From f3f6f7c0d66f136146a912e06c8fbe31ecfbc977 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Fri, 20 Jan 2023 17:45:29 +0100 Subject: [PATCH] Key as dict entry - memory optimization for sets (#11595) If a dict has only keys, and no use of values, then a key can be stored directly in a dict's hashtable. The key replaces the dictEntry. To distinguish between a key and a dictEntry, we only use this optimization if the key is odd, i.e. if the key has the least significant bit set. This is true for sds strings, since the sds header is always an odd number of bytes. Dict entries are used as a fallback when there is a hash collision. A special dict entry without a value (only key and next) is used so we save one word in this case too. This saves 24 bytes per set element for larges sets, and also gains some speed improvement as a side effect (less allocations and cache misses). A quick test adding 1M elements to a set using the command below resulted in memory usage of 28.83M, compared to 46.29M on unstable. That's 18 bytes per set element on average. eval 'for i=1,1000000,1 do redis.call("sadd", "myset", "x"..i) end' 0 Other changes: Allocations are ensured to have at least 8 bits alignment on all systems. This affects 32-bit builds compiled without HAVE_MALLOC_SIZE (not jemalloc or glibc) in which Redis stores the size of each allocation, after this change in 8 bytes instead of previously 4 bytes per allocation. This is done so we can reliably use the 3 least significant bits in a pointer to encode stuff. --- src/defrag.c | 105 +++++---------- src/dict.c | 354 ++++++++++++++++++++++++++++++++++++++------------ src/dict.h | 22 +++- src/server.c | 9 +- src/t_set.c | 17 ++- src/zmalloc.c | 5 +- 6 files changed, 342 insertions(+), 170 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index b813ab74f..5b7fdb775 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -238,46 +238,26 @@ void activeDefragZsetEntry(zset *zs, dictEntry *de) { #define DEFRAG_SDS_DICT_VAL_VOID_PTR 3 #define DEFRAG_SDS_DICT_VAL_LUA_SCRIPT 4 -typedef struct { - dict *dict; - int val_type; -} activeDefragSdsDictData; - -void activeDefragSdsDictCallback(void *privdata, const dictEntry *_de) { - dictEntry *de = (dictEntry*)_de; - activeDefragSdsDictData *data = privdata; - dict *d = data->dict; - int val_type = data->val_type; - sds sdsele = dictGetKey(de), newsds; - if ((newsds = activeDefragSds(sdsele))) - dictSetKey(d, de, newsds); - /* defrag the value */ - if (val_type == DEFRAG_SDS_DICT_VAL_IS_SDS) { - sdsele = dictGetVal(de); - if ((newsds = activeDefragSds(sdsele))) - dictSetVal(d, de, newsds); - } else if (val_type == DEFRAG_SDS_DICT_VAL_IS_STROB) { - robj *newele, *ele = dictGetVal(de); - if ((newele = activeDefragStringOb(ele))) - dictSetVal(d, de, newele); - } else if (val_type == DEFRAG_SDS_DICT_VAL_VOID_PTR) { - void *newptr, *ptr = dictGetVal(de); - if ((newptr = activeDefragAlloc(ptr))) - dictSetVal(d, de, newptr); - } else if (val_type == DEFRAG_SDS_DICT_VAL_LUA_SCRIPT) { - void *newptr, *ptr = dictGetVal(de); - if ((newptr = activeDefragLuaScript(ptr))) - dictSetVal(d, de, newptr); - } +void activeDefragSdsDictCallback(void *privdata, const dictEntry *de) { + UNUSED(privdata); + UNUSED(de); } /* Defrag a dict with sds key and optional value (either ptr, sds or robj string) */ void activeDefragSdsDict(dict* d, int val_type) { - activeDefragSdsDictData data = {d, val_type}; unsigned long cursor = 0; + dictDefragFunctions defragfns = { + .defragAlloc = activeDefragAlloc, + .defragKey = (dictDefragAllocFunction *)activeDefragSds, + .defragVal = (val_type == DEFRAG_SDS_DICT_VAL_IS_SDS ? (dictDefragAllocFunction *)activeDefragSds : + val_type == DEFRAG_SDS_DICT_VAL_IS_STROB ? (dictDefragAllocFunction *)activeDefragStringOb : + val_type == DEFRAG_SDS_DICT_VAL_VOID_PTR ? (dictDefragAllocFunction *)activeDefragAlloc : + val_type == DEFRAG_SDS_DICT_VAL_LUA_SCRIPT ? (dictDefragAllocFunction *)activeDefragLuaScript : + NULL) + }; do { cursor = dictScanDefrag(d, cursor, activeDefragSdsDictCallback, - activeDefragAlloc, &data); + &defragfns, NULL); } while (cursor != 0); } @@ -407,19 +387,14 @@ void scanLaterZset(robj *ob, unsigned long *cursor) { zset *zs = (zset*)ob->ptr; dict *d = zs->dict; scanLaterZsetData data = {zs}; - *cursor = dictScanDefrag(d, *cursor, scanLaterZsetCallback, activeDefragAlloc, &data); + dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc}; + *cursor = dictScanDefrag(d, *cursor, scanLaterZsetCallback, &defragfns, &data); } -typedef struct { - dict *dict; -} scanLaterDictData; - -void scanLaterSetCallback(void *privdata, const dictEntry *_de) { - dictEntry *de = (dictEntry*)_de; - scanLaterDictData *data = privdata; - sds sdsele = dictGetKey(de), newsds; - if ((newsds = activeDefragSds(sdsele))) - dictSetKey(data->dict, de, newsds); +/* Used as scan callback when all the work is done in the dictDefragFunctions. */ +void scanCallbackCountScanned(void *privdata, const dictEntry *de) { + UNUSED(privdata); + UNUSED(de); server.stat_active_defrag_scanned++; } @@ -427,28 +402,23 @@ void scanLaterSet(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_SET || ob->encoding != OBJ_ENCODING_HT) return; dict *d = ob->ptr; - scanLaterDictData data = {d}; - *cursor = dictScanDefrag(d, *cursor, scanLaterSetCallback, activeDefragAlloc, &data); -} - -void scanLaterHashCallback(void *privdata, const dictEntry *_de) { - dictEntry *de = (dictEntry*)_de; - scanLaterDictData *data = privdata; - sds sdsele = dictGetKey(de), newsds; - if ((newsds = activeDefragSds(sdsele))) - dictSetKey(data->dict, de, newsds); - sdsele = dictGetVal(de); - if ((newsds = activeDefragSds(sdsele))) - dictSetVal(data->dict, de, newsds); - server.stat_active_defrag_scanned++; + dictDefragFunctions defragfns = { + .defragAlloc = activeDefragAlloc, + .defragKey = (dictDefragAllocFunction *)activeDefragSds + }; + *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL); } void scanLaterHash(robj *ob, unsigned long *cursor) { if (ob->type != OBJ_HASH || ob->encoding != OBJ_ENCODING_HT) return; dict *d = ob->ptr; - scanLaterDictData data = {d}; - *cursor = dictScanDefrag(d, *cursor, scanLaterHashCallback, activeDefragAlloc, &data); + dictDefragFunctions defragfns = { + .defragAlloc = activeDefragAlloc, + .defragKey = (dictDefragAllocFunction *)activeDefragSds, + .defragVal = (dictDefragAllocFunction *)activeDefragSds + }; + *cursor = dictScanDefrag(d, *cursor, scanCallbackCountScanned, &defragfns, NULL); } void defragQuicklist(redisDb *db, dictEntry *kde) { @@ -788,14 +758,6 @@ void defragScanCallback(void *privdata, const dictEntry *de) { server.stat_active_defrag_scanned++; } -/* Dummy scan callback used when defragging the expire dictionary. We only - * defrag the entries, which is done per bucket. */ -void defragExpireScanCallback(void *privdata, const dictEntry *de) { - UNUSED(privdata); - UNUSED(de); - server.stat_active_defrag_scanned++; -} - /* Utility function to get the fragmentation ratio from jemalloc. * It is critical to do that by comparing only heap maps that belong to * jemalloc, and skip ones the jemalloc keeps as spare. Since we use this @@ -1003,6 +965,7 @@ void activeDefragCycle(void) { endtime = start + timelimit; latencyStartMonitor(latency); + dictDefragFunctions defragfns = {.defragAlloc = activeDefragAlloc}; do { /* if we're not continuing a scan from the last call or loop, start a new one */ if (!cursor && !expires_cursor) { @@ -1055,13 +1018,13 @@ void activeDefragCycle(void) { /* Scan the keyspace dict unless we're scanning the expire dict. */ if (!expires_cursor) cursor = dictScanDefrag(db->dict, cursor, defragScanCallback, - activeDefragAlloc, db); + &defragfns, db); /* When done scanning the keyspace dict, we scan the expire dict. */ if (!cursor) expires_cursor = dictScanDefrag(db->expires, expires_cursor, - defragExpireScanCallback, - activeDefragAlloc, NULL); + scanCallbackCountScanned, + &defragfns, NULL); /* Once in 16 scan iterations, 512 pointer reallocations. or 64 keys * (if we have a lot of pointers in one hash bucket or rehashing), diff --git a/src/dict.c b/src/dict.c index ec2d4dca0..d46e30588 100644 --- a/src/dict.c +++ b/src/dict.c @@ -74,12 +74,19 @@ struct dictEntry { * by dictType's dictEntryMetadataBytes(). */ }; +typedef struct { + void *key; + dictEntry *next; +} dictEntryNoValue; + /* -------------------------- private prototypes ---------------------------- */ static int _dictExpandIfNeeded(dict *d); static signed char _dictNextExp(unsigned long size); -static long _dictKeyIndex(dict *d, const void *key, uint64_t hash, dictEntry **existing); static int _dictInit(dict *d, dictType *type); +static dictEntry *dictGetNext(const dictEntry *de); +static dictEntry **dictGetNextRef(dictEntry *de); +static void dictSetNext(dictEntry *de, dictEntry *next); /* -------------------------- hash functions -------------------------------- */ @@ -107,6 +114,63 @@ uint64_t dictGenCaseHashFunction(const unsigned char *buf, size_t len) { return siphash_nocase(buf,len,dict_hash_function_seed); } +/* --------------------- dictEntry pointer bit tricks ---------------------- */ + +/* The 3 least significant bits in a pointer to a dictEntry determines what the + * pointer actually points to. If the least bit is set, it's a key. Otherwise, + * the bit pattern of the least 3 significant bits mark the kind of entry. */ + +#define ENTRY_PTR_MASK 7 /* 111 */ +#define ENTRY_PTR_NORMAL 0 /* 000 */ +#define ENTRY_PTR_NO_VALUE 2 /* 010 */ + +/* Returns 1 if the entry pointer is a pointer to a key, rather than to an + * allocated entry. Returns 0 otherwise. */ +static inline int entryIsKey(const dictEntry *de) { + return (uintptr_t)(void *)de & 1; +} + +/* Returns 1 if the pointer is actually a pointer to a dictEntry struct. Returns + * 0 otherwise. */ +static inline int entryIsNormal(const dictEntry *de) { + return ((uintptr_t)(void *)de & ENTRY_PTR_MASK) == ENTRY_PTR_NORMAL; +} + +/* Returns 1 if the entry is a special entry with key and next, but without + * value. Returns 0 otherwise. */ +static inline int entryIsNoValue(const dictEntry *de) { + return ((uintptr_t)(void *)de & ENTRY_PTR_MASK) == ENTRY_PTR_NO_VALUE; +} + +/* Creates an entry without a value field. */ +static inline dictEntry *createEntryNoValue(void *key, dictEntry *next) { + dictEntryNoValue *entry = zmalloc(sizeof(*entry)); + entry->key = key; + entry->next = next; + return (dictEntry *)(void *)((uintptr_t)(void *)entry | ENTRY_PTR_NO_VALUE); +} + +static inline dictEntry *encodeMaskedPtr(const void *ptr, unsigned int bits) { + assert(((uintptr_t)ptr & ENTRY_PTR_MASK) == 0); + return (dictEntry *)(void *)((uintptr_t)ptr | bits); +} + +static inline void *decodeMaskedPtr(const dictEntry *de) { + assert(!entryIsKey(de)); + return (void *)((uintptr_t)(void *)de & ~ENTRY_PTR_MASK); +} + +/* Decodes the pointer to an entry without value, when you know it is an entry + * without value. Hint: Use entryIsNoValue to check. */ +static inline dictEntryNoValue *decodeEntryNoValue(const dictEntry *de) { + return decodeMaskedPtr(de); +} + +/* Returns 1 if the entry has a value field and 0 otherwise. */ +static inline int entryHasValue(const dictEntry *de) { + return entryIsNormal(de); +} + /* ----------------------------- API implementation ------------------------- */ /* Reset hash table parameters already initialized with _dictInit()*/ @@ -252,17 +316,42 @@ int dictRehash(dict *d, int n) { while(de) { uint64_t h; - nextde = de->next; + nextde = dictGetNext(de); + void *key = dictGetKey(de); /* Get the index in the new hash table */ if (d->ht_size_exp[1] > d->ht_size_exp[0]) { - h = dictHashKey(d, de->key) & DICTHT_SIZE_MASK(d->ht_size_exp[1]); + h = dictHashKey(d, key) & DICTHT_SIZE_MASK(d->ht_size_exp[1]); } else { /* We're shrinking the table. The tables sizes are powers of * two, so we simply mask the bucket index in the larger table * to get the bucket index in the smaller table. */ h = d->rehashidx & DICTHT_SIZE_MASK(d->ht_size_exp[1]); } - de->next = d->ht_table[1][h]; + if (d->type->no_value) { + if (d->type->keys_are_odd && !d->ht_table[1][h]) { + /* Destination bucket is empty and we can store the key + * directly without an allocated entry. Free the old entry + * if it's an allocated entry. + * + * TODO: Add a flag 'keys_are_even' and if set, we can use + * this optimization for these dicts too. We can set the LSB + * bit when stored as a dict entry and clear it again when + * we need the key back. */ + assert(entryIsKey(key)); + if (!entryIsKey(de)) zfree(decodeMaskedPtr(de)); + de = key; + } else if (entryIsKey(de)) { + /* We don't have an allocated entry but we need one. */ + de = createEntryNoValue(key, d->ht_table[1][h]); + } else { + /* Just move the existing entry to the destination table and + * update the 'next' field. */ + assert(entryIsNoValue(de)); + dictSetNext(de, d->ht_table[1][h]); + } + } else { + dictSetNext(de, d->ht_table[1][h]); + } d->ht_table[1][h] = de; d->ht_used[0]--; d->ht_used[1]++; @@ -334,7 +423,7 @@ int dictAdd(dict *d, void *key, void *val) dictEntry *entry = dictAddRaw(d,key,NULL); if (!entry) return DICT_ERR; - dictSetVal(d, entry, val); + if (!d->type->no_value) dictSetVal(d, entry, val); return DICT_OK; } @@ -358,33 +447,61 @@ int dictAdd(dict *d, void *key, void *val) */ dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing) { - long index; + /* Get the position for the new key or NULL if the key already exists. */ + void *position = dictFindPositionForInsert(d, key, existing); + if (!position) return NULL; + + /* Dup the key if necessary. */ + if (d->type->keyDup) key = d->type->keyDup(d, key); + + return dictInsertAtPosition(d, key, position); +} + +/* Adds a key in the dict's hashtable at the position returned by a preceding + * call to dictFindPositionForInsert. This is a low level function which allows + * splitting dictAddRaw in two parts. Normally, dictAddRaw or dictAdd should be + * used instead. */ +dictEntry *dictInsertAtPosition(dict *d, void *key, void *position) { + dictEntry **bucket = position; /* It's a bucket, but the API hides that. */ dictEntry *entry; - int htidx; - - if (dictIsRehashing(d)) _dictRehashStep(d); - - /* Get the index of the new element, or -1 if - * the element already exists. */ - if ((index = _dictKeyIndex(d, key, dictHashKey(d,key), existing)) == -1) - return NULL; - - /* Allocate the memory and store the new entry. - * Insert the element in top, with the assumption that in a database - * system it is more likely that recently added entries are accessed - * more frequently. */ - htidx = dictIsRehashing(d) ? 1 : 0; + /* If rehashing is ongoing, we insert in table 1, otherwise in table 0. + * Assert that the provided bucket is the right table. */ + int htidx = dictIsRehashing(d) ? 1 : 0; + assert(bucket >= &d->ht_table[htidx][0] && + bucket <= &d->ht_table[htidx][DICTHT_SIZE_MASK(d->ht_size_exp[htidx])]); size_t metasize = dictEntryMetadataSize(d); - entry = zmalloc(sizeof(*entry) + metasize); - if (metasize > 0) { - memset(dictEntryMetadata(entry), 0, metasize); + if (d->type->no_value) { + assert(!metasize); /* Entry metadata + no value not supported. */ + if (d->type->keys_are_odd && !*bucket) { + /* We can store the key directly in the destination bucket without the + * allocated entry. + * + * TODO: Add a flag 'keys_are_even' and if set, we can use this + * optimization for these dicts too. We can set the LSB bit when + * stored as a dict entry and clear it again when we need the key + * back. */ + entry = key; + assert(entryIsKey(entry)); + } else { + /* Allocate an entry without value. */ + entry = createEntryNoValue(key, *bucket); + } + } else { + /* Allocate the memory and store the new entry. + * Insert the element in top, with the assumption that in a database + * system it is more likely that recently added entries are accessed + * more frequently. */ + entry = zmalloc(sizeof(*entry) + metasize); + assert(entryIsNormal(entry)); /* Check alignment of allocation */ + if (metasize > 0) { + memset(dictEntryMetadata(entry), 0, metasize); + } + entry->key = key; + entry->next = *bucket; } - entry->next = d->ht_table[htidx][index]; - d->ht_table[htidx][index] = entry; + *bucket = entry; d->ht_used[htidx]++; - /* Set the hash entry fields. */ - dictSetKey(d, entry, key); return entry; } @@ -395,7 +512,7 @@ dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing) * operation. */ int dictReplace(dict *d, void *key, void *val) { - dictEntry *entry, *existing, auxentry; + dictEntry *entry, *existing; /* Try to add the element. If the key * does not exists dictAdd will succeed. */ @@ -410,9 +527,10 @@ int dictReplace(dict *d, void *key, void *val) * as the previous one. In this context, think to reference counting, * you want to increment (set), and then decrement (free), and not the * reverse. */ - auxentry = *existing; + void *oldval = dictGetVal(existing); dictSetVal(d, existing, val); - dictFreeVal(d, &auxentry); + if (d->type->valDestructor) + d->type->valDestructor(d, oldval); return 0; } @@ -448,12 +566,13 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { he = d->ht_table[table][idx]; prevHe = NULL; while(he) { - if (key==he->key || dictCompareKeys(d, key, he->key)) { + void *he_key = dictGetKey(he); + if (key == he_key || dictCompareKeys(d, key, he_key)) { /* Unlink the element from the list */ if (prevHe) - prevHe->next = he->next; + dictSetNext(prevHe, dictGetNext(he)); else - d->ht_table[table][idx] = he->next; + d->ht_table[table][idx] = dictGetNext(he); if (!nofree) { dictFreeUnlinkedEntry(d, he); } @@ -461,7 +580,7 @@ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { return he; } prevHe = he; - he = he->next; + he = dictGetNext(he); } if (!dictIsRehashing(d)) break; } @@ -505,7 +624,7 @@ void dictFreeUnlinkedEntry(dict *d, dictEntry *he) { if (he == NULL) return; dictFreeKey(d, he); dictFreeVal(d, he); - zfree(he); + if (!entryIsKey(he)) zfree(decodeMaskedPtr(he)); } /* Destroy an entire dictionary */ @@ -520,10 +639,10 @@ int _dictClear(dict *d, int htidx, void(callback)(dict*)) { if ((he = d->ht_table[htidx][i]) == NULL) continue; while(he) { - nextHe = he->next; + nextHe = dictGetNext(he); dictFreeKey(d, he); dictFreeVal(d, he); - zfree(he); + if (!entryIsKey(he)) zfree(decodeMaskedPtr(he)); d->ht_used[htidx]--; he = nextHe; } @@ -555,9 +674,10 @@ dictEntry *dictFind(dict *d, const void *key) idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[table]); he = d->ht_table[table][idx]; while(he) { - if (key==he->key || dictCompareKeys(d, key, he->key)) + void *he_key = dictGetKey(he); + if (key == he_key || dictCompareKeys(d, key, he_key)) return he; - he = he->next; + he = dictGetNext(he); } if (!dictIsRehashing(d)) return NULL; } @@ -597,14 +717,15 @@ dictEntry *dictTwoPhaseUnlinkFind(dict *d, const void *key, dictEntry ***plink, for (table = 0; table <= 1; table++) { idx = h & DICTHT_SIZE_MASK(d->ht_size_exp[table]); dictEntry **ref = &d->ht_table[table][idx]; - while(*ref) { - if (key==(*ref)->key || dictCompareKeys(d, key, (*ref)->key)) { + while (ref && *ref) { + void *de_key = dictGetKey(*ref); + if (key == de_key || dictCompareKeys(d, key, de_key)) { *table_index = table; *plink = ref; dictPauseRehashing(d); return *ref; } - ref = &(*ref)->next; + ref = dictGetNextRef(*ref); } if (!dictIsRehashing(d)) return NULL; } @@ -614,14 +735,15 @@ dictEntry *dictTwoPhaseUnlinkFind(dict *d, const void *key, dictEntry ***plink, void dictTwoPhaseUnlinkFree(dict *d, dictEntry *he, dictEntry **plink, int table_index) { if (he == NULL) return; d->ht_used[table_index]--; - *plink = he->next; + *plink = dictGetNext(he); dictFreeKey(d, he); dictFreeVal(d, he); - zfree(he); + if (!entryIsKey(he)) zfree(decodeMaskedPtr(he)); dictResumeRehashing(d); } void dictSetKey(dict *d, dictEntry* de, void *key) { + assert(!d->type->no_value); if (d->type->keyDup) de->key = d->type->keyDup(d, key); else @@ -629,63 +751,104 @@ void dictSetKey(dict *d, dictEntry* de, void *key) { } void dictSetVal(dict *d, dictEntry *de, void *val) { + assert(entryHasValue(de)); de->v.val = d->type->valDup ? d->type->valDup(d, val) : val; } void dictSetSignedIntegerVal(dictEntry *de, int64_t val) { + assert(entryHasValue(de)); de->v.s64 = val; } void dictSetUnsignedIntegerVal(dictEntry *de, uint64_t val) { + assert(entryHasValue(de)); de->v.u64 = val; } void dictSetDoubleVal(dictEntry *de, double val) { + assert(entryHasValue(de)); de->v.d = val; } int64_t dictIncrSignedIntegerVal(dictEntry *de, int64_t val) { + assert(entryHasValue(de)); return de->v.s64 += val; } uint64_t dictIncrUnsignedIntegerVal(dictEntry *de, uint64_t val) { + assert(entryHasValue(de)); return de->v.u64 += val; } double dictIncrDoubleVal(dictEntry *de, double val) { + assert(entryHasValue(de)); return de->v.d += val; } /* A pointer to the metadata section within the dict entry. */ void *dictEntryMetadata(dictEntry *de) { + assert(entryHasValue(de)); return &de->metadata; } void *dictGetKey(const dictEntry *de) { + if (entryIsKey(de)) return (void*)de; + if (entryIsNoValue(de)) return decodeEntryNoValue(de)->key; return de->key; } void *dictGetVal(const dictEntry *de) { + assert(entryHasValue(de)); return de->v.val; } int64_t dictGetSignedIntegerVal(const dictEntry *de) { + assert(entryHasValue(de)); return de->v.s64; } uint64_t dictGetUnsignedIntegerVal(const dictEntry *de) { + assert(entryHasValue(de)); return de->v.u64; } double dictGetDoubleVal(const dictEntry *de) { + assert(entryHasValue(de)); return de->v.d; } /* Returns a mutable reference to the value as a double within the entry. */ double *dictGetDoubleValPtr(dictEntry *de) { + assert(entryHasValue(de)); return &de->v.d; } +/* Returns the 'next' field of the entry or NULL if the entry doesn't have a + * 'next' field. */ +static dictEntry *dictGetNext(const dictEntry *de) { + if (entryIsKey(de)) return NULL; /* there's no next */ + if (entryIsNoValue(de)) return decodeEntryNoValue(de)->next; + return de->next; +} + +/* Returns a pointer to the 'next' field in the entry or NULL if the entry + * doesn't have a next field. */ +static dictEntry **dictGetNextRef(dictEntry *de) { + if (entryIsKey(de)) return NULL; + if (entryIsNoValue(de)) return &decodeEntryNoValue(de)->next; + return &de->next; +} + +static void dictSetNext(dictEntry *de, dictEntry *next) { + assert(!entryIsKey(de)); + if (entryIsNoValue(de)) { + dictEntryNoValue *entry = decodeEntryNoValue(de); + entry->next = next; + } else { + de->next = next; + } +} + /* Returns the memory usage in bytes of the dict, excluding the size of the keys * and values. */ size_t dictMemUsage(const dict *d) { @@ -801,7 +964,7 @@ dictEntry *dictNext(dictIterator *iter) if (iter->entry) { /* We need to save the 'next' here, the iterator user * may delete the entry we are returning. */ - iter->nextEntry = iter->entry->next; + iter->nextEntry = dictGetNext(iter->entry); return iter->entry; } } @@ -847,12 +1010,12 @@ dictEntry *dictGetRandomKey(dict *d) listlen = 0; orighe = he; while(he) { - he = he->next; + he = dictGetNext(he); listlen++; } listele = random() % listlen; he = orighe; - while(listele--) he = he->next; + while(listele--) he = dictGetNext(he); return he; } @@ -936,7 +1099,7 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) { * empty while iterating. */ *des = he; des++; - he = he->next; + he = dictGetNext(he); stored++; if (stored == count) return stored; } @@ -948,17 +1111,39 @@ unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count) { } -/* Reallocate the dictEntry allocations in a bucket using the provided - * allocation function in order to defrag them. */ -static void dictDefragBucket(dict *d, dictEntry **bucketref, dictDefragAllocFunction *allocfn) { +/* Reallocate the dictEntry, key and value allocations in a bucket using the + * provided allocation functions in order to defrag them. */ +static void dictDefragBucket(dict *d, dictEntry **bucketref, dictDefragFunctions *defragfns) { + dictDefragAllocFunction *defragalloc = defragfns->defragAlloc; + dictDefragAllocFunction *defragkey = defragfns->defragKey; + dictDefragAllocFunction *defragval = defragfns->defragVal; while (bucketref && *bucketref) { - dictEntry *de = *bucketref, *newde; - if ((newde = allocfn(de))) { + dictEntry *de = *bucketref, *newde = NULL; + void *newkey = defragkey ? defragkey(dictGetKey(de)) : NULL; + void *newval = defragval ? defragval(dictGetVal(de)) : NULL; + if (entryIsKey(de)) { + if (newkey) *bucketref = newkey; + assert(entryIsKey(*bucketref)); + } else if (entryIsNoValue(de)) { + dictEntryNoValue *entry = decodeEntryNoValue(de), *newentry; + if ((newentry = defragalloc(entry))) { + newde = encodeMaskedPtr(newentry, ENTRY_PTR_NO_VALUE); + entry = newentry; + } + if (newkey) entry->key = newkey; + } else { + assert(entryIsNormal(de)); + newde = defragalloc(de); + if (newde) de = newde; + if (newkey) de->key = newkey; + if (newval) de->v.val = newval; + } + if (newde) { *bucketref = newde; if (d->type->afterReplaceEntry) d->type->afterReplaceEntry(d, newde); } - bucketref = &(*bucketref)->next; + bucketref = dictGetNextRef(*bucketref); } } @@ -1094,14 +1279,14 @@ unsigned long dictScan(dict *d, * entries using the provided allocation function. This feature was added for * the active defrag feature. * - * The 'defracallocfn' callback is called with a pointer to memory that callback - * can reallocate. The callback should return a new memory address or NULL, + * The 'defragfns' callbacks are called with a pointer to memory that callback + * can reallocate. The callbacks should return a new memory address or NULL, * where NULL means that no reallocation happened and the old memory is still * valid. */ unsigned long dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, - dictDefragAllocFunction *defragallocfn, + dictDefragFunctions *defragfns, void *privdata) { int htidx0, htidx1; @@ -1118,12 +1303,12 @@ unsigned long dictScanDefrag(dict *d, m0 = DICTHT_SIZE_MASK(d->ht_size_exp[htidx0]); /* Emit entries at cursor */ - if (defragallocfn) { - dictDefragBucket(d, &d->ht_table[htidx0][v & m0], defragallocfn); + if (defragfns) { + dictDefragBucket(d, &d->ht_table[htidx0][v & m0], defragfns); } de = d->ht_table[htidx0][v & m0]; while (de) { - next = de->next; + next = dictGetNext(de); fn(privdata, de); de = next; } @@ -1151,12 +1336,12 @@ unsigned long dictScanDefrag(dict *d, m1 = DICTHT_SIZE_MASK(d->ht_size_exp[htidx1]); /* Emit entries at cursor */ - if (defragallocfn) { - dictDefragBucket(d, &d->ht_table[htidx0][v & m0], defragallocfn); + if (defragfns) { + dictDefragBucket(d, &d->ht_table[htidx0][v & m0], defragfns); } de = d->ht_table[htidx0][v & m0]; while (de) { - next = de->next; + next = dictGetNext(de); fn(privdata, de); de = next; } @@ -1165,12 +1350,12 @@ unsigned long dictScanDefrag(dict *d, * of the index pointed to by the cursor in the smaller table */ do { /* Emit entries at cursor */ - if (defragallocfn) { - dictDefragBucket(d, &d->ht_table[htidx1][v & m1], defragallocfn); + if (defragfns) { + dictDefragBucket(d, &d->ht_table[htidx1][v & m1], defragfns); } de = d->ht_table[htidx1][v & m1]; while (de) { - next = de->next; + next = dictGetNext(de); fn(privdata, de); de = next; } @@ -1241,36 +1426,39 @@ static signed char _dictNextExp(unsigned long size) } } -/* Returns the index of a free slot that can be populated with - * a hash entry for the given 'key'. - * If the key already exists, -1 is returned - * and the optional output parameter may be filled. - * - * Note that if we are in the process of rehashing the hash table, the - * index is always returned in the context of the second (new) hash table. */ -static long _dictKeyIndex(dict *d, const void *key, uint64_t hash, dictEntry **existing) -{ +/* Finds and returns the position within the dict where the provided key should + * be inserted using dictInsertAtPosition if the key does not already exist in + * the dict. If the key exists in the dict, NULL is returned and the optional + * 'existing' entry pointer is populated, if provided. */ +void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing) { unsigned long idx, table; dictEntry *he; + uint64_t hash = dictHashKey(d, key); if (existing) *existing = NULL; + if (dictIsRehashing(d)) _dictRehashStep(d); /* Expand the hash table if needed */ if (_dictExpandIfNeeded(d) == DICT_ERR) - return -1; + return NULL; for (table = 0; table <= 1; table++) { idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]); /* Search if this slot does not already contain the given key */ he = d->ht_table[table][idx]; while(he) { - if (key==he->key || dictCompareKeys(d, key, he->key)) { + void *he_key = dictGetKey(he); + if (key == he_key || dictCompareKeys(d, key, he_key)) { if (existing) *existing = he; - return -1; + return NULL; } - he = he->next; + he = dictGetNext(he); } if (!dictIsRehashing(d)) break; } - return idx; + + /* If we are in the process of rehashing the hash table, the bucket is + * always returned in the context of the second (new) hash table. */ + dictEntry **bucket = &d->ht_table[dictIsRehashing(d) ? 1 : 0][idx]; + return bucket; } void dictEmpty(dict *d, void(callback)(dict*)) { @@ -1302,9 +1490,9 @@ dictEntry *dictFindEntryByPtrAndHash(dict *d, const void *oldptr, uint64_t hash) idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]); he = d->ht_table[table][idx]; while(he) { - if (oldptr==he->key) + if (oldptr == dictGetKey(he)) return he; - he = he->next; + he = dictGetNext(he); } if (!dictIsRehashing(d)) return NULL; } @@ -1340,7 +1528,7 @@ size_t _dictGetStatsHt(char *buf, size_t bufsize, dict *d, int htidx) { he = d->ht_table[htidx][i]; while(he) { chainlen++; - he = he->next; + he = dictGetNext(he); } clvector[(chainlen < DICT_STATS_VECTLEN) ? chainlen : (DICT_STATS_VECTLEN-1)]++; if (chainlen > maxchainlen) maxchainlen = chainlen; diff --git a/src/dict.h b/src/dict.h index a8a76e69c..06d4bff78 100644 --- a/src/dict.h +++ b/src/dict.h @@ -56,6 +56,19 @@ typedef struct dictType { void (*keyDestructor)(dict *d, void *key); void (*valDestructor)(dict *d, void *obj); int (*expandAllowed)(size_t moreMem, double usedRatio); + /* Flags */ + /* The 'no_value' flag, if set, indicates that values are not used, i.e. the + * dict is a set. When this flag is set, it's not possible to access the + * value of a dictEntry and it's also impossible to use dictSetKey(). Entry + * metadata can also not be used. */ + unsigned int no_value:1; + /* If no_value = 1 and all keys are odd (LSB=1), setting keys_are_odd = 1 + * enables one more optimization: to store a key without an allocated + * dictEntry. */ + unsigned int keys_are_odd:1; + /* TODO: Add a 'keys_are_even' flag and use a similar optimization if that + * flag is set. */ + /* Allow each dict and dictEntry to carry extra caller-defined metadata. The * extra memory is initialized to 0 when allocated. */ size_t (*dictEntryMetadataBytes)(dict *d); @@ -100,6 +113,11 @@ typedef struct dictIterator { typedef void (dictScanFunction)(void *privdata, const dictEntry *de); typedef void *(dictDefragAllocFunction)(void *ptr); +typedef struct { + dictDefragAllocFunction *defragAlloc; /* Used for entries etc. */ + dictDefragAllocFunction *defragKey; /* Defrag-realloc keys (optional) */ + dictDefragAllocFunction *defragVal; /* Defrag-realloc values (optional) */ +} dictDefragFunctions; /* This is the initial size of every hash table */ #define DICT_HT_INITIAL_EXP 2 @@ -152,6 +170,8 @@ int dictTryExpand(dict *d, unsigned long size); void *dictMetadata(dict *d); int dictAdd(dict *d, void *key, void *val); dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing); +void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing); +dictEntry *dictInsertAtPosition(dict *d, void *key, void *position); dictEntry *dictAddOrFind(dict *d, void *key); int dictReplace(dict *d, void *key, void *val); int dictDelete(dict *d, const void *key); @@ -200,7 +220,7 @@ int dictRehashMilliseconds(dict *d, int ms); void dictSetHashFunctionSeed(uint8_t *seed); uint8_t *dictGetHashFunctionSeed(void); unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, void *privdata); -unsigned long dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragAllocFunction *allocfn, void *privdata); +unsigned long dictScanDefrag(dict *d, unsigned long v, dictScanFunction *fn, dictDefragFunctions *defragfns, void *privdata); uint64_t dictGetHash(dict *d, const void *key); dictEntry *dictFindEntryByPtrAndHash(dict *d, const void *oldptr, uint64_t hash); diff --git a/src/server.c b/src/server.c index 910f299f3..8b4ec28e4 100644 --- a/src/server.c +++ b/src/server.c @@ -448,7 +448,8 @@ dictType setDictType = { NULL, /* val dup */ dictSdsKeyCompare, /* key compare */ dictSdsDestructor, /* key destructor */ - NULL /* val destructor */ + .no_value = 1, /* no values in this dict */ + .keys_are_odd = 1 /* an SDS string is always an odd pointer */ }; /* Sorted sets hash (note: a skiplist is used in addition to the hash table) */ @@ -471,9 +472,9 @@ dictType dbDictType = { dictSdsDestructor, /* key destructor */ dictObjectDestructor, /* val destructor */ dictExpandAllowed, /* allow to expand */ - dbDictEntryMetadataSize, /* size of entry metadata in bytes */ - dbDictMetadataSize, /* size of dict metadata in bytes */ - dbDictAfterReplaceEntry /* notify entry moved/reallocated */ + .dictEntryMetadataBytes = dbDictEntryMetadataSize, + .dictMetadataBytes = dbDictMetadataSize, + .afterReplaceEntry = dbDictAfterReplaceEntry }; /* Db->expires */ diff --git a/src/t_set.c b/src/t_set.c index 6d8023f31..7b0dbed5a 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -122,17 +122,16 @@ int setTypeAddAux(robj *set, char *str, size_t len, int64_t llval, int str_is_sd /* Avoid duping the string if it is an sds string. */ sds sdsval = str_is_sds ? (sds)str : sdsnewlen(str, len); dict *ht = set->ptr; - dictEntry *de = dictAddRaw(ht,sdsval,NULL); - if (de && sdsval == str) { - /* String was added but we don't own this sds string. Dup it and - * replace it in the dict entry. */ - dictSetKey(ht,de,sdsdup((sds)str)); - dictSetVal(ht,de,NULL); - } else if (!de && sdsval != str) { - /* String was already a member. Free our temporary sds copy. */ + void *position = dictFindPositionForInsert(ht, sdsval, NULL); + if (position) { + /* Key doesn't already exist in the set. Add it but dup the key. */ + if (sdsval == str) sdsval = sdsdup(sdsval); + dictInsertAtPosition(ht, sdsval, position); + } else if (sdsval != str) { + /* String is already a member. Free our temporary sds copy. */ sdsfree(sdsval); } - return (de != NULL); + return (position != NULL); } else if (set->encoding == OBJ_ENCODING_LISTPACK) { unsigned char *lp = set->ptr; unsigned char *p = lpFirst(lp); diff --git a/src/zmalloc.c b/src/zmalloc.c index 6d02604f4..e7b426277 100644 --- a/src/zmalloc.c +++ b/src/zmalloc.c @@ -60,8 +60,9 @@ void zlibc_free(void *ptr) { #ifdef HAVE_MALLOC_SIZE #define PREFIX_SIZE (0) #else -#if defined(__sun) || defined(__sparc) || defined(__sparc__) -#define PREFIX_SIZE (sizeof(long long)) +/* Use at least 8 bits alignment on all systems. */ +#if SIZE_MAX < 0xffffffffffffffffull +#define PREFIX_SIZE 8 #else #define PREFIX_SIZE (sizeof(size_t)) #endif