Merge pull request #6903 from oranagra/defrag_lists

Defrag big lists in portions to avoid latency and freeze
This commit is contained in:
Salvatore Sanfilippo 2020-02-20 13:00:39 +01:00 committed by GitHub
commit e741b0c257
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 350 additions and 34 deletions

View File

@ -5,8 +5,8 @@
* We do that by scanning the keyspace and for each pointer we have, we can try to
* ask the allocator if moving it to a new address will help reduce fragmentation.
*
* Copyright (c) 2017, Oran Agra
* Copyright (c) 2017, Redis Labs, Inc
* Copyright (c) 2020, Oran Agra
* Copyright (c) 2020, Redis Labs, Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@ -408,25 +408,32 @@ dictEntry* replaceSateliteDictKeyPtrAndOrDefragDictEntry(dict *d, sds oldkey, sd
return NULL;
}
long activeDefragQuickListNodes(quicklist *ql) {
quicklistNode *node = ql->head, *newnode;
long activeDefragQuickListNode(quicklist *ql, quicklistNode **node_ref) {
quicklistNode *newnode, *node = *node_ref;
long defragged = 0;
unsigned char *newzl;
if ((newnode = activeDefragAlloc(node))) {
if (newnode->prev)
newnode->prev->next = newnode;
else
ql->head = newnode;
if (newnode->next)
newnode->next->prev = newnode;
else
ql->tail = newnode;
*node_ref = node = newnode;
defragged++;
}
if ((newzl = activeDefragAlloc(node->zl)))
defragged++, node->zl = newzl;
return defragged;
}
long activeDefragQuickListNodes(quicklist *ql) {
quicklistNode *node = ql->head;
long defragged = 0;
while (node) {
if ((newnode = activeDefragAlloc(node))) {
if (newnode->prev)
newnode->prev->next = newnode;
else
ql->head = newnode;
if (newnode->next)
newnode->next->prev = newnode;
else
ql->tail = newnode;
node = newnode;
defragged++;
}
if ((newzl = activeDefragAlloc(node->zl)))
defragged++, node->zl = newzl;
defragged += activeDefragQuickListNode(ql, &node);
node = node->next;
}
return defragged;
@ -440,12 +447,48 @@ void defragLater(redisDb *db, dictEntry *kde) {
listAddNodeTail(db->defrag_later, key);
}
long scanLaterList(robj *ob) {
/* returns 0 if no more work needs to be been done, and 1 if time is up and more work is needed. */
long scanLaterList(robj *ob, unsigned long *cursor, long long endtime, long long *defragged) {
quicklist *ql = ob->ptr;
quicklistNode *node;
long iterations = 0;
int bookmark_failed = 0;
if (ob->type != OBJ_LIST || ob->encoding != OBJ_ENCODING_QUICKLIST)
return 0;
server.stat_active_defrag_scanned+=ql->len;
return activeDefragQuickListNodes(ql);
if (*cursor == 0) {
/* if cursor is 0, we start new iteration */
node = ql->head;
} else {
node = quicklistBookmarkFind(ql, "_AD");
if (!node) {
/* if the bookmark was deleted, it means we reached the end. */
*cursor = 0;
return 0;
}
node = node->next;
}
(*cursor)++;
while (node) {
(*defragged) += activeDefragQuickListNode(ql, &node);
server.stat_active_defrag_scanned++;
if (++iterations > 128 && !bookmark_failed) {
if (ustime() > endtime) {
if (!quicklistBookmarkCreate(&ql, "_AD", node)) {
bookmark_failed = 1;
} else {
ob->ptr = ql; /* bookmark creation may have re-allocated the quicklist */
return 1;
}
}
iterations = 0;
}
node = node->next;
}
quicklistBookmarkDelete(ql, "_AD");
*cursor = 0;
return bookmark_failed? 1: 0;
}
typedef struct {
@ -638,7 +681,8 @@ int scanLaterStraemListpacks(robj *ob, unsigned long *cursor, long long endtime,
void *newdata = activeDefragAlloc(ri.data);
if (newdata)
raxSetData(ri.node, ri.data=newdata), (*defragged)++;
if (++iterations > 16) {
server.stat_active_defrag_scanned++;
if (++iterations > 128) {
if (ustime() > endtime) {
serverAssert(ri.key_len==sizeof(last));
memcpy(last,ri.key,ri.key_len);
@ -900,8 +944,7 @@ int defragLaterItem(dictEntry *de, unsigned long *cursor, long long endtime) {
if (de) {
robj *ob = dictGetVal(de);
if (ob->type == OBJ_LIST) {
server.stat_active_defrag_hits += scanLaterList(ob);
*cursor = 0; /* list has no scan, we must finish it in one go */
return scanLaterList(ob, cursor, endtime, &server.stat_active_defrag_hits);
} else if (ob->type == OBJ_SET) {
server.stat_active_defrag_hits += scanLaterSet(ob, cursor);
} else if (ob->type == OBJ_ZSET) {
@ -961,11 +1004,6 @@ int defragLaterStep(redisDb *db, long long endtime) {
if (defragLaterItem(de, &defrag_later_cursor, endtime))
quit = 1; /* time is up, we didn't finish all the work */
/* Don't start a new BIG key in this loop, this is because the
* next key can be a list, and scanLaterList must be done in once cycle */
if (!defrag_later_cursor)
quit = 1;
/* Once in 16 scan iterations, 512 pointer reallocations, or 64 fields
* (if we have a lot of pointers in one hash bucket, or rehashing),
* check if we reached the time limit. */

View File

@ -70,6 +70,12 @@ static const size_t optimization_level[] = {4096, 8192, 16384, 32768, 65536};
} while (0);
#endif
/* Bookmarks forward declarations */
#define QL_MAX_BM ((1 << QL_BM_BITS)-1)
quicklistBookmark *_quicklistBookmarkFindByName(quicklist *ql, const char *name);
quicklistBookmark *_quicklistBookmarkFindByNode(quicklist *ql, quicklistNode *node);
void _quicklistBookmarkDelete(quicklist *ql, quicklistBookmark *bm);
/* Simple way to give quicklistEntry structs default values with one call. */
#define initEntry(e) \
do { \
@ -100,10 +106,11 @@ quicklist *quicklistCreate(void) {
quicklist->count = 0;
quicklist->compress = 0;
quicklist->fill = -2;
quicklist->bookmark_count = 0;
return quicklist;
}
#define COMPRESS_MAX (1 << 16)
#define COMPRESS_MAX (1 << QL_COMP_BITS)
void quicklistSetCompressDepth(quicklist *quicklist, int compress) {
if (compress > COMPRESS_MAX) {
compress = COMPRESS_MAX;
@ -113,7 +120,7 @@ void quicklistSetCompressDepth(quicklist *quicklist, int compress) {
quicklist->compress = compress;
}
#define FILL_MAX (1 << 15)
#define FILL_MAX (1 << (QL_FILL_BITS-1))
void quicklistSetFill(quicklist *quicklist, int fill) {
if (fill > FILL_MAX) {
fill = FILL_MAX;
@ -169,6 +176,7 @@ void quicklistRelease(quicklist *quicklist) {
quicklist->len--;
current = next;
}
quicklistBookmarksClear(quicklist);
zfree(quicklist);
}
@ -578,6 +586,15 @@ quicklist *quicklistCreateFromZiplist(int fill, int compress,
REDIS_STATIC void __quicklistDelNode(quicklist *quicklist,
quicklistNode *node) {
/* Update the bookmark if any */
quicklistBookmark *bm = _quicklistBookmarkFindByNode(quicklist, node);
if (bm) {
bm->node = node->next;
/* if the bookmark was to the last node, delete it. */
if (!bm->node)
_quicklistBookmarkDelete(quicklist, bm);
}
if (node->next)
node->next->prev = node->prev;
if (node->prev)
@ -1410,6 +1427,87 @@ void quicklistPush(quicklist *quicklist, void *value, const size_t sz,
}
}
/* Create or update a bookmark in the list which will be updated to the next node
* automatically when the one referenced gets deleted.
* Returns 1 on success (creation of new bookmark or override of an existing one).
* Returns 0 on failure (reached the maximum supported number of bookmarks).
* NOTE: use short simple names, so that string compare on find is quick.
* NOTE: bookmakrk creation may re-allocate the quicklist, so the input pointer
may change and it's the caller responsibilty to update the reference.
*/
int quicklistBookmarkCreate(quicklist **ql_ref, const char *name, quicklistNode *node) {
quicklist *ql = *ql_ref;
if (ql->bookmark_count >= QL_MAX_BM)
return 0;
quicklistBookmark *bm = _quicklistBookmarkFindByName(ql, name);
if (bm) {
bm->node = node;
return 1;
}
ql = zrealloc(ql, sizeof(quicklist) + (ql->bookmark_count+1) * sizeof(quicklistBookmark));
*ql_ref = ql;
ql->bookmarks[ql->bookmark_count].node = node;
ql->bookmarks[ql->bookmark_count].name = zstrdup(name);
ql->bookmark_count++;
return 1;
}
/* Find the quicklist node referenced by a named bookmark.
* When the bookmarked node is deleted the bookmark is updated to the next node,
* and if that's the last node, the bookmark is deleted (so find returns NULL). */
quicklistNode *quicklistBookmarkFind(quicklist *ql, const char *name) {
quicklistBookmark *bm = _quicklistBookmarkFindByName(ql, name);
if (!bm) return NULL;
return bm->node;
}
/* Delete a named bookmark.
* returns 0 if bookmark was not found, and 1 if deleted.
* Note that the bookmark memory is not freed yet, and is kept for future use. */
int quicklistBookmarkDelete(quicklist *ql, const char *name) {
quicklistBookmark *bm = _quicklistBookmarkFindByName(ql, name);
if (!bm)
return 0;
_quicklistBookmarkDelete(ql, bm);
return 1;
}
quicklistBookmark *_quicklistBookmarkFindByName(quicklist *ql, const char *name) {
unsigned i;
for (i=0; i<ql->bookmark_count; i++) {
if (!strcmp(ql->bookmarks[i].name, name)) {
return &ql->bookmarks[i];
}
}
return NULL;
}
quicklistBookmark *_quicklistBookmarkFindByNode(quicklist *ql, quicklistNode *node) {
unsigned i;
for (i=0; i<ql->bookmark_count; i++) {
if (ql->bookmarks[i].node == node) {
return &ql->bookmarks[i];
}
}
return NULL;
}
void _quicklistBookmarkDelete(quicklist *ql, quicklistBookmark *bm) {
int index = bm - ql->bookmarks;
zfree(bm->name);
ql->bookmark_count--;
memmove(bm, bm+1, (ql->bookmark_count - index)* sizeof(*bm));
/* NOTE: We do not shrink (realloc) the quicklist yet (to avoid resonance,
* it may be re-used later (a call to realloc may NOP). */
}
void quicklistBookmarksClear(quicklist *ql) {
while (ql->bookmark_count)
zfree(ql->bookmarks[--ql->bookmark_count].name);
/* NOTE: We do not shrink (realloc) the quick list. main use case for this
* function is just before releasing the allocation. */
}
/* The rest of this file is test cases and test helpers. */
#ifdef REDIS_TEST
#include <stdint.h>
@ -2641,6 +2739,54 @@ int quicklistTest(int argc, char *argv[]) {
printf("Compressions: %0.2f seconds.\n", (float)(stop - start) / 1000);
printf("\n");
TEST("bookmark get updated to next item") {
quicklist *ql = quicklistNew(1, 0);
quicklistPushTail(ql, "1", 1);
quicklistPushTail(ql, "2", 1);
quicklistPushTail(ql, "3", 1);
quicklistPushTail(ql, "4", 1);
quicklistPushTail(ql, "5", 1);
assert(ql->len==5);
/* add two bookmarks, one pointing to the node before the last. */
assert(quicklistBookmarkCreate(&ql, "_dummy", ql->head->next));
assert(quicklistBookmarkCreate(&ql, "_test", ql->tail->prev));
/* test that the bookmark returns the right node, delete it and see that the bookmark points to the last node */
assert(quicklistBookmarkFind(ql, "_test") == ql->tail->prev);
assert(quicklistDelRange(ql, -2, 1));
assert(quicklistBookmarkFind(ql, "_test") == ql->tail);
/* delete the last node, and see that the bookmark was deleted. */
assert(quicklistDelRange(ql, -1, 1));
assert(quicklistBookmarkFind(ql, "_test") == NULL);
/* test that other bookmarks aren't affected */
assert(quicklistBookmarkFind(ql, "_dummy") == ql->head->next);
assert(quicklistBookmarkFind(ql, "_missing") == NULL);
assert(ql->len==3);
quicklistBookmarksClear(ql); /* for coverage */
assert(quicklistBookmarkFind(ql, "_dummy") == NULL);
quicklistRelease(ql);
}
TEST("bookmark limit") {
int i;
quicklist *ql = quicklistNew(1, 0);
quicklistPushHead(ql, "1", 1);
for (i=0; i<QL_MAX_BM; i++)
assert(quicklistBookmarkCreate(&ql, genstr("",i), ql->head));
/* when all bookmarks are used, creation fails */
assert(!quicklistBookmarkCreate(&ql, "_test", ql->head));
/* delete one and see that we can now create another */
assert(quicklistBookmarkDelete(ql, "0"));
assert(quicklistBookmarkCreate(&ql, "_test", ql->head));
/* delete one and see that the rest survive */
assert(quicklistBookmarkDelete(ql, "_test"));
for (i=1; i<QL_MAX_BM; i++)
assert(quicklistBookmarkFind(ql, genstr("",i)) == ql->head);
/* make sure the deleted ones are indeed gone */
assert(!quicklistBookmarkFind(ql, "0"));
assert(!quicklistBookmarkFind(ql, "_test"));
quicklistRelease(ql);
}
if (!err)
printf("ALL TESTS PASSED!\n");
else

View File

@ -28,6 +28,8 @@
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdint.h> // for UINTPTR_MAX
#ifndef __QUICKLIST_H__
#define __QUICKLIST_H__
@ -64,19 +66,51 @@ typedef struct quicklistLZF {
char compressed[];
} quicklistLZF;
/* Bookmarks are padded with realloc at the end of of the quicklist struct.
* They should only be used for very big lists if thousands of nodes were the
* excess memory usage is negligible, and there's a real need to iterate on them
* in portions.
* When not used, they don't add any memory overhead, but when used and then
* deleted, some overhead remains (to avoid resonance).
* The number of bookmarks used should be kept to minimum since it also adds
* overhead on node deletion (searching for a bookmark to update). */
typedef struct quicklistBookmark {
quicklistNode *node;
char *name;
} quicklistBookmark;
#if UINTPTR_MAX == 0xffffffff
/* 32-bit */
# define QL_FILL_BITS 14
# define QL_COMP_BITS 14
# define QL_BM_BITS 4
#elif UINTPTR_MAX == 0xffffffffffffffff
/* 64-bit */
# define QL_FILL_BITS 16
# define QL_COMP_BITS 16
# define QL_BM_BITS 4 /* we can encode more, but we rather limit the user
since they cause performance degradation. */
#else
# error unknown arch bits count
#endif
/* quicklist is a 40 byte struct (on 64-bit systems) describing a quicklist.
* 'count' is the number of total entries.
* 'len' is the number of quicklist nodes.
* 'compress' is: -1 if compression disabled, otherwise it's the number
* of quicklistNodes to leave uncompressed at ends of quicklist.
* 'fill' is the user-requested (or default) fill factor. */
* 'fill' is the user-requested (or default) fill factor.
* 'bookmakrs are an optional feature that is used by realloc this struct,
* so that they don't consume memory when not used. */
typedef struct quicklist {
quicklistNode *head;
quicklistNode *tail;
unsigned long count; /* total count of all entries in all ziplists */
unsigned long len; /* number of quicklistNodes */
int fill : 16; /* fill factor for individual nodes */
unsigned int compress : 16; /* depth of end nodes not to compress;0=off */
int fill : QL_FILL_BITS; /* fill factor for individual nodes */
unsigned int compress : QL_COMP_BITS; /* depth of end nodes not to compress;0=off */
unsigned int bookmark_count: QL_BM_BITS;
quicklistBookmark bookmarks[];
} quicklist;
typedef struct quicklistIter {
@ -158,6 +192,12 @@ unsigned long quicklistCount(const quicklist *ql);
int quicklistCompare(unsigned char *p1, unsigned char *p2, int p2_len);
size_t quicklistGetLzf(const quicklistNode *node, void **data);
/* bookmarks */
int quicklistBookmarkCreate(quicklist **ql_ref, const char *name, quicklistNode *node);
int quicklistBookmarkDelete(quicklist *ql, const char *name);
quicklistNode *quicklistBookmarkFind(quicklist *ql, const char *name);
void quicklistBookmarksClear(quicklist *ql);
#ifdef REDIS_TEST
int quicklistTest(int argc, char *argv[]);
#endif

View File

@ -209,5 +209,97 @@ start_server {tags {"defrag"}} {
assert {$digest eq $newdigest}
r save ;# saving an rdb iterates over all the data / pointers
} {OK}
test "Active defrag big list" {
r flushdb
r config resetstat
r config set save "" ;# prevent bgsave from interfereing with save below
r config set hz 100
r config set activedefrag no
r config set active-defrag-max-scan-fields 1000
r config set active-defrag-threshold-lower 5
r config set active-defrag-cycle-min 65
r config set active-defrag-cycle-max 75
r config set active-defrag-ignore-bytes 2mb
r config set maxmemory 0
r config set list-max-ziplist-size 5 ;# list of 500k items will have 100k quicklist nodes
# create big keys with 10k items
set rd [redis_deferring_client]
set expected_frag 1.7
# add a mass of list nodes to two lists (allocations are interlaced)
set val [string repeat A 100] ;# 5 items of 100 bytes puts us in the 640 bytes bin, which has 32 regs, so high potential for fragmentation
for {set j 0} {$j < 500000} {incr j} {
$rd lpush biglist1 $val
$rd lpush biglist2 $val
}
for {set j 0} {$j < 500000} {incr j} {
$rd read ; # Discard replies
$rd read ; # Discard replies
}
# create some fragmentation
r del biglist2
# start defrag
after 120 ;# serverCron only updates the info once in 100ms
set frag [s allocator_frag_ratio]
if {$::verbose} {
puts "frag $frag"
}
assert {$frag >= $expected_frag}
r config set latency-monitor-threshold 5
r latency reset
set digest [r debug digest]
catch {r config set activedefrag yes} e
if {![string match {DISABLED*} $e]} {
# wait for the active defrag to start working (decision once a second)
wait_for_condition 50 100 {
[s active_defrag_running] ne 0
} else {
fail "defrag not started."
}
# wait for the active defrag to stop working
wait_for_condition 500 100 {
[s active_defrag_running] eq 0
} else {
after 120 ;# serverCron only updates the info once in 100ms
puts [r info memory]
puts [r info stats]
puts [r memory malloc-stats]
fail "defrag didn't stop."
}
# test the the fragmentation is lower
after 120 ;# serverCron only updates the info once in 100ms
set frag [s allocator_frag_ratio]
set max_latency 0
foreach event [r latency latest] {
lassign $event eventname time latency max
if {$eventname == "active-defrag-cycle"} {
set max_latency $max
}
}
if {$::verbose} {
puts "frag $frag"
puts "max latency $max_latency"
puts [r latency latest]
puts [r latency history active-defrag-cycle]
}
assert {$frag < 1.1}
# due to high fragmentation, 100hz, and active-defrag-cycle-max set to 75,
# we expect max latency to be not much higher than 7.5ms
assert {$max_latency <= 12}
}
# verify the data isn't corrupted or changed
set newdigest [r debug digest]
assert {$digest eq $newdigest}
r save ;# saving an rdb iterates over all the data / pointers
r del biglist1 ;# coverage for quicklistBookmarksClear
} {1}
}
}