redict/src/rdb.h

164 lines
7.2 KiB
C
Raw Normal View History

/*
* Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
2015-07-27 03:41:48 -04:00
#ifndef __RDB_H
#define __RDB_H
#include <stdio.h>
#include "rio.h"
/* TBD: include only necessary headers. */
#include "server.h"
/* The current RDB version. When the format changes in a way that is no longer
* backward compatible this number gets incremented. */
2018-03-16 08:48:44 -04:00
#define RDB_VERSION 9
2011-05-13 17:24:19 -04:00
/* Defines related to the dump file format. To store 32 bits lengths for short
* keys requires a lot of space, so we check the most significant 2 bits of
* the first byte to interpreter the length:
*
* 00|XXXXXX => if the two MSB are 00 the len is the 6 bits of this byte
* 01|XXXXXX XXXXXXXX => 01, the len is 14 byes, 6 bits + 8 bits of next byte
* 10|000000 [32 bit integer] => A full 32 bit len in net byte order will follow
* 10|000001 [64 bit integer] => A full 64 bit len in net byte order will follow
* 11|OBKIND this means: specially encoded object will follow. The six bits
2011-05-13 17:24:19 -04:00
* number specify the kind of object that follows.
2015-07-27 03:41:48 -04:00
* See the RDB_ENC_* defines.
2011-05-13 17:24:19 -04:00
*
2013-01-16 12:00:20 -05:00
* Lengths up to 63 are stored using a single byte, most DB keys, and may
2011-05-13 17:24:19 -04:00
* values, will fit inside. */
2015-07-27 03:41:48 -04:00
#define RDB_6BITLEN 0
#define RDB_14BITLEN 1
#define RDB_32BITLEN 0x80
#define RDB_64BITLEN 0x81
2015-07-27 03:41:48 -04:00
#define RDB_ENCVAL 3
2016-06-01 14:18:28 -04:00
#define RDB_LENERR UINT64_MAX
2011-05-13 17:24:19 -04:00
/* When a length of a string object stored on disk has the first two bits
* set, the remaining six bits specify a special encoding for the object
2011-05-13 17:24:19 -04:00
* accordingly to the following defines: */
2015-07-27 03:41:48 -04:00
#define RDB_ENC_INT8 0 /* 8 bit signed integer */
#define RDB_ENC_INT16 1 /* 16 bit signed integer */
#define RDB_ENC_INT32 2 /* 32 bit signed integer */
#define RDB_ENC_LZF 3 /* string compressed with FASTLZ */
2011-05-13 17:24:19 -04:00
/* Map object types to RDB object types. Macros starting with OBJ_ are for
* memory storage and may change. Instead RDB types must be fixed because
* we store them on disk. */
2015-07-27 03:41:48 -04:00
#define RDB_TYPE_STRING 0
#define RDB_TYPE_LIST 1
#define RDB_TYPE_SET 2
#define RDB_TYPE_ZSET 3
#define RDB_TYPE_HASH 4
#define RDB_TYPE_ZSET_2 5 /* ZSET version 2 with doubles stored in binary. */
#define RDB_TYPE_MODULE 6
RDB modules values serialization format version 2. The original RDB serialization format was not parsable without the module loaded, becuase the structure was managed only by the module itself. Moreover RDB is a streaming protocol in the sense that it is both produce di an append-only fashion, and is also sometimes directly sent to the socket (in the case of diskless replication). The fact that modules values cannot be parsed without the relevant module loaded is a problem in many ways: RDB checking tools must have loaded modules even for doing things not involving the value at all, like splitting an RDB into N RDBs by key or alike, or just checking the RDB for sanity. In theory module values could be just a blob of data with a prefixed length in order for us to be able to skip it. However prefixing the values with a length would mean one of the following: 1. To be able to write some data at a previous offset. This breaks stremaing. 2. To bufferize values before outputting them. This breaks performances. 3. To have some chunked RDB output format. This breaks simplicity. Moreover, the above solution, still makes module values a totally opaque matter, with the fowllowing problems: 1. The RDB check tool can just skip the value without being able to at least check the general structure. For datasets composed mostly of modules values this means to just check the outer level of the RDB not actually doing any checko on most of the data itself. 2. It is not possible to do any recovering or processing of data for which a module no longer exists in the future, or is unknown. So this commit implements a different solution. The modules RDB serialization API is composed if well defined calls to store integers, floats, doubles or strings. After this commit, the parts generated by the module API have a one-byte prefix for each of the above emitted parts, and there is a final EOF byte as well. So even if we don't know exactly how to interpret a module value, we can always parse it at an high level, check the overall structure, understand the types used to store the information, and easily skip the whole value. The change is backward compatible: older RDB files can be still loaded since the new encoding has a new RDB type: MODULE_2 (of value 7). The commit also implements the ability to check RDB files for sanity taking advantage of the new feature.
2017-06-27 07:09:33 -04:00
#define RDB_TYPE_MODULE_2 7 /* Module value with annotations for parsing without
the generating module being loaded. */
/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */
/* Object types for encoded objects. */
2015-07-27 03:41:48 -04:00
#define RDB_TYPE_HASH_ZIPMAP 9
#define RDB_TYPE_LIST_ZIPLIST 10
#define RDB_TYPE_SET_INTSET 11
#define RDB_TYPE_ZSET_ZIPLIST 12
#define RDB_TYPE_HASH_ZIPLIST 13
#define RDB_TYPE_LIST_QUICKLIST 14
#define RDB_TYPE_STREAM_LISTPACKS 15
/* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */
/* Test if a type is an object type. */
#define rdbIsObjectType(t) ((t >= 0 && t <= 7) || (t >= 9 && t <= 15))
/* Special RDB opcodes (saved/loaded with rdbSaveType/rdbLoadType). */
#define RDB_OPCODE_MODULE_AUX 247 /* Module auxiliary data. */
#define RDB_OPCODE_IDLE 248 /* LRU idle time. */
#define RDB_OPCODE_FREQ 249 /* LFU frequency. */
#define RDB_OPCODE_AUX 250 /* RDB aux field. */
#define RDB_OPCODE_RESIZEDB 251 /* Hash table resize hint. */
#define RDB_OPCODE_EXPIRETIME_MS 252 /* Expire time in milliseconds. */
#define RDB_OPCODE_EXPIRETIME 253 /* Old expire time in seconds. */
#define RDB_OPCODE_SELECTDB 254 /* DB number of the following keys. */
#define RDB_OPCODE_EOF 255 /* End of the RDB file. */
RDB modules values serialization format version 2. The original RDB serialization format was not parsable without the module loaded, becuase the structure was managed only by the module itself. Moreover RDB is a streaming protocol in the sense that it is both produce di an append-only fashion, and is also sometimes directly sent to the socket (in the case of diskless replication). The fact that modules values cannot be parsed without the relevant module loaded is a problem in many ways: RDB checking tools must have loaded modules even for doing things not involving the value at all, like splitting an RDB into N RDBs by key or alike, or just checking the RDB for sanity. In theory module values could be just a blob of data with a prefixed length in order for us to be able to skip it. However prefixing the values with a length would mean one of the following: 1. To be able to write some data at a previous offset. This breaks stremaing. 2. To bufferize values before outputting them. This breaks performances. 3. To have some chunked RDB output format. This breaks simplicity. Moreover, the above solution, still makes module values a totally opaque matter, with the fowllowing problems: 1. The RDB check tool can just skip the value without being able to at least check the general structure. For datasets composed mostly of modules values this means to just check the outer level of the RDB not actually doing any checko on most of the data itself. 2. It is not possible to do any recovering or processing of data for which a module no longer exists in the future, or is unknown. So this commit implements a different solution. The modules RDB serialization API is composed if well defined calls to store integers, floats, doubles or strings. After this commit, the parts generated by the module API have a one-byte prefix for each of the above emitted parts, and there is a final EOF byte as well. So even if we don't know exactly how to interpret a module value, we can always parse it at an high level, check the overall structure, understand the types used to store the information, and easily skip the whole value. The change is backward compatible: older RDB files can be still loaded since the new encoding has a new RDB type: MODULE_2 (of value 7). The commit also implements the ability to check RDB files for sanity taking advantage of the new feature.
2017-06-27 07:09:33 -04:00
/* Module serialized values sub opcodes */
#define RDB_MODULE_OPCODE_EOF 0 /* End of module value. */
#define RDB_MODULE_OPCODE_SINT 1 /* Signed integer. */
#define RDB_MODULE_OPCODE_UINT 2 /* Unsigned integer. */
#define RDB_MODULE_OPCODE_FLOAT 3 /* Float. */
#define RDB_MODULE_OPCODE_DOUBLE 4 /* Double. */
#define RDB_MODULE_OPCODE_STRING 5 /* String. */
/* rdbLoad...() functions flags. */
#define RDB_LOAD_NONE 0
#define RDB_LOAD_ENC (1<<0)
#define RDB_LOAD_PLAIN (1<<1)
#define RDB_LOAD_SDS (1<<2)
/* flags on the purpose of rdb save or load */
#define RDBFLAGS_NONE 0
#define RDBFLAGS_AOF_PREAMBLE (1<<0)
#define RDBFLAGS_REPLICATION (1<<1)
2016-08-09 05:07:32 -04:00
2011-05-13 17:24:19 -04:00
int rdbSaveType(rio *rdb, unsigned char type);
int rdbLoadType(rio *rdb);
int rdbSaveTime(rio *rdb, time_t t);
time_t rdbLoadTime(rio *rdb);
int rdbSaveLen(rio *rdb, uint64_t len);
int rdbSaveMillisecondTime(rio *rdb, long long t);
long long rdbLoadMillisecondTime(rio *rdb, int rdbver);
uint64_t rdbLoadLen(rio *rdb, int *isencoded);
int rdbLoadLenByRef(rio *rdb, int *isencoded, uint64_t *lenptr);
2011-05-13 17:24:19 -04:00
int rdbSaveObjectType(rio *rdb, robj *o);
int rdbLoadObjectType(rio *rdb);
int rdbLoad(char *filename, rdbSaveInfo *rsi, int rdbflags);
PSYNC2: different improvements to Redis replication. The gist of the changes is that now, partial resynchronizations between slaves and masters (without the need of a full resync with RDB transfer and so forth), work in a number of cases when it was impossible in the past. For instance: 1. When a slave is promoted to mastrer, the slaves of the old master can partially resynchronize with the new master. 2. Chained slalves (slaves of slaves) can be moved to replicate to other slaves or the master itsef, without requiring a full resync. 3. The master itself, after being turned into a slave, is able to partially resynchronize with the new master, when it joins replication again. In order to obtain this, the following main changes were operated: * Slaves also take a replication backlog, not just masters. * Same stream replication for all the slaves and sub slaves. The replication stream is identical from the top level master to its slaves and is also the same from the slaves to their sub-slaves and so forth. This means that if a slave is later promoted to master, it has the same replication backlong, and can partially resynchronize with its slaves (that were previously slaves of the old master). * A given replication history is no longer identified by the `runid` of a Redis node. There is instead a `replication ID` which changes every time the instance has a new history no longer coherent with the past one. So, for example, slaves publish the same replication history of their master, however when they are turned into masters, they publish a new replication ID, but still remember the old ID, so that they are able to partially resynchronize with slaves of the old master (up to a given offset). * The replication protocol was slightly modified so that a new extended +CONTINUE reply from the master is able to inform the slave of a replication ID change. * REPLCONF CAPA is used in order to notify masters that a slave is able to understand the new +CONTINUE reply. * The RDB file was extended with an auxiliary field that is able to select a given DB after loading in the slave, so that the slave can continue receiving the replication stream from the point it was disconnected without requiring the master to insert "SELECT" statements. This is useful in order to guarantee the "same stream" property, because the slave must be able to accumulate an identical backlog. * Slave pings to sub-slaves are now sent in a special form, when the top-level master is disconnected, in order to don't interfer with the replication stream. We just use out of band "\n" bytes as in other parts of the Redis protocol. An old design document is available here: https://gist.github.com/antirez/ae068f95c0d084891305 However the implementation is not identical to the description because during the work to implement it, different changes were needed in order to make things working well.
2016-11-09 05:31:06 -05:00
int rdbSaveBackground(char *filename, rdbSaveInfo *rsi);
int rdbSaveToSlavesSockets(rdbSaveInfo *rsi);
void rdbRemoveTempFile(pid_t childpid);
PSYNC2: different improvements to Redis replication. The gist of the changes is that now, partial resynchronizations between slaves and masters (without the need of a full resync with RDB transfer and so forth), work in a number of cases when it was impossible in the past. For instance: 1. When a slave is promoted to mastrer, the slaves of the old master can partially resynchronize with the new master. 2. Chained slalves (slaves of slaves) can be moved to replicate to other slaves or the master itsef, without requiring a full resync. 3. The master itself, after being turned into a slave, is able to partially resynchronize with the new master, when it joins replication again. In order to obtain this, the following main changes were operated: * Slaves also take a replication backlog, not just masters. * Same stream replication for all the slaves and sub slaves. The replication stream is identical from the top level master to its slaves and is also the same from the slaves to their sub-slaves and so forth. This means that if a slave is later promoted to master, it has the same replication backlong, and can partially resynchronize with its slaves (that were previously slaves of the old master). * A given replication history is no longer identified by the `runid` of a Redis node. There is instead a `replication ID` which changes every time the instance has a new history no longer coherent with the past one. So, for example, slaves publish the same replication history of their master, however when they are turned into masters, they publish a new replication ID, but still remember the old ID, so that they are able to partially resynchronize with slaves of the old master (up to a given offset). * The replication protocol was slightly modified so that a new extended +CONTINUE reply from the master is able to inform the slave of a replication ID change. * REPLCONF CAPA is used in order to notify masters that a slave is able to understand the new +CONTINUE reply. * The RDB file was extended with an auxiliary field that is able to select a given DB after loading in the slave, so that the slave can continue receiving the replication stream from the point it was disconnected without requiring the master to insert "SELECT" statements. This is useful in order to guarantee the "same stream" property, because the slave must be able to accumulate an identical backlog. * Slave pings to sub-slaves are now sent in a special form, when the top-level master is disconnected, in order to don't interfer with the replication stream. We just use out of band "\n" bytes as in other parts of the Redis protocol. An old design document is available here: https://gist.github.com/antirez/ae068f95c0d084891305 However the implementation is not identical to the description because during the work to implement it, different changes were needed in order to make things working well.
2016-11-09 05:31:06 -05:00
int rdbSave(char *filename, rdbSaveInfo *rsi);
2016-11-30 14:47:02 -05:00
ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key);
size_t rdbSavedObjectLen(robj *o);
2016-11-30 14:47:02 -05:00
robj *rdbLoadObject(int type, rio *rdb, robj *key);
void backgroundSaveDoneHandler(int exitcode, int bysignal);
int rdbSaveKeyValuePair(rio *rdb, robj *key, robj *val, long long expiretime);
ssize_t rdbSaveSingleModuleAux(rio *rdb, int when, moduleType *mt);
robj *rdbLoadStringObject(rio *rdb);
ssize_t rdbSaveStringObject(rio *rdb, robj *obj);
ssize_t rdbSaveRawString(rio *rdb, unsigned char *s, size_t len);
void *rdbGenericLoadStringObject(rio *rdb, int flags, size_t *lenptr);
int rdbSaveBinaryDoubleValue(rio *rdb, double val);
int rdbLoadBinaryDoubleValue(rio *rdb, double *val);
int rdbSaveBinaryFloatValue(rio *rdb, float val);
int rdbLoadBinaryFloatValue(rio *rdb, float *val);
int rdbLoadRio(rio *rdb, int rdbflags, rdbSaveInfo *rsi);
int rdbSaveRio(rio *rdb, int *error, int rdbflags, rdbSaveInfo *rsi);
rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi);
#endif