Redis数据结构底层源码简析

声明:以下是基于Redis7.2进行源代码的分析

dict

结构体定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
/* 一个dict结构由两个ht_table组成,方便来进行rehash */
struct dict {
dictType *type;

dictEntry **ht_table[2];
unsigned long ht_used[2];

long rehashidx; /* rehashing not in progress if rehashidx == -1 */

/* Keep small vars at end for optimal (minimal) struct padding */
int16_t pauserehash; /* If >0 rehashing is paused (<0 indicates coding error) */
signed char ht_size_exp[2]; /* exponent of size. (size = 1<<exp) */

void *metadata[]; /* An arbitrary number of bytes (starting at a
* pointer-aligned address) of size as defined
* by dictType's dictEntryBytes. */
};

struct dictEntry {
void *key;
union {
void *val;
uint64_t u64;
int64_t s64;
double d;
} v;
struct dictEntry *next; /* Next entry in the same hash bucket. */
void *metadata[]; /* An arbitrary number of bytes (starting at a
* pointer-aligned address) of size as returned
* by dictType's dictEntryMetadataBytes(). */
};

操作

dictCreate(初始化)

  • 初始化时不分配内存

    1
    2
    3
    4
    5
    6
    7
    /* Reset hash table parameters already initialized with _dictInit()*/
    static void _dictReset(dict *d, int htidx)
    {
    d->ht_table[htidx] = NULL;
    d->ht_size_exp[htidx] = -1;
    d->ht_used[htidx] = 0;
    }
  • 初始化会初始化两个dictht数组

    1
    2
    3
    4
    5
    6
    7
    8
    9
    int _dictInit(dict *d, dictType *type)
    {
    _dictReset(d, 0);
    _dictReset(d, 1);
    d->type = type;
    d->rehashidx = -1;
    d->pauserehash = 0;
    return DICT_OK;
    }

dictAdd

resize(扩容)

插入时如何判断什么时候进行resize?

  • 允许自动扩容:已插入的元素数量/容量 >= 1
  • 手动扩容:已插入元素数量/容量 >= threshold(默认为5)
  • 扩容后扩为原来内存的2倍
  • 扩容后标志rehash位,表示开始rehash了
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/* 1. 寻找key应该被插入的位置的函数 */
void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing) {
unsigned long idx, table;
dictEntry *he;
uint64_t hash = dictHashKey(d, key);
if (existing) *existing = NULL;
/* 判断是否正在进行rehashing */
if (dictIsRehashing(d)) _dictRehashStep(d);
/* 判断是否需要扩容 */
if (_dictExpandIfNeeded(d) == DICT_ERR)
return NULL;
...
}

/* 2. 判断是否需要扩容函数 */
static int _dictExpandIfNeeded(dict *d) {
...
/* 初始化时依据ht_size_exp的值进行初始化 */
if (DICTHT_SIZE(d->ht_size_exp[0]) == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);

/* If we reached the 1:1 ratio, and we are allowed to resize the hash
* table (global setting) or we should avoid it but the ratio between
* elements/buckets is over the "safe" threshold, we resize doubling
* the number of buckets. */
if ((dict_can_resize == DICT_RESIZE_ENABLE &&
d->ht_used[0] >= DICTHT_SIZE(d->ht_size_exp[0])) ||
(dict_can_resize != DICT_RESIZE_FORBID &&
d->ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0]) > dict_force_resize_ratio))
{
...
return dictExpand(d, d->ht_used[0] + 1);
}
}
/* 3. 扩容函数 */
/* 第一次扩容 */
dictExpand(d, d->ht_used[0] + 1);
/* 函数 */
int _dictExpand(dict *d, unsigned long size, int* malloc_failed) {
/* new_ht_size_exp = 3 */
/* 扩容后为2的倍数:size = 5 扩容函数:return 8*sizeof(long) - __builtin_clzl(size-1); */
signed char new_ht_size_exp = _dictNextExp(size);
/* newsize = 8 */
size_t newsize = 1ul<<new_ht_size_exp;
...
/* 初始化新的table */
new_ht_table = zcalloc(newsize*sizeof(dictEntry*));
new_ht_used = 0;
...
/* Prepare a second hash table for incremental rehashing */
d->ht_size_exp[1] = new_ht_size_exp;
d->ht_used[1] = new_ht_used;
d->ht_table[1] = new_ht_table;
/* 扩容后会将rehashidx设为0,表示开始进行rehash了*/
d->rehashidx = 0;
return DICT_OK;
}

rehash

渐进式hash:每个读写操作都会判断是否需要进行rehash

什么时候进行rehash?:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
/* rehash操作 */
if (dictIsRehashing(d)) _dictRehashStep(d);
/* 移动元素的操作 */
d->ht_table[1][h] = de;
d->ht_used[0]--;
d->ht_used[1]++;
de = nextde;
}
d->ht_table[0][d->rehashidx] = NULL;
d->rehashidx++;
/* 移动结束,切换指针 */
/* Check if we already rehashed the whole table... */
if (d->ht_used[0] == 0) {
zfree(d->ht_table[0]);
/* Copy the new ht onto the old one */
d->ht_table[0] = d->ht_table[1];
d->ht_used[0] = d->ht_used[1];
d->ht_size_exp[0] = d->ht_size_exp[1];
_dictReset(d, 1);
d->rehashidx = -1;
return 0;
}

判断key是否存在

什么时候会返回当前key?什么时候把key插入到当前entry的链表中?

1
2
3
4
5
6
7
8
9
10
/* Search if this slot does not already contain the given key */
void *he_key = dictGetKey(he);
/* 如果key == he_key,则判断existing是否为空,如果不为空,赋值*existing,调用方可以通过*existing获取当前节点*/
/* 如果key != he_key, 则通过next指针进行往下找*/
if (key == he_key || dictCompareKeys(d, key, he_key)) {
if (existing) *existing = he;
return NULL;
}
he = dictGetNext(he);

SDS

Redis中一般的key都会采用SDS进行存储,SDS会根据key的长度来选择5种header来一种来存储,达到优化内存的效果

3种编码

  • int编码 【内存连续存储】:元数据(8字节)、int
  • embStr编码【内存连续存储】:元数据(8字节)、header
  • raw编码 【前两项连续存储】:元数据(8字节)、*ptr(8字节)、header

5种header

  • sdshdr5

    1
    2
    3
    4
    struct __attribute__ ((__packed__)) sdshdr5 {
    unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
    char buf[];
    };
  • sdshdr8

    1
    2
    3
    4
    5
    6
    struct __attribute__ ((__packed__)) sdshdr8 {
    uint8_t len; /* used */
    uint8_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
    };
  • sdshdr16

    1
    2
    3
    4
    5
    6
    struct __attribute__ ((__packed__)) sdshdr16 {
    uint16_t len; /* used */
    uint16_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
    };
  • sdshdr32

    1
    2
    3
    4
    5
    6
    struct __attribute__ ((__packed__)) sdshdr32 {
    uint32_t len; /* used */
    uint32_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
    };
  • sdshdr64

    1
    2
    3
    4
    5
    6
    struct __attribute__ ((__packed__)) sdshdr64 {
    uint64_t len; /* used */
    uint64_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
    };

Robj

因为Redis的value会采用不同的数据结构来存储,因此抽象出了Redis Object来表示不同的数据结构

结构体定义

1
2
3
4
5
6
7
8
9
struct redisObject {
unsigned type:4;
unsigned encoding:4;
unsigned lru:LRU_BITS; /* LRU time (relative to global lru_clock) or
* LFU data (least significant 8 bits frequency
* and most significant 16 bits access time). */
int refcount;
void *ptr;
};

Type

代表各种数据类型。包含5种,分别如下:

1
2
3
4
5
6
/* The actual Redis Object */
#define OBJ_STRING 0 /* String object. */
#define OBJ_LIST 1 /* List object. */
#define OBJ_SET 2 /* Set object. */
#define OBJ_ZSET 3 /* Sorted set object. */
#define OBJ_HASH 4 /* Hash object. */

Encoding

代表底层数据结构

常用的包含:SDS(RAW、INT、EMBSTR)、INTSET、HT(hash table)、QUICKLIST、LISTPACK(同ziplist)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
/* Objects encoding. Some kind of objects like Strings and Hashes can be
* internally represented in multiple ways. The 'encoding' field of the object
* is set to one of this fields for this object. */
#define OBJ_ENCODING_RAW 0 /* Raw representation */
#define OBJ_ENCODING_INT 1 /* Encoded as integer */
#define OBJ_ENCODING_HT 2 /* Encoded as hash table */
#define OBJ_ENCODING_ZIPMAP 3 /* No longer used: old hash encoding. */
#define OBJ_ENCODING_LINKEDLIST 4 /* No longer used: old list encoding. */
#define OBJ_ENCODING_ZIPLIST 5 /* No longer used: old list/hash/zset encoding. */
#define OBJ_ENCODING_INTSET 6 /* Encoded as intset */
#define OBJ_ENCODING_SKIPLIST 7 /* Encoded as skiplist */
#define OBJ_ENCODING_EMBSTR 8 /* Embedded sds string encoding */
#define OBJ_ENCODING_QUICKLIST 9 /* Encoded as linked list of listpacks */
#define OBJ_ENCODING_STREAM 10 /* Encoded as a radix tree of listpacks */
#define OBJ_ENCODING_LISTPACK 11 /* Encoded as a listpack */

Ziplist

压缩列表是一块连续的内存,每一项都存在前后连续的地址空间里

压缩列表采用变长的编码。

​ 如果是小整数则采用少一些字节来存储,如果是大整数则采用大的字节来存储

数据结构定义

从宏观上看,ziplist的内存结构如下:

1
<zlbytes><zltail><zllen><entry>...<entry><zlend>

zlbytes:压缩列表所占用的字节数(包含zlbytes的字节书)

zltail:最后一项entry在压缩列表中偏移量,方便在尾端执行pop和push操作

zllen:压缩列表中元素的个数

zlend:一个字节,结束标记,值固定等于255

每一个数据项<entry>的构成:

1
<prevrawlen><len><encoding><data>

<prevrawlen>:1个字节或5个字节,前一个元素的长度

<len>:4个字节,当前元素的长度

<encoding>:1个字节

<data>:具体的数据

应用

hash:当元素较少的时候,hash结构会采用listpacks进行存储,元素较多会采用dict

quicklist:底层基于ziplist组合起来的的双向链表

sortedset:当元素较少时底层采用ziplist进行存储,元素较多时采用dict+skiplist

Quicklist

源代码描述:quicklist.c - A doubly linked list of listpacks

quicklist是一个listpacks(压缩列表)的双向链表,包含一些复杂的压缩算法、和listpacks的大小的规定来中和quicklist的性能和占用空间

结构体定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
/* quicklist is a 40 byte struct (on 64-bit systems) describing a quicklist */
typedef struct quicklist {
quicklistNode *head;
quicklistNode *tail;
unsigned long count; /* total count of all entries in all listpacks */
unsigned long len; /* number of quicklistNodes */
signed int fill : QL_FILL_BITS; /* fill factor for individual nodes */
unsigned int compress : QL_COMP_BITS; /* depth of end nodes not to compress;0=off */
unsigned int bookmark_count: QL_BM_BITS;
quicklistBookmark bookmarks[];
} quicklist;

/* quicklistNode is a 32 byte struct describing a listpack for a quicklist */
typedef struct quicklistNode {
struct quicklistNode *prev;
struct quicklistNode *next;
unsigned char *entry;
size_t sz; /* entry size in bytes */
unsigned int count : 16; /* count of items in listpack */
unsigned int encoding : 2; /* RAW==1 or LZF==2 */
unsigned int container : 2; /* PLAIN==1 or PACKED==2 */
unsigned int recompress : 1; /* was this node previous compressed? */
unsigned int attempted_compress : 1; /* node can't compress; too small */
unsigned int dont_compress : 1; /* prevent compression of entry that will be used later */
unsigned int extra : 9; /* more bits to steal for future usage */
} quicklistNode;

Create

1
2
3
4
5
6
7
8
9
10
11
12
quicklist *quicklistCreate(void) {
struct quicklist *quicklist;

quicklist = zmalloc(sizeof(*quicklist));
quicklist->head = quicklist->tail = NULL;
quicklist->len = 0;
quicklist->count = 0;
quicklist->compress = 0;
quicklist->fill = -2;
quicklist->bookmark_count = 0;
return quicklist;
}

PUSH(插入)

根据条件来判断是插入到某个ziplist(quicklistNodeUpdateSz())中还是新建一个quicklistnode

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
/* Wrapper to allow argument-based switching between HEAD/TAIL pop */
void quicklistPush(quicklist *quicklist, void *value, const size_t sz,
int where) {
/* The head and tail should never be compressed (we don't attempt to decompress them) */
if (quicklist->head)
assert(quicklist->head->encoding != QUICKLIST_NODE_ENCODING_LZF);
if (quicklist->tail)
assert(quicklist->tail->encoding != QUICKLIST_NODE_ENCODING_LZF);

if (where == QUICKLIST_HEAD) {
quicklistPushHead(quicklist, value, sz);
} else if (where == QUICKLIST_TAIL) {
quicklistPushTail(quicklist, value, sz);
}
}

/* Add new entry to head node of quicklist.
*
* Returns 0 if used existing head.
* Returns 1 if new head created. */
int quicklistPushHead(quicklist *quicklist, void *value, size_t sz) {
quicklistNode *orig_head = quicklist->head;

if (unlikely(isLargeElement(sz))) {
__quicklistInsertPlainNode(quicklist, quicklist->head, value, sz, 0);
return 1;
}

if (likely(
_quicklistNodeAllowInsert(quicklist->head, quicklist->fill, sz))) {
quicklist->head->entry = lpPrepend(quicklist->head->entry, value, sz);
quicklistNodeUpdateSz(quicklist->head);
} else {
quicklistNode *node = quicklistCreateNode();
node->entry = lpPrepend(lpNew(0), value, sz);

quicklistNodeUpdateSz(node);
_quicklistInsertNodeBefore(quicklist, quicklist->head, node);
}
quicklist->count++;
quicklist->head->count++;
return (orig_head != quicklist->head);
}

Intset

为了实现集合(set)这种数据结构,设计了intset数据结构,元素是有序不重复的,并且包含了一些并集、交集、差集的操作

intset:当set存储的元素较少

dict:元素较多时、插入字符串

元素少or多根据系统参数来进行控制以达到最好的性能

是一块连续的存储空间

针对于大整数、小整数采用不通的编码来优化存储空间,如果编码变了,则之前的也需要统一调整编码

结构体定义

1
2
3
4
5
6
7
8
9
10
11
typedef struct intset {
uint32_t encoding;
uint32_t length;
int8_t contents[];
} intset;

/* Note that these encodings are ordered, so:
* INTSET_ENC_INT16 < INTSET_ENC_INT32 < INTSET_ENC_INT64. */
#define INTSET_ENC_INT16 (sizeof(int16_t))
#define INTSET_ENC_INT32 (sizeof(int32_t))
#define INTSET_ENC_INT64 (sizeof(int64_t))

intset与ziplist相比

  • inset只能存储整数、ziplist可以存储任意字符

  • inset元素是不重复、有序的,ziplist是无序的

  • inset会根据元素的大小调整统一编码,ziplist可以采用变长编码

集合的计算

交集

第一步:检查各个集合,对于不存在的集合当做空集来处理。一旦出现空集,则不用继续计算了,最终的交集就是空集。

第二步:按照集合元素个数进行排序

第三步:遍历最小的集合,拿每一个元素跟其他的集合进行寻找,如果其他集合存在,则插入到结果集合中

差集

第一种:将第一个集合中元素存到中间集合中,遍历其他集合,遇到存在的则删除,时间复杂度O(N)

第二种:遍历其他集合,只有所有集合中都找不到的元素,才加入到最后集合中

并集

遍历所有集合,将所有元素都添加到最终集合中

Skiplist

sorted set是基于ziplist、dict+skiplist构建的

skiplist时如何建立的

  • 每插入一个节点,根据算法算出来一个随机层数

  • 每次插入和删除的过程,首先根据skiplist数据结构找到对应的位置,然后通过改变指针的方式,来实现插入和删除

  • 随机层数计算算法:

1
2
3
4
5
6
randomLevel()
level := 1
// random()返回一个[0...1)的随机数
while random() < p and level < MaxLevel do
level := level + 1
return level

skiplist和平衡数的比较

  • 插入的性能:平衡树每次的插入和删除涉及到树结构的变更,skiplist不涉及
  • 占用空间:skiplist内存占用更少
  • 范围查找性能:平衡数找到最小值后,找最大值还是比较麻烦的,skiplist找到最小值后,直接根据第一层来往后找就行

Sorted Set底层结构

ziplist、zset(dict+skiplist)

  • 当元素较少时(可通过参数来控制),会采用ziplist进行存储
  • 当元素较多时,数据会存在dict中,分数会存在skiplist中

总结

  1. redis底层是用dict来进行存储dictEntry的
  • *key(8个字节) -> RedisObject

  • *value(8个字节)-> RedisObject

  • *next(8个字节)-> dictEntry

  1. Redis所有的类型的数据都是采用RedisObject来进行存储的
  • 元数据(8个字节)<encoding、refcount(访问次数)、lru(最后一次访问时间)、type>

  • ptr* (8个字节、针对于SDS有优化)


参考链接:

Redis内部数据结构详解


Redis数据结构源码简析
http://example.com/2024/12/30/2025-01-15-Redis数据结构源码分析/
作者
wyx-98
发布于
2024年12月30日
许可协议