Redis数据结构底层源码简析

声明：以下是基于Redis7.2进行源代码的分析

dict

结构体定义

/* 一个dict结构由两个ht_table组成，方便来进行rehash */
struct dict {
    dictType *type;

    dictEntry **ht_table[2];
    unsigned long ht_used[2];

    long rehashidx; /* rehashing not in progress if rehashidx == -1 */

    /* Keep small vars at end for optimal (minimal) struct padding */
    int16_t pauserehash; /* If >0 rehashing is paused (<0 indicates coding error) */
    signed char ht_size_exp[2]; /* exponent of size. (size = 1<<exp) */

    void *metadata[];           /* An arbitrary number of bytes (starting at a
                                 * pointer-aligned address) of size as defined
                                 * by dictType's dictEntryBytes. */
};

struct dictEntry {
    void *key;
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v;
    struct dictEntry *next;     /* Next entry in the same hash bucket. */
    void *metadata[];           /* An arbitrary number of bytes (starting at a
                                 * pointer-aligned address) of size as returned
                                 * by dictType's dictEntryMetadataBytes(). */
};

操作

dictCreate（初始化）

初始化时不分配内存

/* Reset hash table parameters already initialized with _dictInit()*/
static void _dictReset(dict *d, int htidx)
{
    d->ht_table[htidx] = NULL;
    d->ht_size_exp[htidx] = -1;
    d->ht_used[htidx] = 0;
}

初始化会初始化两个dictht数组

int _dictInit(dict *d, dictType *type)
{
    _dictReset(d, 0);
    _dictReset(d, 1);
    d->type = type;
    d->rehashidx = -1;
    d->pauserehash = 0;
    return DICT_OK;
}

dictAdd

resize（扩容）

插入时如何判断什么时候进行resize？

允许自动扩容：已插入的元素数量/容量 >= 1
手动扩容：已插入元素数量/容量 >= threshold（默认为5）
扩容后扩为原来内存的2倍
扩容后标志rehash位，表示开始rehash了

/* 1. 寻找key应该被插入的位置的函数 */
void *dictFindPositionForInsert(dict *d, const void *key, dictEntry **existing) {
    unsigned long idx, table;
    dictEntry *he;
    uint64_t hash = dictHashKey(d, key);
    if (existing) *existing = NULL;
    /* 判断是否正在进行rehashing */
    if (dictIsRehashing(d)) _dictRehashStep(d);
    /* 判断是否需要扩容 */
    if (_dictExpandIfNeeded(d) == DICT_ERR)
        return NULL;
    ...
}

/* 2. 判断是否需要扩容函数 */
static int _dictExpandIfNeeded(dict *d) {
    ...
    /* 初始化时依据ht_size_exp的值进行初始化 */
    if (DICTHT_SIZE(d->ht_size_exp[0]) == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);

    /* If we reached the 1:1 ratio, and we are allowed to resize the hash
     * table (global setting) or we should avoid it but the ratio between
     * elements/buckets is over the "safe" threshold, we resize doubling
     * the number of buckets. */
    if ((dict_can_resize == DICT_RESIZE_ENABLE &&
         d->ht_used[0] >= DICTHT_SIZE(d->ht_size_exp[0])) ||
        (dict_can_resize != DICT_RESIZE_FORBID &&
         d->ht_used[0] / DICTHT_SIZE(d->ht_size_exp[0]) > dict_force_resize_ratio))
    {
        ...
        return dictExpand(d, d->ht_used[0] + 1);
    }
}
/* 3. 扩容函数 */
/* 第一次扩容 */
dictExpand(d, d->ht_used[0] + 1);
/* 函数 */
int _dictExpand(dict *d, unsigned long size, int* malloc_failed) {
    /* new_ht_size_exp = 3 */
    /* 扩容后为2的倍数：size = 5 扩容函数：return 8*sizeof(long) - __builtin_clzl(size-1); */
    signed char new_ht_size_exp = _dictNextExp(size);
    /* newsize = 8 */
    size_t newsize = 1ul<<new_ht_size_exp;
    ...
    /* 初始化新的table */
    new_ht_table = zcalloc(newsize*sizeof(dictEntry*));
    new_ht_used = 0;
    ...
    /* Prepare a second hash table for incremental rehashing */
    d->ht_size_exp[1] = new_ht_size_exp;
    d->ht_used[1] = new_ht_used;
    d->ht_table[1] = new_ht_table;
  	/* 扩容后会将rehashidx设为0，表示开始进行rehash了*/
    d->rehashidx = 0;
    return DICT_OK;
}

rehash

渐进式hash：每个读写操作都会判断是否需要进行rehash

什么时候进行rehash？：

/* rehash操作 */
if (dictIsRehashing(d)) _dictRehashStep(d);
/* 移动元素的操作 */
d->ht_table[1][h] = de;
            d->ht_used[0]--;
            d->ht_used[1]++;
            de = nextde;
        }
        d->ht_table[0][d->rehashidx] = NULL;
        d->rehashidx++;
/* 移动结束，切换指针 */
/* Check if we already rehashed the whole table... */
    if (d->ht_used[0] == 0) {
        zfree(d->ht_table[0]);
        /* Copy the new ht onto the old one */
        d->ht_table[0] = d->ht_table[1];
        d->ht_used[0] = d->ht_used[1];
        d->ht_size_exp[0] = d->ht_size_exp[1];
        _dictReset(d, 1);
        d->rehashidx = -1;
        return 0;
    }

判断key是否存在

什么时候会返回当前key？什么时候把key插入到当前entry的链表中？

/* Search if this slot does not already contain the given key */
void *he_key = dictGetKey(he);
						/* 如果key == he_key，则判断existing是否为空，如果不为空，赋值*existing，调用方可以通过*existing获取当前节点*/
            /* 如果key != he_key, 则通过next指针进行往下找*/
            if (key == he_key || dictCompareKeys(d, key, he_key)) {
                if (existing) *existing = he;
                return NULL;
            }
            he = dictGetNext(he);

SDS

Redis中一般的key都会采用SDS进行存储，SDS会根据key的长度来选择5种header来一种来存储，达到优化内存的效果

3种编码

int编码【内存连续存储】：元数据（8字节）、int
embStr编码【内存连续存储】：元数据（8字节）、header
raw编码【前两项连续存储】：元数据（8字节）、*ptr（8字节）、header

sdshdr5

struct __attribute__ ((__packed__)) sdshdr5 {
    unsigned char flags; /* 3 lsb of type, and 5 msb of string length */
    char buf[];
};

sdshdr8

struct __attribute__ ((__packed__)) sdshdr8 {
    uint8_t len; /* used */
    uint8_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};

sdshdr16

struct __attribute__ ((__packed__)) sdshdr16 {
    uint16_t len; /* used */
    uint16_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};

sdshdr32

struct __attribute__ ((__packed__)) sdshdr32 {
    uint32_t len; /* used */
    uint32_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};

sdshdr64

struct __attribute__ ((__packed__)) sdshdr64 {
    uint64_t len; /* used */
    uint64_t alloc; /* excluding the header and null terminator */
    unsigned char flags; /* 3 lsb of type, 5 unused bits */
    char buf[];
};

Robj

因为Redis的value会采用不同的数据结构来存储，因此抽象出了Redis Object来表示不同的数据结构

结构体定义

struct redisObject {
    unsigned type:4;
    unsigned encoding:4;
    unsigned lru:LRU_BITS; /* LRU time (relative to global lru_clock) or
                            * LFU data (least significant 8 bits frequency
                            * and most significant 16 bits access time). */
    int refcount;
    void *ptr;
};

Type

代表各种数据类型。包含5种，分别如下：

/* The actual Redis Object */
#define OBJ_STRING 0    /* String object. */
#define OBJ_LIST 1      /* List object. */
#define OBJ_SET 2       /* Set object. */
#define OBJ_ZSET 3      /* Sorted set object. */
#define OBJ_HASH 4      /* Hash object. */

Encoding

代表底层数据结构

常用的包含：SDS（RAW、INT、EMBSTR）、INTSET、HT（hash table）、QUICKLIST、LISTPACK（同ziplist）

/* Objects encoding. Some kind of objects like Strings and Hashes can be
 * internally represented in multiple ways. The 'encoding' field of the object
 * is set to one of this fields for this object. */
#define OBJ_ENCODING_RAW 0     /* Raw representation */
#define OBJ_ENCODING_INT 1     /* Encoded as integer */
#define OBJ_ENCODING_HT 2      /* Encoded as hash table */
#define OBJ_ENCODING_ZIPMAP 3  /* No longer used: old hash encoding. */
#define OBJ_ENCODING_LINKEDLIST 4 /* No longer used: old list encoding. */
#define OBJ_ENCODING_ZIPLIST 5 /* No longer used: old list/hash/zset encoding. */
#define OBJ_ENCODING_INTSET 6  /* Encoded as intset */
#define OBJ_ENCODING_SKIPLIST 7  /* Encoded as skiplist */
#define OBJ_ENCODING_EMBSTR 8  /* Embedded sds string encoding */
#define OBJ_ENCODING_QUICKLIST 9 /* Encoded as linked list of listpacks */
#define OBJ_ENCODING_STREAM 10 /* Encoded as a radix tree of listpacks */
#define OBJ_ENCODING_LISTPACK 11 /* Encoded as a listpack */

Ziplist

压缩列表是一块连续的内存，每一项都存在前后连续的地址空间里

压缩列表采用变长的编码。

如果是小整数则采用少一些字节来存储，如果是大整数则采用大的字节来存储

数据结构定义

从宏观上看，ziplist的内存结构如下：

1	`<zlbytes><zltail><zllen><entry>...<entry><zlend>`

zlbytes：压缩列表所占用的字节数（包含zlbytes的字节书）

zltail：最后一项entry在压缩列表中偏移量，方便在尾端执行pop和push操作

zllen：压缩列表中元素的个数

zlend：一个字节，结束标记，值固定等于255

每一个数据项<entry>的构成：

1	`<prevrawlen><len><encoding><data>`

<prevrawlen>：1个字节或5个字节，前一个元素的长度

<len>：4个字节，当前元素的长度

<encoding>：1个字节

<data>：具体的数据

应用

hash：当元素较少的时候，hash结构会采用listpacks进行存储，元素较多会采用dict

quicklist：底层基于ziplist组合起来的的双向链表

sortedset：当元素较少时底层采用ziplist进行存储，元素较多时采用dict+skiplist

Quicklist

源代码描述：quicklist.c - A doubly linked list of listpacks

quicklist是一个listpacks（压缩列表）的双向链表，包含一些复杂的压缩算法、和listpacks的大小的规定来中和quicklist的性能和占用空间

结构体定义

/* quicklist is a 40 byte struct (on 64-bit systems) describing a quicklist */
typedef struct quicklist {
    quicklistNode *head;
    quicklistNode *tail;
    unsigned long count;        /* total count of all entries in all listpacks */
    unsigned long len;          /* number of quicklistNodes */
    signed int fill : QL_FILL_BITS;       /* fill factor for individual nodes */
    unsigned int compress : QL_COMP_BITS; /* depth of end nodes not to compress;0=off */
    unsigned int bookmark_count: QL_BM_BITS;
    quicklistBookmark bookmarks[];
} quicklist;

/* quicklistNode is a 32 byte struct describing a listpack for a quicklist */
typedef struct quicklistNode {
    struct quicklistNode *prev;
    struct quicklistNode *next;
    unsigned char *entry;
    size_t sz;             /* entry size in bytes */
    unsigned int count : 16;     /* count of items in listpack */
    unsigned int encoding : 2;   /* RAW==1 or LZF==2 */
    unsigned int container : 2;  /* PLAIN==1 or PACKED==2 */
    unsigned int recompress : 1; /* was this node previous compressed? */
    unsigned int attempted_compress : 1; /* node can't compress; too small */
    unsigned int dont_compress : 1; /* prevent compression of entry that will be used later */
    unsigned int extra : 9; /* more bits to steal for future usage */
} quicklistNode;

Create

quicklist *quicklistCreate(void) {
    struct quicklist *quicklist;

    quicklist = zmalloc(sizeof(*quicklist));
    quicklist->head = quicklist->tail = NULL;
    quicklist->len = 0;
    quicklist->count = 0;
    quicklist->compress = 0;
    quicklist->fill = -2;
    quicklist->bookmark_count = 0;
    return quicklist;
}

PUSH（插入）

根据条件来判断是插入到某个ziplist（quicklistNodeUpdateSz（））中还是新建一个quicklistnode

/* Wrapper to allow argument-based switching between HEAD/TAIL pop */
void quicklistPush(quicklist *quicklist, void *value, const size_t sz,
                   int where) {
    /* The head and tail should never be compressed (we don't attempt to decompress them) */
    if (quicklist->head)
        assert(quicklist->head->encoding != QUICKLIST_NODE_ENCODING_LZF);
    if (quicklist->tail)
        assert(quicklist->tail->encoding != QUICKLIST_NODE_ENCODING_LZF);

    if (where == QUICKLIST_HEAD) {
        quicklistPushHead(quicklist, value, sz);
    } else if (where == QUICKLIST_TAIL) {
        quicklistPushTail(quicklist, value, sz);
    }
}

/* Add new entry to head node of quicklist.
 *
 * Returns 0 if used existing head.
 * Returns 1 if new head created. */
int quicklistPushHead(quicklist *quicklist, void *value, size_t sz) {
    quicklistNode *orig_head = quicklist->head;

    if (unlikely(isLargeElement(sz))) {
        __quicklistInsertPlainNode(quicklist, quicklist->head, value, sz, 0);
        return 1;
    }

    if (likely(
            _quicklistNodeAllowInsert(quicklist->head, quicklist->fill, sz))) {
        quicklist->head->entry = lpPrepend(quicklist->head->entry, value, sz);
        quicklistNodeUpdateSz(quicklist->head);
    } else {
        quicklistNode *node = quicklistCreateNode();
        node->entry = lpPrepend(lpNew(0), value, sz);

        quicklistNodeUpdateSz(node);
        _quicklistInsertNodeBefore(quicklist, quicklist->head, node);
    }
    quicklist->count++;
    quicklist->head->count++;
    return (orig_head != quicklist->head);
}

Intset

为了实现集合（set）这种数据结构，设计了intset数据结构，元素是有序、不重复的，并且包含了一些并集、交集、差集的操作

intset：当set存储的元素较少时

dict：当元素较多时、插入字符串时

元素少or多根据系统参数来进行控制以达到最好的性能

是一块连续的存储空间

针对于大整数、小整数采用不通的编码来优化存储空间，如果编码变了，则之前的也需要统一调整编码

结构体定义

typedef struct intset {
    uint32_t encoding;
    uint32_t length;
    int8_t contents[];
} intset;

/* Note that these encodings are ordered, so:
 * INTSET_ENC_INT16 < INTSET_ENC_INT32 < INTSET_ENC_INT64. */
#define INTSET_ENC_INT16 (sizeof(int16_t))
#define INTSET_ENC_INT32 (sizeof(int32_t))
#define INTSET_ENC_INT64 (sizeof(int64_t))

intset与ziplist相比

inset只能存储整数、ziplist可以存储任意字符
inset元素是不重复、有序的，ziplist是无序的
inset会根据元素的大小调整统一编码，ziplist可以采用变长编码

集合的计算

交集

第一步：检查各个集合，对于不存在的集合当做空集来处理。一旦出现空集，则不用继续计算了，最终的交集就是空集。

第二步：按照集合元素个数进行排序

第三步：遍历最小的集合，拿每一个元素跟其他的集合进行寻找，如果其他集合存在，则插入到结果集合中

差集

第一种：将第一个集合中元素存到中间集合中，遍历其他集合，遇到存在的则删除，时间复杂度O（N）

第二种：遍历其他集合，只有所有集合中都找不到的元素，才加入到最后集合中

并集

遍历所有集合，将所有元素都添加到最终集合中

Skiplist

sorted set是基于ziplist、dict+skiplist构建的

skiplist时如何建立的

每插入一个节点，根据算法算出来一个随机层数
每次插入和删除的过程，首先根据skiplist数据结构找到对应的位置，然后通过改变指针的方式，来实现插入和删除
随机层数计算算法：

randomLevel()
    level := 1
    // random()返回一个[0...1)的随机数
    while random() < p and level < MaxLevel do
        level := level + 1
    return level

skiplist和平衡数的比较

插入的性能：平衡树每次的插入和删除涉及到树结构的变更，skiplist不涉及
占用空间：skiplist内存占用更少
范围查找性能：平衡数找到最小值后，找最大值还是比较麻烦的，skiplist找到最小值后，直接根据第一层来往后找就行

Sorted Set底层结构

ziplist、zset（dict+skiplist）

当元素较少时（可通过参数来控制），会采用ziplist进行存储
当元素较多时，数据会存在dict中，分数会存在skiplist中

总结

redis底层是用dict来进行存储dictEntry的

*key（8个字节） -> RedisObject
*value（8个字节）-> RedisObject
*next（8个字节）-> dictEntry

Redis所有的类型的数据都是采用RedisObject来进行存储的

元数据（8个字节）<encoding、refcount（访问次数）、lru（最后一次访问时间）、type>
ptr* （8个字节、针对于SDS有优化）

参考链接：

Redis内部数据结构详解

数据库

#Redis

Redis数据结构源码简析

http://example.com/2024/12/30/2025-01-15-Redis数据结构源码分析/

作者

wyx-98

发布于

2024年12月30日

许可协议

SSL/TLS协议运行机制上一篇

Mysql事务和锁下一篇