Linux 6.12 内存管理详解 - 第8章：Slab 分配器与 PCP

基于 Linux 6.12.38 源码

8.1 概述：内存分配层次结构

Linux 内核的内存管理采用分层架构，从底层到上层依次为：

┌─────────────────────────────────────────────────────────────────┐
│                        内核内存分配层次                          │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│  ┌──────────────────────────────────────────────────────────┐  │
│  │  专用分配器 (Specialized Allocators)                     │  │
│  │  • kmem_cache_alloc - 固定大小对象                       │  │
│  │  • kmalloc/kzalloc - 通用小对象分配                     │  │
│  │  • vmalloc - 非连续物理内存                              │  │
│  └──────────────────────────────────────────────────────────┘  │
│                              │                                  │
│  ┌──────────────────────────────────────────────────────────┐  │
│  │  Slab 分配器 (SLUB/SLO B)                                 │  │
│  │  • 管理小对象缓存                                         │  │
│  │  • 减少内零碎片                                           │  │
│  │  • 提高缓存利用率                                         │  │
│  └──────────────────────────────────────────────────────────┘  │
│                              │                                  │
│  ┌──────────────────────────────────────────────────────────┐  │
│  │  PCP (Per-CPU Pageframe Allocator)                       │  │
│  │  • Per-CPU 页面缓存                                       │  │
│  │  • 减少自旋锁竞争                                         │  │
│  │  • 提高分配性能                                           │  │
│  └──────────────────────────────────────────────────────────┘  │
│                              │                                  │
│  ┌──────────────────────────────────────────────────────────┐  │
│  │  伙伴系统 (Buddy System)                                  │  │
│  │  • 管理物理页面                                           │  │
│  │  • 最小单位: 1 页 (4KB)                                   │  │
│  │  • 2^n 页面块分配                                         │  │
│  └──────────────────────────────────────────────────────────┘  │
│                              │                                  │
│  ┌──────────────────────────────────────────────────────────┐  │
│  │  物理内存                                                 │  │
│  └──────────────────────────────────────────────────────────┘  │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

为什么需要多层分配器？

层次	问题	解决方案
伙伴系统	最小分配单位 4KB，小对象浪费严重	-
PCP	频繁获取 zone->lock，竞争激烈	Per-CPU 缓存减少锁竞争
Slab	对象构造/析构开销，内存碎片	对象池，减少碎片，缓存热点对象

8.2 PCP (Per-CPU Pageframe Allocator)

8.2.1 PCP 设计目的

问题：
在 PCP 引入之前，每次分配和释放页面都需要获取 zone->lock，这在多处理器系统上导致严重的锁竞争。

解决方案：
为每个 CPU 的每个 zone 维护独立的页面缓存，大多数分配/释放可以在无锁或少锁的情况下完成。

位置： mm/page_alloc.c, include/linux/mmzone.h

8.2.2 PCP 数据结构

/* 位置: include/linux/mmzone.h:683 */
struct per_cpu_pages {
    spinlock_t lock;        /* 保护 lists 字段 */
    int count;              /* 列表中的页面数 */
    int high;               /* 高水位，达到时需要释放 */
    int high_min;           /* 最小高水位 */
    int high_max;           /* 最大高水位 */
    int batch;              /* 批量添加/移除的块大小 */
    u8 flags;               /* 标志位 */
    u8 alloc_factor;        /* 分配时的批处理缩放因子 */
#ifdef CONFIG_NUMA
    u8 expire;              /* 为 0 时，远程 pageset 被排空 */
#endif
    short free_count;       /* 连续释放计数 */

    /* 页面列表，每种迁移类型一个 */
    struct list_head lists[NR_PCP_LISTS];
} ____cacheline_aligned_in_smp;

PCP 列表数量：

/* 位置: include/linux/mmzone.h:665 */
#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1))
#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)

/* MIGRATE_PCPTYPES = 3 (MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE) */
/* PAGE_ALLOC_COSTLY_ORDER = 3 */
/* 对于 THP，额外的 2 个列表 */

zone 中的 PCP：

/* 位置: include/linux/mmzone.h:843 */
struct zone {
    /* ... 其他字段 ... */
    struct per_cpu_pages __percpu *per_cpu_pageset;
    struct per_cpu_zonestat __percpu *per_cpu_zonestats;

    /* high 和 batch 值被复制到各个 pageset 以加快访问 */
    int pageset_high_min;
    int pageset_high_max;
    int pageset_batch;
};

8.2.3 PCP 工作原理

水位管理：

        high
         │─────────────────────────────────────
         │
         │
         │   正常工作区
         │
         │
         │
         │─────────────────────────────────────
         0

• count < high:    正常状态
• count >= high:   需要释放一批页面到伙伴系统
• count 为 0:      需要从伙伴系统批量分配

batch 动态调整：

/* 位置: mm/page_alloc.c:2973 */
static int nr_pcp_alloc(struct per_cpu_pages *pcp, struct zone *zone,
                        unsigned int order)
{
    int batch = READ_ONCE(pcp->batch);
    int max_nr_alloc;

    /* 根据分配频率调整 */
    if (pcp->free_count) {
        /* 有连续释放时，增加批量大小 */
        max_nr_alloc = batch << min(pcp->free_count,
                                    CONFIG_PCP_BATCH_SCALE_MAX);
        if (batch <= max_nr_alloc &&
            pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX)
            pcp->alloc_factor++;
        batch = min(batch, max_nr_alloc);
    }

    /* 根据 order 调整 */
    if (batch > 1)
        batch = max(batch >> order, 2);

    return batch;
}

8.2.4 PCP 分配流程

完整分配路径：

__alloc_pages()
    │
    ▼
rmqueue()
    │
    ▼
┌─────────────────────────────────────────────────────────────┐
│ rmqueue_pcplist() - 从 PCP 分配                              │
│ 位置: mm/page_alloc.c:3052                                   │
├─────────────────────────────────────────────────────────────┤
│ 1. 尝试获取 PCP 锁                                           │
│    pcp_spin_trylock(zone->per_cpu_pageset)                  │
│    │                                                         │
│    ├─ 成功 ──────────────────────────────────┐              │
│    │                                          │              │
│    └─ 失败 → 返回 NULL                        │              │
│                                               │              │
│ 2. 减少释放批处理因子                         │              │
│    pcp->free_count >>= 1                     │              │
│                                               │              │
│ 3. 选择对应的迁移类型列表                     │              │
│    list = &pcp->lists[order_to_pindex(...)]  │              │
│                                               │              │
│ 4. __rmqueue_pcplist()                        │              │
│    │                                           │              │
│    ├─ 列表为空?                                │              │
│    │   │                                       │              │
│    │   └─ 是 → rmqueue_bulk()                 │              │
│    │          从伙伴系统批量分配页面            │              │
│    │          添加到 PCP 列表                  │              │
│    │                                           │              │
│    └─ 从列表取出第一个页面                    │              │
│                                               │              │
│ 5. 释放 PCP 锁                                │              │
│                                               │              │
│ 6. 返回页面                                   │              │
└─────────────────────────────────────────────────────────────┘
    │
    ▼
__rmqueue_pcplist() - 位置: mm/page_alloc.c:3021
    │
    ├─ 列表为空?
    │   │
    │   └─ 是 → rmqueue_bulk()
    │              │
    │              ▼
    │         ┌─────────────────────────────────────┐
    │         │ 从伙伴系统分配 batch 个页面          │
    │         │ 持有 zone->lock                     │
    │         │ 循环调用 __rmqueue()                │
    │         └─────────────────────────────────────┘
    │
    └─ list_first_entry() 取出页面
        list_del(&page->pcp_list)
        pcp->count -= 1 << order
        return page

关键代码分析：

/* 位置: mm/page_alloc.c:3021 */
static inline struct page *__rmqueue_pcplist(struct zone *zone,
                                             unsigned int order,
                                             int migratetype,
                                             unsigned int alloc_flags,
                                             struct per_cpu_pages *pcp,
                                             struct list_head *list)
{
    struct page *page;

    do {
        /* 检查列表是否为空 */
        if (list_empty(list)) {
            /* 计算批量大小 */
            int batch = nr_pcp_alloc(pcp, zone, order);
            int alloced;

            /* 从伙伴系统批量分配 */
            alloced = rmqueue_bulk(zone, order,
                                   batch, list,
                                   migratetype, alloc_flags);

            /* 更新计数 */
            pcp->count += alloced << order;

            /* 如果仍然为空，返回 NULL */
            if (unlikely(list_empty(list)))
                return NULL;
        }

        /* 从列表取出第一个页面 */
        page = list_first_entry(list, struct page, pcp_list);
        list_del(&page->pcp_list);
        pcp->count -= 1 << order;
    } while (check_new_pages(page, order));

    return page;
}

8.2.5 PCP 释放流程

完整释放路径：

__free_pages() / put_page()
    │
    ▼
free_unref_page() - 位置: mm/page_alloc.c:2675
    │
    ├─ pcp_allowed_order(order)?
    │   │
    │   ├─ 否 → __free_pages_ok() 直接到伙伴系统
    │   │
    │   └─ 是 → 继续
    │
    ├─ free_pages_prepare() 页面检查和清理
    │
    ├─ 获取页面迁移类型
    │   migratetype = get_pfnblock_migratetype(page, pfn)
    │
    ├─ 尝试获取 PCP 锁
    │   pcp = pcp_spin_trylock(zone->per_cpu_pageset)
    │   │
    │   ├─ 成功 → free_unref_page_commit()
    │   │          │
    │   │          ├─ 添加到 PCP 列表
    │   │          │   list_add(&page->pcp_list, &pcp->lists[pindex])
    │   │          │   pcp->count += 1 << order
    │   │          │
    │   │          ├─ 检查是否超过高水位
    │   │          │   if (pcp->count >= high)
    │   │          │       free_pcppages_bulk()
    │   │          │
    │   │          └─ 释放锁
    │   │
    │   └─ 失败 → free_one_page() 直接到伙伴系统

释放提交代码：

/* 位置: mm/page_alloc.c:2624 */
static void free_unref_page_commit(struct zone *zone,
                                   struct per_cpu_pages *pcp,
                                   struct page *page,
                                   int migratetype,
                                   unsigned int order)
{
    int high, batch;
    int pindex;
    bool free_high = false;

    /* 减少分配批处理因子 */
    pcp->alloc_factor >>= 1;

    /* 统计 */
    __count_vm_events(PGFREE, 1 << order);

    /* 确定列表索引 */
    pindex = order_to_pindex(migratetype, order);

    /* 添加到对应的 PCP 列表 */
    list_add(&page->pcp_list, &pcp->lists[pindex]);
    pcp->count += 1 << order;

    batch = READ_ONCE(pcp->batch);

    /* 高阶页面处理 */
    if (order && order <= PAGE_ALLOC_COSTLY_ORDER) {
        free_high = (pcp->free_count >= batch &&
                     (pcp->flags & PCPF_PREV_FREE_HIGH_ORDER) &&
                     (!(pcp->flags & PCPF_FREE_HIGH_BATCH) ||
                      pcp->count >= READ_ONCE(batch)));
        pcp->flags |= PCPF_PREV_FREE_HIGH_ORDER;
    }

    /* 更新连续释放计数 */
    if (pcp->free_count < (batch << CONFIG_PCP_BATCH_SCALE_MAX))
        pcp->free_count += (1 << order);

    /* 计算高水位 */
    high = nr_pcp_high(pcp, zone, batch, free_high);

    /* 超过水位，批量释放到伙伴系统 */
    if (pcp->count >= high) {
        free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
                           pcp, pindex);
        if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
            zone_watermark_ok(zone, 0, high_wmark_pages(zone),
                              ZONE_MOVABLE, 0))
            clear_bit(ZONE_BELOW_HIGH, &zone->flags);
    }
}

8.2.6 PCP 批量释放到伙伴系统

/* 位置: mm/page_alloc.c:1171 */
static void free_pcppages_bulk(struct zone *zone, int count,
                               struct per_cpu_pages *pcp,
                               int pindex)
{
    unsigned long flags;
    unsigned int order;
    struct page *page;

    /* 确保正确的 count */
    count = min(pcp->count, count);

    /* 首先释放请求的 pindex */
    pindex = pindex - 1;

    spin_lock_irqsave(&zone->lock, flags);

    while (count > 0) {
        struct list_head *list;
        int nr_pages;

        /* 以轮询方式从各个列表移除页面 */
        do {
            if (++pindex > NR_PCP_LISTS - 1)
                pindex = 0;
            list = &pcp->lists[pindex];
        } while (list_empty(list));

        order = pindex_to_order(pindex);
        nr_pages = 1 << order;

        do {
            unsigned long pfn;
            int mt;

            page = list_last_entry(list, struct page, pcp_list);
            pfn = page_to_pfn(page);
            mt = get_pfnblock_migratetype(page, pfn);

            /* 必须删除以避免损坏 pcp 列表 */
            list_del(&page->pcp_list);
            count -= nr_pages;
            pcp->count -= nr_pages;

            /* 释放到伙伴系统 */
            __free_one_page(page, pfn, zone, order, mt, FPI_NONE);
            trace_mm_page_pcpu_drain(page, order, mt);
        } while (count > 0 && !list_empty(list));
    }

    spin_unlock_irqrestore(&zone->lock, flags);
}

8.2.7 PCP 水位衰减

/* 位置: mm/page_alloc.c:2366 */
int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp)
{
    int high_min, to_drain, batch;
    int todo = 0;

    high_min = READ_ONCE(pcp->high_min);
    batch = READ_ONCE(pcp->batch);

    /* 定期降低 pcp->high 以尝试释放可能的空闲 PCP 页面 */
    if (pcp->high > high_min) {
        pcp->high = max3(pcp->count - (batch << CONFIG_PCP_BATCH_SCALE_MAX),
                         pcp->high - (pcp->high >> 3), high_min);
        if (pcp->high > high_min)
            todo++;
    }

    /* 如果超过水位，释放页面 */
    to_drain = pcp->count - pcp->high;
    if (to_drain > 0) {
        spin_lock(&pcp->lock);
        free_pcppages_bulk(zone, to_drain, pcp, 0);
        spin_unlock(&pcp->lock);
        todo++;
    }

    return todo;
}

8.2.8 PCP 支持的分配阶数

/* 位置: mm/page_alloc.c:542 */
static inline bool pcp_allowed_order(unsigned int order)
{
    /* 支持 0 阶和低于 PAGE_ALLOC_COSTLY_ORDER 的高阶分配 */
    return order <= PAGE_ALLOC_COSTLY_ORDER;
}

/* PAGE_ALLOC_COSTLY_ORDER = 3，即最多支持 8 页 (32KB) 的分配 */

8.3 SLUB 分配器 (默认实现)

8.3.1 SLUB 概述

设计目标：

简化数据结构
减少 Cache Line 占用
提高分配性能
更好的 NUMA 支持

位置： mm/slub.c

SLUB 特点：

特性	SLAB	SLUB
Per-CPU 队列	3个列表 (full, partial, free)	单个 freelist
对象指针	间接引用	直接引用
内存开销	较高	较低
代码复杂度	高	低
调试功能	有限	丰富

8.3.2 SLUB 数据结构

kmem_cache 结构：

/* 位置: include/linux/slub_def.h */
struct kmem_cache {
    /* 对象配置 */
    unsigned int object_size;     /* 原始对象大小 */
    unsigned int size;            /* 包含对齐后的对象大小 */
    unsigned int offset;          /* 到对象的偏移 */
    unsigned int freelist_idx_size;   /* freelist 索引大小 */
    unsigned int inuse;           /* 偏移到元数据 */
    unsigned int align;           /* 对齐要求 */
    unsigned int red_left_pad;    /* 红区左填充 */
    slab_flags_t flags;           /* 标志 */
    unsigned long useroffset;     /* 用户拷贝偏移 */
    unsigned long usersize;       /* 用户拷贝大小 */

    /* slab 配置 */
    unsigned int reciprocal_floor_size;  /* 1/slab 大小 */
    unsigned int obj_offset;       /* free 指针偏移 */

    /* Per-CPU 数据 */
    struct kmem_cache_cpu __percpu *cpu_slab;

    /* 节点数据 */
    struct kmem_cache_node *node[MAX_NUMNODES];

    /* 构造/析构 */
    void (*ctor)(void *object);

    /* 名称和大小 */
    const char *name;
    unsigned int flags;
    unsigned int object_size;

    /* ... 更多字段 ... */
};

Per-CPU 数据：

/* 位置: mm/slub.c:384 */
struct kmem_cache_cpu {
    union {
        struct {
            void **freelist;    /* 指向下一个可用对象的指针 */
            unsigned long tid;  /* 全局唯一的事务 ID */
        };
        freelist_aba_t freelist_tid;
    };
    struct slab *slab;          /* 当前从中分配的 slab */
#ifdef CONFIG_SLUB_CPU_PARTIAL
    struct slab *partial;       /* 部分分配的 slab */
#endif
    local_lock_t lock;          /* 保护上述字段 */
#ifdef CONFIG_SLUB_STATS
    unsigned int stat[NR_SLUB_STAT_ITEMS];
#endif
};

节点数据：

/* 位置: mm/slub.c:425 */
struct kmem_cache_node {
    spinlock_t list_lock;
    unsigned long nr_partial;   /* 部分 slab 数量 */
    struct list_head partial;   /* 部分 slab 列表 */
#ifdef CONFIG_SLUB_DEBUG
    atomic_long_t nr_slabs;     /* 总 slab 数量 */
    atomic_long_t total_objects; /* 总对象数量 */
    struct list_head full;      /* 满 slab 列表 */
#endif
};

8.3.3 SLUB Slab 状态

Slab 四种状态：

┌─────────────────────────────────────────────────────────────────┐
│                    SLUB Slab 状态管理                            │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│  1. CPU Slab (当前 slab)                                        │
│     ┌────────────────────────────────────┐                     │
│     │ frozen = true                      │                     │
│     │ PG_workingset = 0                  │                     │
│     │                                     │                     │
│     │ 当前 CPU 正在从中分配对象的 slab     │                     │
│     │ 不在任何列表中                      │                     │
│     └────────────────────────────────────┘                     │
│                                                                 │
│  2. CPU Partial (Per-CPU 部分 slab)                            │
│     ┌────────────────────────────────────┐                     │
│     │ frozen = false                     │                     │
│     │ PG_workingset = 0                  │                     │
│     │                                     │                     │
│     │ Per-CPU 缓存的部分空 slab           │                     │
│     │ 用于减少 node list_lock 竞争        │                     │
│     └────────────────────────────────────┘                     │
│                                                                 │
│  3. Node Partial (节点部分 slab)                               │
│     ┌────────────────────────────────────┐                     │
│     │ frozen = false                     │                     │
│     │ PG_workingset = 1                  │                     │
│     │                                     │                     │
│     │ 节点的部分 slab 列表                │                     │
│     │ 受 node->list_lock 保护             │                     │
│     └────────────────────────────────────┘                     │
│                                                                 │
│  4. Full (满 slab)                                             │
│     ┌────────────────────────────────────┐                     │
│     │ frozen = false                     │                     │
│     │ PG_workingset = 0                  │                     │
│     │                                     │                     │
│     │ 所有对象都被分配                    │                     │
│     │ 不在任何活跃列表中                  │                     │
│     └────────────────────────────────────┘                     │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

8.3.4 SLUB 分配流程

完整分配路径：

kmem_cache_alloc(s, flags)
    │
    ▼
slab_alloc_node() - 位置: mm/slub.c:4127
    │
    ├─ slab_pre_alloc_hook() - 分配前钩子
    │
    ├─ kfence_alloc() - KFENCE 采样
    │
    ▼
__slab_alloc_node() - 位置: mm/slub.c:3934
    │
    ├─ 获取 Per-CPU 数据
    │   c = raw_cpu_ptr(s->cpu_slab)
    │   tid = READ_ONCE(c->tid)
    │
    ├─ 快速路径检查
    │   │
    │   ├─ object = c->freelist
    │   │   slab = c->slab
    │   │
    │   └─ USE_LOCKLESS_FAST_PATH() && object && slab && node_match()
    │       │
    │       ├─ 是 → 快速路径
    │       │   │
    │       │   ├─ next_object = get_freepointer_safe(s, object)
    │       │   │
    │       │   ├─ __update_cpu_freelist_fast()
    │       │   │   原子更新: (freelist, tid)
    │       │   │
    │       │   └─ 返回 object
    │       │
    │       └─ 否 → 慢速路径
    │           │
    │           └─ __slab_alloc()
    │               │
    │               ├─ 检查是否需要新 slab
    │               │
    │               ├─ get_partial() - 获取部分 slab
    │               │
    │               └─ new_slab() - 分配新 slab
    │
    └─ slab_post_alloc_hook() - 分配后钩子

快速路径代码：

/* 位置: mm/slub.c:3934 */
static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
                                               gfp_t gfpflags, int node,
                                               unsigned long addr,
                                               size_t orig_size)
{
    struct kmem_cache_cpu *c;
    struct slab *slab;
    unsigned long tid;
    void *object;

redo:
    /* 获取 Per-CPU slab 数据 */
    c = raw_cpu_ptr(s->cpu_slab);
    tid = READ_ONCE(c->tid);

    /* 内存屏障，确保 tid 先于其他数据读取 */
    barrier();

    object = c->freelist;
    slab = c->slab;

    /* 快速路径: 有空闲对象且匹配节点 */
    if (!USE_LOCKLESS_FAST_PATH() ||
        unlikely(!object || !slab || !node_match(slab, node))) {
        /* 慢速路径 */
        object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
    } else {
        void *next_object = get_freepointer_safe(s, object);

        /*
         * cmpxchg 双原子操作:
         * 1. 验证 tid 未改变 (未发生抢占或迁移)
         * 2. 验证 freelist 未改变
         * 3. 更新 freelist 和 tid
         */
        if (unlikely(!__update_cpu_freelist_fast(s, object,
                                                  next_object, tid))) {
            note_cmpxchg_failure("slab_alloc", s, tid);
            goto redo;
        }
        prefetch_freepointer(s, next_object);
        stat(s, ALLOC_FASTPATH);
    }

    return object;
}

TID (Transaction ID) 机制：

TID 机制确保无锁路径的正确性:

1. 分配前:
   c->freelist = object1
   c->tid = 5

2. 发生抢占/迁移:
   - 在另一个 CPU 上，c->tid 会递增
   - 当前 CPU 的 tid 值会失效

3. cmpxchg 检查:
   - 同时比较 freelist 和 tid
   - 如果任一改变，操作失败
   - 重新获取数据并重试

这确保了:
- 不会使用过期 CPU 的数据
- 并发操作被正确检测

8.3.5 SLUB 释放流程

完整释放路径：

kmem_cache_free(s, object)
    │
    ▼
do_slab_free() - 位置: mm/slub.c
    │
    ├─ 检查对象有效性
    │
    ├─ 快速路径
    │   │
    │   ├─ 获取 CPU slab
    │   │   c = raw_cpu_ptr(s->cpu_slab)
    │   │
    │   ├─ 检查是否可以释放到当前 slab
    │   │   slab = c->slab
    │   │   slab_cmpxchg_double()
    │   │
    │   └─ 更新 freelist
    │
    └─ 慢速路径
        │
        ├─ slab_free()
        │   │
        │   ├─ 检查 slab 是否为空
        │   │
        │   ├─ 空 -> 移除 slab
        │   │
        │   └─ 不空 -> 添加到 freelist

8.3.6 SLUB slab 分配

从伙伴系统分配 slab：

/* 位置: mm/slub.c */
static struct slab *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
    unsigned int order = slub_get_order(s, s->size);
    struct folio *folio;
    struct slab *slab;

    flags |= __GFP_NOTRACK;

    /* 从伙伴系统分配页面 */
    folio = alloc_slab_folios(s, flags, order, node);
    if (!folio)
        return NULL;

    slab = &folio->page;

    /* 初始化 slab */
    __slab_set_guard(slab);
    inc_slabs_node(s, node, folio_nr_pages(folio));
    memcg_slab_post_alloc_hook(s, flags, 1, &folio);

    return slab;
}

8.3.7 SLUB 对象布局

┌────────────────────────────────────────────────────────────────┐
│                    SLUB Slab 布局                               │
├────────────────────────────────────────────────────────────────┤
│                                                                 │
│  Slab (1 或多个页面)                                            │
│  ┌────────────────────────────────────────────────────────┐    │
│  │  Object 0    Object 1    Object 2    ...    Object N   │    │
│  │  ┌─────────┐ ┌─────────┐ ┌─────────┐         ┌─────────┐│    │
│  │  │         │ │         │ │         │         │         ││    │
│  │  │ Object  │ │ Object  │ │ Object  │         │ Object  ││    │
│  │  │ Data    │ │ Data    │ │ Data    │         │ Data    ││    │
│  │  │         │ │         │ │         │         │         ││    │
│  │  └─────────┘ └─────────┘ └─────────┘         └─────────┘│    │
│  │       │            │                              │       │    │
│  │       ▼            ▼                              ▼       │    │
│  │   Freelist     Freelist                      Freelist    │    │
│  │                                                             │    │
│  │  每个对象末尾存储指向下一个空闲对象的指针                     │    │
│  └────────────────────────────────────────────────────────┘    │
│                                                                 │
│  元数据位置:                                                     │
│  • offset = s->offset (从对象起始到元数据的偏移)                │
│  • freelist 指针通常存储在对象起始位置                          │
│                                                                 │
└────────────────────────────────────────────────────────────────┘

8.4 SLAB 分配器 (传统实现)

8.4.1 SLAB 概述

特点：

原始的 slab 分配器实现
三队列管理 (full, partial, free)
更复杂但功能完整
支持着色 (coloring) 优化缓存行使用

注意： 在 Linux 6.12 中，SLAB 已被标记为过时，SLUB 是默认且推荐的实现。

8.4.2 SLAB 数据结构

/* 位置: include/linux/slab_def.h */
struct kmem_cache {
    /* 对象大小 */
    unsigned int object_size;
    unsigned int size;        /* 完整对象大小 */
    unsigned int align;       /* 对齐 */

    /* slab 管理 */
    unsigned int num;         /* 每个 slab 的对象数 */
    unsigned int gfporder;    /* 分配阶数 */

    /* Per-CPU 数据 */
    struct kmem_cache_cpu __percpu *cpu_slab;

    /* 节点数据 */
    struct kmem_cache_node *node[MAX_NUMNODES];

    /* 构造函数 */
    void (*ctor)(void *object);

    /* ... */
};

/* Per-CPU slab 数据 */
struct kmem_cache_cpu {
    void **freelist;         /* 空闲对象列表 */
    struct slab *slab;       /* 当前 slab */
    struct slab *partial;    /* 部分 slab */
};

/* 节点 slab 数据 */
struct kmem_cache_node {
    spinlock_t list_lock;
    struct list_head slabs_full;     /* 满 slab */
    struct list_head slabs_partial;  /* 部分 slab */
    struct list_head slabs_free;     /* 空 slab */
};

8.4.3 SLAB 三队列管理

┌─────────────────────────────────────────────────────────────────┐
│                    SLAB 三队列管理                               │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│  Full Slabs (满)                                                │
│  ┌───────────┐  ┌───────────┐  ┌───────────┐                   │
│  │ □ □ □ □ │  │ □ □ □ □ │  │ □ □ □ □ │                       │
│  │ □ □ □ □ │  │ □ □ □ □ │  │ □ □ □ □ │  所有对象已分配        │
│  │ □ □ □ □ │  │ □ □ □ □ │  │ □ □ □ □ │                       │
│  └───────────┘  └───────────┘  └───────────┘                   │
│                                                                 │
│  Partial Slabs (部分)                                           │
│  ┌───────────┐  ┌───────────┐  ┌───────────┐                   │
│  │ □ □ □ □ │  │ □ □ □ □ │  │ □ □ □ □ │  部分对象已分配        │
│  │ □ □ □ □ │  │ □ □ □ □ │  │ □ □ □ □ │                       │
│  │ . . □ □ │  │ . . . □ │  │ □ . □ □ │                       │
│  └───────────┘  └───────────┘  └───────────┘                   │
│                                                                 │
│  Free Slabs (空)                                                │
│  ┌───────────┐  ┌───────────┐  ┌───────────┐                   │
│  │ . . . . │  │ . . . . │  │ . . . . │  所有对象空闲           │
│  │ . . . . │  │ . . . . │  │ . . . . │                       │
│  │ . . . . │  │ . . . . │  │ . . . . │                       │
│  └───────────┘  └───────────┘  └───────────┘                   │
│                                                                 │
│  分配流程:                                                      │
│  1. 从 partial 首部分配                                        │
│  2. 若 partial 为空，从 free 移动一个到 partial                 │
│  3. 若 free 为空，分配新 slab 到 free                          │
│                                                                 │
│  释放流程:                                                      │
│  1. 释放到 partial                                             │
│  2. 若 partial 变满，移动到 full                               │
│  3. 若 partial 变空，移动到 free                               │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

8.5 SLOB 分配器 (已移除)

8.5.1 SLOB 概述

注意： SLOB 已在较新的内核版本中被移除。

历史特点：

极简的 slab 实现
适合资源受限的嵌入式系统
使用简单的链表管理空闲块
内存开销最小，但性能较低

移除原因：

SLUB 已经足够高效
维护三种实现的成本高
SLUB tiny 配置可以满足大多数嵌入式需求

8.6 kmalloc 实现

8.6.1 kmalloc 缓存数组

kmalloc 大小类别：

/* 位置: mm/slab_common.c */
/* kmalloc 分配大小表 */
struct kmalloc_info {
    const char *name;
    unsigned long size;
};

static const struct kmalloc_info kmalloc_info[] = {
    { NULL,                      0 },   /* 0 */
    { "kmalloc-8",               8 },   /* 1 */
    { "kmalloc-16",             16 },   /* 2 */
    { "kmalloc-32",             32 },   /* 3 */
    { "kmalloc-64",             64 },   /* 4 */
    { "kmalloc-128",           128 },   /* 5 */
    { "kmalloc-256",           256 },   /* 6 */
    { "kmalloc-512",           512 },   /* 7 */
    { "kmalloc-1024",         1024 },   /* 8 */
    { "kmalloc-2048",         2048 },   /* 9 */
    { "kmalloc-4096",         4096 },   /* 10 */
    { "kmalloc-8192",         8192 },   /* 11 */
    { "kmalloc-16384",       16384 },   /* 12 */
    { "kmalloc-32768",       32768 },   /* 13 */
    { "kmalloc-65536",       65536 },   /* 14 */
    { "kmalloc-131072",     131072 },   /* 15 */
    { "kmalloc-262144",     262144 },   /* 16 */
    { "kmalloc-524288",     524288 },   /* 17 */
    { "kmalloc-1048576",   1048576 },   /* 18 */
    { "kmalloc-2097152",   2097152 },   /* 19 */
    { "kmalloc-4194304",   4194304 },   /* 20 */
    { "kmalloc-8388608",   8388608 },   /* 21 */
    { "kmalloc-16777216", 16777216 },   /* 22 */
    { "kmalloc-33554432", 33554432 },   /* 23 */
    { "kmalloc-67108864", 67108864 },   /* 24 */
};

8.6.2 kmalloc 实现

/* 位置: mm/slub.c:4263 */
static __always_inline
void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags,
                        int node, unsigned long caller)
{
    struct kmem_cache *s;
    void *ret;

    /* 查找合适的缓存 */
    if (size <= KMALLOC_MAX_CACHE_SIZE) {
        /* 小分配: 使用 slab 缓存 */
        s = kmalloc_slab(size, flags);
        ret = slab_alloc_node(s, NULL, flags, node, caller, size);
    } else {
        /* 大分配: 直接分配页面 */
        ret = ___kmalloc_large_node(size, flags, node);
    }

    ret = kasan_kmalloc(ret, size, flags);
    return ret;
}

8.6.3 kmalloc 大小选择

size <= 192:     使用 2 的幂次对齐
size <= 1024:    使用通用 kmalloc 缓存
size > 1024:     可能使用 kmalloc_large 直接分配页面
size > 2MB:      直接调用伙伴系统

8.7 调试与监控

8.7.1 Slab 调试选项

# SLUB 调试
CONFIG_SLUB_DEBUG=y
CONFIG_SLUB_DEBUG_ON=y      # 启动时启用调试
CONFIG_SLUB_STATS=y         # 统计信息

# 对象调试
CONFIG_SLUB_RCU_DEBUG=y     # RCU 调试
CONFIG_DEBUG_KMEMLEAK=y     # 内存泄漏检测
CONFIG_DEBUG_OBJECTS=y      # 对象状态调试

# 其他
CONFIG_KASAN=y              # 地址消毒剂
CONFIG_KFENCE=y             # 低开销内存错误检测
CONFIG_DEBUG_SLAB=y         # SLAB 调试

8.7.2 调试接口

# 查看 slab 信息
cat /proc/slabinfo

# 实时监控
slabtop

# SLUB 调试文件
cat /sys/kernel/debug/slab/*/alloc_traces
cat /sys/kernel/debug/slab/*/free_traces
cat /sys/kernel/debug/slab/*/slabs

# 特定缓存信息
cat /sys/kernel/slab/kmalloc-256/objects
cat /sys/kernel/slab/kmalloc-256/objs_per_slab
cat /sys/kernel/slab/kmalloc-256/order

8.7.3 调试示例

查找内存泄漏：

# 1. 记录初始状态
cat /proc/slabinfo > /tmp/slab_before

# 2. 运行测试程序

# 3. 记录结束状态
cat /proc/slabinfo > /tmp/slab_after

# 4. 比较
diff /tmp/slab_before /tmp/slab_after

# 5. 使用 kmemleak
echo scan > /sys/kernel/debug/kmemleak
cat /sys/kernel/debug/kmemleak

跟踪分配：

# 启用跟踪
echo 1 > /sys/kernel/slab/dentry/trace

# 查看分配跟踪
cat /sys/kernel/slab/dentry/alloc_traces

# 禁用跟踪
echo 0 > /sys/kernel/slab/dentry/trace

8.7.4 SLUB 统计

/* 位置: mm/slub.c */
enum stat_item {
    ALLOC_FASTPATH,         /* 快速路径分配 */
    ALLOC_SLOWPATH,         /* 慢速路径分配 */
    FREE_FASTPATH,          /* 快速路径释放 */
    FREE_SLOWPATH,          /* 慢速路径释放 */
    FREE_FROZEN,            /* 释放到 frozen slab */
    FREE_ADD_PARTIAL,       /* 添加到部分列表 */
    FREE_REMOVE_PARTIAL,    /* 从部分列表移除 */
    ALLOC_FROM_PARTIAL,     /* 从部分列表分配 */
    ALLOC_SLAB,             /* 分配新 slab */
    ALLOC_NODE_MISMATCH,    /* 节点不匹配 */
    FREE_SLAB,              /* 释放 slab */
    CPUSLAB_FLUSH,          /* 刷新 CPU slab */
    DEACTIVATE_FULL,        /* 停用满 slab */
    DEACTIVATE_EMPTY,       /* 停用空 slab */
    DEACTIVATE_TO_HEAD,     /* 停用到头部 */
    DEACTIVATE_TO_TAIL,     /* 停用到尾部 */
    DEACTIVATE_REMOTE_FREES,/* 远程释放 */
    DEACTIVATE_BYPASS,      /* 绕过停用 */
    ORDER_FALLBACK,         /* 阶数回退 */
};

/* 查看统计 */
cat /sys/kernel/slab/<cache>/stats

8.8 性能优化

8.8.1 Per-CPU 优化

/* 定义 Per-CPU 变量 */
static DEFINE_PER_CPU(unsigned long, counters);

/* 访问 */
this_cpu_inc(counters);              /* 无锁访问 */
__this_cpu_inc(counters);             /* 更快但需要保护 */

/* 禁用抢占 */
preempt_disable();
this_cpu_inc(counters);
preempt_enable();

8.8.2 批量操作

/* 批量分配 */
void *objects[10];
int n = kmem_cache_alloc_bulk(cache, GFP_KERNEL, 10, objects);

/* 批量释放 */
if (n > 0) {
    kmem_cache_free_bulk(cache, n, objects);
}

8.8.3 NUMA 优化

/* 从本地节点分配 */
void *obj = kmem_cache_alloc_node(cache, GFP_KERNEL, numa_node_id());

/* 从指定节点分配 */
void *obj = kmalloc_node(size, GFP_KERNEL, node);

8.8.4 对象构造

/* 使用构造函数初始化对象 */
struct kmem_cache *cache;

void ctor(void *obj)
{
    struct my_struct *p = obj;
    INIT_LIST_HEAD(&p->list);
    /* 其他初始化 */
}

cache = kmem_cache_create("my_cache",
                          sizeof(struct my_struct),
                          __alignof__(struct my_struct),
                          0, ctor);

8.9 常见问题

8.9.1 Slab 泄漏

症状：

系统内存持续增长
/proc/meminfo 中 Slab 持续增加
系统性能下降

诊断：

# 查看哪个缓存增长最快
watch -n 1 'cat /proc/slabinfo | head -20 | sort -k2 -nr'

# 使用 kmemleak
echo scan > /sys/kernel/debug/kmemleak
cat /sys/kernel/debug/kmemleak

8.9.2 Slab 碎片

症状：

大量 partial slabs
内存利用率低

解决：

# 合并相同的缓存
echo 1 > /sys/kernel/slab/merge

# slab 收缩
echo 2 > /proc/sys/vm/drop_caches

8.9.3 性能问题

症状：

分配/释放慢
高 CPU 使用率

解决：

检查是否使用了正确的分配器
考虑使用批量分配
优化对象大小
检查是否有缓存抖动

8.10 本章小结

本章详细介绍了 Linux 6.12 的内存分配器系统：

8.10.1 PCP (Per-CPU Pageframe Allocator)

设计目的: 减少 zone->lock 竞争
关键结构: struct per_cpu_pages
工作原理: Per-CPU 页面缓存，批量操作
支持阶数: 0 到 PAGE_ALLOC_COSTLY_ORDER (3)

8.10.2 SLUB 分配器 (默认)

设计目标: 简化结构，提高性能
关键结构: struct kmem_cache_cpu, struct kmem_cache_node
Slab 状态: CPU Slab, CPU Partial, Node Partial, Full
无锁分配: 基于 TID 的 lockless fastpath

8.10.3 SLAB 分配器 (传统)

特点: 三队列管理，功能完整
状态: 已标记为过时
适用: 特定场景或兼容性需求

8.10.4 SLOB 分配器

状态: 已在较新版本中移除
原因: SLUB 已足够高效

8.10.5 分配层次

用户接口 (kmalloc/kmem_cache_alloc)
    │
    ▼
Slab 分配器 (SLUB)
    │
    ▼
PCP (Per-CPU Pageframe Allocator)
    │
    ▼
伙伴系统 (Buddy System)
    │
    ▼
物理内存

8.10.6 关键函数位置

函数	位置
`__rmqueue_pcplist`	mm/page_alloc.c:3021
`rmqueue_pcplist`	mm/page_alloc.c:3052
`free_unref_page`	mm/page_alloc.c:2675
`free_pcppages_bulk`	mm/page_alloc.c:1171
`slab_alloc_node`	mm/slub.c:4127
`__slab_alloc_node`	mm/slub.c:3934
`kmem_cache_alloc`	mm/slub.c:4158

下一章将介绍缺页异常处理。

Linux内核分析之内存管理-08

Linux 6.12 内存管理详解 - 第8章：Slab 分配器与 PCP

8.1 概述：内存分配层次结构

8.2 PCP (Per-CPU Pageframe Allocator)

8.2.1 PCP 设计目的

8.2.2 PCP 数据结构

8.2.3 PCP 工作原理

8.2.4 PCP 分配流程

8.2.5 PCP 释放流程

8.2.6 PCP 批量释放到伙伴系统

8.2.7 PCP 水位衰减

8.2.8 PCP 支持的分配阶数

8.3 SLUB 分配器 (默认实现)

8.3.1 SLUB 概述

8.3.2 SLUB 数据结构

8.3.3 SLUB Slab 状态

8.3.4 SLUB 分配流程

8.3.5 SLUB 释放流程

8.3.6 SLUB slab 分配

8.3.7 SLUB 对象布局

8.4 SLAB 分配器 (传统实现)

8.4.1 SLAB 概述

8.4.2 SLAB 数据结构

8.4.3 SLAB 三队列管理

8.5 SLOB 分配器 (已移除)

8.5.1 SLOB 概述

8.6 kmalloc 实现

8.6.1 kmalloc 缓存数组

8.6.2 kmalloc 实现

8.6.3 kmalloc 大小选择

8.7 调试与监控

8.7.1 Slab 调试选项

8.7.2 调试接口

8.7.3 调试示例

8.7.4 SLUB 统计

8.8 性能优化

8.8.1 Per-CPU 优化

8.8.2 批量操作

8.8.3 NUMA 优化

8.8.4 对象构造

8.9 常见问题

8.9.1 Slab 泄漏

8.9.2 Slab 碎片

8.9.3 性能问题

8.10 本章小结

8.10.1 PCP (Per-CPU Pageframe Allocator)

8.10.2 SLUB 分配器 (默认)

8.10.3 SLAB 分配器 (传统)

8.10.4 SLOB 分配器

8.10.5 分配层次

8.10.6 关键函数位置