第13章 CFS 完全公平调度器详解

基于 Linux 6.12.38 源码分析

13.1 CFS 设计理念

13.1.1 完全公平的概念

传统调度器的问题：

基于时间片的调度器需要维护复杂的时间片计算
优先级静态，无法动态调整
进程数增多时时间片变短，上下文切换开销增大

CFS 的解决方案：

没有传统意义上的时间片概念
基于虚拟运行时间 (vruntime) 进行调度
vruntime 最小的任务优先获得 CPU

13.1.2 理想公平模型

理想公平调度器模型:

在 N 个 CPU 上运行 N 个可运行任务，每个任务获得:
理想 CPU 时间 = 实际时间 / N

CFS 目标:
让每个任务的虚拟运行时间 (vruntime) 尽可能接近

13.1.3 CFS 核心思想

/*
 * 核心方程: vruntime += delta_exec * (NICE_0_LOAD / weight)
 *
 * delta_exec: 实际运行时间
 * weight: 任务权重 (基于 nice 值)
 * NICE_0_LOAD: nice 0 的基准权重
 *
 * 权重越高，vruntime 增长越慢 -> 获得更多 CPU 时间
 */

13.2 调度实体 (sched_entity)

13.2.1 数据结构

位置： include/linux/sched.h

struct sched_entity {
    /* 负载权重相关 */
    struct load_weight      load;           /* 任务权重 */
    struct rb_node          run_node;       /* 红黑树节点 */

    /* 虚拟时间相关 */
    u64                     vruntime;        /* 虚拟运行时间 */
    u64                     exec_start;      /* 执行开始时间 */
    u64                     sum_exec_runtime;/* 总运行时间 */
    u64                     prev_sum_exec_runtime; /* 上次总运行时间 */

    /* 队列相关 */
    struct list_head        group_node;     /* 组调度链表 */
    unsigned int            on_rq;          /* 是否在运行队列 */

    /* EEVDF 相关 */
    u64                     deadline;       /* 虚拟截止时间 */
    u64                     min_vruntime;   /* 最小 vruntime */
    s64                     vlag;           /* 虚拟滞后 */
    u64                     slice;          /* 分配的时间片 */

#ifdef CONFIG_FAIR_GROUP_SCHED
    struct sched_entity     *parent;        /* 父实体 */
    struct cfs_rq           *my_q;          /* 自有 cfs_rq */
#endif

#ifdef CONFIG_SMP
    struct sched_avg        avg;            /* PELT 负载平均 */
#endif
};

13.2.2 权重计算

// kernel/sched/core.c
/*
 * nice 值到权重的映射
 * 每增加 1 个 nice 值，权重乘以约 1.25
 */
static const int prio_to_weight[40] = {
 /* -20 */     88761,     71755,     56483,     46273,     36291,
 /* -15 */     29154,     23254,     18705,     14949,     11916,
 /* -10 */      9548,      7620,      6100,      4904,      3906,
 /*  -5 */      3121,      2501,      1991,      1586,      1277,
 /*   0 */      1024,       820,       655,       526,       423,
 /*  +5 */       336,       272,       215,       172,       137,
 /* +10 */       110,        87,        70,        56,        45,
 /* +15 */        36,        29,        23,        18,        15,
};

// nice 0 的基准权重
#define NICE_0_LOAD     1024

13.2.3 权重到优先级的转换

/*
 * 权重与 nice 的关系:
 *
 * weight = 1024 / (1.25 ^ nice)
 *
 * 示例:
 * nice -20 -> weight = 88761  (最高优先级)
 * nice   0 -> weight = 1024
 * nice +19 -> weight = 15     (最低优先级)
 */

13.3 虚拟运行时间 (vruntime)

13.3.1 vruntime 更新

位置： kernel/sched/fair.c

/*
 * 计算 delta 对应的虚拟时间
 *
 * delta_exec: 实际运行时间
 * weight: 任务权重
 * lw: 运行队列权重
 */
static u64 __calc_delta(u64 delta_exec, unsigned long weight,
                       struct load_weight *lw)
{
    u64 fact = scale_load_down(weight);
    u32 fact_hi;
    int shift = WMULT_SHIFT;

    __update_inv_weight(lw);

    // 处理高 32 位
    if (unlikely(fact_hi = (u32)(fact >> 32))) {
        int fs = fls(fact_hi);
        shift -= fs;
        fact >>= fs;
    }

    // 乘法
    fact = mul_u32_u32(fact, lw->inv_weight);

    // 处理可能的溢出
    if (fact_hi = (u32)(fact >> 32)) {
        int fs = fls(fact_hi);
        shift -= fs;
        fact >>= fs;
    }

    return mul_u64_u32_shr(delta_exec, fact, shift);
}

/*
 * 计算 fair delta
 * 将实际运行时间转换为虚拟运行时间
 */
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
    if (unlikely(se->load.weight != NICE_0_LOAD))
        delta = __calc_delta(delta, NICE_0_LOAD, &se->load);

    return delta;
}

13.3.2 vruntime 更新流程

/*
 * 更新当前任务的 vruntime
 */
static void update_curr(struct cfs_rq *cfs_rq)
{
    struct sched_entity *curr = cfs_rq->curr;
    struct rq *rq = rq_of(cfs_rq);
    u64 now = rq_clock_task(rq);
    u64 delta_exec;

    if (unlikely(!curr))
        return;

    // 计算实际运行时间
    delta_exec = now - curr->exec_start;
    if (unlikely((s64)delta_exec <= 0))
        return;

    // 更新执行开始时间
    curr->exec_start = now;

    // 更新 vruntime
    curr->vruntime += calc_delta_fair(delta_exec, curr);

    // 更新统计信息
    update_min_vruntime(cfs_rq);
}

13.4 CFS 运行队列

13.4.1 cfs_rq 结构

位置： kernel/sched/sched.h

/* CFS 运行队列 */
struct cfs_rq {
    struct load_weight      load;           /* 队列总负载 */
    unsigned long           nr_running;     /* 运行任务数 */
    u64                     exec_clock;      /* 执行时钟 */

    /* 红黑树 */
    struct rb_root_cached   timeline;        /* 按 timeline 排序的任务树 */

    /* vruntime 相关 */
    u64                     min_vruntime;    /* 最小 vruntime */
    u64                     min_vruntime_copy;
    struct sched_entity     *curr;          /* 当前运行实体 */
    struct sched_entity     *next;          /* 下一个实体 */
    struct sched_entity     *last;          /* 上一个实体 */
    struct sched_entity     *skip;          /* 跳过的实体 */

#ifdef CONFIG_FAIR_GROUP_SCHED
    struct rq               *rq;            /* 所属运行队列 */
    struct task_group       *tg;            /* 任务组 */
    struct cfs_rq           *my_q;          /* 自有队列 */
#endif
};

13.4.2 红黑树组织

CFS 运行队列红黑树结构:

┌─────────────────────────────────────────────────────────────┐
│                    cfs_rq->timeline                         │
│                    (rb_root_cached)                         │
└─────────────────────────────────────────────────────────────┘
                          │
                          ↓
         ┌────────────────────────────────────┐
         │              左子树                 │
         │  vruntime 较大 (较晚需要 CPU)       │
         └────────────────────────────────────┘
                          │
         ┌────────────────────────────────────┐
         │           rb_leftmost               │ ← 最小 vruntime
         │       (下一个被调度任务)            │
         │    se->vruntime = min_vruntime      │
         └────────────────────────────────────┘
                          │
         ┌────────────────────────────────────┐
         │              右子树                 │
         │  vruntime 最大 (最晚需要 CPU)       │
         └────────────────────────────────────┘

属性:
- 按 se->vruntime 排序
- 左节点 vruntime < 父节点 vruntime < 右节点 vruntime
- rb_leftmost 指向最小 vruntime 节点
- O(log n) 插入/删除
- O(1) 查找最小节点

13.4.3 入队操作

/*
 * 将调度实体加入运行队列
 */
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    // 添加到红黑树，按 vruntime/deadline 排序
    rb_add_cached(&se->run_node, &cfs_rq->timeline,
                  __entity_less);
}

/*
 * 比较函数
 */
static inline bool __entity_less(struct rb_node *a, struct rb_node *b)
{
    struct sched_entity *se_a = __node_2_se(a);
    struct sched_entity *se_b = __node_2_se(b);

    return (s64)(se_a->deadline - se_b->deadline) < 0;
}

13.5 任务选择

13.5.1 pick_next_entity

位置： kernel/sched/fair.c

static struct sched_entity *pick_next_entity(struct rq *rq,
                                            struct cfs_rq *cfs_rq)
{
    struct sched_entity *se;

    // 获取最左节点 (最小 vruntime/deadline)
    se = __pick_first_entity(cfs_rq);
    if (!se)
        return NULL;

    // 检查是否应该跳过
    if (cfs_rq->skip == se) {
        struct sched_entity *second;

        // 如果第一个是 skip，获取下一个
        if (se == cfs_rq->curr)
            second = __pick_first_entity(cfs_rq);
        else
            second = __pick_next_entity(se);

        if (second && wakeup_preempt_entity(second, se) < 1)
            se = second;
    }

    return se;
}

/*
 * 获取红黑树最左节点
 */
static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
    struct rb_node *left = rb_first_cached(&cfs_rq->timeline);

    if (!left)
        return NULL;

    return __node_2_se(left);
}

13.5.2 唤醒抢占检查

/*
 * 检查新任务是否应该抢占当前任务
 */
static int wakeup_preempt_entity(struct sched_entity *curr,
                                struct sched_entity *se)
{
    s64 vdiff = curr->vruntime - se->vruntime;

    // 如果新任务的 vruntime 明显更小，抢占
    if (vdiff <= 0)
        return 1;

    // 检查是否超过抢占阈值
    vdiff = vdiff >> 1;  // 右移 1 位 = 除以 2

    return vdiff > 0;
}

13.6 时间片分配

13.6.1 目标延迟

位置： kernel/sched/fair.c

/*
 * 目标延迟 (sched_latency)
 * - 系统期望每个可运行任务在目标延迟内至少运行一次
 * - 默认约 6ms，随 CPU 数量对数增长
 */
unsigned int sysctl_sched_base_slice = 700000ULL; /* 700us */
static unsigned int normalized_sysctl_sched_base_slice = 700000ULL;

static unsigned int get_update_sysctl_factor(void)
{
    unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
    unsigned int factor;

    switch (sysctl_sched_tunable_scaling) {
    case SCHED_TUNABLESCALING_NONE:
        factor = 1;
        break;
    case SCHED_TUNABLESCALING_LINEAR:
        factor = cpus;
        break;
    case SCHED_TUNABLESCALING_LOG:
    default:
        factor = 1 + ilog2(cpus);  /* 对数缩放 */
        break;
    }

    return factor;
}

13.6.2 计算时间片

/*
 * 计算任务的时间片
 *
 * slice = sched_latency * (load_weight / total_load)
 *
 * 权重越高，时间片越大
 */
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);

    for_each_sched_entity(se) {
        struct load_weight *load;
        struct load_weight lw;

        cfs_rq = cfs_rq_of(se);
        load = &cfs_rq->load;

        if (unlikely(!se->on_rq)) {
            lw = cfs_rq->load;
            update_load_add(&lw, se->load.weight);
            load = &lw;
        }

        slice = __calc_delta(slice, se->load.weight, load);
    }

    return slice;
}

/*
 * 计算调度周期
 */
static u64 __sched_period(unsigned long nr_running)
{
    if (nr_running > sched_nr_latency)
        return nr_running * sysctl_sched_base_slice;

    return sysctl_sched_base_slice * (1 + ilog2(nr_running));
}

13.7 负载计算

13.7.1 load_weight 结构

struct load_weight {
    unsigned long           weight;         /* 权重值 */
    unsigned long           inv_weight;     /* 权重的倒数 (用于优化) */
};

/*
 * inv_weight 用于优化除法运算
 * 除法: x / weight
 * 转换: x * inv_weight >> WMULT_SHIFT
 */

13.7.2 权重更新

/*
 * 更新负载权重
 */
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
    lw->weight += inc;
    lw->inv_weight = 0;  /* 触发重新计算 */
}

static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
{
    lw->weight -= dec;
    lw->inv_weight = 0;
}

static void __update_inv_weight(struct load_weight *lw)
{
    unsigned long w;

    if (likely(lw->inv_weight))
        return;

    w = scale_load_down(lw->weight);

    if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
        lw->inv_weight = 1;
    else if (unlikely(!w))
        lw->inv_weight = WMULT_CONST;
    else
        lw->inv_weight = WMULT_CONST / w;
}

13.8 组调度 (CFS Group Scheduling)

13.8.1 任务组

struct task_group {
    struct cfs_rq           **cfs_rq;       /* 每个 CPU 的 cfs_rq */
    struct rt_rq            **rt_rq;        /* 每个 CPU 的 rt_rq */

    struct list_head        list;           /* 任务组链表 */
    struct task_group       *parent;        /* 父组 */
    struct list_head        siblings;       /* 兄弟组 */
    struct list_head        children;       /* 子组 */

    struct sched_entity     **se;           /* 每个 CPU 的调度实体 */

    /* 共享运行队列 */
    struct rq               **rq;           /* 每个 CPU 的运行队列 */

    /* 带宽控制 */
    struct cfs_bandwidth    cfs_bandwidth;
};

13.8.2 层级结构

任务组层级结构:

根任务组 (root_task_group)
    │
    ├─ 系统服务组
    │   ├─ systemd
    │   └─ cron
    │
    ├─ 用户会话组
    │   ├─ 浏览器组
    │   │   ├─ chrome
    │   │   └─ firefox
    │   └─ 终端组
    │       ├─ bash
    │       └─ vim
    │
    └─ 后台任务组
        ├─ make
        └─ gcc

每个组有自己的 CPU 份额 (shares)

13.9 常用调优参数

13.9.1 /proc/sys/kernel 参数

# 基础时间片 (纳秒)
/proc/sys/kernel/sched_base_slice
# 默认: 700000 (700us)
# 影响: 每个任务的最小执行时间

# 缩放策略
/proc/sys/kernel/sched_tunable_scaling
# 0 = 不缩放
# 1 = 对数缩放 (默认)
# 2 = 线性缩放

# 迁移成本 (纳秒)
/proc/sys/kernel/sched_migration_cost
# 默认: 500000 (500us)
# 影响: 任务迁移的阈值

# CFS 带宽时间片 (微秒)
/proc/sys/kernel/sched_cfs_bandwidth_slice_us
# 默认: 5000 (5ms)

13.9.2 cgroup 控制

# 设置 CPU 份额
echo 1024 > /sys/fs/cgroup/cpu/mygroup/cpu.shares

# 设置带宽限制
echo 100000 > /sys/fs/cgroup/cpu/mygroup/cpu.cfs_quota_us   # 100ms
echo 1000000 > /sys/fs/cgroup/cpu/mygroup/cpu.cfs_period_us # 1s

13.10 本章小结

本章详细介绍了 Linux CFS 完全公平调度器：

设计理念：基于 vruntime 的完全公平调度
调度实体：sched_entity 结构与权重计算
虚拟时间：vruntime 的更新与计算
运行队列：红黑树组织与管理
任务选择：pick_next_entity 与抢占机制
时间片分配：目标延迟与动态计算
负载计算：load_weight 与优化技巧
组调度：任务组层级结构

下一章将介绍 EEVDF 调度器的实现。

Linux内核分析之进程管理-13

第13章 CFS 完全公平调度器详解

13.1 CFS 设计理念

13.1.1 完全公平的概念

13.1.2 理想公平模型

13.1.3 CFS 核心思想

13.2 调度实体 (sched_entity)

13.2.1 数据结构

13.2.2 权重计算

13.2.3 权重到优先级的转换

13.3 虚拟运行时间 (vruntime)

13.3.1 vruntime 更新

13.3.2 vruntime 更新流程

13.4 CFS 运行队列

13.4.1 cfs_rq 结构

13.4.2 红黑树组织

13.4.3 入队操作

13.5 任务选择

13.5.1 pick_next_entity

13.5.2 唤醒抢占检查

13.6 时间片分配

13.6.1 目标延迟

13.6.2 计算时间片

13.7 负载计算

13.7.1 load_weight 结构

13.7.2 权重更新

13.8 组调度 (CFS Group Scheduling)

13.8.1 任务组

13.8.2 层级结构

13.9 常用调优参数

13.9.1 /proc/sys/kernel 参数

13.9.2 cgroup 控制

13.10 本章小结