linux内核的进程调度—调度策略

(注:下文代码片段均出自linux内核版本4.1.15


在linux内核中,默认实现了5种调度策略:

(1)stop调度策略

(2)deadline调度策略

(3)realtime调度策略

(4)CFS调度策略

(5)idle调度策略

以上5种调度策略通过.next指针串联在一起。如下代码片段(/kernel/sched/sched.h):

#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)

for_each_class宏在check_preempt_curr()函数pick_next_task()函数中被使用,从而遍历由5个调度策略组成的调度器链表。

对于调度策略的选择,根据:不同的进程采用不同的调度策略

源码中,5种调度策略的具体实例都使用struct sched_class来定义调度类(/kernel/sched/sched.h):

struct sched_class {
	const struct sched_class *next;

	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
	void (*yield_task) (struct rq *rq);
	bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);

	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);

	/*
	 * It is the responsibility of the pick_next_task() method that will
	 * return the next task to call put_prev_task() on the @prev task or
	 * something equivalent.
	 *
	 * May return RETRY_TASK when it finds a higher prio class has runnable
	 * tasks.
	 */
	struct task_struct * (*pick_next_task) (struct rq *rq,
						struct task_struct *prev);
	void (*put_prev_task) (struct rq *rq, struct task_struct *p);

#ifdef CONFIG_SMP
	int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);

	void (*post_schedule) (struct rq *this_rq);
	void (*task_waking) (struct task_struct *task);
	void (*task_woken) (struct rq *this_rq, struct task_struct *task);

	void (*set_cpus_allowed)(struct task_struct *p,
				 const struct cpumask *newmask);

	void (*rq_online)(struct rq *rq);
	void (*rq_offline)(struct rq *rq);
#endif

	void (*set_curr_task) (struct rq *rq);
	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
	void (*task_fork) (struct task_struct *p);
	void (*task_dead) (struct task_struct *p);

	/*
	 * The switched_from() call is allowed to drop rq->lock, therefore we
	 * cannot assume the switched_from/switched_to pair is serliazed by
	 * rq->lock. They are however serialized by p->pi_lock.
	 */
	void (*switched_from) (struct rq *this_rq, struct task_struct *task);
	void (*switched_to) (struct rq *this_rq, struct task_struct *task);
	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
			     int oldprio);

	unsigned int (*get_rr_interval) (struct rq *rq,
					 struct task_struct *task);

	void (*update_curr) (struct rq *rq);

#ifdef CONFIG_FAIR_GROUP_SCHED
	void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
};

【设置调度策略】

用户空间程序可以使用sched_setscheduler()来设定用户进程的调度策略。(/include/uapi/linux/sched.h)

/*
 * Scheduling policies
 */
#define SCHED_NORMAL		0
#define SCHED_FIFO		1
#define SCHED_RR		2
#define SCHED_BATCH		3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE		5
#define SCHED_DEADLINE		6
SCHED_NORMAL和SCHED_BATCH使用CFS调度器
   
 SCHED_FIFO和SCHED_RR使用realtime调度器
 
 SCHED_IDILE使用idle调度器
 
 SCHED_DEADLINE使用deadline调度器
一、调度策略
(1-1)stop调度策略

​ 优先级最高的调度类,特点是:可以抢占其他所有进程,不能被其他进程所抢占。如下为stop调度器的具体实例stop_sched_class(/kernel/sched/stop_task.c):

const struct sched_class stop_sched_class = {
	.next			= &dl_sched_class,

	.enqueue_task		= enqueue_task_stop,
	.dequeue_task		= dequeue_task_stop,
	.yield_task		= yield_task_stop,

	.check_preempt_curr	= check_preempt_curr_stop,

	.pick_next_task		= pick_next_task_stop,
	.put_prev_task		= put_prev_task_stop,

#ifdef CONFIG_SMP
	.select_task_rq		= select_task_rq_stop,
#endif

	.set_curr_task          = set_curr_task_stop,
	.task_tick		= task_tick_stop,

	.get_rr_interval	= get_rr_interval_stop,

	.prio_changed		= prio_changed_stop,
	.switched_to		= switched_to_stop,
	.update_curr		= update_curr_stop,
};
(1-2)deadline调度策略

​ 使用红黑树,把进程按照绝对截止期限进行排序,选择最小期限进程调度运行。如下为deadline调度策略的具体实例dl_sched_class(/kernel/sched/deadline.c):

const struct sched_class dl_sched_class = {
	.next			= &rt_sched_class,
	.enqueue_task		= enqueue_task_dl,
	.dequeue_task		= dequeue_task_dl,
	.yield_task		= yield_task_dl,

	.check_preempt_curr	= check_preempt_curr_dl,

	.pick_next_task		= pick_next_task_dl,
	.put_prev_task		= put_prev_task_dl,

#ifdef CONFIG_SMP
	.select_task_rq		= select_task_rq_dl,
	.set_cpus_allowed       = set_cpus_allowed_dl,
	.rq_online              = rq_online_dl,
	.rq_offline             = rq_offline_dl,
	.post_schedule		= post_schedule_dl,
	.task_woken		= task_woken_dl,
#endif

	.set_curr_task		= set_curr_task_dl,
	.task_tick		= task_tick_dl,
	.task_fork              = task_fork_dl,
	.task_dead		= task_dead_dl,

	.prio_changed           = prio_changed_dl,
	.switched_from		= switched_from_dl,
	.switched_to		= switched_to_dl,

	.update_curr		= update_curr_dl,
};
(1-3)realtime调度策略

​ 为每个优先级维护一个队列。如下为realtime调度策略的具体实例(/kernel/sched/rt.c):

const struct sched_class rt_sched_class = {
	.next			= &fair_sched_class,
	.enqueue_task		= enqueue_task_rt,
	.dequeue_task		= dequeue_task_rt,
	.yield_task		= yield_task_rt,

	.check_preempt_curr	= check_preempt_curr_rt,

	.pick_next_task		= pick_next_task_rt,
	.put_prev_task		= put_prev_task_rt,

#ifdef CONFIG_SMP
	.select_task_rq		= select_task_rq_rt,

	.set_cpus_allowed       = set_cpus_allowed_rt,
	.rq_online              = rq_online_rt,
	.rq_offline             = rq_offline_rt,
	.post_schedule		= post_schedule_rt,
	.task_woken		= task_woken_rt,
	.switched_from		= switched_from_rt,
#endif

	.set_curr_task          = set_curr_task_rt,
	.task_tick		= task_tick_rt,

	.get_rr_interval	= get_rr_interval_rt,

	.prio_changed		= prio_changed_rt,
	.switched_to		= switched_to_rt,

	.update_curr		= update_curr_rt,
};
(1-4)CFS调度策略

​ 采用完全公平调度算法,引入虚拟运行时间概念。如下fair_sched_class的cfs具体实例(/kernel/sched/fair.c):

const struct sched_class fair_sched_class = {
	.next			= &idle_sched_class,
	.enqueue_task		= enqueue_task_fair,
	.dequeue_task		= dequeue_task_fair,
	.yield_task		= yield_task_fair,
	.yield_to_task		= yield_to_task_fair,

	.check_preempt_curr	= check_preempt_wakeup,

	.pick_next_task		= pick_next_task_fair,
	.put_prev_task		= put_prev_task_fair,

#ifdef CONFIG_SMP
	.select_task_rq		= select_task_rq_fair,
	.migrate_task_rq	= migrate_task_rq_fair,

	.rq_online		= rq_online_fair,
	.rq_offline		= rq_offline_fair,

	.task_waking		= task_waking_fair,
#endif

	.set_curr_task          = set_curr_task_fair,
	.task_tick		= task_tick_fair,
	.task_fork		= task_fork_fair,

	.prio_changed		= prio_changed_fair,
	.switched_from		= switched_from_fair,
	.switched_to		= switched_to_fair,

	.get_rr_interval	= get_rr_interval_fair,

	.update_curr		= update_curr_fair,

#ifdef CONFIG_FAIR_GROUP_SCHED
	.task_move_group	= task_move_group_fair,
#endif
};
(1-5)idle调度策略

​ 每个CPU都会有一个idle线程,当没有其他进程可以调度时,调度运行idle线程。idle_sched_classidle调度器的具体实例(/kernel/sched/idle_task.h):

const struct sched_class idle_sched_class = {
	/* .next is NULL */
	/* no enqueue/yield_task for idle tasks */

	/* dequeue is not valid, we print a debug message there: */
	.dequeue_task		= dequeue_task_idle,

	.check_preempt_curr	= check_preempt_curr_idle,

	.pick_next_task		= pick_next_task_idle,
	.put_prev_task		= put_prev_task_idle,

#ifdef CONFIG_SMP
	.select_task_rq		= select_task_rq_idle,
#endif

	.set_curr_task          = set_curr_task_idle,
	.task_tick		= task_tick_idle,

	.get_rr_interval	= get_rr_interval_idle,

	.prio_changed		= prio_changed_idle,
	.switched_to		= switched_to_idle,
	.update_curr		= update_curr_idle,
};

注:在(/kernel/sched/core.c)init_idle()函数中为指定cpu进行idle调度器的初始化

二、调度时刻

​ 调度的本质是:选择下一个进程,然后进行线程切换。在执行调度之前需要设置调度标记TIF_NEED_RESCHED,然后在调度的时候判断进程是否被设置TIF_NEED_RESCHED,如果设置则调用schedule来进行调度。

​ (2-1)设置调度标记

​ CPU上正在运行的进程由thread_info(/arch/arm/include/asm/thread_info.h)描述,设置调度标记本质就是设置thread_info结构体中的flags成员为TIF_NEED_RESCHED

​ 在以下五个时刻会设置TIF_NEED_RESCHED

​ 1、scheduler_tick时钟中断

​ 2、wake_up_process唤醒进程的时刻

​ 3、使用do_fork创建新进程的时候

​ 4、set_user_nice修改进程nice值的时候

​ 5、smp_send_reschedule负载均衡的时候

三、执行调度

​ linux kernel会判当前今标记是否是TIF_NEED_RESCHED,如果是,则会调用schedule()函数执行调度,切换上下文。

​ 那么在哪些情况下会执行schedule()函数呢?

​ (1)阻塞操作:互斥量(Mutex)、信号量(Semaphore)、等待队列(waitqueue)等。

​ (2)在中断返回前和系统调用返回用户空间时,去检查TIF_NEED_RESCHED标志位判断是否需要调度。

​ (3)将要被唤醒的进程不会马上调用schedule()要求被调度,而是会被添加到CFS就绪队列中,并且设置TIF_NEED_RESCHED标志位。那么唤醒进程什么时候被调度呢?这要根据内核是否具体可抢占功能(CONFIG_PREEMPT=y)分两种情况:

​ 【如果内核不可抢占】,则:

  • ​ 当前进程调用cond_resched()时会检查是否要调度。
  • ​ 主动调度调用schedule()
  • ​ 系统调用或者异常处理返回用户空间时
  • ​ 中断处理完成返回用户空间时

​ 【如果内核可抢占】,则:

  • 如果唤醒动作发生在系统调用或异常处理上下文中,在下一次调用preempt_enable()时会检查是否需要抢占调度。
  • 如果唤醒动作发生在硬件中断处理上下文中,硬件中断处理返回前夕会检查是否要抢占当前进程。

硬件中断返回前夕硬件中断返回用户空间前前夕是两个不同的概念。硬件中断返回前夕是每次返回前夕都会检查是否进程需要被抢占调度,不管中断发生点是在内核空间,还是用户空间;硬件中断返回用户空间前夕是只有中断发生点在用户空间才会检查。


本人由于知识与精力有限,如果分享的文章存在有不妥的地方,请多多批评!哈哈。


搜索/关注【嵌入式小生vx公众号,获取更多精彩内容>>>>
请添加图片描述

Logo

为开发者提供学习成长、分享交流、生态实践、资源工具等服务,帮助开发者快速成长。

更多推荐