kvm虚拟化 steal time 机制详解

steal time 是指在虚拟化的环境下，管理机（host os，如 linux）窃取的虚拟机中的时间（虚拟机上的一个 vcpu 对应主机上一个线程，当该线程未在运行时，则是主机窃取的虚机时间），即 vcpu 没有在运行的时间。在虚机中执行 top 命令，其中有一个 st 字段，该字段数据则是描述主机窃取时间占比的。ps：对于真实物理机器：该字段永远为 0，如果是 guest os：该字段可能不

内核新视界

1521人浏览 · 2024-01-05 11:10:37

内核新视界 · 2024-01-05 11:10:37 发布

文章目录

1 steal time 机制介绍

steal time 是指在虚拟化的环境下，管理机（host os，如 linux）窃取的虚拟机中的时间（虚拟机上的一个 vcpu 对应主机上一个线程，当该线程未在运行时，则是主机窃取的虚机时间），即 vcpu 没有在运行的时间。

在虚机中执行 top 命令，其中有一个 st 字段，该字段数据则是描述主机窃取时间占比的。
ps：对于真实物理机器：该字段永远为 0，如果是 guest os：该字段可能不为0，如果该字段占比较大，说明主机任务比较繁忙，该虚机被调度较少。
st 数据可以让 guest 看到自己真正占用的 cpu 时间比例。，如果 st 值较高，则说明主机窃取时间多，主机管理端繁忙。

要支持 steal time 机制，需要内核启用 CONFIG_PARAVIRT 半虚拟化配置，这样内核可以在运行时检测自己处于虚拟机状态。

（下面的讲解都以 x86 的硬件支持为例）
steal time 机制又由 MSR_KVM_STEAL_TIME 特性支持，虚拟机可以通过读取特定字段来判断自己在虚机中是否支持 steal time 特性。该特性由下面的数据结构实现支持：

#define MSR_KVM_STEAL_TIME  0x4b564d03

struct kvm_steal_time {
	__u64 steal;
	__u32 version;
	__u32 flags;
	__u8  preempted;
	__u8  u8_pad[3];
	__u32 pad[11];
};

主机管理机定期更新填充该数据结构，每个 vcpu 只需要一次写操作或注册。
此结构更新间隔不固定，根据任务调度，出对入队等信息不定时更新。
主机会在上述任意时刻更新该结构，直到写入 0，则禁用该功能。
version 是一个序列计数器，虚机在读取该结构数据之前和之后需要检查该字段，
并且确保前后两者值相等，并且为偶数，如果为奇数则说明正在更新，需要重新获取。
flags 目前未使用，将来将会用来表示该结构的变化。
steal 记录该 vcpu 未运行的时间，以纳秒为单位。
preempted 记录该 vcpu 抢占状态以及其他一些信息，包括 vcpu 未运行，tlb flush op。
其他字段为填充字段，将来用于拓展。

2 guest os 中 steal time 初始化流程

一些补充：
为什么 guest os （linux）需要去检测自己是否处于虚拟化环境？
linux 启动中可以检测自己处于虚拟化环境运行，这一步是有意义的，知道自己处于虚拟化环境，那么许多操作可以基于虚拟化进行优化，替换原有的默认操作，以此提高性能或者相关安全性，比如 steal time 机制可以替换 clock 相关接口，用于记录被管理机窃取的运行时间，从而重新计算任务的调度时间片，使其虚机上运行的任务得到真实的运行时间。
又比如 tlb flush 操作可以替换为 kvm 的特定 tlb flush 操作，以提高刷新 flush 的性能，这在实际应用中性能提高非常明显。

当 guestOS 启动时，有如下路径（注意：该流程为 guest OS （linux）的初始化流程）：

setup_arch
  -> init_hypervisor_platform
  -> x86_init.hyper.guest_late_init();
  

当系统启动时 guest_late_init 拥有如下默认值：
struct x86_init_ops x86_init __initdata = {
...
	.hyper = {
		.init_platform		= x86_init_noop,
		.guest_late_init	= x86_init_noop,
		.x2apic_available	= bool_x86_init_noop,
		.init_mem_mapping	= x86_init_noop,
		.init_after_bootmem	= x86_init_noop,
	},
...
void x86_init_noop(void) { }

即 x86_init.hyper.guest_late_init() 操作为空，但是在之前还有一个 init_hypervisor_platform。
它将检测到自己正处于虚机运行状态，此时在支持了半虚拟化下则会将许多默认接口替换为虚拟化接口。
void __init init_hypervisor_platform(void)
{
	const struct hypervisor_x86 *h;

	h = detect_hypervisor_vendor();

	if (!h)
		return;
    // 如果检测出 h 有值，则是虚机运行，将虚机使用的回调替换到默认回调中。
	copy_array(&h->init, &x86_init.hyper, sizeof(h->init));
	copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime));

	x86_hyper_type = h->type;
	x86_init.hyper.init_platform();
}

static inline const struct hypervisor_x86 * __init
detect_hypervisor_vendor(void)
{
	const struct hypervisor_x86 *h = NULL, * const *p;
	uint32_t pri, max_pri = 0;

    // 遍历所有可能的虚拟化支持，并执行 detect
	for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
		pri = (*p)->detect();
		if (pri > max_pri) {
			max_pri = pri;
			h = *p;
		}
	}

	if (h)
		pr_info("Hypervisor detected: %s\n", h->name);

	return h;
}

static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
#ifdef CONFIG_XEN_PV
	&x86_hyper_xen_pv,
#endif
#ifdef CONFIG_XEN_PVHVM
	&x86_hyper_xen_hvm,
#endif
	&x86_hyper_vmware,
	&x86_hyper_ms_hyperv,
#ifdef CONFIG_KVM_GUEST
	&x86_hyper_kvm,
#endif
#ifdef CONFIG_JAILHOUSE_GUEST
	&x86_hyper_jailhouse,
#endif
};

// 对于 kvm 有如下
const __initconst struct hypervisor_x86 x86_hyper_kvm = {
	.name			= "KVM",
	.detect			= kvm_detect,
	.type			= X86_HYPER_KVM,
	.init.guest_late_init	= kvm_guest_init,
	.init.x2apic_available	= kvm_para_available,
	.init.init_platform	= kvm_init_platform,
};

kvm_detect
  -> kvm_cpuid_base
    -> __kvm_cpuid_base

// 通过读取 cpuid 字段，得到 KVMKVMKVM 则说明是 kvm 虚拟化运行。
static noinline uint32_t __kvm_cpuid_base(void)
{
	if (boot_cpu_data.cpuid_level < 0)
		return 0;	/* So we don't blow up on old processors */

	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
		return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);

	return 0;
}

那么到这里，一个 guestOS 则会最终调用 kvm_guest_init，做 kvm 虚拟化的初始化：
static void __init kvm_guest_init(void)
{
...
	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
		has_steal_clock = 1;
		pv_time_ops.steal_clock = kvm_steal_clock;
	}

	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
	    !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
		pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
		pv_mmu_ops.tlb_remove_table = tlb_remove_table;
	}
...
}
可以看到 pv_time_ops.steal_clock 为 kvm_steal_clock，
pv_mmu_ops.flush_tlb_others 也被替换为了 kvm_flush_tlb_others。

接着在每个cpu的路径上会调用到 kvm_guest_cpu_init：
static void kvm_guest_cpu_init(void)
{
...
	if (has_steal_clock)
		kvm_register_steal_time();
}

static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64);

// 每个 vcpu 都会将该全局的 percpu 变量的物理地址写入到 msr 虚拟寄存器中。
static void kvm_register_steal_time(void)
{
	int cpu = smp_processor_id();
	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);

	if (!has_steal_clock)
		return;

	wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
	pr_info("kvm-stealtime: cpu %d, msr %llx\n",
		cpu, (unsigned long long) slow_virt_to_phys(st));
}

注意这里是虚机运行，当写 msr 时会触发异常退出到 host，在 vcpu 退出时将检测退出原因：
如是 vmx 则是下面，如果是仿真，则是其他路径：
vmx_handle_exit
  -> kvm_vmx_exit_handlers[exit_reason](vcpu);
static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
...
	[EXIT_REASON_MSR_WRITE]               = handle_wrmsr,
...
}

handle_wrmsr
  -> kvm_set_msr
	case MSR_KVM_STEAL_TIME:
		if (unlikely(!sched_info_on()))
			return 1;
		if (data & KVM_STEAL_RESERVED_MASK)
			return 1;
		if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
						data & KVM_STEAL_VALID_BITS,
						sizeof(struct kvm_steal_time)))
			return 1;
		vcpu->arch.st.msr_val = data;
		if (!(data & KVM_MSR_ENABLED))
			break;
		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
		break;

可以看到对应的 vcpu 中 msr 写的物理地址（gpa）将会被写入到 host 的 cpu->arch.st.stime 中，
这样即可建立主机与 guest 对应内存的访问方式。

3 guest os 使用 steal time

首先是：

pv_time_ops.steal_clock = kvm_steal_clock;

static u64 kvm_steal_clock(int cpu)
{
	u64 steal;
	struct kvm_steal_time *src;
	int version;

	src = &per_cpu(steal_time, cpu);
	do {
		version = src->version;
		virt_rmb();
		steal = src->steal;
		virt_rmb();
	} while ((version & 1) || (version != src->version));

	return steal;
}

static inline u64 paravirt_steal_clock(int cpu)
{
	return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu);
}

虚机通过读取 &per_cpu(steal_time)->steal 获取到主机窃取的 vcpu 时间。

主要在两个地方使用：

paravirt_steal_clock
  -> steal_account_process_time
  -> update_rq_clock_task
  
(1)steal_account_process_time
该接口主要看一下 account_process_tick
account_process_tick
  -> update_process_times
    -> tick_sched_handle
在某些定时器中将会更新该值：
static __always_inline u64 steal_account_process_time(u64 maxtime)
{
#ifdef CONFIG_PARAVIRT
	if (static_key_false(&paravirt_steal_enabled)) {
		u64 steal;

        // 通过 paravirt_steal_clock 获取当前 cpu 的 steal time，
        // 减去上次的 prev_steal_time，那么就是该期间的 steal time，
        // 该 steal 将会被累加到 cpustat[CPUTIME_STEAL] += cputime; 中，
        // 应用程序将会在 top 看到该值的计算结果。
        // 另外会返回该期间的 steal，并且更新 prev_steal_time 为当前 steal time。
		steal = paravirt_steal_clock(smp_processor_id());
		steal -= this_rq()->prev_steal_time;
		steal = min(steal, maxtime);
		account_steal_time(steal);
		this_rq()->prev_steal_time += steal;

		return steal;
	}
#endif
	return 0;
}

// 计算系统时间时，steal_account_process_time 将会返回系统被窃取的时间，
// 并从实际时间中减去该部分，得到虚机系统真实的运行时间，
// 对于 host 系统来说，steal 等于 0。
void account_process_tick(struct task_struct *p, int user_tick)
{
...
	steal = steal_account_process_time(ULONG_MAX);

	if (steal >= cputime)
		return;

	cputime -= steal;

	if (user_tick)
		account_user_time(p, cputime);
	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
		account_system_time(p, HARDIRQ_OFFSET, cputime);
	else
		account_idle_time(cputime);
}

（2）update_rq_clock_task
update_rq_clock_task
  -> update_rq_clock
该处处于系统调度的重要使用函数，在任意需要更新调度时间的地方均会调用 update_rq_clock_task。
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
...
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
	if (static_key_false((&paravirt_steal_rq_enabled))) {
		// 获取到当前 cpu 系统的窃取时间，并减去上次记录的时间，
		// 得到在此期间被窃取的时间，将系统实际运行的时间片减去窃取窃取部分的，
		// 那么就会得到系统真实的运行时间，并且将当前窃取时间累加到 prev_steal_time_rq，
		// 方便后续计算。
		steal = paravirt_steal_clock(cpu_of(rq));
		steal -= rq->prev_steal_time_rq;

		if (unlikely(steal > delta))
			steal = delta;

		rq->prev_steal_time_rq += steal;
		delta -= steal;
	}
#endif

	rq->clock_task += delta;

#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
		update_irq_load_avg(rq, irq_delta + steal);
#endif

从上可以看到，通过在关键路径计算中减去 steal time的时间，得到虚机实际的运行时间，通过这种方式，虚机内部就可以感知到被窃取的时间，并通过计算将窃取时间从实际时间中移除，得到系统真实的运行时间。

5 host os steal time 时间记录与更新

前面看到的都是 guest os 如何使用 steal time 时间，那么该部分时间从哪里来的呢？
steal time 时间则是由 host os 来负载记录与更新。

通过启动流程可以知道，percpu 的 steal time 的物理地址通过 msr 已经传递给了 host 主机，由 host 在合适的时机去更新 steal time，具体位置在前面分析的 record_steal_time 中，每当 vcpu 即将开始运行时，都会更新 steal time 时间，让虚机得到真实的窃取时间。

static void record_steal_time(struct kvm_vcpu *vcpu)
{
	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
		return;

    // 通过 write msr，vcpu->arch.st.stime 已经被初始化，保存了虚机内部 stiem percpu
    // 的物理地址，通过 kvm memslot 机制可以将其 gpa 地址转换为 hva 主机端虚拟地址。
    // 并将其对应的 steal time 结构体数据读取到主机端的 vcpu->arch.st.steal 中。
	if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
		return;

	/*
	 * Doing a TLB flush here, on the guest's behalf, can avoid
	 * expensive IPIs.
	 */
	if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
		kvm_vcpu_flush_tlb(vcpu, false);

    // 保证处于偶数
	if (vcpu->arch.st.steal.version & 1)
		vcpu->arch.st.steal.version += 1;  /* first time write, random junk */

    // 设置为奇数，表示主机正在更新该值，虚机会在检测到奇数时重新读取，直到为偶数。
	vcpu->arch.st.steal.version += 1;

	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));

	smp_wmb();

    // 将 current 的 run_delay 累加到 steal 中，
    // run_delay 表示当前任务未在运行的时间，即包括不在 rq 队列上和在 rq 队列上等待的时间。
	vcpu->arch.st.steal.steal += current->sched_info.run_delay -
		vcpu->arch.st.last_steal;
	// 将 last_steal 更新为当前的 run_delay，方便下一次计算。
	vcpu->arch.st.last_steal = current->sched_info.run_delay;

	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));

	smp_wmb();

    // 完成更新后，将 version 设置为偶数，虚机可以读取了。
	vcpu->arch.st.steal.version += 1;

	kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
		&vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
}

通过 record_steal_time，虚机内部的 steal time 会被更新为当前 vcpu 最新的被窃取时间。

run_delay 如何计算的呢？如下：

sched_info_dequeued
	-> dequeue_task（设置 cpumask，迁移task，设置优先级）

sched_info_arrive
	-> __sched_info_switch
		-> sched_info_switch
			-> prepare_task_switch

sched_info_queued
	-> enqueue_task
	-> sched_info_depart
	-> __sched_info_switch (prev)

所有的计算是在 schedule 中计算的：

schedule
  -> __schedule
    -> context_switch
      -> prepare_task_switch(rq, prev, next);
        -> sched_info_switch(rq, prev, next);
          -> __sched_info_switch
          
static inline void
__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
	/*
	 * prev now departs the CPU.  It's not interesting to record
	 * stats about how efficient we were at scheduling the idle
	 * process, however.
	 */
	if (prev != rq->idle)
		sched_info_depart(rq, prev);

	if (next != rq->idle)
		sched_info_arrive(rq, next);
}

(1)首先看看 sched_info_depart
static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
{
	unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;

	rq_sched_info_depart(rq, delta);

	if (t->state == TASK_RUNNING)
		sched_info_queued(rq, t);
}

static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
{
    // 首先这里记录的是上一个任务的退出时间保存在 sched_info.last_queued 中。
	if (unlikely(sched_info_on())) {
		if (!t->sched_info.last_queued)
			t->sched_info.last_queued = rq_clock(rq);
	}
}

static void sched_info_arrive(struct rq *rq, struct task_struct *t)
{
	unsigned long long now = rq_clock(rq), delta = 0;

	if (t->sched_info.last_queued)
		delta = now - t->sched_info.last_queued;
	sched_info_reset_dequeued(t);
	t->sched_info.run_delay += delta;
	t->sched_info.last_arrival = now;
	t->sched_info.pcount++;

	rq_sched_info_arrive(rq, delta);
}

当任务进行切换时，在 __sched_info_switch 中将会处理 run_delay 的计算。

sched_info_depart 计算的是 prev task 的时间，上一个任务即将被调度出去，将该点时间记录到 last_queued 中。

sched_info_arrive 计算的是 next task 的时间，该任务即将开始运行，将现在时间减去 sched_info.last_queued 时间，得到 run_delay 时间，因为 sched_info.last_queued 记录的是上一次调度出去的时间，所以这一次相减即可得到 run_delay，接着将其累加到 t->sched_info.run_delay 中，并更新 sched_info.last_arrival 为 now，方便下一次计算。

上述代码可以总结出：每当任务被调度出去之前记录当前时间last_queued，每当任务开始运行时，将当前时间减去上一次的 last_queued 得到 run_dealy，就是任务没有运行的时间。