1.每个物理CPU下面都有一个虚拟CPU的运行队列

2.每个运行队列中的VCPU都有一个
credit

3. credit表示VCPU的priority的价值

4.CPU调度的是最先入队的处于under状态下的VCPU

5.每10msec 为一个时间片,相应一次中断。如果被调度的VCPU的credit处于OVER状态那么它将不再被继续调度,重新计算credit值,调度后面的处于under状态下的第一个VCPU

6.如果进行了3个时间片也就是30msec时,原先的VCPU还是没有处于OVER状态,那么这个VCPU也将不被继续调度,credit值的重计算,同5后面的步骤

7.代码中的计算公式VCPU的credit = credit –CSCHED_CREDITS_PER_TICK (100)+30msec/n(VCPU的数)

8.处于OVER状态的VCPU credit的值不被增加

 

 

概念:Credit, Weight, Cap

想彻底搞清楚这三个词的概念,我想最重要的是把这个函数读懂:sched_credit.c/csched_acct()

Credit: 这是针对Scheduler而言的,而不是针对Domain.

csched_priv.credit = CSCHED_CREDITS_PER_ACCT * #_of_PCPU. (for example: 300 * 4 = 1200)

Weight: 这个是针对Domain而言的,Scheduler根据各个domain的Weight,来分配credit。是一个“相对”的概念

比如说:256:256和512:512是一样的,彼此各占一半。但有什么区别呢?

       512:512相对于256:256,控制的精度更高。

/*

* A domain's fair share is computed using its weight in competition

* with that of all other active domains.

*

* At most, a domain can use credits to run all its active VCPUs

* for one full accounting period. We allow a domain to earn more

* only when the system-wide credit balance is negative.

*/

Cap: 这也是针对Domain而言的,是一个“绝对”概念。100代表一整颗PCPU的Cycles。50代表最多可以运行半个PCPU的Cycles.

在csched_acct这个函数中:

(1) 根据各个domain的weight情况,把total_credit分配到各个domain中

    credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)) / weight_total;

(2) 再把domain的Credit平均分配到domain的各个VCPU中

    credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )) / sdom->active_vcpu_count;

 

本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/snailhit/archive/2010/12/30/6107279.aspx

 

虚拟机(xen)中credit调度算法分析

   宋伟 联想研究院

调度简介

    在虚拟机xen中主要有两中调度算法,一种是credit算法,另一种是sedf算法。Credit算法就是让每一个vcpu(虚拟cpu)都可以公平的使用物理cpu的资源。Sedf算法可以根据每个vcpu负载的大小动态的调整vcpu的优先级。

在虚拟机xen中关于调度的代码是这样的:

void __init scheduler_init(void)

{

    int i;

    open_softirq(SCHEDULE_SOFTIRQ, schedule); //打开/注册schedule这个软中断

    for_each_cpu ( i )

    { //为每个cpu定一个定时器。在时间到后就调用回调函数s_time_fn, 并且在回调函数中调用产生软中断(设置bit),在cpu 发送vmexit后会检查软中断的mask位,如果发现某些位被置上后就会调用其中断回调函数。

        spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);

        init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);

    }

    for ( i = 0; schedulers[i] != NULL; i++ )

    {

        ops = *schedulers[i];

        if ( strcmp(ops.opt_name, opt_sched) == 0 )

        break;

    }

    if ( schedulers[i] == NULL )

        printk("Could not find scheduler: %s/n", opt_sched);

    printk("Using scheduler: %s (%s)/n", ops.name, ops.opt_name);

    SCHED_OP(init);

}

    在schedule.c中其实是对调度的抽象层,具体的实现在sched_credit.c or sched_sedf.c中。至于要使用sedf或者credit算法。可以由宏定义来指明。

Credit算法

Credit算法:

    每个物理cpu都有一个runq,这个runq是一个以每个vcpu的priority的大小来排序的。Priority有over(above fair share) and down(below fair share)

图(一)调度队列整体结构

图(二)Credit scheduler调度队列具体实现

图(三)内核启动调度器流程

图(四)调度器初始化流程

图(五)Credit 调度的优先级计算方法

我们可以看到bsp对其他的ap的runq队列按照计算的优先级进行排序。

图(六) Credit算法偷取任务流程图

 

任务的优先级次序:
1.CSCHED_PRI_TS_UNDER
2.CSCHED_PRI_TS_OVER
3.CSCHED_PRI_IDLE
 
static void csched_acct(void)计算任务优先级
{
    unsigned long flags;
    struct list_head *iter_vcpu, *next_vcpu;
    struct list_head *iter_sdom, *next_sdom;
    struct csched_vcpu *svc;
    struct csched_dom *sdom;
    uint32_t credit_total;
    uint32_t weight_total;
    uint32_t weight_left;
    uint32_t credit_fair;
    uint32_t credit_peak;
    int credit_balance;
    int credit_xtra;
    int credit;
    spin_lock_irqsave(&csched_priv.lock, flags);
    weight_total = csched_priv.weight;这里的weight为所有active domain的权重总和
    credit_total = csched_priv.credit; credit为当前系统的分值,为物理cpu个数×30
 
    /* Converge balance towards 0 when it drops negative */
    if ( csched_priv.credit_balance < 0 )
    {
        credit_total -= csched_priv.credit_balance;
        CSCHED_STAT_CRANK(acct_balance);
    }
 
    if ( unlikely(weight_total == 0) )没有active domain无需进行调度
    {
        csched_priv.credit_balance = 0;
        spin_unlock_irqrestore(&csched_priv.lock, flags);
        CSCHED_STAT_CRANK(acct_no_work);
        return;
    }
 
    CSCHED_STAT_CRANK(acct_run);
    weight_left = weight_total;
    credit_balance = 0;
    credit_xtra = 0;
    以active domain为循环过程,先计算每个domain的credit值,然后平分到这个domain的每个vcpu中,最后计算每个vcpu的优先级。
    list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
    {
        sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
        BUG_ON( is_idle_domain(sdom->dom) );
        BUG_ON( sdom->active_vcpu_count == 0 );
        BUG_ON( sdom->weight == 0 );
        BUG_ON( sdom->weight > weight_left );
        weight_left -= sdom->weight;
        /*
         * A domain's fair share is computed using its weight in competition
         * with that of all other active domains.
         *
         * At most, a domain can use credits to run all its active VCPUs
         * for one full accounting period. We allow a domain to earn more
         * only when the system-wide credit balance is negative.
         */
    credit_peak = sdom->active_vcpu_count * CSCHED_ACCT_PERIOD;
    if ( csched_priv.credit_balance < 0 )
    {
        credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) + (weight_total - 1) ) / weight_total;
    }
    if ( sdom->cap != 0U )
    {
        uint32_t credit_cap = ((sdom->cap * CSCHED_ACCT_PERIOD) + 99) / 100;
        if ( credit_cap < credit_peak )
            credit_peak = credit_cap;
    }
    credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1) ) / weight_total;
 
    计算当前domain 的credit值
    if ( credit_fair < credit_peak )若实际分配的credit值比它该获得的credit低,表明分配给该domain的credit应该多给一些credit值。需要将该domain在csched_priv.active_sdom向后排,(越往后排可能获得的credit值就比它排在前面时多)
    {
        credit_xtra = 1;表明当前domain需要向后排
    }
    else实际分配的credit值比它应该获得的credit高
    {
        if ( weight_left != 0U )将该domain多出的credit值分配给其他domain
        {
            /* Give other domains a chance at unused credits */
            credit_total += ( ( ( credit_fair - credit_peak ) * weight_total ) + ( weight_left - 1 ) ) /  weight_left;
        }
        if ( credit_xtra )
        {需要将该domain往队列前面放
            /*
             * Lazily keep domains with extra credits at the head of
             * the queue to give others a chance at them in future
             * accounting periods.
             */
            CSCHED_STAT_CRANK(acct_reorder);
            list_del(&sdom->active_sdom_elem);
            list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
        }
        credit_fair = credit_peak;用应该获得的credit值(非实际credit值)进行分配vcpu,即需要降低他们的优先级
    }
    /* Compute fair share per VCPU */
    credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 ) ) / sdom->active_vcpu_count;
    list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
    {分配当前domain的credit值给每个vcpu
        svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
        BUG_ON( sdom != svc->sdom );
        /* Increment credit */
        atomic_add(credit_fair, &svc->credit);
        credit = atomic_read(&svc->credit);
        /*
         * Recompute priority or, if VCPU is idling, remove it from
         * the active list.
         */
         if ( credit < 0 )计算vcpu优先级
         {
             if ( sdom->cap == 0U )
                 svc->pri = CSCHED_PRI_TS_OVER;
             else
                 svc->pri = CSCHED_PRI_TS_PARKED;
             if ( credit < -CSCHED_TSLICE )
             {
                 CSCHED_STAT_CRANK(acct_min_credit);
                 credit = -CSCHED_TSLICE;
                 atomic_set(&svc->credit, credit);
              }
         }
        else
        {
            svc->pri = CSCHED_PRI_TS_UNDER;
            if ( credit > CSCHED_TSLICE )检测是否当前的vpu为空闲状态
                __csched_vcpu_acct_idle_locked(svc);
         }
        svc->credit_last = credit;
        svc->credit_incr = credit_fair;
        credit_balance += credit;
        }
    }
    csched_priv.credit_balance = credit_balance;
    spin_unlock_irqrestore(&csched_priv.lock, flags);
    /* Inform each CPU that its runq needs to be sorted */
    csched_priv.runq_sort++;
}
 
static struct csched_vcpu *
csched_runq_steal(struct csched_pcpu *spc, int cpu, int pri)
{
    struct list_head *iter;
    struct csched_vcpu *speer;
    struct vcpu *vc;
    list_for_each( iter, &spc->runq )遍历spc(vcpu)对应物理cpu的调度队列
    {
        speer = __runq_elem(iter);找到每个队列元素的调度结构
        /*
         * If next available VCPU here is not of higher priority than ours,
         * this PCPU is useless to us.
         */
        if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri <= pri )
        {如果所偷的任务的优先级没有原cpu的任务低,则无需偷此cpu的任务
            CSCHED_STAT_CRANK(steal_peer_idle);
            break;
        }
        /* Is this VCPU is runnable on our PCPU? */
        vc = speer->vcpu;找到了比原cpu优先级高的任务
        BUG_ON( is_idle_vcpu(vc) );如果当前此任务对应的vcpu是空闲的,就停止偷取此任务
        if ( __csched_vcpu_is_stealable(cpu, vc) )
        {
            /* We got a candidate. Grab it! */
            __runq_remove(speer);
            vc->processor = cpu;
            return speer;
         }
    }
    return NULL;
}
Logo

华为开发者空间,是为全球开发者打造的专属开发空间,汇聚了华为优质开发资源及工具,致力于让每一位开发者拥有一台云主机,基于华为根生态开发、创新。

更多推荐