Index: linux-2.6.8.1-ck9/kernel/sched.c =================================================================== --- linux-2.6.8.1-ck9.orig/kernel/sched.c 2004-10-02 11:38:40.595124856 +1000 +++ linux-2.6.8.1-ck9/kernel/sched.c 2004-10-02 11:38:44.998436700 +1000 @@ -352,10 +352,12 @@ static void recalc_task_prio(task_t *p, { unsigned long sleep_time = now - p->timestamp; unsigned int rr = rr_interval(p); - if (p->flags & PF_FORKED || - (NS_TO_JIFFIES(p->runtime + sleep_time) < rr / 2 || - ((!sched_interactive || sched_compute || rr < 3 ) && - NS_TO_JIFFIES(p->runtime + sleep_time) < rr))) { + unsigned int best_burst = burst(p); + unsigned int minrun = rr * (p->burst + 1) / (best_burst + 1) ? : 1; + if (p->flags & PF_FORKED || (p->mm && + (NS_TO_JIFFIES(p->runtime + sleep_time) < minrun || + ((!sched_interactive || sched_compute) && + NS_TO_JIFFIES(p->runtime + sleep_time) < rr)))) { unsigned long ns_totalrun = p->totalrun + p->runtime; unsigned long total_run = NS_TO_JIFFIES(ns_totalrun); p->flags &= ~PF_FORKED; @@ -364,13 +366,22 @@ static void recalc_task_prio(task_t *p, dec_burst(p); } else { unsigned int intervals = total_run / rr; + unsigned int remainder; p->totalrun = ns_totalrun; p->slice -= intervals * rr; - if (p->slice <= rr) + if (p->slice <= rr) { p->totalrun = 0; + dec_burst(p); + } else { + remainder = p->slice % rr; + if (remainder) + p->time_slice = remainder; + } } } else { - if (!(p->flags & PF_UISLEEP)) + if (NS_TO_JIFFIES(p->totalrun) > (best_burst - p->burst) * rr) + dec_burst(p); + else if (!(p->flags & PF_UISLEEP || p->totalrun)) inc_burst(p); p->runtime = 0; p->totalrun = 0; @@ -392,10 +403,10 @@ static void activate_task(task_t *p, run } #endif p->slice = slice(p); + p->time_slice = rr_interval(p); recalc_task_prio(p, now); p->prio = effective_prio(p); p->flags &= ~PF_UISLEEP; - p->time_slice = rr_interval(p); if (batch_task(p)) p->time_slice = p->slice; p->timestamp = now; @@ -427,7 +438,8 @@ static void resched_task(task_t *p) { int need_resched, nrpolling; - preempt_disable(); + BUG_ON(!spin_is_locked(&task_rq(p)->lock)); + /* minimise the chance of sending an interrupt to poll_idle() */ nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); @@ -435,7 +447,6 @@ static void resched_task(task_t *p) if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) smp_send_reschedule(task_cpu(p)); - preempt_enable(); } #else static inline void resched_task(task_t *p) @@ -1789,8 +1800,10 @@ void scheduler_tick(int user_ticks, int cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; + spin_lock(&rq->lock); if (wake_priority_sleeper(rq)) - goto out; + goto out_unlock; + spin_unlock(&rq->lock); rebalance_tick(cpu, rq, IDLE); return; } @@ -1841,23 +1854,33 @@ out: } #ifdef CONFIG_SCHED_SMT -static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) +static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { - int i; - struct sched_domain *sd = rq->sd; + struct sched_domain *sd = this_rq->sd; cpumask_t sibling_map; + int i; if (!(sd->flags & SD_SHARE_CPUPOWER)) return; - cpus_and(sibling_map, sd->span, cpu_online_map); - for_each_cpu_mask(i, sibling_map) { - runqueue_t *smt_rq; + /* + * Unlock the current runqueue because we have to lock in + * CPU order to avoid deadlocks. Caller knows that we might + * unlock. We keep IRQs disabled. + */ + spin_unlock(&this_rq->lock); - if (i == cpu) - continue; + cpus_and(sibling_map, sd->span, cpu_online_map); + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + /* + * We clear this CPU from the mask. This both simplifies the + * inner loop and keps this_rq locked when we exit: + */ + cpu_clear(this_cpu, sibling_map); - smt_rq = cpu_rq(i); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); /* * If an SMT sibling task is sleeping due to priority @@ -1866,27 +1889,48 @@ static inline void wake_sleeping_depende if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) resched_task(smt_rq->idle); } + + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); + /* + * We exit with this_cpu's rq still held and IRQs + * still disabled: + */ } -static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) +static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { - struct sched_domain *sd = rq->sd; + struct sched_domain *sd = this_rq->sd; cpumask_t sibling_map; int ret = 0, i; + task_t *p; if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; + /* + * The same locking rules and details apply as for + * wake_sleeping_dependent(): + */ + spin_unlock(&this_rq->lock); cpus_and(sibling_map, sd->span, cpu_online_map); - for_each_cpu_mask(i, sibling_map) { - runqueue_t *smt_rq; - task_t *smt_curr; + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + cpu_clear(this_cpu, sibling_map); - if (i == cpu) - continue; + /* + * Establish next task to be run - it might have gone away because + * we released the runqueue lock above: + */ + if (!this_rq->nr_running) + goto out_unlock; - smt_rq = cpu_rq(i); - smt_curr = smt_rq->curr; + p = list_entry(this_rq->queue[sched_find_first_bit(this_rq->bitmap)].next, + task_t, run_list); + + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); + task_t *smt_curr = smt_rq->curr; /* * If a user task with lower static priority than the @@ -1914,14 +1958,17 @@ static inline int dependent_sleeper(int (smt_curr == smt_rq->idle && smt_rq->nr_running)) resched_task(smt_curr); } +out_unlock: + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); return ret; } #else -static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) +static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { } -static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { return 0; } @@ -1978,20 +2025,37 @@ need_resched: cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { +go_idle: idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; wake_sleeping_dependent(cpu, rq); + /* + * wake_sleeping_dependent() might have released + * the runqueue, so break out if we got new + * tasks meanwhile: + */ + if (likely(!rq->nr_running)) + goto switch_tasks; + } + } else { + if (dependent_sleeper(cpu, rq)) { + next = rq->idle; goto switch_tasks; } + /* + * dependent_sleeper() releases and reacquires the runqueue + * lock, hence go into the idle loop if the rq went + * empty meanwhile: + */ + if (unlikely(!rq->nr_running)) + goto go_idle; } idx = sched_find_first_bit(rq->bitmap); queue = rq->queue + idx; next = list_entry(queue->next, task_t, run_list); - if (dependent_sleeper(cpu, rq, next)) - next = rq->idle; switch_tasks: prefetch(next); clear_tsk_need_resched(prev);