Index: linux-2.6.8.1-ck6/kernel/sched.c =================================================================== --- linux-2.6.8.1-ck6.orig/kernel/sched.c 2004-09-06 22:46:49.464951759 +1000 +++ linux-2.6.8.1-ck6/kernel/sched.c 2004-09-06 22:47:25.367309364 +1000 @@ -438,7 +438,8 @@ static void resched_task(task_t *p) { int need_resched, nrpolling; - preempt_disable(); + BUG_ON(!spin_is_locked(&task_rq(p)->lock)); + /* minimise the chance of sending an interrupt to poll_idle() */ nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); @@ -446,7 +447,6 @@ static void resched_task(task_t *p) if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) smp_send_reschedule(task_cpu(p)); - preempt_enable(); } #else static inline void resched_task(task_t *p) @@ -1800,8 +1800,10 @@ void scheduler_tick(int user_ticks, int cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; + spin_lock(&rq->lock); if (wake_priority_sleeper(rq)) - goto out; + goto out_unlock; + spin_unlock(&rq->lock); rebalance_tick(cpu, rq, IDLE); return; } @@ -1852,23 +1854,33 @@ out: } #ifdef CONFIG_SCHED_SMT -static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) +static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { - int i; - struct sched_domain *sd = rq->sd; + struct sched_domain *sd = this_rq->sd; cpumask_t sibling_map; + int i; if (!(sd->flags & SD_SHARE_CPUPOWER)) return; - cpus_and(sibling_map, sd->span, cpu_online_map); - for_each_cpu_mask(i, sibling_map) { - runqueue_t *smt_rq; + /* + * Unlock the current runqueue because we have to lock in + * CPU order to avoid deadlocks. Caller knows that we might + * unlock. We keep IRQs disabled. + */ + spin_unlock(&this_rq->lock); - if (i == cpu) - continue; + cpus_and(sibling_map, sd->span, cpu_online_map); + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + /* + * We clear this CPU from the mask. This both simplifies the + * inner loop and keps this_rq locked when we exit: + */ + cpu_clear(this_cpu, sibling_map); - smt_rq = cpu_rq(i); + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); /* * If an SMT sibling task is sleeping due to priority @@ -1877,27 +1889,48 @@ static inline void wake_sleeping_depende if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) resched_task(smt_rq->idle); } + + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); + /* + * We exit with this_cpu's rq still held and IRQs + * still disabled: + */ } -static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) +static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { - struct sched_domain *sd = rq->sd; + struct sched_domain *sd = this_rq->sd; cpumask_t sibling_map; int ret = 0, i; + task_t *p; if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; + /* + * The same locking rules and details apply as for + * wake_sleeping_dependent(): + */ + spin_unlock(&this_rq->lock); cpus_and(sibling_map, sd->span, cpu_online_map); - for_each_cpu_mask(i, sibling_map) { - runqueue_t *smt_rq; - task_t *smt_curr; + for_each_cpu_mask(i, sibling_map) + spin_lock(&cpu_rq(i)->lock); + cpu_clear(this_cpu, sibling_map); - if (i == cpu) - continue; + /* + * Establish next task to be run - it might have gone away because + * we released the runqueue lock above: + */ + if (!this_rq->nr_running) + goto out_unlock; - smt_rq = cpu_rq(i); - smt_curr = smt_rq->curr; + p = list_entry(this_rq->queue[sched_find_first_bit(this_rq->bitmap)].next, + task_t, run_list); + + for_each_cpu_mask(i, sibling_map) { + runqueue_t *smt_rq = cpu_rq(i); + task_t *smt_curr = smt_rq->curr; /* * If a user task with lower static priority than the @@ -1925,14 +1958,17 @@ static inline int dependent_sleeper(int (smt_curr == smt_rq->idle && smt_rq->nr_running)) resched_task(smt_curr); } +out_unlock: + for_each_cpu_mask(i, sibling_map) + spin_unlock(&cpu_rq(i)->lock); return ret; } #else -static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) +static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) { } -static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) +static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) { return 0; } @@ -1989,20 +2025,37 @@ need_resched: cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { +go_idle: idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; wake_sleeping_dependent(cpu, rq); + /* + * wake_sleeping_dependent() might have released + * the runqueue, so break out if we got new + * tasks meanwhile: + */ + if (likely(!rq->nr_running)) + goto switch_tasks; + } + } else { + if (dependent_sleeper(cpu, rq)) { + next = rq->idle; goto switch_tasks; } + /* + * dependent_sleeper() releases and reacquires the runqueue + * lock, hence go into the idle loop if the rq went + * empty meanwhile: + */ + if (unlikely(!rq->nr_running)) + goto go_idle; } idx = sched_find_first_bit(rq->bitmap); queue = rq->queue + idx; next = list_entry(queue->next, task_t, run_list); - if (dependent_sleeper(cpu, rq, next)) - next = rq->idle; switch_tasks: prefetch(next); clear_tsk_need_resched(prev);