From: Peter Williams -ENOCHANGELOG Signed-off-by: Peter Williams Cc: "Martin J. Bligh" Acked-by: Con Kolivas Signed-off-by: Andrew Morton --- include/linux/sched.h | 2 - kernel/sched.c | 47 ++++++++++++++++++++++++++++++---------- 2 files changed, 37 insertions(+), 12 deletions(-) diff -puN include/linux/sched.h~sched-modified-nice-support-for-smp-load-balancing-fix include/linux/sched.h --- devel/include/linux/sched.h~sched-modified-nice-support-for-smp-load-balancing-fix 2006-01-15 22:53:41.000000000 -0800 +++ devel-akpm/include/linux/sched.h 2006-01-15 22:53:41.000000000 -0800 @@ -712,7 +712,7 @@ struct task_struct { #endif int prio, static_prio; #ifdef CONFIG_SMP - int bias_prio; + int bias_prio; /* load "weight" factor for load balancing purposes */ #endif struct list_head run_list; prio_array_t *array; diff -puN kernel/sched.c~sched-modified-nice-support-for-smp-load-balancing-fix kernel/sched.c --- devel/kernel/sched.c~sched-modified-nice-support-for-smp-load-balancing-fix 2006-01-15 22:53:41.000000000 -0800 +++ devel-akpm/kernel/sched.c 2006-01-15 22:53:41.000000000 -0800 @@ -680,6 +680,13 @@ static int effective_prio(task_t *p) } #ifdef CONFIG_SMP +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. The bias_prio field holds the value + * used to calculate the weight for each task. + */ static inline void set_bias_prio(task_t *p) { if (rt_task(p)) { @@ -717,6 +724,18 @@ static inline void dec_nr_running(task_t rq->nr_running--; dec_prio_bias(rq, p); } + +/* convert biased priority to scaled weighted load */ +static inline unsigned long weighted_load(unsigned long bias) +{ + return (bias * SCHED_LOAD_SCALE) / NICE_TO_BIAS_PRIO(0); +} + +/* convert scaled weighted load to unscaled biased load */ +static inline unsigned long biased_load(unsigned long wload) +{ + return (wload * NICE_TO_BIAS_PRIO(0)) / SCHED_LOAD_SCALE; +} #else static inline void set_bias_prio(task_t *p) { @@ -1010,7 +1029,8 @@ void kick_process(task_t *p) } /* - * Return a low guess at the load of a migration-source cpu. + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. * * We want to under-estimate the load of migration sources, to * balance conservatively. @@ -1018,7 +1038,7 @@ void kick_process(task_t *p) static unsigned long source_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->prio_bias * SCHED_LOAD_SCALE; + unsigned long load_now = weighted_load(rq->prio_bias); if (type == 0) return load_now; @@ -1027,12 +1047,13 @@ static unsigned long source_load(int cpu } /* - * Return a high guess at the load of a migration-target cpu + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. */ static inline unsigned long target_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long load_now = rq->prio_bias * SCHED_LOAD_SCALE; + unsigned long load_now = weighted_load(rq->prio_bias); if (type == 0) return load_now; @@ -1298,7 +1319,7 @@ static int try_to_wake_up(task_t *p, uns * of the current CPU: */ if (sync) - tl -= p->bias_prio * SCHED_LOAD_SCALE; + tl -= weighted_load(p->bias_prio); if ((tl <= load && tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || @@ -1902,9 +1923,9 @@ int can_migrate_task(task_t *p, runqueue } /* - * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, - * as part of a balancing operation within "domain". Returns the number of - * tasks moved. + * move_tasks tries to move up to max_nr_move tasks and max_bias_move biased + * load from busiest to this_rq, as part of a balancing operation within + * "domain". Returns the number of tasks moved. * * Called with both runqueues locked. */ @@ -2133,8 +2154,11 @@ find_busiest_group(struct sched_domain * return busiest; } - /* Get rid of the scaling factor, rounding down as we divide */ - *imbalance = *imbalance / SCHED_LOAD_SCALE; + /* + * Get rid of the scaling factor, rounding down as we divide and + * converting to biased load for use by move_tasks() + */ + *imbalance = biased_load(*imbalance); return busiest; out_balanced: @@ -2447,7 +2471,8 @@ static void rebalance_tick(int this_cpu, struct sched_domain *sd; int i; - this_load = this_rq->prio_bias * SCHED_LOAD_SCALE; + /* weight load according to scheduling class and "nice" value */ + this_load = weighted_load(this_rq->prio_bias); /* Update our load */ for (i = 0; i < 3; i++) { unsigned long new_load = this_load; _