From: Peter Williams This is a modified version of Con Kolivas's patch to add "nice" support to load balancing across physical CPUs on SMP systems. The principal modifications to the Con's mechanism are changing move_tasks() so that it endeavours to move a specified amount of biased load rather than a specified number of tasks, changing find_busiest_group() so that the value returned in "imbalance" is in terms of biased load rather than number of tasks and changing rebalance_tick() to calculate "cpu_load" for each run queue as biased load rather than plain load. To be more precise, because of the special case of active_load_balance() wanting to move exactly 1 task, move_tasks() actually moves up to a given number of tasks or up to a given amount of biased load. Because these changes mean that tasks' biased prio is evaluated much more often than in the original implementation a "bias_prio" field has been added to the task structure to hold its value meaning that it only needs to be calculated when the task's nice or scheduling class is changed. This change facilitates considerable simplification of much of the code. Signed-off-by: Peter Williams Acked-by: Ingo Molnar Acked-by: Con Kolivas Signed-off-by: Andrew Morton --- include/linux/sched.h | 3 kernel/sched.c | 161 ++++++++++++++++++++++++-------------------------- 2 files changed, 82 insertions(+), 82 deletions(-) diff -puN include/linux/sched.h~sched-modified-nice-support-for-smp-load-balancing include/linux/sched.h --- devel/include/linux/sched.h~sched-modified-nice-support-for-smp-load-balancing 2005-10-11 00:35:00.000000000 -0700 +++ devel-akpm/include/linux/sched.h 2005-10-11 00:35:00.000000000 -0700 @@ -655,6 +655,9 @@ struct task_struct { int oncpu; #endif int prio, static_prio; +#ifdef CONFIG_SMP + int bias_prio; +#endif struct list_head run_list; prio_array_t *array; diff -puN kernel/sched.c~sched-modified-nice-support-for-smp-load-balancing kernel/sched.c --- devel/kernel/sched.c~sched-modified-nice-support-for-smp-load-balancing 2005-10-11 00:35:00.000000000 -0700 +++ devel-akpm/kernel/sched.c 2005-10-11 00:35:00.000000000 -0700 @@ -60,6 +60,16 @@ #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) +#ifdef CONFIG_SMP +/* + * Priority bias for load balancing ranges from 1 (nice==19) to 139 (RT + * priority of 100). + */ +#define NICE_TO_BIAS_PRIO(nice) (20 - (nice)) +#define PRIO_TO_BIAS_PRIO(prio) NICE_TO_BIAS_PRIO(PRIO_TO_NICE(prio)) +#define RTPRIO_TO_BIAS_PRIO(rp) (40 + (rp)) +#endif + /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, @@ -661,46 +671,53 @@ static int effective_prio(task_t *p) } #ifdef CONFIG_SMP -static inline void inc_prio_bias(runqueue_t *rq, int prio) +static inline void set_bias_prio(task_t *p) { - rq->prio_bias += MAX_PRIO - prio; + if (rt_task(p)) { + if (p == task_rq(p)->migration_thread) + /* + * The migration thread does the actual balancing. Do + * not bias by its priority as the ultra high priority + * will skew balancing adversely. + */ + p->bias_prio = 0; + else + p->bias_prio = RTPRIO_TO_BIAS_PRIO(p->rt_priority); + } else + p->bias_prio = PRIO_TO_BIAS_PRIO(p->static_prio); } -static inline void dec_prio_bias(runqueue_t *rq, int prio) +static inline void inc_prio_bias(runqueue_t *rq, const task_t *p) { - rq->prio_bias -= MAX_PRIO - prio; + rq->prio_bias += p->bias_prio; +} + +static inline void dec_prio_bias(runqueue_t *rq, const task_t *p) +{ + rq->prio_bias -= p->bias_prio; } static inline void inc_nr_running(task_t *p, runqueue_t *rq) { rq->nr_running++; - if (rt_task(p)) { - if (p != rq->migration_thread) - /* - * The migration thread does the actual balancing. Do - * not bias by its priority as the ultra high priority - * will skew balancing adversely. - */ - inc_prio_bias(rq, p->prio); - } else - inc_prio_bias(rq, p->static_prio); + inc_prio_bias(rq, p); } static inline void dec_nr_running(task_t *p, runqueue_t *rq) { rq->nr_running--; - if (rt_task(p)) { - if (p != rq->migration_thread) - dec_prio_bias(rq, p->prio); - } else - dec_prio_bias(rq, p->static_prio); + dec_prio_bias(rq, p); } #else -static inline void inc_prio_bias(runqueue_t *rq, int prio) +static inline void set_bias_prio(task_t *p) { } -static inline void dec_prio_bias(runqueue_t *rq, int prio) +static inline void inc_prio_bias(runqueue_t *rq, const task_t *p) +{ +} + +static inline void dec_prio_bias(runqueue_t *rq, const task_t *p) { } @@ -986,61 +1003,29 @@ void kick_process(task_t *p) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long __source_load(int cpu, int type, enum idle_type idle) +static inline unsigned long source_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long running = rq->nr_running; - unsigned long source_load, cpu_load = rq->cpu_load[type-1], - load_now = running * SCHED_LOAD_SCALE; + unsigned long load_now = rq->prio_bias * SCHED_LOAD_SCALE; if (type == 0) - source_load = load_now; - else - source_load = min(cpu_load, load_now); - - if (running > 1 || (idle == NOT_IDLE && running)) - /* - * If we are busy rebalancing the load is biased by - * priority to create 'nice' support across cpus. When - * idle rebalancing we should only bias the source_load if - * there is more than one task running on that queue to - * prevent idle rebalance from trying to pull tasks from a - * queue with only one running task. - */ - source_load = source_load * rq->prio_bias / running; - - return source_load; -} + return load_now; -static inline unsigned long source_load(int cpu, int type) -{ - return __source_load(cpu, type, NOT_IDLE); + return min(rq->cpu_load[type-1], load_now); } /* * Return a high guess at the load of a migration-target cpu */ -static inline unsigned long __target_load(int cpu, int type, enum idle_type idle) +static inline unsigned long target_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); - unsigned long running = rq->nr_running; - unsigned long target_load, cpu_load = rq->cpu_load[type-1], - load_now = running * SCHED_LOAD_SCALE; + unsigned long load_now = rq->prio_bias * SCHED_LOAD_SCALE; if (type == 0) - target_load = load_now; - else - target_load = max(cpu_load, load_now); - - if (running > 1 || (idle == NOT_IDLE && running)) - target_load = target_load * rq->prio_bias / running; + return load_now; - return target_load; -} - -static inline unsigned long target_load(int cpu, int type) -{ - return __target_load(cpu, type, NOT_IDLE); + return max(rq->cpu_load[type-1], load_now); } /* @@ -1304,7 +1289,7 @@ static int try_to_wake_up(task_t *p, uns * of the current CPU: */ if (sync) - tl -= SCHED_LOAD_SCALE; + tl -= p->bias_prio * SCHED_LOAD_SCALE; if ((tl <= load && tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || @@ -1909,15 +1894,16 @@ int can_migrate_task(task_t *p, runqueue * Called with both runqueues locked. */ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, - unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle, int *all_pinned) + unsigned long max_nr_move, long max_bias_move, + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) { prio_array_t *array, *dst_array; struct list_head *head, *curr; int idx, pulled = 0, pinned = 0; task_t *tmp; - if (max_nr_move == 0) + if (max_nr_move == 0 || max_bias_move == 0) goto out; pinned = 1; @@ -1960,7 +1946,8 @@ skip_queue: curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { + if (tmp->bias_prio > max_bias_move || + !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { if (curr != head) goto skip_queue; idx++; @@ -1974,9 +1961,13 @@ skip_queue: pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; + max_bias_move -= tmp->bias_prio; - /* We only want to steal up to the prescribed number of tasks. */ - if (pulled < max_nr_move) { + /* + * We only want to steal up to the prescribed number of tasks + * and the prescribed amount of biased load. + */ + if (pulled < max_nr_move && max_bias_move > 0) { if (curr != head) goto skip_queue; idx++; @@ -1997,7 +1988,7 @@ out: /* * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the number of tasks which should be + * domain. It calculates and returns the amount of biased load which should be * moved to restore balance via the imbalance parameter. */ static struct sched_group * @@ -2033,9 +2024,9 @@ find_busiest_group(struct sched_domain * /* Bias balancing toward cpus of our domain */ if (local_group) - load = __target_load(i, load_idx, idle); + load = target_load(i, load_idx); else - load = __source_load(i, load_idx, idle); + load = source_load(i, load_idx); avg_load += load; } @@ -2090,7 +2081,7 @@ find_busiest_group(struct sched_domain * unsigned long tmp; if (max_load - this_load >= SCHED_LOAD_SCALE*2) { - *imbalance = 1; + *imbalance = NICE_TO_BIAS_PRIO(0); return busiest; } @@ -2123,7 +2114,7 @@ find_busiest_group(struct sched_domain * if (pwr_move <= pwr_now) goto out_balanced; - *imbalance = 1; + *imbalance = NICE_TO_BIAS_PRIO(0); return busiest; } @@ -2140,15 +2131,14 @@ out_balanced: /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ -static runqueue_t *find_busiest_queue(struct sched_group *group, - enum idle_type idle) +static runqueue_t *find_busiest_queue(struct sched_group *group) { unsigned long load, max_load = 0; runqueue_t *busiest = NULL; int i; for_each_cpu_mask(i, group->cpumask) { - load = __source_load(i, 0, idle); + load = source_load(i, 0); if (load > max_load) { max_load = load; @@ -2165,6 +2155,7 @@ static runqueue_t *find_busiest_queue(st */ #define MAX_PINNED_INTERVAL 512 +#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0) /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -2192,7 +2183,7 @@ static int load_balance(int this_cpu, ru goto out_balanced; } - busiest = find_busiest_queue(group, idle); + busiest = find_busiest_queue(group); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; @@ -2212,6 +2203,7 @@ static int load_balance(int this_cpu, ru */ double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, + minus_1_or_zero(busiest->nr_running), imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); @@ -2315,7 +2307,7 @@ static int load_balance_newidle(int this goto out_balanced; } - busiest = find_busiest_queue(group, NEWLY_IDLE); + busiest = find_busiest_queue(group); if (!busiest) { schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); goto out_balanced; @@ -2330,6 +2322,7 @@ static int load_balance_newidle(int this /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, + minus_1_or_zero(busiest->nr_running), imbalance, sd, NEWLY_IDLE, NULL); spin_unlock(&busiest->lock); } @@ -2410,7 +2403,8 @@ static void active_load_balance(runqueue schedstat_inc(sd, alb_cnt); - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) + if (move_tasks(target_rq, target_cpu, busiest_rq, 1, + RTPRIO_TO_BIAS_PRIO(100), sd, SCHED_IDLE, NULL)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -2438,7 +2432,7 @@ static void rebalance_tick(int this_cpu, struct sched_domain *sd; int i; - this_load = this_rq->nr_running * SCHED_LOAD_SCALE; + this_load = this_rq->prio_bias * SCHED_LOAD_SCALE; /* Update our load */ for (i = 0; i < 3; i++) { unsigned long new_load = this_load; @@ -3552,18 +3546,19 @@ void set_user_nice(task_t *p, long nice) array = p->array; if (array) { dequeue_task(p, array); - dec_prio_bias(rq, p->static_prio); + dec_prio_bias(rq, p); } old_prio = p->prio; new_prio = NICE_TO_PRIO(nice); delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); + set_bias_prio(p); p->prio += delta; if (array) { enqueue_task(p, array); - inc_prio_bias(rq, p->static_prio); + inc_prio_bias(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -3702,6 +3697,7 @@ static void __setscheduler(struct task_s p->prio = MAX_RT_PRIO-1 - p->rt_priority; else p->prio = p->static_prio; + set_bias_prio(p); } /** @@ -5642,6 +5638,7 @@ void __init sched_init(void) } } + set_bias_prio(&init_task); /* * The boot idle thread does lazy MMU switching as well: */ _