Cache load state of a sched domain On large systems the repeated scans of processors in a sched group to determine the load state of a group can become very expensive. In a system with a few hundred cpus each of the processors has to scan the few hundred processors run queues to establish the load situation of a sched group. It would be best if one's processors result could be used by others saving additional scanning effort. This patch addds state to sched_group and a timestamp of when that latest state was determined. A new function tally_group() is provided to update that sched_group state. All processors can then determine the state of a sched_group without scanning all the runqueues. A sched group is updated when more than 1/10th of second times the distance to the sched group has passed. I.e. It will rescan the sched_group that it is directly a part of every 1/10th of a second. The sched_group containing the lowest sched_group is scanned at max every 2/10th of a second and so on. This makes it likely that processors at the lowest level will provide the results for their local sched_group. The sched_group was allocated near the processors that are part of it. So it is likely that the scanning of the lowest level of sched_groups will only involve accesses that are NUMA local. The caching of the sched_group data only works if there is no special idle processing for power consumption and if we have not excluded any processors because they had all pinned processors. If either of these conditions is met then we always fall back to the full scan. The updates to the sched_group state are done without locking relying on the atomic update of the unsigned long values and on the fact that a partial update of the load state is not harmful and may in the worst case lead to a wrong reschedule attempt. Signed-off-by: Christoph Lameter Index: linux-2.6.19-rc1-mm1/include/linux/sched.h =================================================================== --- linux-2.6.19-rc1-mm1.orig/include/linux/sched.h 2006-10-11 17:54:04.289054012 -0700 +++ linux-2.6.19-rc1-mm1/include/linux/sched.h 2006-10-14 12:05:30.903745803 -0700 @@ -655,6 +655,14 @@ struct sched_group { * single CPU. This is read only (except for setup, hotplug CPU). */ unsigned long cpu_power; + + /* + * State of the processes in this sched group at the last time we + * scanned it via tally_group. + */ + unsigned long last; /* Last load calculation */ + unsigned long sum_nr_running; + unsigned long cpu_load[2][4]; /* Cpu load for source / targets */ }; struct sched_domain { Index: linux-2.6.19-rc1-mm1/kernel/sched.c =================================================================== --- linux-2.6.19-rc1-mm1.orig/kernel/sched.c 2006-10-14 11:58:31.038038869 -0700 +++ linux-2.6.19-rc1-mm1/kernel/sched.c 2006-10-14 12:23:39.479212499 -0700 @@ -1166,6 +1166,11 @@ static inline unsigned long target_load( return max(rq->cpu_load[type-1], rq->raw_weighted_load); } +static inline unsigned long group_load(struct sched_group *group, int type, int local) +{ + return group->cpu_load[local][type]; +} + /* * Return the average load per task on the cpu's run queue */ @@ -2223,6 +2228,34 @@ out: return pulled; } +static void tally_group(struct sched_group *group) +{ + unsigned long sum_nr_running = 0; + unsigned long cpu_load[2][4] = { {0, } }; + int cpu; + + group->last = jiffies + HZ; /* Avoid scans by other processors */ + for_each_cpu_mask(cpu, group->cpumask) { + struct rq *rq; + int i; + + rq = cpu_rq(cpu); + + for (i = 1; i <= 3; i++) { + cpu_load[0][i] += target_load(cpu, i); + cpu_load[1][i] += source_load(cpu, i); + } + + sum_nr_running += rq->nr_running; + cpu_load[0][0] += rq->raw_weighted_load; + } + /* Transfer calculated data into group */ + cpu_load[1][0] = cpu_load[0][0]; + group->sum_nr_running = sum_nr_running; + memcpy(group->cpu_load,cpu_load, sizeof(cpu_load)); + group->last = jiffies; +} + /* * find_busiest_group finds and returns the busiest CPU group within the * domain. It calculates and returns the amount of weighted load which @@ -2239,6 +2272,7 @@ find_busiest_group(struct sched_domain * unsigned long busiest_load_per_task, busiest_nr_running; unsigned long this_load_per_task, this_nr_running; int load_idx; + int distance = 0; #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) int power_savings_balance = 1; unsigned long leader_nr_running = 0, min_load_per_task = 0; @@ -2263,30 +2297,43 @@ find_busiest_group(struct sched_domain * unsigned long sum_nr_running, sum_weighted_load; local_group = cpu_isset(this_cpu, group->cpumask); + distance++; - /* Tally up the load of all CPUs in the group */ - sum_weighted_load = sum_nr_running = avg_load = 0; + if (*sd_idle || cpus_full(*cpus)) { + /* + * Non standard scan. + * Tally up the load of all CPUs in the group + */ + sum_weighted_load = sum_nr_running = avg_load = 0; - for_each_cpu_mask(i, group->cpumask) { - struct rq *rq; + for_each_cpu_mask(i, group->cpumask) { + struct rq *rq; - if (!cpu_isset(i, *cpus)) - continue; + if (!cpu_isset(i, *cpus)) + continue; - rq = cpu_rq(i); + rq = cpu_rq(i); - if (*sd_idle && !idle_cpu(i)) - *sd_idle = 0; + if (*sd_idle && !idle_cpu(i)) + *sd_idle = 0; - /* Bias balancing toward cpus of our domain */ - if (local_group) - load = target_load(i, load_idx); - else - load = source_load(i, load_idx); + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = target_load(i, load_idx); + else + load = source_load(i, load_idx); - avg_load += load; - sum_nr_running += rq->nr_running; - sum_weighted_load += rq->raw_weighted_load; + avg_load += load; + sum_nr_running += rq->nr_running; + sum_weighted_load += rq->raw_weighted_load; + } + } else { + /* Usual standard scan that may be optimized */ + if (jiffies > group->last + distance * HZ / 2) + tally_group(group); + avg_load = group_load(group, load_idx, local_group); + sum_weighted_load = group_load(group, 0, local_group); + sum_nr_running = group->sum_nr_running; } total_load += avg_load;