Use next_balance instead of last_balance ... The cpu offset calculation in the sched_domains code makes it difficult to figure out when the next event is supposed to happen since we only keep track of the last_balancing. We want to know when the next load balance is supposed to occur. Move the cpu offset calculation into build_sched_domains(). Do the setup of the staggered load balance schewduling when the sched domains are initialized. That way we dont have to worry about it anymore later. (V2) Move the idle calculation into rebalance_tick() and make sure that we use the idle state before load balancing to calculate the interval. Signed-off-by: Christoph Lameter Index: linux-2.6.19-rc3/include/asm-ia64/topology.h =================================================================== --- linux-2.6.19-rc3.orig/include/asm-ia64/topology.h 2006-10-27 13:40:06.000000000 -0500 +++ linux-2.6.19-rc3/include/asm-ia64/topology.h 2006-10-27 13:41:09.487242698 -0500 @@ -76,7 +76,6 @@ void build_cpu_to_node_map(void); | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE, \ - .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } @@ -102,7 +101,6 @@ void build_cpu_to_node_map(void); | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ | SD_WAKE_BALANCE, \ - .last_balance = jiffies, \ .balance_interval = 64, \ .nr_balance_failed = 0, \ } Index: linux-2.6.19-rc3/include/linux/sched.h =================================================================== --- linux-2.6.19-rc3.orig/include/linux/sched.h 2006-10-27 13:40:06.000000000 -0500 +++ linux-2.6.19-rc3/include/linux/sched.h 2006-10-27 13:41:09.504822422 -0500 @@ -670,7 +670,7 @@ struct sched_domain { int flags; /* See SD_* */ /* Runtime fields. */ - unsigned long last_balance; /* init to jiffies. units in jiffies */ + unsigned long next_balance; /* init to jiffies. units in jiffies */ unsigned int balance_interval; /* initialise to 1. units in ms. */ unsigned int nr_balance_failed; /* initialise to 0 */ Index: linux-2.6.19-rc3/include/linux/topology.h =================================================================== --- linux-2.6.19-rc3.orig/include/linux/topology.h 2006-10-27 13:40:06.000000000 -0500 +++ linux-2.6.19-rc3/include/linux/topology.h 2006-10-27 13:41:09.515565587 -0500 @@ -108,7 +108,6 @@ | SD_WAKE_AFFINE \ | SD_WAKE_IDLE \ | SD_SHARE_CPUPOWER, \ - .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } @@ -140,7 +139,6 @@ | SD_WAKE_AFFINE \ | SD_SHARE_PKG_RESOURCES\ | BALANCE_FOR_MC_POWER, \ - .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } @@ -170,7 +168,6 @@ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ | BALANCE_FOR_PKG_POWER,\ - .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } @@ -195,7 +192,6 @@ .forkexec_idx = 0, /* unused */ \ .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE, \ - .last_balance = jiffies, \ .balance_interval = 64, \ .nr_balance_failed = 0, \ } Index: linux-2.6.19-rc3/kernel/sched.c =================================================================== --- linux-2.6.19-rc3.orig/kernel/sched.c 2006-10-27 13:40:39.000000000 -0500 +++ linux-2.6.19-rc3/kernel/sched.c 2006-10-27 13:41:09.544865127 -0500 @@ -2848,42 +2848,43 @@ static void update_load(struct rq *this_ * * Balancing parameters are set up in arch_init_sched_domains. */ - -/* Don't have all balancing operations going off at once: */ -static inline unsigned long cpu_offset(int cpu) -{ - return jiffies + cpu * HZ / NR_CPUS; -} - static void -rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) +rebalance_tick(int this_cpu, struct rq *this_rq) { - unsigned long interval, j = cpu_offset(this_cpu); struct sched_domain *sd; + /* + * A task is idle if this is the idle queue + * and we have no runnable task + */ + enum idle_type idle = (this_rq->idle && !this_rq->nr_running) ? + SCHED_IDLE : NOT_IDLE; for_each_domain(this_cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; - interval = sd->balance_interval; - if (idle != SCHED_IDLE) - interval *= sd->busy_factor; - - /* scale ms to jiffies */ - interval = msecs_to_jiffies(interval); - if (unlikely(!interval)) - interval = 1; + if (jiffies >= sd->next_balance) { + unsigned long interval; + enum idle_type was_idle = idle; - if (j - sd->last_balance >= interval) { - if (load_balance(this_cpu, this_rq, sd, idle)) { + if (load_balance(this_cpu, this_rq, sd, idle)) /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is * not idle. */ idle = NOT_IDLE; - } - sd->last_balance += interval; + + interval = sd->balance_interval; + if (was_idle != SCHED_IDLE) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + if (unlikely(!interval)) + interval = 1; + + sd->next_balance += interval; } } } @@ -3137,20 +3138,18 @@ void scheduler_tick(void) struct task_struct *p = current; int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); - enum idle_type idle = NOT_IDLE; update_cpu_clock(p, rq, now); rq->timestamp_last_tick = now; - if (p == rq->idle) { + if (p == rq->idle) /* Task on the idle queue */ - if (!wake_priority_sleeper(rq)) - idle = SCHED_IDLE; - } else + wake_priority_sleeper(rq); + else task_running_tick(rq, p); update_load(rq); - rebalance_tick(cpu, rq, idle); + rebalance_tick(cpu, rq); } #ifdef CONFIG_SCHED_SMT @@ -6327,6 +6326,16 @@ static void init_sched_groups_power(int } /* + * Calculate jiffies start to use for each cpu. On sched domain + * initialization this jiffy value is used to stagger the load balancing + * of the cpus so that they do not load balance all at at the same time. + */ +static inline unsigned long cpu_offset(int cpu) +{ + return jiffies + cpu * HZ / NR_CPUS; +} + +/* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ @@ -6382,6 +6391,7 @@ static int build_sched_domains(const cpu sd->span = *cpu_map; group = cpu_to_allnodes_group(i, cpu_map); sd->groups = &sched_group_allnodes[group]; + sd->next_balance = cpu_offset(i); p = sd; } else p = NULL; @@ -6390,6 +6400,7 @@ static int build_sched_domains(const cpu *sd = SD_NODE_INIT; sd->span = sched_domain_node_span(cpu_to_node(i)); sd->parent = p; + sd->next_balance = cpu_offset(i); if (p) p->child = sd; cpus_and(sd->span, sd->span, *cpu_map); @@ -6401,6 +6412,7 @@ static int build_sched_domains(const cpu *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; + sd->next_balance = cpu_offset(i); if (p) p->child = sd; sd->groups = &sched_group_phys[group]; @@ -6413,6 +6425,7 @@ static int build_sched_domains(const cpu sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; + sd->next_balance = cpu_offset(i); p->child = sd; sd->groups = &sched_group_core[group]; #endif @@ -6425,6 +6438,7 @@ static int build_sched_domains(const cpu sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; + sd->next_balance = cpu_offset(i); p->child = sd; sd->groups = &sched_group_cpus[group]; #endif Index: linux-2.6.19-rc3/include/asm-i386/topology.h =================================================================== --- linux-2.6.19-rc3.orig/include/asm-i386/topology.h 2006-10-27 13:40:06.000000000 -0500 +++ linux-2.6.19-rc3/include/asm-i386/topology.h 2006-10-27 13:41:09.583931181 -0500 @@ -90,7 +90,6 @@ static inline int node_to_first_cpu(int | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ | SD_WAKE_BALANCE, \ - .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } Index: linux-2.6.19-rc3/include/asm-powerpc/topology.h =================================================================== --- linux-2.6.19-rc3.orig/include/asm-powerpc/topology.h 2006-10-27 13:40:06.000000000 -0500 +++ linux-2.6.19-rc3/include/asm-powerpc/topology.h 2006-10-27 13:41:09.603464208 -0500 @@ -60,7 +60,6 @@ extern int pcibus_to_node(struct pci_bus | SD_BALANCE_NEWIDLE \ | SD_WAKE_IDLE \ | SD_WAKE_BALANCE, \ - .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } Index: linux-2.6.19-rc3/include/asm-x86_64/topology.h =================================================================== --- linux-2.6.19-rc3.orig/include/asm-x86_64/topology.h 2006-10-27 13:40:06.000000000 -0500 +++ linux-2.6.19-rc3/include/asm-x86_64/topology.h 2006-10-27 13:41:09.615184024 -0500 @@ -48,7 +48,6 @@ extern int __node_distance(int, int); | SD_BALANCE_FORK \ | SD_BALANCE_EXEC \ | SD_WAKE_BALANCE, \ - .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } Index: linux-2.6.19-rc3/include/asm-mips/mach-ip27/topology.h =================================================================== --- linux-2.6.19-rc3.orig/include/asm-mips/mach-ip27/topology.h 2006-10-27 13:40:06.000000000 -0500 +++ linux-2.6.19-rc3/include/asm-mips/mach-ip27/topology.h 2006-10-27 13:41:09.630810446 -0500 @@ -33,7 +33,6 @@ extern unsigned char __node_distances[MA .flags = SD_LOAD_BALANCE \ | SD_BALANCE_EXEC \ | SD_WAKE_BALANCE, \ - .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ }