[PATCH] rebalancei from migration threads instead of from timer tick load_balancing has the potential of running for some time if f.e. sched_domains for a system with 1024 processors have to be balanced. We currently do all of that with interrupts disabled and this may result in long interrupts holdoffs. Most of that time is potentially spend in rebalance_tick. This patch splits off rebalance_tick from scheduler_tick and uses the migration threads to run schedule sched domain balancing as needed. If we run the balancing from the migration threads then we run with interrupts enabled so we need to change the request queue locking to disable interupts since the timer interrupt may require run queue locks during time slice processing (which we leave in the timer tick). Successfully completed a AIM7 run with it. Signed-off-by: Christoph Lameter Index: linux-2.6.19-rc2-mm2/kernel/sched.c =================================================================== --- linux-2.6.19-rc2-mm2.orig/kernel/sched.c 2006-10-23 16:56:58.275153361 -0500 +++ linux-2.6.19-rc2-mm2/kernel/sched.c 2006-10-23 19:06:06.201241750 -0500 @@ -16,6 +16,8 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin + * 2006-10-23 Sched domain balancing with interrupts enabled by + * Christoph Lameter */ #include @@ -2530,8 +2532,6 @@ static inline unsigned long minus_1_or_z /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. - * - * Called with this_rq unlocked. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum idle_type idle) @@ -2541,6 +2541,7 @@ static int load_balance(int this_cpu, st unsigned long imbalance; struct rq *busiest; cpumask_t cpus = CPU_MASK_ALL; + unsigned long flags; /* * When power savings policy is enabled for the parent domain, idle @@ -2580,11 +2581,13 @@ redo: * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ + local_irq_save(flags); double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, minus_1_or_zero(busiest->nr_running), imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); + local_irq_restore(flags); /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { @@ -2601,13 +2604,13 @@ redo: if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - spin_lock(&busiest->lock); + spin_lock_irqsave(&busiest->lock, flags); /* don't kick the migration_thread, if the curr * task on busiest cpu can't be moved to this_cpu */ if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) { - spin_unlock(&busiest->lock); + spin_unlock_irqrestore(&busiest->lock, flags); all_pinned = 1; goto out_one_pinned; } @@ -2617,7 +2620,7 @@ redo: busiest->push_cpu = this_cpu; active_balance = 1; } - spin_unlock(&busiest->lock); + spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) wake_up_process(busiest->migration_thread); @@ -2817,7 +2820,7 @@ static void active_load_balance(struct r } /* - * rebalance_tick will get called every timer tick, on every CPU. + * rebalance_domains is called when needed from the migration thread. * * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -2825,19 +2828,28 @@ static void active_load_balance(struct r * Balancing parameters are set up in arch_init_sched_domains. */ -/* Don't have all balancing operations going off at once: */ -static inline unsigned long cpu_offset(int cpu) -{ - return jiffies + cpu * HZ / NR_CPUS; -} - -static void -rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) +/* + * Called from the migration threads. + * + * Returns a time in jiffies that specifies when we need to call this + * function next. + */ +static unsigned long rebalance_domains(void) { - unsigned long this_load, interval, j = cpu_offset(this_cpu); + int this_cpu = smp_processor_id(); + struct rq *this_rq = cpu_rq(this_cpu); + enum idle_type idle; + unsigned long this_load, interval; struct sched_domain *sd; int i, scale; + /* Maximum interval betwen calls to rebalance_domains */ + unsigned long next_balance = jiffies + HZ; + /* + * A queue is only truly idle if it is the idle queue and no task is runnable + */ + idle = (current == this_rq->idle && !this_rq->nr_running) ? + SCHED_IDLE : NOT_IDLE; this_load = this_rq->raw_weighted_load; /* Update our load: */ @@ -2869,7 +2881,7 @@ rebalance_tick(int this_cpu, struct rq * if (unlikely(!interval)) interval = 1; - if (j - sd->last_balance >= interval) { + if (jiffies >= sd->next_balance) { if (load_balance(this_cpu, this_rq, sd, idle)) { /* * We've pulled tasks over so either we're no @@ -2878,39 +2890,33 @@ rebalance_tick(int this_cpu, struct rq * */ idle = NOT_IDLE; } - sd->last_balance += interval; + sd->next_balance += interval; } + next_balance = min(next_balance, sd->next_balance); } + return next_balance; } #else /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) -{ -} static inline void idle_balance(int cpu, struct rq *rq) { } #endif -static inline int wake_priority_sleeper(struct rq *rq) +static inline void wake_priority_sleeper(struct rq *rq) { - int ret = 0; - #ifdef CONFIG_SCHED_SMT spin_lock(&rq->lock); /* * If an SMT sibling task has been put to sleep for priority * reasons reschedule the idle task to see if it can now run. */ - if (rq->nr_running) { + if (rq->nr_running) resched_task(rq->idle); - ret = 1; - } spin_unlock(&rq->lock); #endif - return ret; } DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -3057,16 +3063,14 @@ void scheduler_tick(void) rq->timestamp_last_tick = now; if (p == rq->idle) { - if (wake_priority_sleeper(rq)) - goto out; - rebalance_tick(cpu, rq, SCHED_IDLE); + wake_priority_sleeper(rq); return; } /* Task might have expired already, but not scheduled off yet */ if (p->array != rq->active) { set_tsk_need_resched(p); - goto out; + return; } spin_lock(&rq->lock); /* @@ -3134,8 +3138,6 @@ void scheduler_tick(void) } out_unlock: spin_unlock(&rq->lock); -out: - rebalance_tick(cpu, rq, NOT_IDLE); } #ifdef CONFIG_SCHED_SMT @@ -5001,6 +5003,7 @@ static int migration_thread(void *data) { int cpu = (long)data; struct rq *rq; + unsigned long next_domain_balance = jiffies + HZ / 2; rq = cpu_rq(cpu); BUG_ON(rq->migration_thread != current); @@ -5019,6 +5022,9 @@ static int migration_thread(void *data) goto wait_to_die; } + if (jiffies >= next_domain_balance) + next_domain_balance = rebalance_domains(); + if (rq->active_balance) { active_load_balance(rq, cpu); rq->active_balance = 0; @@ -5027,8 +5033,11 @@ static int migration_thread(void *data) head = &rq->migration_queue; if (list_empty(head)) { + int interval; + spin_unlock_irq(&rq->lock); - schedule(); + interval = next_domain_balance - jiffies; + schedule_timeout(max(interval, 1)); set_current_state(TASK_INTERRUPTIBLE); continue; } @@ -6430,6 +6439,12 @@ static void init_sched_groups_power(int } while (group != child->groups); } +/* Don't have all balancing operations going off at once: */ +static inline unsigned long cpu_offset(int cpu) +{ + return cpu * HZ / NR_CPUS; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -6486,6 +6501,7 @@ static int build_sched_domains(const cpu sd->span = *cpu_map; group = cpu_to_allnodes_group(i, cpu_map); sd->groups = &sched_group_allnodes[group]; + sd->next_balance = jiffies + cpu_offset(i); p = sd; } else p = NULL; @@ -6494,6 +6510,7 @@ static int build_sched_domains(const cpu *sd = SD_NODE_INIT; sd->span = sched_domain_node_span(cpu_to_node(i)); sd->parent = p; + sd->next_balance = jiffies + cpu_offset(i); if (p) p->child = sd; cpus_and(sd->span, sd->span, *cpu_map); @@ -6505,6 +6522,7 @@ static int build_sched_domains(const cpu *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; + sd->next_balance = jiffies + cpu_offset(i); if (p) p->child = sd; sd->groups = &sched_group_phys[group]; @@ -6517,6 +6535,7 @@ static int build_sched_domains(const cpu sd->span = cpu_coregroup_map(i); cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; + sd->next_balance = jiffies + cpu_offset(i); p->child = sd; sd->groups = &sched_group_core[group]; #endif @@ -6529,6 +6548,7 @@ static int build_sched_domains(const cpu sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; + sd->next_balance = jiffies + cpu_offset(i); p->child = sd; sd->groups = &sched_group_cpus[group]; #endif Index: linux-2.6.19-rc2-mm2/include/asm-ia64/topology.h =================================================================== --- linux-2.6.19-rc2-mm2.orig/include/asm-ia64/topology.h 2006-10-13 11:25:04.000000000 -0500 +++ linux-2.6.19-rc2-mm2/include/asm-ia64/topology.h 2006-10-23 18:57:14.125640962 -0500 @@ -76,7 +76,7 @@ void build_cpu_to_node_map(void); | SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE, \ - .last_balance = jiffies, \ + .next_balance = jiffies + 1, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } @@ -102,7 +102,7 @@ void build_cpu_to_node_map(void); | SD_BALANCE_EXEC \ | SD_BALANCE_FORK \ | SD_WAKE_BALANCE, \ - .last_balance = jiffies, \ + .next_balance = jiffies + 8, \ .balance_interval = 64, \ .nr_balance_failed = 0, \ } Index: linux-2.6.19-rc2-mm2/include/linux/sched.h =================================================================== --- linux-2.6.19-rc2-mm2.orig/include/linux/sched.h 2006-10-23 15:18:20.000000000 -0500 +++ linux-2.6.19-rc2-mm2/include/linux/sched.h 2006-10-23 18:55:16.389056295 -0500 @@ -692,7 +692,7 @@ struct sched_domain { int flags; /* See SD_* */ /* Runtime fields. */ - unsigned long last_balance; /* init to jiffies. units in jiffies */ + unsigned long next_balance; /* init to jiffies. units in jiffies */ unsigned int balance_interval; /* initialise to 1. units in ms. */ unsigned int nr_balance_failed; /* initialise to 0 */ Index: linux-2.6.19-rc2-mm2/include/linux/topology.h =================================================================== --- linux-2.6.19-rc2-mm2.orig/include/linux/topology.h 2006-10-13 11:25:04.000000000 -0500 +++ linux-2.6.19-rc2-mm2/include/linux/topology.h 2006-10-23 18:56:38.783765036 -0500 @@ -108,7 +108,7 @@ | SD_WAKE_AFFINE \ | SD_WAKE_IDLE \ | SD_SHARE_CPUPOWER, \ - .last_balance = jiffies, \ + .next_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } @@ -140,7 +140,7 @@ | SD_WAKE_AFFINE \ | SD_SHARE_PKG_RESOURCES\ | BALANCE_FOR_MC_POWER, \ - .last_balance = jiffies, \ + .next_balance = jiffies + 1, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } @@ -170,7 +170,7 @@ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ | BALANCE_FOR_PKG_POWER,\ - .last_balance = jiffies, \ + .next_balance = jiffies + 1, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ } @@ -195,7 +195,7 @@ .forkexec_idx = 0, /* unused */ \ .per_cpu_gain = 100, \ .flags = SD_LOAD_BALANCE, \ - .last_balance = jiffies, \ + .next_balance = jiffies + 64, \ .balance_interval = 64, \ .nr_balance_failed = 0, \ }