[PATCH] rebalancei from migration threads instead of from timer tick

load_balancing has the potential of running for some time if f.e.
sched_domains for a system with 1024 processors have to be balanced.
We currently do all of that with interrupts disabled and this may
result in long interrupts holdoffs. Most of that time is potentially
spend in rebalance_tick.

This patch splits off rebalance_tick from scheduler_tick and uses
the migration threads to run schedule sched domain balancing as needed.

If we run the balancing from the migration threads then we run with
interrupts enabled so we need to change the request queue locking to
disable interupts since the timer interrupt may require run queue
locks during time slice processing (which we leave in the timer tick).

Successfully completed a AIM7 run with it.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.19-rc2-mm1/kernel/sched.c
===================================================================
--- linux-2.6.19-rc2-mm1.orig/kernel/sched.c	2006-10-20 15:45:16.015151170 -0500
+++ linux-2.6.19-rc2-mm1/kernel/sched.c	2006-10-20 16:23:24.697412453 -0500
@@ -2530,8 +2530,6 @@ static inline unsigned long minus_1_or_z
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
- *
- * Called with this_rq unlocked.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum idle_type idle)
@@ -2541,6 +2539,7 @@ static int load_balance(int this_cpu, st
 	unsigned long imbalance;
 	struct rq *busiest;
 	cpumask_t cpus = CPU_MASK_ALL;
+	unsigned long flags;
 
 	/*
 	 * When power savings policy is enabled for the parent domain, idle
@@ -2580,11 +2579,13 @@ redo:
 		 * still unbalanced. nr_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
+		local_irq_save(flags);
 		double_rq_lock(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 				      minus_1_or_zero(busiest->nr_running),
 				      imbalance, sd, idle, &all_pinned);
 		double_rq_unlock(this_rq, busiest);
+		local_irq_restore(flags);
 
 		/* All tasks on this runqueue were pinned by CPU affinity */
 		if (unlikely(all_pinned)) {
@@ -2601,13 +2602,13 @@ redo:
 
 		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
 
-			spin_lock(&busiest->lock);
+			spin_lock_irqsave(&busiest->lock, flags);
 
 			/* don't kick the migration_thread, if the curr
 			 * task on busiest cpu can't be moved to this_cpu
 			 */
 			if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
-				spin_unlock(&busiest->lock);
+				spin_unlock_irqrestore(&busiest->lock, flags);
 				all_pinned = 1;
 				goto out_one_pinned;
 			}
@@ -2617,7 +2618,7 @@ redo:
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
 			}
-			spin_unlock(&busiest->lock);
+			spin_unlock_irqrestore(&busiest->lock, flags);
 			if (active_balance)
 				wake_up_process(busiest->migration_thread);
 
@@ -2817,7 +2818,7 @@ static void active_load_balance(struct r
 }
 
 /*
- * rebalance_tick will get called every timer tick, on every CPU.
+ * rebalance_domains is called when needed from the migration thread.
  *
  * It checks each scheduling domain to see if it is due to be balanced,
  * and initiates a balancing operation if so.
@@ -2831,13 +2832,28 @@ static inline unsigned long cpu_offset(i
 	return jiffies + cpu * HZ / NR_CPUS;
 }
 
-static void
-rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
+/*
+ * Called from the migration threads.
+ *
+ * Returns a time in jiffies that specifies when we need to call this
+ * function next.
+ */
+static unsigned long rebalance_domains(void)
 {
+	int this_cpu = smp_processor_id();
+	struct rq *this_rq = cpu_rq(this_cpu);
+	enum idle_type idle;
 	unsigned long this_load, interval, j = cpu_offset(this_cpu);
 	struct sched_domain *sd;
 	int i, scale;
+	/* Maximum interval betwen calls to rebalance_domains */
+	unsigned long next_balance = j + HZ;
 
+	/*
+	 * A queue is only truly idle if it is the idle queue and no task is runnable
+	 */
+	idle = (current == this_rq->idle && !this_rq->nr_running) ?
+				SCHED_IDLE : NOT_IDLE;
 	this_load = this_rq->raw_weighted_load;
 
 	/* Update our load: */
@@ -2880,37 +2896,31 @@ rebalance_tick(int this_cpu, struct rq *
 			}
 			sd->last_balance += interval;
 		}
+		next_balance = min(next_balance, sd->last_balance + j + interval);
 	}
+	return next_balance;
 }
 #else
 /*
  * on UP we do not need to balance between CPUs:
  */
-static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
-{
-}
 static inline void idle_balance(int cpu, struct rq *rq)
 {
 }
 #endif
 
-static inline int wake_priority_sleeper(struct rq *rq)
+static inline void wake_priority_sleeper(struct rq *rq)
 {
-	int ret = 0;
-
 #ifdef CONFIG_SCHED_SMT
 	spin_lock(&rq->lock);
 	/*
 	 * If an SMT sibling task has been put to sleep for priority
 	 * reasons reschedule the idle task to see if it can now run.
 	 */
-	if (rq->nr_running) {
+	if (rq->nr_running)
 		resched_task(rq->idle);
-		ret = 1;
-	}
 	spin_unlock(&rq->lock);
 #endif
-	return ret;
 }
 
 DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -3057,16 +3067,14 @@ void scheduler_tick(void)
 	rq->timestamp_last_tick = now;
 
 	if (p == rq->idle) {
-		if (wake_priority_sleeper(rq))
-			goto out;
-		rebalance_tick(cpu, rq, SCHED_IDLE);
+		wake_priority_sleeper(rq);
 		return;
 	}
 
 	/* Task might have expired already, but not scheduled off yet */
 	if (p->array != rq->active) {
 		set_tsk_need_resched(p);
-		goto out;
+		return;
 	}
 	spin_lock(&rq->lock);
 	/*
@@ -3134,8 +3142,6 @@ void scheduler_tick(void)
 	}
 out_unlock:
 	spin_unlock(&rq->lock);
-out:
-	rebalance_tick(cpu, rq, NOT_IDLE);
 }
 
 #ifdef CONFIG_SCHED_SMT
@@ -5001,6 +5007,7 @@ static int migration_thread(void *data)
 {
 	int cpu = (long)data;
 	struct rq *rq;
+	unsigned long next_domain_balance = jiffies + HZ / 2;
 
 	rq = cpu_rq(cpu);
 	BUG_ON(rq->migration_thread != current);
@@ -5019,6 +5026,9 @@ static int migration_thread(void *data)
 			goto wait_to_die;
 		}
 
+		if (jiffies >= next_domain_balance)
+			next_domain_balance = rebalance_domains();
+
 		if (rq->active_balance) {
 			active_load_balance(rq, cpu);
 			rq->active_balance = 0;
@@ -5027,8 +5037,11 @@ static int migration_thread(void *data)
 		head = &rq->migration_queue;
 
 		if (list_empty(head)) {
+			int interval;
+
 			spin_unlock_irq(&rq->lock);
-			schedule();
+			interval = next_domain_balance - jiffies;
+			schedule_timeout(max(interval, 1));
 			set_current_state(TASK_INTERRUPTIBLE);
 			continue;
 		}