---
 Documentation/sched-design.txt  |  234 ++++++
 Documentation/sysctl/kernel.txt |   30 
 fs/pipe.c                       |    7 
 fs/proc/array.c                 |    2 
 include/linux/init_task.h       |    4 
 include/linux/sched.h           |   32 
 kernel/kthread.c                |    1 
 kernel/sched.c                  | 1345 +++++++++++++++++++---------------------
 kernel/softirq.c                |    2 
 kernel/sysctl.c                 |   35 -
 kernel/workqueue.c              |    2 
 11 files changed, 978 insertions(+), 716 deletions(-)

Index: linux-2.6.22-rc2-ck1/include/linux/sched.h
===================================================================
--- linux-2.6.22-rc2-ck1.orig/include/linux/sched.h	2007-05-19 20:10:51.000000000 +1000
+++ linux-2.6.22-rc2-ck1/include/linux/sched.h	2007-05-19 20:11:57.000000000 +1000
@@ -150,8 +150,7 @@ extern unsigned long weighted_cpuload(co
 #define EXIT_ZOMBIE		16
 #define EXIT_DEAD		32
 /* in tsk->state again */
-#define TASK_NONINTERACTIVE	64
-#define TASK_DEAD		128
+#define TASK_DEAD		64
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
@@ -537,8 +536,9 @@ struct signal_struct {
 
 #define MAX_USER_RT_PRIO	100
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
+#define PRIO_RANGE		(40)
 
-#define MAX_PRIO		(MAX_RT_PRIO + 40)
+#define MAX_PRIO		(MAX_RT_PRIO + PRIO_RANGE)
 
 #define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
 #define rt_task(p)		rt_prio((p)->prio)
@@ -809,13 +809,6 @@ struct mempolicy;
 struct pipe_inode_info;
 struct uts_namespace;
 
-enum sleep_type {
-	SLEEP_NORMAL,
-	SLEEP_NONINTERACTIVE,
-	SLEEP_INTERACTIVE,
-	SLEEP_INTERRUPTED,
-};
-
 struct prio_array;
 
 struct task_struct {
@@ -835,20 +828,33 @@ struct task_struct {
 	int load_weight;	/* for niceness load balancing purposes */
 	int prio, static_prio, normal_prio;
 	struct list_head run_list;
+	/*
+	 * This bitmap shows what priorities this task has received quota
+	 * from for this major priority rotation on its current runqueue.
+	 */
+	DECLARE_BITMAP(bitmap, PRIO_RANGE + 1);
 	struct prio_array *array;
+	/* Which major runqueue rotation did this task run */
+	unsigned long rotation;
 
 	unsigned short ioprio;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	unsigned int btrace_seq;
 #endif
-	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
 	unsigned long long sched_time; /* sched_clock time spent running */
-	enum sleep_type sleep_type;
 
 	unsigned int policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice, first_time_slice;
+	/*
+	 * How much this task is entitled to run at the current priority
+	 * before being requeued at a lower priority.
+	 */
+	int time_slice;
+	/* Is this the very first time_slice this task has ever run. */
+	unsigned int first_time_slice;
+	/* How much this task receives at each priority level */
+	int quota;
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
Index: linux-2.6.22-rc2-ck1/kernel/sched.c
===================================================================
--- linux-2.6.22-rc2-ck1.orig/kernel/sched.c	2007-05-19 20:10:51.000000000 +1000
+++ linux-2.6.22-rc2-ck1/kernel/sched.c	2007-05-19 20:12:08.000000000 +1000
@@ -16,6 +16,7 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2007-03-02	Staircase deadline scheduling policy by Con Kolivas
  */
 
 #include <linux/mm.h>
@@ -53,8 +54,9 @@
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <linux/reciprocal_div.h>
-
+#include <linux/log2.h>
 #include <asm/tlb.h>
+
 #include <asm/unistd.h>
 
 /*
@@ -84,147 +86,73 @@ unsigned long long __attribute__((weak))
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+#define SCHED_PRIO(p)		((p)+MAX_RT_PRIO)
 
-/*
- * Some helpers for converting nanosecond timing to jiffy resolution
- */
-#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
+/* Some helpers for converting to/from various scales.*/
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
-
-/*
- * These are the 'tuning knobs' of the scheduler:
- *
- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
- * Timeslices get refilled after they expire.
- */
-#define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
-#define DEF_TIMESLICE		(100 * HZ / 1000)
-#define ON_RUNQUEUE_WEIGHT	 30
-#define CHILD_PENALTY		 95
-#define PARENT_PENALTY		100
-#define EXIT_WEIGHT		  3
-#define PRIO_BONUS_RATIO	 25
-#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
-#define INTERACTIVE_DELTA	  2
-#define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
-#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
-#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
-
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- *  priority range a task can explore, a value of '1' means the
- *  task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-
-#define CURRENT_BONUS(p) \
-	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
-		MAX_SLEEP_AVG)
-
-#define GRANULARITY	(10 * HZ / 1000 ? : 1)
-
-#ifdef CONFIG_SMP
-#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
-			num_online_cpus())
-#else
-#define TIMESLICE_GRANULARITY(p)	(GRANULARITY * \
-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
-#endif
-
-#define SCALE(v1,v1_max,v2_max) \
-	(v1) * (v2_max) / (v1_max)
-
-#define DELTA(p) \
-	(SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
-		INTERACTIVE_DELTA)
-
-#define TASK_INTERACTIVE(p) \
-	((p)->prio <= (p)->static_prio - DELTA(p))
-
-#define INTERACTIVE_SLEEP(p) \
-	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
-		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
-
-#define TASK_PREEMPTS_CURR(p, rq) \
-	((p)->prio < (rq)->curr->prio)
-
-#define SCALE_PRIO(x, prio) \
-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
-
-static unsigned int static_prio_timeslice(int static_prio)
-{
-	if (static_prio < NICE_TO_PRIO(0))
-		return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
-	else
-		return SCALE_PRIO(DEF_TIMESLICE, static_prio);
-}
-
-#ifdef CONFIG_SMP
-/*
- * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
- * Since cpu_power is a 'constant', we can use a reciprocal divide.
+#define MS_TO_NS(TIME)		((TIME) * 1000000)
+#define MS_TO_US(TIME)		((TIME) * 1000)
+#define US_TO_MS(TIME)		((TIME) / 1000)
+
+#define TASK_PREEMPTS_CURR(p, curr)	((p)->prio < (curr)->prio)
+
+/*
+ * This is the time all tasks within the same priority round robin.
+ * Value is in ms and set to a minimum of 10ms. Scales with number of cpus.
+ * Tunable via /proc interface.
+ */
+int rr_interval __read_mostly = 10;
+int sched_interactive __read_mostly = 1;
+
+/*
+ * This contains a bitmap for each dynamic priority level with empty slots
+ * for the valid priorities each different nice level can have. It allows
+ * us to stagger the slots where differing priorities run in a way that
+ * keeps latency differences between different nice levels at a minimum.
+ * The purpose of a pre-generated matrix is for rapid lookup of next slot in
+ * O(1) time without having to recalculate every time priority gets demoted.
+ * All nice levels use priority slot 39 as this allows less niced tasks to
+ * get all priority slots better than that before expiration is forced.
+ * ie, where 0 means a slot for that priority, priority running from left to
+ * right is from prio 0 to prio 39:
+ * nice -20 0000000000000000000000000000000000000000
+ * nice -10 1000100010001000100010001000100010010000
+ * nice   0 1010101010101010101010101010101010101010
+ * nice   5 1011010110110101101101011011010110110110
+ * nice  10 1110111011101110111011101110111011101110
+ * nice  15 1111111011111110111111101111111011111110
+ * nice  19 1111111111111111111111111111111111111110
  */
-static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
-{
-	return reciprocal_divide(load, sg->reciprocal_cpu_power);
-}
+static unsigned long prio_matrix[PRIO_RANGE][BITS_TO_LONGS(PRIO_RANGE)]
+				 __read_mostly;
 
-/*
- * Each time a sched group cpu_power is changed,
- * we must compute its reciprocal value
- */
-static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
-{
-	sg->__cpu_power += val;
-	sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
-}
-#endif
-
-/*
- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
- * to time slice values: [800ms ... 100ms ... 5ms]
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
- */
-
-static inline unsigned int task_timeslice(struct task_struct *p)
-{
-	return static_prio_timeslice(p->static_prio);
-}
+struct rq;
 
 /*
  * These are the runqueue data structures:
  */
-
 struct prio_array {
-	unsigned int nr_active;
-	DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
+	/* Tasks queued at each priority */
 	struct list_head queue[MAX_PRIO];
+
+	/*
+	 * The bitmap of priorities queued for this array. While the expired
+	 * array will never have realtime tasks on it, it is simpler to have
+	 * equal sized bitmaps for a cheap array swap. Include 1 bit for
+	 * delimiter.
+	 */
+	DECLARE_BITMAP(prio_bitmap, MAX_PRIO + 1);
+
+	/*
+	 * The best static priority (of the dynamic priority tasks) queued
+	 * this array.
+	 */
+	int best_static_prio;
+
+#ifdef CONFIG_SMP
+	/* For convenience looks back at rq */
+	struct rq *rq;
+#endif
 };
 
 /*
@@ -260,14 +188,24 @@ struct rq {
 	 */
 	unsigned long nr_uninterruptible;
 
-	unsigned long expired_timestamp;
 	/* Cached timestamp set by update_cpu_clock() */
 	unsigned long long most_recent_timestamp;
 	struct task_struct *curr, *idle;
 	unsigned long next_balance;
 	struct mm_struct *prev_mm;
+
 	struct prio_array *active, *expired, arrays[2];
-	int best_expired_prio;
+	unsigned long *dyn_bitmap, *exp_bitmap;
+
+	/*
+	 * The current dynamic priority level this runqueue is at per static
+	 * priority level.
+	 */
+	int prio_level[PRIO_RANGE];
+
+	/* How many times we have rotated the priority queue */
+	unsigned long prio_rotation;
+
 	atomic_t nr_iowait;
 
 #ifdef CONFIG_SMP
@@ -606,12 +544,9 @@ static inline struct rq *this_rq_lock(vo
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 /*
  * Called when a process is dequeued from the active array and given
- * the cpu.  We should note that with the exception of interactive
- * tasks, the expired queue will become the active queue after the active
- * queue is empty, without explicitly dequeuing and requeuing tasks in the
- * expired queue.  (Interactive tasks may be requeued directly to the
- * active queue, thus delaying tasks in the expired queue from running;
- * see scheduler_tick()).
+ * the cpu.  We should note that the expired queue will become the active
+ * queue after the active queue is empty, without explicitly dequeuing and
+ * requeuing tasks in the expired queue.
  *
  * This function is only called from sched_info_arrive(), rather than
  * dequeue_task(). Even though a task may be queued and dequeued multiple
@@ -709,71 +644,239 @@ sched_info_switch(struct task_struct *pr
 #define sched_info_switch(t, next)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
+static inline int task_queued(struct task_struct *task)
+{
+	return !list_empty(&task->run_list);
+}
+
+static inline void set_dynamic_bit(struct task_struct *p, struct rq *rq)
+{
+	__set_bit(p->prio, p->array->prio_bitmap);
+}
+
 /*
- * Adding/removing a task to/from a priority array:
+ * Removing from a runqueue.
  */
-static void dequeue_task(struct task_struct *p, struct prio_array *array)
+static void dequeue_task(struct task_struct *p, struct rq *rq)
 {
-	array->nr_active--;
-	list_del(&p->run_list);
-	if (list_empty(array->queue + p->prio))
-		__clear_bit(p->prio, array->bitmap);
+	list_del_init(&p->run_list);
+	if (list_empty(p->array->queue + p->prio))
+		__clear_bit(p->prio, p->array->prio_bitmap);
 }
 
-static void enqueue_task(struct task_struct *p, struct prio_array *array)
+static void reset_first_time_slice(struct task_struct *p)
 {
-	sched_info_queued(p);
-	list_add_tail(&p->run_list, array->queue + p->prio);
-	__set_bit(p->prio, array->bitmap);
-	array->nr_active++;
+	if (unlikely(p->first_time_slice))
+		p->first_time_slice = 0;
+}
+
+/*
+ * The task is being queued on a fresh array so it has its entitlement
+ * bitmap cleared.
+ */
+static void task_new_array(struct task_struct *p, struct rq *rq,
+			   struct prio_array *array)
+{
+	bitmap_zero(p->bitmap, PRIO_RANGE);
+	p->rotation = rq->prio_rotation;
+	p->time_slice = p->quota;
 	p->array = array;
+	reset_first_time_slice(p);
+}
+
+/* Find the first slot from the relevant prio_matrix entry */
+static int first_prio_slot(struct task_struct *p)
+{
+	if (unlikely(p->policy == SCHED_BATCH))
+		return p->static_prio;
+	return SCHED_PRIO(find_first_zero_bit(
+		prio_matrix[USER_PRIO(p->static_prio)], PRIO_RANGE));
 }
 
 /*
- * Put task to the end of the run list without the overhead of dequeue
- * followed by enqueue.
+ * In sched_interactive mode priority allocation occurs per process per rq
+ * array swap. In !sched_interactive mode all waking tasks must obey the
+ * current prio level of all other tasks running per array swap.
  */
-static void requeue_task(struct task_struct *p, struct prio_array *array)
+static int minprio(struct rq *rq, int uprio)
 {
-	list_move_tail(&p->run_list, array->queue + p->prio);
+	if (sched_interactive)
+		return MAX_RT_PRIO;
+	return rq->prio_level[uprio];
 }
 
-static inline void
-enqueue_task_head(struct task_struct *p, struct prio_array *array)
+/*
+ * Find the first unused slot by this task that is also in its prio_matrix
+ * level. SCHED_BATCH tasks do not use the priority matrix. They only take
+ * priority slots from their static_prio and above.
+ */
+static int next_entitled_slot(struct task_struct *p, struct rq *rq)
 {
-	list_add(&p->run_list, array->queue + p->prio);
-	__set_bit(p->prio, array->bitmap);
-	array->nr_active++;
-	p->array = array;
+	int search_prio = MAX_RT_PRIO, uprio = USER_PRIO(p->static_prio);
+	struct prio_array *array = rq->active;
+	DECLARE_BITMAP(tmp, PRIO_RANGE);
+
+	/*
+	 * Go straight to expiration if there are higher priority tasks
+	 * already expired.
+	 */
+	if (p->static_prio > rq->expired->best_static_prio)
+		return MAX_PRIO;
+	if (!rq->prio_level[uprio])
+		rq->prio_level[uprio] = MAX_RT_PRIO;
+	/*
+	 * Only priorities equal to the prio_level and above for their
+	 * static_prio are acceptable, and only if it's not better than
+	 * a queued better static_prio's prio_level.
+	 */
+	if (p->static_prio < array->best_static_prio) {
+		if (likely(p->policy != SCHED_BATCH))
+			array->best_static_prio = p->static_prio;
+	} else if (p->static_prio == array->best_static_prio) {
+		search_prio = minprio(rq, uprio);
+	} else {
+		int i;
+
+		search_prio = minprio(rq, uprio);
+		/* A bound O(n) function, worst case n is 40 */
+		for (i = array->best_static_prio; i <= p->static_prio ; i++) {
+			if (!rq->prio_level[USER_PRIO(i)])
+				rq->prio_level[USER_PRIO(i)] = MAX_RT_PRIO;
+			search_prio = max(search_prio,
+				      rq->prio_level[USER_PRIO(i)]);
+		}
+	}
+	if (unlikely(p->policy == SCHED_BATCH)) {
+		search_prio = max(search_prio, p->static_prio);
+		return SCHED_PRIO(find_next_zero_bit(p->bitmap, PRIO_RANGE,
+				  USER_PRIO(search_prio)));
+	}
+	bitmap_or(tmp, p->bitmap, prio_matrix[uprio], PRIO_RANGE);
+	return SCHED_PRIO(find_next_zero_bit(tmp, PRIO_RANGE,
+		USER_PRIO(search_prio)));
+}
+
+static void queue_expired(struct task_struct *p, struct rq *rq)
+{
+	task_new_array(p, rq, rq->expired);
+	p->prio = p->normal_prio = first_prio_slot(p);
+	if (p->static_prio < rq->expired->best_static_prio)
+		rq->expired->best_static_prio = p->static_prio;
+	reset_first_time_slice(p);
 }
 
+#ifdef CONFIG_SMP
 /*
- * __normal_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
- *
- * Both properties are important to certain workloads.
+ * If we're waking up a task that was previously on a different runqueue,
+ * update its data appropriately. Note we may be reading data from src_rq->
+ * outside of lock, but the occasional inaccurate result should be harmless.
+ */
+ static void update_if_moved(struct task_struct *p, struct rq *rq)
+{
+	struct rq *src_rq = p->array->rq;
+
+	if (src_rq == rq)
+		return;
+	/*
+	 * Only need to set p->array when p->rotation == rq->prio_rotation as
+	 * they will be set in recalc_task_prio when != rq->prio_rotation.
+	 */
+	if (p->rotation == src_rq->prio_rotation) {
+		p->rotation = rq->prio_rotation;
+		if (p->array == src_rq->expired)
+			p->array = rq->expired;
+		else
+			p->array = rq->active;
+	} else
+		p->rotation = 0;
+}
+#else
+static inline void update_if_moved(struct task_struct *p, struct rq *rq)
+{
+}
+#endif
+
+/*
+ * recalc_task_prio determines what priority a non rt_task will be
+ * queued at. If the task has already been running during this runqueue's
+ * major rotation (rq->prio_rotation) then it continues at the same
+ * priority if it has tick entitlement left. If it does not have entitlement
+ * left, it finds the next priority slot according to its nice value that it
+ * has not extracted quota from. If it has not run during this major
+ * rotation, it starts at the next_entitled_slot and has its bitmap quota
+ * cleared. If it does not have any slots left it has all its slots reset and
+ * is queued on the expired at its first_prio_slot.
  */
+static void recalc_task_prio(struct task_struct *p, struct rq *rq)
+{
+	struct prio_array *array = rq->active;
+	int queue_prio;
+
+	update_if_moved(p, rq);
+	if (p->rotation == rq->prio_rotation) {
+		if (p->array == array) {
+			if (p->time_slice > 0)
+				return;
+			p->time_slice = p->quota;
+		} else if (p->array == rq->expired) {
+			queue_expired(p, rq);
+			return;
+		} else
+			task_new_array(p, rq, array);
+	} else
+		task_new_array(p, rq, array);
+
+	queue_prio = next_entitled_slot(p, rq);
+	if (queue_prio >= MAX_PRIO) {
+		queue_expired(p, rq);
+		return;
+	}
+	p->prio = p->normal_prio = queue_prio;
+	__set_bit(USER_PRIO(p->prio), p->bitmap);
+}
 
-static inline int __normal_prio(struct task_struct *p)
+/*
+ * Adding to a runqueue. The dynamic priority queue that it is added to is
+ * determined by recalc_task_prio() above.
+ */
+static inline void __enqueue_task(struct task_struct *p, struct rq *rq)
 {
-	int bonus, prio;
+	if (rt_task(p))
+		p->array = rq->active;
+	else
+		recalc_task_prio(p, rq);
 
-	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+	sched_info_queued(p);
+	set_dynamic_bit(p, rq);
+}
 
-	prio = p->static_prio - bonus;
-	if (prio < MAX_RT_PRIO)
-		prio = MAX_RT_PRIO;
-	if (prio > MAX_PRIO-1)
-		prio = MAX_PRIO-1;
-	return prio;
+static void enqueue_task(struct task_struct *p, struct rq *rq)
+{
+	__enqueue_task(p, rq);
+	list_add_tail(&p->run_list, p->array->queue + p->prio);
+}
+
+static inline void enqueue_task_head(struct task_struct *p, struct rq *rq)
+{
+	__enqueue_task(p, rq);
+	list_add(&p->run_list, p->array->queue + p->prio);
+}
+
+/*
+ * requeue_task is only called when p->static_prio does not change. p->prio
+ * can change with dynamic tasks.
+ */
+static void requeue_task(struct task_struct *p, struct rq *rq,
+			 struct prio_array *old_array, int old_prio)
+{
+	if (p->array == rq->expired)
+		queue_expired(p, rq);
+	list_move_tail(&p->run_list, p->array->queue + p->prio);
+	if (!rt_task(p)) {
+		if (list_empty(old_array->queue + old_prio))
+			__clear_bit(old_prio, old_array->prio_bitmap);
+		set_dynamic_bit(p, rq);
+	}
 }
 
 /*
@@ -786,17 +889,24 @@ static inline int __normal_prio(struct t
  */
 
 /*
- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
- * If static_prio_timeslice() is ever changed to break this assumption then
- * this code will need modification
- */
-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
-#define LOAD_WEIGHT(lp) \
-	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
-#define PRIO_TO_LOAD_WEIGHT(prio) \
-	LOAD_WEIGHT(static_prio_timeslice(prio))
-#define RTPRIO_TO_LOAD_WEIGHT(rp) \
-	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
+ * task_timeslice - the total duration a task can run during one major
+ * rotation. Returns value in milliseconds as the smallest value can be 1.
+ */
+static int task_timeslice(struct task_struct *p)
+{
+	int slice = p->quota;	/* quota is in us */
+
+	if (!rt_task(p))
+		slice += (PRIO_RANGE - 1 - TASK_USER_PRIO(p)) * slice;
+	return US_TO_MS(slice);
+}
+
+/*
+ * The load weight is basically the task_timeslice in ms. Realtime tasks are
+ * special cased to be proportionately larger than nice -20 by their
+ * rt_priority. The weight for rt tasks can only be arbitrary at best.
+ */
+#define RTPRIO_TO_LOAD_WEIGHT(rp)	(rr_interval * 20 * (40 + rp))
 
 static void set_load_weight(struct task_struct *p)
 {
@@ -813,7 +923,7 @@ static void set_load_weight(struct task_
 #endif
 			p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
 	} else
-		p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
+		p->load_weight = task_timeslice(p);
 }
 
 static inline void
@@ -841,28 +951,38 @@ static inline void dec_nr_running(struct
 }
 
 /*
- * Calculate the expected normal priority: i.e. priority
- * without taking RT-inheritance into account. Might be
- * boosted by interactivity modifiers. Changes upon fork,
- * setprio syscalls, and whenever the interactivity
- * estimator recalculates.
+ * __activate_task - move a task to the runqueue.
  */
-static inline int normal_prio(struct task_struct *p)
+static inline void __activate_task(struct task_struct *p, struct rq *rq)
+{
+	enqueue_task(p, rq);
+	inc_nr_running(p, rq);
+}
+
+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
 {
-	int prio;
+	enqueue_task_head(p, rq);
+	inc_nr_running(p, rq);
+}
 
+static inline int normal_prio(struct task_struct *p)
+{
 	if (has_rt_policy(p))
-		prio = MAX_RT_PRIO-1 - p->rt_priority;
+		return MAX_RT_PRIO-1 - p->rt_priority;
+	/* Other tasks all have normal_prio set in recalc_task_prio */
+	if (likely(p->prio >= MAX_RT_PRIO && p->prio < MAX_PRIO))
+		return p->prio;
 	else
-		prio = __normal_prio(p);
-	return prio;
+		return p->static_prio;
 }
 
 /*
  * Calculate the current priority, i.e. the priority
  * taken into account by the scheduler. This value might
- * be boosted by RT tasks, or might be boosted by
- * interactivity modifiers. Will be RT if the task got
+ * be boosted by RT tasks as it will be RT if the task got
  * RT-boosted. If not then it returns p->normal_prio.
  */
 static int effective_prio(struct task_struct *p)
@@ -879,111 +999,41 @@ static int effective_prio(struct task_st
 }
 
 /*
- * __activate_task - move a task to the runqueue.
+ * All tasks have quotas based on rr_interval. RT tasks all get rr_interval.
+ * From nice 1 to 19 they are smaller than it only if they are at least one
+ * tick still. Below nice 0 they get progressively larger.
+ * ie nice -6..0 = rr_interval. nice -10 = 2.5 * rr_interval
+ * nice -20 = 10 * rr_interval. nice 1-19 = rr_interval / 2.
+ * Value returned is in microseconds.
  */
-static void __activate_task(struct task_struct *p, struct rq *rq)
+static inline unsigned int rr_quota(struct task_struct *p)
 {
-	struct prio_array *target = rq->active;
-
-	if (batch_task(p))
-		target = rq->expired;
-	enqueue_task(p, target);
-	inc_nr_running(p, rq);
-}
+	int nice = TASK_NICE(p), rr = rr_interval;
 
-/*
- * __activate_idle_task - move idle task to the _front_ of runqueue.
- */
-static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
-{
-	enqueue_task_head(p, rq->active);
-	inc_nr_running(p, rq);
+	if (!rt_task(p)) {
+		if (nice < -6) {
+			rr *= nice * nice;
+			rr /= 40;
+		} else if (nice > 0)
+			rr = rr / 2 ? : 1;
+	}
+	return MS_TO_US(rr);
 }
 
-/*
- * Recalculate p->normal_prio and p->prio after having slept,
- * updating the sleep-average too:
- */
-static int recalc_task_prio(struct task_struct *p, unsigned long long now)
+/* Every time we set the quota we need to set the load weight */
+static void set_quota(struct task_struct *p)
 {
-	/* Caller must always ensure 'now >= p->timestamp' */
-	unsigned long sleep_time = now - p->timestamp;
-
-	if (batch_task(p))
-		sleep_time = 0;
-
-	if (likely(sleep_time > 0)) {
-		/*
-		 * This ceiling is set to the lowest priority that would allow
-		 * a task to be reinserted into the active array on timeslice
-		 * completion.
-		 */
-		unsigned long ceiling = INTERACTIVE_SLEEP(p);
-
-		if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
-			/*
-			 * Prevents user tasks from achieving best priority
-			 * with one single large enough sleep.
-			 */
-			p->sleep_avg = ceiling;
-			/*
-			 * Using INTERACTIVE_SLEEP() as a ceiling places a
-			 * nice(0) task 1ms sleep away from promotion, and
-			 * gives it 700ms to round-robin with no chance of
-			 * being demoted.  This is more than generous, so
-			 * mark this sleep as non-interactive to prevent the
-			 * on-runqueue bonus logic from intervening should
-			 * this task not receive cpu immediately.
-			 */
-			p->sleep_type = SLEEP_NONINTERACTIVE;
-		} else {
-			/*
-			 * Tasks waking from uninterruptible sleep are
-			 * limited in their sleep_avg rise as they
-			 * are likely to be waiting on I/O
-			 */
-			if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
-				if (p->sleep_avg >= ceiling)
-					sleep_time = 0;
-				else if (p->sleep_avg + sleep_time >=
-					 ceiling) {
-						p->sleep_avg = ceiling;
-						sleep_time = 0;
-				}
-			}
-
-			/*
-			 * This code gives a bonus to interactive tasks.
-			 *
-			 * The boost works by updating the 'average sleep time'
-			 * value here, based on ->timestamp. The more time a
-			 * task spends sleeping, the higher the average gets -
-			 * and the higher the priority boost gets as well.
-			 */
-			p->sleep_avg += sleep_time;
-
-		}
-		if (p->sleep_avg > NS_MAX_SLEEP_AVG)
-			p->sleep_avg = NS_MAX_SLEEP_AVG;
-	}
-
-	return effective_prio(p);
+	p->quota = rr_quota(p);
+	set_load_weight(p);
 }
 
 /*
  * activate_task - move a task to the runqueue and do priority recalculation
- *
- * Update all the scheduling statistics stuff. (sleep average
- * calculation, priority modifiers, etc.)
  */
 static void activate_task(struct task_struct *p, struct rq *rq, int local)
 {
-	unsigned long long now;
-
-	if (rt_task(p))
-		goto out;
+	unsigned long long now = sched_clock();
 
-	now = sched_clock();
 #ifdef CONFIG_SMP
 	if (!local) {
 		/* Compensate for drifting sched_clock */
@@ -1004,32 +1054,9 @@ static void activate_task(struct task_st
 				     (now - p->timestamp) >> 20);
 	}
 
-	p->prio = recalc_task_prio(p, now);
-
-	/*
-	 * This checks to make sure it's not an uninterruptible task
-	 * that is now waking up.
-	 */
-	if (p->sleep_type == SLEEP_NORMAL) {
-		/*
-		 * Tasks which were woken up by interrupts (ie. hw events)
-		 * are most likely of interactive nature. So we give them
-		 * the credit of extending their sleep time to the period
-		 * of time they spend on the runqueue, waiting for execution
-		 * on a CPU, first time around:
-		 */
-		if (in_interrupt())
-			p->sleep_type = SLEEP_INTERRUPTED;
-		else {
-			/*
-			 * Normal first-time wakeups get a credit too for
-			 * on-runqueue time, but it will be weighted down:
-			 */
-			p->sleep_type = SLEEP_INTERACTIVE;
-		}
-	}
+	set_quota(p);
+	p->prio = effective_prio(p);
 	p->timestamp = now;
-out:
 	__activate_task(p, rq);
 }
 
@@ -1039,8 +1066,7 @@ out:
 static void deactivate_task(struct task_struct *p, struct rq *rq)
 {
 	dec_nr_running(p, rq);
-	dequeue_task(p, p->array);
-	p->array = NULL;
+	dequeue_task(p, rq);
 }
 
 /*
@@ -1133,7 +1159,7 @@ migrate_task(struct task_struct *p, int 
 	 * If the task is not on a runqueue (and not running), then
 	 * it is sufficient to simply update the task's cpu field.
 	 */
-	if (!p->array && !task_running(rq, p)) {
+	if (!task_queued(p) && !task_running(rq, p)) {
 		set_task_cpu(p, dest_cpu);
 		return 0;
 	}
@@ -1164,7 +1190,7 @@ void wait_task_inactive(struct task_stru
 repeat:
 	rq = task_rq_lock(p, &flags);
 	/* Must be off runqueue entirely, not preempted. */
-	if (unlikely(p->array || task_running(rq, p))) {
+	if (unlikely(task_queued(p) || task_running(rq, p))) {
 		/* If it's preempted, we yield.  It could be a while. */
 		preempted = !task_running(rq, p);
 		task_rq_unlock(rq, &flags);
@@ -1243,6 +1269,25 @@ static inline unsigned long cpu_avg_load
 }
 
 /*
+ * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
+ * Since cpu_power is a 'constant', we can use a reciprocal divide.
+ */
+static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
+{
+	return reciprocal_divide(load, sg->reciprocal_cpu_power);
+}
+
+/*
+ * Each time a sched group cpu_power is changed,
+ * we must compute its reciprocal value
+ */
+static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
+{
+	sg->__cpu_power += val;
+	sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
+}
+
+/*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
  */
@@ -1439,6 +1484,31 @@ static inline int wake_idle(int cpu, str
 }
 #endif
 
+/*
+ * We need to have a special definition for an idle runqueue when testing
+ * for preemption on CONFIG_HOTPLUG_CPU as the idle task may be scheduled as
+ * a realtime task in sched_idle_next.
+ */
+#ifdef CONFIG_HOTPLUG_CPU
+#define rq_idle(rq)	((rq)->curr == (rq)->idle && !rt_task((rq)->curr))
+#else
+#define rq_idle(rq)	((rq)->curr == (rq)->idle)
+#endif
+
+static inline int task_preempts_curr(struct task_struct *p, struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+
+	return ((p->array == task_rq(p)->active &&
+		TASK_PREEMPTS_CURR(p, curr)) || rq_idle(rq));
+}
+
+static inline void try_preempt(struct task_struct *p, struct rq *rq)
+{
+	if (task_preempts_curr(p, rq))
+		resched_task(rq->curr);
+}
+
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -1470,7 +1540,7 @@ static int try_to_wake_up(struct task_st
 	if (!(old_state & state))
 		goto out;
 
-	if (p->array)
+	if (task_queued(p))
 		goto out_running;
 
 	cpu = task_cpu(p);
@@ -1563,7 +1633,7 @@ out_set_cpu:
 		old_state = p->state;
 		if (!(old_state & state))
 			goto out;
-		if (p->array)
+		if (task_queued(p))
 			goto out_running;
 
 		this_cpu = smp_processor_id();
@@ -1572,25 +1642,9 @@ out_set_cpu:
 
 out_activate:
 #endif /* CONFIG_SMP */
-	if (old_state == TASK_UNINTERRUPTIBLE) {
+	if (old_state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
-		/*
-		 * Tasks on involuntary sleep don't earn
-		 * sleep_avg beyond just interactive state.
-		 */
-		p->sleep_type = SLEEP_NONINTERACTIVE;
-	} else
-
-	/*
-	 * Tasks that have marked their sleep as noninteractive get
-	 * woken up with their sleep average not weighted in an
-	 * interactive way.
-	 */
-		if (old_state & TASK_NONINTERACTIVE)
-			p->sleep_type = SLEEP_NONINTERACTIVE;
-
 
-	activate_task(p, rq, cpu == this_cpu);
 	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
@@ -1599,10 +1653,9 @@ out_activate:
 	 * the waker guarantees that the freshly woken up task is going
 	 * to be considered on this CPU.)
 	 */
-	if (!sync || cpu != this_cpu) {
-		if (TASK_PREEMPTS_CURR(p, rq))
-			resched_task(rq->curr);
-	}
+	activate_task(p, rq, cpu == this_cpu);
+	if (!sync || cpu != this_cpu)
+		try_preempt(p, rq);
 	success = 1;
 
 out_running:
@@ -1625,7 +1678,6 @@ int fastcall wake_up_state(struct task_s
 	return try_to_wake_up(p, state, 0);
 }
 
-static void task_running_tick(struct rq *rq, struct task_struct *p);
 /*
  * Perform scheduler related setup for a newly forked process p.
  * p is forked by current.
@@ -1653,7 +1705,6 @@ void fastcall sched_fork(struct task_str
 	p->prio = current->normal_prio;
 
 	INIT_LIST_HEAD(&p->run_list);
-	p->array = NULL;
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	if (unlikely(sched_info_on()))
 		memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -1665,30 +1716,31 @@ void fastcall sched_fork(struct task_str
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
+	if (unlikely(p->policy == SCHED_FIFO))
+		goto out;
 	/*
 	 * Share the timeslice between parent and child, thus the
 	 * total amount of pending timeslices in the system doesn't change,
 	 * resulting in more scheduling fairness.
 	 */
 	local_irq_disable();
-	p->time_slice = (current->time_slice + 1) >> 1;
-	/*
-	 * The remainder of the first timeslice might be recovered by
-	 * the parent if the child exits early enough.
-	 */
-	p->first_time_slice = 1;
-	current->time_slice >>= 1;
-	p->timestamp = sched_clock();
-	if (unlikely(!current->time_slice)) {
+	if (current->time_slice > 0) {
+		current->time_slice /= 2;
+		if (current->time_slice)
+			p->time_slice = current->time_slice;
+		else
+			p->time_slice = 1;
 		/*
-		 * This case is rare, it happens when the parent has only
-		 * a single jiffy left from its timeslice. Taking the
-		 * runqueue lock is not a problem.
+		 * The remainder of the first timeslice might be recovered by
+		 * the parent if the child exits early enough.
 		 */
-		current->time_slice = 1;
-		task_running_tick(cpu_rq(cpu), current);
-	}
+		p->first_time_slice = 1;
+	} else
+		p->time_slice = 0;
+
+	p->timestamp = sched_clock();
 	local_irq_enable();
+out:
 	put_cpu();
 }
 
@@ -1710,38 +1762,16 @@ void fastcall wake_up_new_task(struct ta
 	this_cpu = smp_processor_id();
 	cpu = task_cpu(p);
 
-	/*
-	 * We decrease the sleep average of forking parents
-	 * and children as well, to keep max-interactive tasks
-	 * from forking tasks that are max-interactive. The parent
-	 * (current) is done further down, under its lock.
-	 */
-	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
-		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-
-	p->prio = effective_prio(p);
-
 	if (likely(cpu == this_cpu)) {
+		activate_task(p, rq, 1);
 		if (!(clone_flags & CLONE_VM)) {
 			/*
 			 * The VM isn't cloned, so we're in a good position to
 			 * do child-runs-first in anticipation of an exec. This
 			 * usually avoids a lot of COW overhead.
 			 */
-			if (unlikely(!current->array))
-				__activate_task(p, rq);
-			else {
-				p->prio = current->prio;
-				p->normal_prio = current->normal_prio;
-				list_add_tail(&p->run_list, &current->run_list);
-				p->array = current->array;
-				p->array->nr_active++;
-				inc_nr_running(p, rq);
-			}
 			set_need_resched();
-		} else
-			/* Run child last */
-			__activate_task(p, rq);
+		}
 		/*
 		 * We skip the following code due to cpu == this_cpu
 	 	 *
@@ -1758,19 +1788,16 @@ void fastcall wake_up_new_task(struct ta
 		 */
 		p->timestamp = (p->timestamp - this_rq->most_recent_timestamp)
 					+ rq->most_recent_timestamp;
-		__activate_task(p, rq);
-		if (TASK_PREEMPTS_CURR(p, rq))
-			resched_task(rq->curr);
+		activate_task(p, rq, 0);
+		try_preempt(p, rq);
 
 		/*
 		 * Parent and child are on different CPUs, now get the
-		 * parent runqueue to update the parent's ->sleep_avg:
+		 * parent runqueue to update the parent's ->flags:
 		 */
 		task_rq_unlock(rq, &flags);
 		this_rq = task_rq_lock(current, &flags);
 	}
-	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
-		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 	task_rq_unlock(this_rq, &flags);
 }
 
@@ -1785,23 +1812,17 @@ void fastcall wake_up_new_task(struct ta
  */
 void fastcall sched_exit(struct task_struct *p)
 {
+	struct task_struct *parent;
 	unsigned long flags;
 	struct rq *rq;
 
-	/*
-	 * If the child was a (relative-) CPU hog then decrease
-	 * the sleep_avg of the parent as well.
-	 */
-	rq = task_rq_lock(p->parent, &flags);
-	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
-		p->parent->time_slice += p->time_slice;
-		if (unlikely(p->parent->time_slice > task_timeslice(p)))
-			p->parent->time_slice = task_timeslice(p);
-	}
-	if (p->sleep_avg < p->parent->sleep_avg)
-		p->parent->sleep_avg = p->parent->sleep_avg /
-		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
-		(EXIT_WEIGHT + 1);
+	parent = p->parent;
+	rq = task_rq_lock(parent, &flags);
+	if (p->first_time_slice > 0 && task_cpu(p) == task_cpu(parent)) {
+		parent->time_slice += p->time_slice;
+		if (unlikely(parent->time_slice > parent->quota))
+			parent->time_slice = parent->quota;
+	}
 	task_rq_unlock(rq, &flags);
 }
 
@@ -2133,23 +2154,17 @@ void sched_exec(void)
  * pull_task - move a task from a remote runqueue to the local runqueue.
  * Both runqueues must be locked.
  */
-static void pull_task(struct rq *src_rq, struct prio_array *src_array,
-		      struct task_struct *p, struct rq *this_rq,
-		      struct prio_array *this_array, int this_cpu)
+static void pull_task(struct rq *src_rq, struct task_struct *p,
+		      struct rq *this_rq, int this_cpu)
 {
-	dequeue_task(p, src_array);
+	dequeue_task(p, src_rq);
 	dec_nr_running(p, src_rq);
 	set_task_cpu(p, this_cpu);
 	inc_nr_running(p, this_rq);
-	enqueue_task(p, this_array);
+	enqueue_task(p, this_rq);
 	p->timestamp = (p->timestamp - src_rq->most_recent_timestamp)
 				+ this_rq->most_recent_timestamp;
-	/*
-	 * Note that idle threads have a prio of MAX_PRIO, for this test
-	 * to be always true for them.
-	 */
-	if (TASK_PREEMPTS_CURR(p, this_rq))
-		resched_task(this_rq->curr);
+	try_preempt(p, this_rq);
 }
 
 /*
@@ -2192,7 +2207,16 @@ int can_migrate_task(struct task_struct 
 	return 1;
 }
 
-#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
+static inline int rq_best_prio(struct rq *rq)
+{
+	int best_prio, exp_prio;
+
+	best_prio = sched_find_first_bit(rq->dyn_bitmap);
+	exp_prio = find_next_bit(rq->exp_bitmap, MAX_PRIO, MAX_RT_PRIO);
+	if (unlikely(best_prio > exp_prio))
+		best_prio = exp_prio;
+	return best_prio;
+}
 
 /*
  * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
@@ -2208,7 +2232,7 @@ static int move_tasks(struct rq *this_rq
 {
 	int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
 	    best_prio_seen, skip_for_load;
-	struct prio_array *array, *dst_array;
+	struct prio_array *array;
 	struct list_head *head, *curr;
 	struct task_struct *tmp;
 	long rem_load_move;
@@ -2235,26 +2259,21 @@ static int move_tasks(struct rq *this_rq
 	 * be cache-cold, thus switching CPUs has the least effect
 	 * on them.
 	 */
-	if (busiest->expired->nr_active) {
-		array = busiest->expired;
-		dst_array = this_rq->expired;
-	} else {
-		array = busiest->active;
-		dst_array = this_rq->active;
-	}
-
+	array = busiest->expired;
 new_array:
-	/* Start searching at priority 0: */
-	idx = 0;
+	/* Expired arrays don't have RT tasks so they're always MAX_RT_PRIO+ */
+	if (array == busiest->expired)
+		idx = MAX_RT_PRIO;
+	else
+		idx = 0;
 skip_bitmap:
 	if (!idx)
-		idx = sched_find_first_bit(array->bitmap);
+		idx = sched_find_first_bit(array->prio_bitmap);
 	else
-		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
+		idx = find_next_bit(array->prio_bitmap, MAX_PRIO, idx);
 	if (idx >= MAX_PRIO) {
-		if (array == busiest->expired && busiest->active->nr_active) {
+		if (array == busiest->expired) {
 			array = busiest->active;
-			dst_array = this_rq->active;
 			goto new_array;
 		}
 		goto out;
@@ -2285,7 +2304,7 @@ skip_queue:
 		goto skip_bitmap;
 	}
 
-	pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
+	pull_task(busiest, tmp, this_rq, this_cpu);
 	pulled++;
 	rem_load_move -= tmp->load_weight;
 
@@ -3242,11 +3261,36 @@ EXPORT_PER_CPU_SYMBOL(kstat);
 /*
  * This is called on clock ticks and on context switches.
  * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ * The value returned from sched_clock() occasionally gives bogus values so
+ * some sanity checking is required.
  */
-static inline void
-update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
+static void
+update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now,
+		 int tick)
 {
-	p->sched_time += now - p->last_ran;
+	long time_diff = now - p->last_ran;
+
+	if (tick) {
+		/*
+		 * Called from scheduler_tick() there should be less than two
+		 * jiffies worth, and not negative/overflow.
+		 */
+		if (time_diff > JIFFIES_TO_NS(2) || time_diff < 0)
+			time_diff = JIFFIES_TO_NS(1);
+	} else {
+		/*
+		 * Called from context_switch there should be less than one
+		 * jiffy worth, and not negative/overflow. There should be
+		 * some time banked here so use a nominal 1us.
+		 */
+		if (time_diff > JIFFIES_TO_NS(1) || time_diff < 1)
+			time_diff = 1000;
+	}
+	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
+	if (p != rq->idle && p->policy != SCHED_FIFO)
+		p->time_slice -= time_diff / 1000;
+	p->sched_time += time_diff;
 	p->last_ran = rq->most_recent_timestamp = now;
 }
 
@@ -3267,27 +3311,6 @@ unsigned long long current_sched_time(co
 }
 
 /*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired:
- */
-static inline int expired_starving(struct rq *rq)
-{
-	if (rq->curr->static_prio > rq->best_expired_prio)
-		return 1;
-	if (!STARVATION_LIMIT || !rq->expired_timestamp)
-		return 0;
-	if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
-		return 1;
-	return 0;
-}
-
-/*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3360,87 +3383,47 @@ void account_steal_time(struct task_stru
 		cpustat->steal = cputime64_add(cpustat->steal, tmp);
 }
 
-static void task_running_tick(struct rq *rq, struct task_struct *p)
+/*
+ * The task has used up its quota of running in this prio_level so it must be
+ * dropped a priority level, all managed by recalc_task_prio().
+ */
+static void task_expired_entitlement(struct rq *rq, struct task_struct *p)
 {
-	if (p->array != rq->active) {
-		/* Task has expired but was not scheduled yet */
-		set_tsk_need_resched(p);
+	int overrun;
+
+	reset_first_time_slice(p);
+	if (rt_task(p)) {
+		p->time_slice += p->quota;
+		list_move_tail(&p->run_list, p->array->queue + p->prio);
 		return;
 	}
-	spin_lock(&rq->lock);
+	overrun = p->time_slice;
+	dequeue_task(p, rq);
+	enqueue_task(p, rq);
 	/*
-	 * The task was running during this tick - update the
-	 * time slice counter. Note: we do not update a thread's
-	 * priority until it either goes to sleep or uses up its
-	 * timeslice. This makes it possible for interactive tasks
-	 * to use up their timeslices at their highest priority levels.
+	 * Subtract any extra time this task ran over its time_slice; ie
+	 * overrun will either be 0 or negative.
 	 */
-	if (rt_task(p)) {
-		/*
-		 * RR tasks need a special form of timeslice management.
-		 * FIFO tasks have no timeslices.
-		 */
-		if ((p->policy == SCHED_RR) && !--p->time_slice) {
-			p->time_slice = task_timeslice(p);
-			p->first_time_slice = 0;
-			set_tsk_need_resched(p);
-
-			/* put it at the end of the queue: */
-			requeue_task(p, rq->active);
-		}
-		goto out_unlock;
-	}
-	if (!--p->time_slice) {
-		dequeue_task(p, rq->active);
-		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
-		p->time_slice = task_timeslice(p);
-		p->first_time_slice = 0;
-
-		if (!rq->expired_timestamp)
-			rq->expired_timestamp = jiffies;
-		if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
-			enqueue_task(p, rq->expired);
-			if (p->static_prio < rq->best_expired_prio)
-				rq->best_expired_prio = p->static_prio;
-		} else
-			enqueue_task(p, rq->active);
-	} else {
-		/*
-		 * Prevent a too long timeslice allowing a task to monopolize
-		 * the CPU. We do this by splitting up the timeslice into
-		 * smaller pieces.
-		 *
-		 * Note: this does not mean the task's timeslices expire or
-		 * get lost in any way, they just might be preempted by
-		 * another task of equal priority. (one with higher
-		 * priority would have preempted this task already.) We
-		 * requeue this task to the end of the list on this priority
-		 * level, which is in essence a round-robin of tasks with
-		 * equal priority.
-		 *
-		 * This only applies to tasks in the interactive
-		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
-		 */
-		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
-			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
-			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
-			(p->array == rq->active)) {
+	p->time_slice += overrun;
+}
 
-			requeue_task(p, rq->active);
-			set_tsk_need_resched(p);
-		}
-	}
-out_unlock:
+/* This manages tasks that have run out of timeslice during a scheduler_tick */
+static void task_running_tick(struct rq *rq, struct task_struct *p)
+{
+	/* SCHED_FIFO tasks never run out of timeslice. */
+	if (p->time_slice > 0 || p->policy == SCHED_FIFO)
+		return;
+	/* p->time_slice <= 0 */
+	spin_lock(&rq->lock);
+	if (likely(task_queued(p)))
+		task_expired_entitlement(rq, p);
+	set_tsk_need_resched(p);
 	spin_unlock(&rq->lock);
 }
 
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
  */
 void scheduler_tick(void)
 {
@@ -3450,7 +3433,7 @@ void scheduler_tick(void)
 	int idle_at_tick = idle_cpu(cpu);
 	struct rq *rq = cpu_rq(cpu);
 
-	update_cpu_clock(p, rq, now);
+	update_cpu_clock(p, rq, now, 1);
 
 	if (!idle_at_tick)
 		task_running_tick(rq, p);
@@ -3499,10 +3482,55 @@ EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
-static inline int interactive_sleep(enum sleep_type sleep_type)
+static void reset_prio_levels(struct rq *rq)
 {
-	return (sleep_type == SLEEP_INTERACTIVE ||
-		sleep_type == SLEEP_INTERRUPTED);
+	rq->active->best_static_prio = MAX_PRIO - 1;
+	rq->expired->best_static_prio = MAX_PRIO - 1;
+	memset(rq->prio_level, 0, sizeof(int) * PRIO_RANGE);
+}
+
+/*
+ * next_dynamic_task finds the next suitable dynamic task.
+ */
+static inline struct task_struct *next_dynamic_task(struct rq *rq, int idx)
+{
+	struct prio_array *array = rq->active;
+	struct task_struct *next;
+	struct list_head *queue;
+	int nstatic;
+
+retry:
+	if (idx >= MAX_PRIO) {
+		/* There are no more tasks in the active array. Swap arrays */
+		array = rq->expired;
+		rq->expired = rq->active;
+		rq->active = array;
+		rq->exp_bitmap = rq->expired->prio_bitmap;
+		rq->dyn_bitmap = rq->active->prio_bitmap;
+		rq->prio_rotation++;
+		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
+		reset_prio_levels(rq);
+	}
+	queue = array->queue + idx;
+	next = list_entry(queue->next, struct task_struct, run_list);
+	if (unlikely(next->time_slice <= 0)) {
+		/*
+		 * Unlucky enough that this task ran out of time_slice
+		 * before it hit a scheduler_tick so it should have its
+		 * priority reassessed and choose another task (possibly
+		 * the same one)
+		 */
+		task_expired_entitlement(rq, next);
+		idx = find_next_bit(rq->dyn_bitmap, MAX_PRIO, MAX_RT_PRIO);
+		goto retry;
+	}
+	next->rotation = rq->prio_rotation;
+	nstatic = next->static_prio;
+	if (nstatic < array->best_static_prio)
+		array->best_static_prio = nstatic;
+	if (idx > rq->prio_level[USER_PRIO(nstatic)])
+		rq->prio_level[USER_PRIO(nstatic)] = idx;
+	return next;
 }
 
 /*
@@ -3511,13 +3539,11 @@ static inline int interactive_sleep(enum
 asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
-	struct prio_array *array;
 	struct list_head *queue;
 	unsigned long long now;
-	unsigned long run_time;
-	int cpu, idx, new_prio;
 	long *switch_count;
 	struct rq *rq;
+	int cpu, idx;
 
 	/*
 	 * Test if we are atomic.  Since do_exit() needs to call into
@@ -3553,18 +3579,6 @@ need_resched_nonpreemptible:
 
 	schedstat_inc(rq, sched_cnt);
 	now = sched_clock();
-	if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) {
-		run_time = now - prev->timestamp;
-		if (unlikely((long long)(now - prev->timestamp) < 0))
-			run_time = 0;
-	} else
-		run_time = NS_MAX_SLEEP_AVG;
-
-	/*
-	 * Tasks charged proportionately less run_time at high sleep_avg to
-	 * delay them losing their interactive status
-	 */
-	run_time /= (CURRENT_BONUS(prev) ? : 1);
 
 	spin_lock_irq(&rq->lock);
 
@@ -3586,59 +3600,29 @@ need_resched_nonpreemptible:
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
 			next = rq->idle;
-			rq->expired_timestamp = 0;
 			goto switch_tasks;
 		}
 	}
 
-	array = rq->active;
-	if (unlikely(!array->nr_active)) {
-		/*
-		 * Switch the active and expired arrays.
-		 */
-		schedstat_inc(rq, sched_switch);
-		rq->active = rq->expired;
-		rq->expired = array;
-		array = rq->active;
-		rq->expired_timestamp = 0;
-		rq->best_expired_prio = MAX_PRIO;
+	idx = sched_find_first_bit(rq->dyn_bitmap);
+	if (!rt_prio(idx))
+		next = next_dynamic_task(rq, idx);
+	else {
+		queue = rq->active->queue + idx;
+		next = list_entry(queue->next, struct task_struct, run_list);
 	}
-
-	idx = sched_find_first_bit(array->bitmap);
-	queue = array->queue + idx;
-	next = list_entry(queue->next, struct task_struct, run_list);
-
-	if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
-		unsigned long long delta = now - next->timestamp;
-		if (unlikely((long long)(now - next->timestamp) < 0))
-			delta = 0;
-
-		if (next->sleep_type == SLEEP_INTERACTIVE)
-			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
-
-		array = next->array;
-		new_prio = recalc_task_prio(next, next->timestamp + delta);
-
-		if (unlikely(next->prio != new_prio)) {
-			dequeue_task(next, array);
-			next->prio = new_prio;
-			enqueue_task(next, array);
-		}
-	}
-	next->sleep_type = SLEEP_NORMAL;
 switch_tasks:
-	if (next == rq->idle)
+	if (next == rq->idle) {
+		reset_prio_levels(rq);
+		rq->prio_rotation++;
 		schedstat_inc(rq, sched_goidle);
+	}
 	prefetch(next);
 	prefetch_stack(next);
 	clear_tsk_need_resched(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
-	update_cpu_clock(prev, rq, now);
-
-	prev->sleep_avg -= run_time;
-	if ((long)prev->sleep_avg <= 0)
-		prev->sleep_avg = 0;
+	update_cpu_clock(prev, rq, now, 0);
 	prev->timestamp = prev->last_ran = now;
 
 	sched_info_switch(prev, next);
@@ -4074,29 +4058,22 @@ EXPORT_SYMBOL(sleep_on_timeout);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-	struct prio_array *array;
 	unsigned long flags;
+	int queued, oldprio;
 	struct rq *rq;
-	int oldprio;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
 	rq = task_rq_lock(p, &flags);
 
 	oldprio = p->prio;
-	array = p->array;
-	if (array)
-		dequeue_task(p, array);
+	queued = task_queued(p);
+	if (queued)
+		dequeue_task(p, rq);
 	p->prio = prio;
 
-	if (array) {
-		/*
-		 * If changing to an RT priority then queue it
-		 * in the active array!
-		 */
-		if (rt_task(p))
-			array = rq->active;
-		enqueue_task(p, array);
+	if (queued) {
+		enqueue_task(p, rq);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
@@ -4105,8 +4082,8 @@ void rt_mutex_setprio(struct task_struct
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
-		} else if (TASK_PREEMPTS_CURR(p, rq))
-			resched_task(rq->curr);
+		} else
+			try_preempt(p, rq);
 	}
 	task_rq_unlock(rq, &flags);
 }
@@ -4115,8 +4092,7 @@ void rt_mutex_setprio(struct task_struct
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-	struct prio_array *array;
-	int old_prio, delta;
+	int queued, old_prio,delta;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -4137,20 +4113,20 @@ void set_user_nice(struct task_struct *p
 		p->static_prio = NICE_TO_PRIO(nice);
 		goto out_unlock;
 	}
-	array = p->array;
-	if (array) {
-		dequeue_task(p, array);
+	queued = task_queued(p);
+	if (queued) {
+		dequeue_task(p, rq);
 		dec_raw_weighted_load(rq, p);
 	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
-	set_load_weight(p);
 	old_prio = p->prio;
 	p->prio = effective_prio(p);
+	set_quota(p);
 	delta = p->prio - old_prio;
 
-	if (array) {
-		enqueue_task(p, array);
+	if (queued) {
+		enqueue_task(p, rq);
 		inc_raw_weighted_load(rq, p);
 		/*
 		 * If the task increased its priority or is running and
@@ -4226,11 +4202,23 @@ asmlinkage long sys_nice(int increment)
  *
  * This is the priority value as seen by users in /proc.
  * RT tasks are offset by -200. Normal tasks are centered
- * around 0, value goes from -16 to +15.
+ * around 1, value goes from 0 to +79. Values higher than
+ * 39 indicate task is on the expired array. This is done
+ * lockless and may rarely return an active instead of
+ * expired value.
  */
 int task_prio(const struct task_struct *p)
 {
-	return p->prio - MAX_RT_PRIO;
+	int prio = p->prio - MAX_RT_PRIO;
+
+	if (task_queued(p)) {
+		struct rq *rq = task_rq(p);
+		struct prio_array *array = p->array;
+
+		if (rq && rq->expired == array)
+			prio += PRIO_RANGE;
+	}
+	return prio;
 }
 
 /**
@@ -4273,19 +4261,14 @@ static inline struct task_struct *find_p
 /* Actually do priority change: must hold rq lock. */
 static void __setscheduler(struct task_struct *p, int policy, int prio)
 {
-	BUG_ON(p->array);
+	BUG_ON(task_queued(p));
 
 	p->policy = policy;
 	p->rt_priority = prio;
 	p->normal_prio = normal_prio(p);
 	/* we are holding p->pi_lock already */
 	p->prio = rt_mutex_getprio(p);
-	/*
-	 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
-	 */
-	if (policy == SCHED_BATCH)
-		p->sleep_avg = 0;
-	set_load_weight(p);
+	set_quota(p);
 }
 
 /**
@@ -4299,8 +4282,7 @@ static void __setscheduler(struct task_s
 int sched_setscheduler(struct task_struct *p, int policy,
 		       struct sched_param *param)
 {
-	int retval, oldprio, oldpolicy = -1;
-	struct prio_array *array;
+	int queued, retval, oldprio, oldpolicy = -1;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -4374,12 +4356,12 @@ recheck:
 		spin_unlock_irqrestore(&p->pi_lock, flags);
 		goto recheck;
 	}
-	array = p->array;
-	if (array)
+	queued = task_queued(p);
+	if (queued)
 		deactivate_task(p, rq);
 	oldprio = p->prio;
 	__setscheduler(p, policy, param->sched_priority);
-	if (array) {
+	if (queued) {
 		__activate_task(p, rq);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
@@ -4389,8 +4371,8 @@ recheck:
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
-		} else if (TASK_PREEMPTS_CURR(p, rq))
-			resched_task(rq->curr);
+		} else
+			try_preempt(p, rq);
 	}
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4663,40 +4645,27 @@ asmlinkage long sys_sched_getaffinity(pi
  * sys_sched_yield - yield the current processor to other threads.
  *
  * This function yields the current CPU by moving the calling thread
- * to the expired array. If there are no other threads running on this
- * CPU then this function will return.
+ * to the expired array if SCHED_NORMAL or the end of its current priority
+ * queue if a realtime task. If there are no other threads running on this
+ * cpu this function will return.
  */
 asmlinkage long sys_sched_yield(void)
 {
 	struct rq *rq = this_rq_lock();
-	struct prio_array *array = current->array, *target = rq->expired;
+	struct task_struct *p = current;
 
 	schedstat_inc(rq, yld_cnt);
-	/*
-	 * We implement yielding by moving the task into the expired
-	 * queue.
-	 *
-	 * (special rule: RT tasks will just roundrobin in the active
-	 *  array.)
-	 */
-	if (rt_task(current))
-		target = rq->active;
+	if (rq->nr_running == 1)
+		schedstat_inc(rq, yld_both_empty);
+	else {
+		struct prio_array *old_array = p->array;
+		int old_prio = p->prio;
 
-	if (array->nr_active == 1) {
-		schedstat_inc(rq, yld_act_empty);
-		if (!rq->expired->nr_active)
-			schedstat_inc(rq, yld_both_empty);
-	} else if (!rq->expired->nr_active)
-		schedstat_inc(rq, yld_exp_empty);
-
-	if (array != target) {
-		dequeue_task(current, array);
-		enqueue_task(current, target);
-	} else
-		/*
-		 * requeue_task is cheaper so perform that if possible.
-		 */
-		requeue_task(current, array);
+		/* p->prio will be updated in requeue_task via queue_expired */
+		if (!rt_task(p))
+			p->array = rq->expired;
+		requeue_task(p, rq, old_array, old_prio);
+	}
 
 	/*
 	 * Since we are going to call schedule() anyway, there's
@@ -4906,8 +4875,8 @@ long sys_sched_rr_get_interval(pid_t pid
 	if (retval)
 		goto out_unlock;
 
-	jiffies_to_timespec(p->policy == SCHED_FIFO ?
-				0 : task_timeslice(p), &t);
+	t = ns_to_timespec(p->policy == SCHED_FIFO ? 0 :
+			   MS_TO_NS(task_timeslice(p)));
 	read_unlock(&tasklist_lock);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:
@@ -5003,10 +4972,10 @@ void __cpuinit init_idle(struct task_str
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	idle->timestamp = sched_clock();
-	idle->sleep_avg = 0;
-	idle->array = NULL;
-	idle->prio = idle->normal_prio = MAX_PRIO;
+	bitmap_zero(idle->bitmap, PRIO_RANGE);
+	idle->timestamp = idle->last_ran = sched_clock();
+	idle->array = rq->active;
+	idle->prio = idle->normal_prio = NICE_TO_PRIO(0);
 	idle->state = TASK_RUNNING;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	set_task_cpu(idle, cpu);
@@ -5125,7 +5094,7 @@ static int __migrate_task(struct task_st
 		goto out;
 
 	set_task_cpu(p, dest_cpu);
-	if (p->array) {
+	if (task_queued(p)) {
 		/*
 		 * Sync timestamp with rq_dest's before activating.
 		 * The same thing could be achieved by doing this step
@@ -5136,8 +5105,7 @@ static int __migrate_task(struct task_st
 				+ rq_dest->most_recent_timestamp;
 		deactivate_task(p, rq_src);
 		__activate_task(p, rq_dest);
-		if (TASK_PREEMPTS_CURR(p, rq_dest))
-			resched_task(rq_dest->curr);
+		try_preempt(p, rq_dest);
 	}
 	ret = 1;
 out:
@@ -5434,7 +5402,7 @@ migration_call(struct notifier_block *nf
 		/* Idle task back to normal (off runqueue, low prio) */
 		rq = task_rq_lock(rq->idle, &flags);
 		deactivate_task(rq->idle, rq);
-		rq->idle->static_prio = MAX_PRIO;
+		rq->idle->static_prio = NICE_TO_PRIO(0);
 		__setscheduler(rq->idle, SCHED_NORMAL, 0);
 		migrate_dead_tasks(cpu);
 		task_rq_unlock(rq, &flags);
@@ -6960,6 +6928,13 @@ void __init sched_init_smp(void)
 	/* Move init over to a non-isolated CPU */
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
+
+	/*
+	 * Assume that every added cpu gives us slightly less overall latency
+	 * allowing us to increase the base rr_interval, but in a non linear
+	 * fashion.
+	 */
+	rr_interval *= 1 + ilog2(num_online_cpus());
 }
 #else
 void __init sched_init_smp(void)
@@ -6982,6 +6957,16 @@ void __init sched_init(void)
 	int i, j, k;
 	int highest_cpu = 0;
 
+	/* Generate the priority matrix */
+	for (i = 0; i < PRIO_RANGE; i++) {
+		bitmap_fill(prio_matrix[i], PRIO_RANGE);
+		j = PRIO_RANGE * PRIO_RANGE / (PRIO_RANGE - i);
+		for (k = 0; k <= PRIO_RANGE * (PRIO_RANGE - 1); k += j) {
+			__clear_bit(PRIO_RANGE - 1 - (k / PRIO_RANGE),
+				    prio_matrix[i]);
+		}
+	}
+
 	for_each_possible_cpu(i) {
 		struct prio_array *array;
 		struct rq *rq;
@@ -6990,11 +6975,16 @@ void __init sched_init(void)
 		spin_lock_init(&rq->lock);
 		lockdep_set_class(&rq->lock, &rq->rq_lock_key);
 		rq->nr_running = 0;
+		rq->prio_rotation = 0;
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
-		rq->best_expired_prio = MAX_PRIO;
+		reset_prio_levels(rq);
+		rq->dyn_bitmap = rq->active->prio_bitmap;
+		rq->exp_bitmap = rq->expired->prio_bitmap;
 
 #ifdef CONFIG_SMP
+		rq->active->rq = rq;
+		rq->expired->rq = rq;
 		rq->sd = NULL;
 		for (j = 1; j < 3; j++)
 			rq->cpu_load[j] = 0;
@@ -7007,17 +6997,16 @@ void __init sched_init(void)
 		atomic_set(&rq->nr_iowait, 0);
 
 		for (j = 0; j < 2; j++) {
+
 			array = rq->arrays + j;
-			for (k = 0; k < MAX_PRIO; k++) {
+			for (k = 0; k < MAX_PRIO; k++)
 				INIT_LIST_HEAD(array->queue + k);
-				__clear_bit(k, array->bitmap);
-			}
-			// delimiter for bitsearch
-			__set_bit(MAX_PRIO, array->bitmap);
+			bitmap_zero(array->prio_bitmap, MAX_PRIO);
+			/* delimiter for bitsearch */
+			__set_bit(MAX_PRIO, array->prio_bitmap);
 		}
 		highest_cpu = i;
 	}
-
 	set_load_weight(&init_task);
 
 #ifdef CONFIG_SMP
@@ -7072,10 +7061,10 @@ EXPORT_SYMBOL(__might_sleep);
 #ifdef CONFIG_MAGIC_SYSRQ
 void normalize_rt_tasks(void)
 {
-	struct prio_array *array;
 	struct task_struct *p;
 	unsigned long flags;
 	struct rq *rq;
+	int queued;
 
 	read_lock_irq(&tasklist_lock);
 	for_each_process(p) {
@@ -7085,11 +7074,11 @@ void normalize_rt_tasks(void)
 		spin_lock_irqsave(&p->pi_lock, flags);
 		rq = __task_rq_lock(p);
 
-		array = p->array;
-		if (array)
+		queued = task_queued(p);
+		if (queued)
 			deactivate_task(p, task_rq(p));
 		__setscheduler(p, SCHED_NORMAL, 0);
-		if (array) {
+		if (queued) {
 			__activate_task(p, task_rq(p));
 			resched_task(rq->curr);
 		}
Index: linux-2.6.22-rc2-ck1/kernel/sysctl.c
===================================================================
--- linux-2.6.22-rc2-ck1.orig/kernel/sysctl.c	2007-05-19 20:10:51.000000000 +1000
+++ linux-2.6.22-rc2-ck1/kernel/sysctl.c	2007-05-19 20:12:08.000000000 +1000
@@ -78,6 +78,8 @@ extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int maps_protect;
 extern int sysctl_stat_interval;
+extern int rr_interval;
+extern int sched_interactive;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -161,6 +163,14 @@ int sysctl_legacy_va_layout;
 #endif
 
 
+/* Constants for minimum and maximum testing.
+   We use these as one-element integer vectors. */
+static int __read_mostly zero;
+static int __read_mostly one = 1;
+static int __read_mostly one_hundred = 100;
+static int __read_mostly five_thousand = 5000;
+
+
 /* The default sysctl tables: */
 
 static ctl_table root_table[] = {
@@ -501,6 +511,25 @@ static ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "rr_interval",
+		.data		= &rr_interval,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+		.extra2		= &five_thousand,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "interactive",
+		.data		= &sched_interactive,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
 	{
 		.ctl_name       = KERN_UNKNOWN_NMI_PANIC,
@@ -619,12 +648,6 @@ static ctl_table kern_table[] = {
 	{ .ctl_name = 0 }
 };
 
-/* Constants for minimum and maximum testing in vm_table.
-   We use these as one-element integer vectors. */
-static int zero;
-static int one_hundred = 100;
-
-
 static ctl_table vm_table[] = {
 	{
 		.ctl_name	= VM_OVERCOMMIT_MEMORY,
Index: linux-2.6.22-rc2-ck1/Documentation/sched-design.txt
===================================================================
--- linux-2.6.22-rc2-ck1.orig/Documentation/sched-design.txt	2006-11-30 11:30:31.000000000 +1100
+++ linux-2.6.22-rc2-ck1/Documentation/sched-design.txt	2007-05-19 20:11:57.000000000 +1000
@@ -1,11 +1,14 @@
-		   Goals, Design and Implementation of the
-		      new ultra-scalable O(1) scheduler
+ Goals, Design and Implementation of the ultra-scalable O(1) scheduler by
+ Ingo Molnar and theStaircase Deadline cpu scheduler policy designed by
+ Con Kolivas.
 
 
-  This is an edited version of an email Ingo Molnar sent to
-  lkml on 4 Jan 2002.  It describes the goals, design, and
-  implementation of Ingo's new ultra-scalable O(1) scheduler.
-  Last Updated: 18 April 2002.
+  This was originally an edited version of an email Ingo Molnar sent to
+  lkml on 4 Jan 2002.  It describes the goals, design, and implementation
+  of Ingo's ultra-scalable O(1) scheduler. It now contains a description
+  of the Staircase Deadline priority scheduler that was built on this
+  design.
+  Last Updated: Fri, 4 May 2007
 
 
 Goal
@@ -163,3 +166,222 @@ certain code paths and data constructs. 
 code is smaller than the old one.
 
 	Ingo
+
+
+Staircase Deadline cpu scheduler policy
+================================================
+
+Design summary
+==============
+
+A novel design which incorporates a foreground-background descending priority
+system (the staircase) via a bandwidth allocation matrix according to nice
+level.
+
+
+Features
+========
+
+A starvation free, strict fairness O(1) scalable design with interactivity
+as good as the above restrictions can provide. There is no interactivity
+estimator, no sleep/run measurements and only simple fixed accounting.
+The design has strict enough a design and accounting that task behaviour
+can be modelled and maximum scheduling latencies can be predicted by
+the virtual deadline mechanism that manages runqueues. The prime concern
+in this design is to maintain fairness at all costs determined by nice level,
+yet to maintain as good interactivity as can be allowed within the
+constraints of strict fairness.
+
+
+Design description
+==================
+
+SD works off the principle of providing each task a quota of runtime that it is
+allowed to run at a number of priority levels determined by its static priority
+(ie. its nice level). If the task uses up its quota it has its priority
+decremented to the next level determined by a priority matrix. Once every
+runtime quota has been consumed of every priority level, a task is queued on the
+"expired" array. When no other tasks exist with quota, the expired array is
+activated and fresh quotas are handed out. This is all done in O(1).
+
+Design details
+==============
+
+Each task keeps a record of its own entitlement of cpu time. Most of the rest of
+these details apply to non-realtime tasks as rt task management is straight
+forward.
+
+Each runqueue keeps a record of what major epoch it is up to in the
+rq->prio_rotation field which is incremented on each major epoch. It also
+keeps a record of the current prio_level for each static priority task.
+
+Each task keeps a record of what major runqueue epoch it was last running
+on in p->rotation. It also keeps a record of what priority levels it has
+already been allocated quota from during this epoch in a bitmap p->bitmap.
+
+The only tunable that determines all other details is the RR_INTERVAL. This
+is set to 8ms, and is scaled gently upwards with more cpus. This value is
+tunable via a /proc interface.
+
+All tasks are initially given a quota based on RR_INTERVAL. This is equal to
+RR_INTERVAL between nice values of -6 and 0, half that size above nice 0, and
+progressively larger for nice values from -1 to -20. This is assigned to
+p->quota and only changes with changes in nice level.
+
+As a task is first queued, it checks in recalc_task_prio to see if it has run at
+this runqueue's current priority rotation. If it has not, it will have its
+p->prio level set according to the first slot in a "priority matrix" and will be
+given a p->time_slice equal to the p->quota, and has its allocation bitmap bit
+set in p->bitmap for this prio level. It is then queued on the current active
+priority array.
+
+If a task has already been running during this major epoch, and it has
+p->time_slice left and the rq->prio_quota for the task's p->prio still
+has quota, it will be placed back on the active array, but no more quota
+will be added.
+
+If a task has been running during this major epoch, but does not have
+p->time_slice left, it will find the next lowest priority in its bitmap that it
+has not been allocated quota from. It then gets the a full quota in
+p->time_slice. It is then queued on the current active priority array at the
+newly determined lower priority.
+
+If a task has been running during this major epoch, and does not have
+any entitlement left in p->bitmap and no time_slice left, it will have its
+bitmap cleared, and be queued at its best prio again, but on the expired
+priority array.
+
+When a task is queued, it has its relevant bit set in the array->prio_bitmap.
+
+p->time_slice is stored in nanosconds and is updated via update_cpu_clock on
+schedule() and scheduler_tick. If p->time_slice is below zero then the
+recalc_task_prio is readjusted and the task rescheduled.
+
+
+Priority Matrix
+===============
+
+In order to minimise the latencies between tasks of different nice levels
+running concurrently, the dynamic priority slots where different nice levels
+are queued are dithered instead of being sequential. What this means is that
+there are 40 priority slots where a task may run during one major rotation,
+and the allocation of slots is dependant on nice level. In the
+following table, a zero represents a slot where the task may run.
+
+PRIORITY:0..................20.................39
+nice -20 0000000000000000000000000000000000000000
+nice -10 1000100010001000100010001000100010010000
+nice   0 1010101010101010101010101010101010101010
+nice   5 1011010110110101101101011011010110110110
+nice  10 1110111011101110111011101110111011101110
+nice  15 1111111011111110111111101111111011111110
+nice  19 1111111111111111111111111111111111111110
+
+As can be seen, a nice -20 task runs in every priority slot whereas a nice 19
+task only runs one slot per major rotation. This dithered table allows for the
+smallest possible maximum latencies between tasks of varying nice levels, thus
+allowing vastly different nice levels to be used.
+
+SCHED_BATCH tasks are managed slightly differently, receiving only the top
+slots from its priority bitmap giving it equal cpu as SCHED_NORMAL, but
+slightly higher latencies.
+
+
+Modelling deadline behaviour
+============================
+
+As the accounting in this design is hard and not modified by sleep average
+calculations or interactivity modifiers, it is possible to accurately
+predict the maximum latency that a task may experience under different
+conditions. This is a virtual deadline mechanism enforced by mandatory
+timeslice expiration and not outside bandwidth measurement.
+
+The maximum duration a task can run during one major epoch is determined by its
+nice value. Nice 0 tasks can run at 19 different priority levels for RR_INTERVAL
+duration during each epoch. Nice 10 tasks can run at 9 priority levels for each
+epoch, and so on. The table in the priority matrix above demonstrates how this
+is enforced.
+
+Therefore the maximum duration a runqueue epoch can take is determined by
+the number of tasks running, and their nice level. After that, the maximum
+duration it can take before a task can wait before it get scheduled is
+determined by the position of its first slot on the matrix.
+
+In the following examples, these are _worst case scenarios_ and would rarely
+occur, but can be modelled nonetheless to determine the maximum possible
+latency.
+
+So for example, if two nice 0 tasks are running, and one has just expired as
+another is activated for the first time receiving a full quota for this
+runqueue rotation, the first task will wait:
+
+nr_tasks * max_duration + nice_difference * rr_interval
+1 * 19 * RR_INTERVAL + 0 = 152ms
+
+In the presence of a nice 10 task, a nice 0 task would wait a maximum of
+1 * 10 * RR_INTERVAL + 0 = 80ms
+
+In the presence of a nice 0 task, a nice 10 task would wait a maximum of
+1 * 19 * RR_INTERVAL + 1 * RR_INTERVAL = 160ms
+
+More useful than these values, though, are the average latencies which are
+a matter of determining the average distance between priority slots of
+different nice values and multiplying them by the tasks' quota. For example
+in the presence of a nice -10 task, a nice 0 task will wait either one or
+two slots. Given that nice -10 tasks have a quota 2.5 times the RR_INTERVAL,
+this means the latencies will alternate between 2.5 and 5 RR_INTERVALs or
+20 and 40ms respectively (on uniprocessor at 1000HZ).
+
+
+Achieving interactivity
+=======================
+
+A requirement of this scheduler design was to achieve good interactivity
+despite being a completely fair deadline based design. The disadvantage of
+designs that try to achieve interactivity is that they usually do so at
+the expense of maintaining fairness. As cpu speeds increase, the requirement
+for some sort of metered unfairness towards interactive tasks becomes a less
+desirable phenomenon, but low latency and fairness remains mandatory to
+good interactive performance.
+
+This design relies on the fact that interactive tasks, by their nature,
+sleep often. Most fair scheduling designs end up penalising such tasks
+indirectly giving them less than their fair possible share because of the
+sleep, and have to use a mechanism of bonusing their priority to offset
+this based on the duration they sleep. This becomes increasingly inaccurate
+as the number of running tasks rises and more tasks spend time waiting on
+runqueues rather than sleeping, and it is impossible to tell whether the
+task that's waiting on a runqueue only intends to run for a short period and
+then sleep again after than runqueue wait. Furthermore, all such designs rely
+on a period of time to pass to accumulate some form of statistic on the task
+before deciding on how much to give them preference. The shorter this period,
+the more rapidly bursts of cpu ruin the interactive tasks behaviour. The
+longer this period, the longer it takes for interactive tasks to get low
+scheduling latencies and fair cpu.
+
+This design does not measure sleep time at all. Interactive tasks that sleep
+often will wake up having consumed very little if any of their quota for
+the current major priority rotation. The longer they have slept, the less
+likely they are to even be on the current major priority rotation. Once
+woken up, though, they get to use up a their full quota for that epoch,
+whether part of a quota remains or a full quota. Overall, however, they
+can still only run as much cpu time for that epoch as any other task of the
+same nice level. This means that two tasks behaving completely differently
+from fully cpu bound to waking/sleeping extremely frequently will still
+get the same quota of cpu, but the latter will be using its quota for that
+epoch in bursts rather than continuously. This guarantees that interactive
+tasks get the same amount of cpu as cpu bound ones.
+
+The other requirement of interactive tasks is also to obtain low latencies
+for when they are scheduled. Unlike fully cpu bound tasks and the maximum
+latencies possible described in the modelling deadline behaviour section
+above, tasks that sleep will wake up with quota available usually at the
+current runqueue's priority_level or better. This means that the most latency
+they are likely to see is one RR_INTERVAL, and often they will preempt the
+current task if it is not of a sleeping nature. This then guarantees very
+low latency for interactive tasks, and the lowest latencies for the least
+cpu bound tasks.
+
+
+Fri, 4 May 2007
+Con Kolivas <kernel@kolivas.org>
Index: linux-2.6.22-rc2-ck1/Documentation/sysctl/kernel.txt
===================================================================
--- linux-2.6.22-rc2-ck1.orig/Documentation/sysctl/kernel.txt	2007-05-19 20:10:46.000000000 +1000
+++ linux-2.6.22-rc2-ck1/Documentation/sysctl/kernel.txt	2007-05-19 20:12:08.000000000 +1000
@@ -25,6 +25,7 @@ show up in /proc/sys/kernel:
 - domainname
 - hostname
 - hotplug
+- interactive
 - java-appletviewer           [ binfmt_java, obsolete ]
 - java-interpreter            [ binfmt_java, obsolete ]
 - kstack_depth_to_print       [ X86 only ]
@@ -43,6 +44,7 @@ show up in /proc/sys/kernel:
 - printk
 - real-root-dev               ==> Documentation/initrd.txt
 - reboot-cmd                  [ SPARC only ]
+- rr_interval
 - rtsig-max
 - rtsig-nr
 - sem
@@ -164,6 +166,21 @@ Default value is "/sbin/hotplug".
 
 ==============================================================
 
+interactive:
+
+The staircase-deadline cpu scheduler can be set in either purely
+forward-looking mode for absolutely rigid fairness and cpu distribution
+according to nice level, or it can allow a small per-process history
+to smooth out cpu usage perturbations common in interactive tasks by
+enabling this sysctl. While small fairness issues can arise with this
+enabled, overall fairness is usually still strongly maintained and
+starvation is never possible. Enabling this can significantly smooth
+out 3d graphics and games.
+
+Default value is 1 (enabled).
+
+==============================================================
+
 l2cr: (PPC only)
 
 This flag controls the L2 cache of G3 processor boards. If
@@ -288,6 +305,19 @@ rebooting. ???
 
 ==============================================================
 
+rr_interval:
+
+This is the smallest duration that any cpu process scheduling unit
+will run for. Increasing this value can increase throughput of cpu
+bound tasks substantially but at the expense of increased latencies
+overall. This value is in milliseconds and the default value chosen
+depends on the number of cpus available at scheduler initialisation
+with a minimum of 8.
+
+Valid values are from 1-5000.
+
+==============================================================
+
 rtsig-max & rtsig-nr:
 
 The file rtsig-max can be used to tune the maximum number
Index: linux-2.6.22-rc2-ck1/fs/pipe.c
===================================================================
--- linux-2.6.22-rc2-ck1.orig/fs/pipe.c	2007-05-19 20:10:51.000000000 +1000
+++ linux-2.6.22-rc2-ck1/fs/pipe.c	2007-05-19 20:11:57.000000000 +1000
@@ -41,12 +41,7 @@ void pipe_wait(struct pipe_inode_info *p
 {
 	DEFINE_WAIT(wait);
 
-	/*
-	 * Pipes are system-local resources, so sleeping on them
-	 * is considered a noninteractive wait:
-	 */
-	prepare_to_wait(&pipe->wait, &wait,
-			TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
+	prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
 	if (pipe->inode)
 		mutex_unlock(&pipe->inode->i_mutex);
 	schedule();
Index: linux-2.6.22-rc2-ck1/fs/proc/array.c
===================================================================
--- linux-2.6.22-rc2-ck1.orig/fs/proc/array.c	2007-05-19 20:10:51.000000000 +1000
+++ linux-2.6.22-rc2-ck1/fs/proc/array.c	2007-05-19 20:11:57.000000000 +1000
@@ -165,7 +165,6 @@ static inline char * task_state(struct t
 	rcu_read_lock();
 	buffer += sprintf(buffer,
 		"State:\t%s\n"
-		"SleepAVG:\t%lu%%\n"
 		"Tgid:\t%d\n"
 		"Pid:\t%d\n"
 		"PPid:\t%d\n"
@@ -173,7 +172,6 @@ static inline char * task_state(struct t
 		"Uid:\t%d\t%d\t%d\t%d\n"
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
-		(p->sleep_avg/1024)*100/(1020000000/1024),
 	       	p->tgid, p->pid,
 	       	pid_alive(p) ? rcu_dereference(p->real_parent)->tgid : 0,
 		pid_alive(p) && p->ptrace ? rcu_dereference(p->parent)->pid : 0,
Index: linux-2.6.22-rc2-ck1/include/linux/init_task.h
===================================================================
--- linux-2.6.22-rc2-ck1.orig/include/linux/init_task.h	2007-05-19 20:10:51.000000000 +1000
+++ linux-2.6.22-rc2-ck1/include/linux/init_task.h	2007-05-19 20:11:57.000000000 +1000
@@ -125,13 +125,15 @@ extern struct group_info init_groups;
 	.prio		= MAX_PRIO-20,					\
 	.static_prio	= MAX_PRIO-20,					\
 	.normal_prio	= MAX_PRIO-20,					\
+	.rotation	= 0,						\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
 	.ioprio		= 0,						\
-	.time_slice	= HZ,						\
+	.time_slice	= 1000000000,						\
+	.quota		= 1000000000,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
Index: linux-2.6.22-rc2-ck1/kernel/softirq.c
===================================================================
--- linux-2.6.22-rc2-ck1.orig/kernel/softirq.c	2007-05-19 20:10:51.000000000 +1000
+++ linux-2.6.22-rc2-ck1/kernel/softirq.c	2007-05-19 20:11:57.000000000 +1000
@@ -488,7 +488,7 @@ void __init softirq_init(void)
 
 static int ksoftirqd(void * __bind_cpu)
 {
-	set_user_nice(current, 19);
+	set_user_nice(current, 15);
 	current->flags |= PF_NOFREEZE;
 
 	set_current_state(TASK_INTERRUPTIBLE);
Index: linux-2.6.22-rc2-ck1/kernel/workqueue.c
===================================================================
--- linux-2.6.22-rc2-ck1.orig/kernel/workqueue.c	2007-05-19 20:10:51.000000000 +1000
+++ linux-2.6.22-rc2-ck1/kernel/workqueue.c	2007-05-19 20:11:57.000000000 +1000
@@ -298,8 +298,6 @@ static int worker_thread(void *__cwq)
 	if (!cwq->wq->freezeable)
 		current->flags |= PF_NOFREEZE;
 
-	set_user_nice(current, -5);
-
 	for (;;) {
 		prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE);
 		if (!freezing(current) && !cwq->should_stop
Index: linux-2.6.22-rc2-ck1/kernel/kthread.c
===================================================================
--- linux-2.6.22-rc2-ck1.orig/kernel/kthread.c	2007-05-19 20:10:51.000000000 +1000
+++ linux-2.6.22-rc2-ck1/kernel/kthread.c	2007-05-19 20:11:57.000000000 +1000
@@ -220,7 +220,6 @@ static __init void kthreadd_setup(void)
 
 	ignore_signals(tsk);
 
-	set_user_nice(tsk, -5);
 	set_cpus_allowed(tsk, CPU_MASK_ALL);
 }