commit d1b37dc608d76d8c2480d506a0034a3d840ea6d2
Merge: 11bd513... 97c2363...
Author: Arnd Bergmann <arnd@klappe.arndb.de>
Date:   Sat Jun 23 11:29:47 2007 +0200

    Merge branches 'powerpc+origin' and 'spufs' into spufs-merge

commit 11bd5132c185f9ede02111c1ab6a6cda97f0347a
Merge: 29926f8... 9c7e907...
Author: Arnd Bergmann <arnd@klappe.arndb.de>
Date:   Fri Jun 22 15:39:22 2007 +0200

    Merge branches 'powerpc+origin' and 'spufs' into spufs-merge

commit 29926f83fe5694afb7ce19e48815dd24ec181c65
Merge: ca223a2... cd25352...
Author: Arnd Bergmann <arnd@klappe.arndb.de>
Date:   Thu Jun 21 02:32:31 2007 +0200

    Merge branches 'powerpc+origin' and 'spufs' into spufs-merge

commit ca223a26a0a057ff27b64a4bdd549a128b49f1b6
Merge: 2895de4... d59a251...
Author: Arnd Bergmann <arnd@klappe.arndb.de>
Date:   Thu Jun 21 01:52:18 2007 +0200

    Merge branches 'powerpc+origin' and 'spufs' into spufs-merge

commit 2895de4bd69bbb91ffacc6323f6a3285c2f604eb
Merge: 5b6ee6d... 0dcc8e8...
Author: Arnd Bergmann <arnd@klappe.arndb.de>
Date:   Wed Jun 20 17:11:46 2007 +0200

    Merge branches 'powerpc+origin' and 'spufs' into spufs-merge

commit 5b6ee6da98e114e3daea030bdc1cbca4ce805e0d
Merge: d4301e4... 3cec3a4...
Author: Arnd Bergmann <arnd@klappe.arndb.de>
Date:   Tue Jun 19 18:58:06 2007 +0200

    Merge branches 'powerpc+origin' and 'spufs' into spufs-merge

commit d4301e48317e542cdfa415b080e3c839bcdb0e8f
Merge: 6b87045... d5b59a3...
Author: Arnd Bergmann <arnd@klappe.arndb.de>
Date:   Tue Jun 19 15:57:35 2007 +0200

    Merge branches 'powerpc+origin' and 'spufs' into spufs-merge

commit d5b59a39557007af92cbcb0c53e310deef16a1fe
Author: Jeremy Kerr <jk@ozlabs.org>
Date:   Tue Jun 19 10:43:01 2007 +1000

    Revert "spufs: allow coredumps to include nosched contexts"
    
    This reverts commit 3ea083b358542fa8e227623b2cb4179b7a72e7a4.
    
    This patch conflicts with 'spusched: catch nosched contexts in
    spu_deactivate', as we need to deactivate contexts before they can
    be coredumped.
    
    Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

commit 6b8704506298728124341c697b5dab4155686dee
Merge: a6187e8... 0a63d3c...
Author: Arnd Bergmann <arnd@klappe.arndb.de>
Date:   Tue Jun 19 02:15:13 2007 +0200

    Merge branches 'powerpc+origin' and 'spufs' into spufs-merge

commit a6187e86cbd1c33a1baf102ce6f2afca871af592
Merge: d082b2a... 19b4640...
Author: Arnd Bergmann <arnd@klappe.arndb.de>
Date:   Mon Jun 18 18:03:33 2007 +0200

    Merge branches 'powerpc+origin' and 'spufs' into spufs-merge

commit 19b464022b8d0513efb3a0eb407b5c40d1bdc0a4
Author: Jeremy Kerr <jk@ozlabs.org>
Date:   Mon Jun 18 18:26:59 2007 +1000

    spufs: Remove spufs_dir_inode_operations
    
    spufs_dir_inode_operations is exactly the same as
    simple_dir_inode_operations. Use that instead.
    
    Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
    Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>

commit b3e2bd6a1afd6352e6bde841e784c7b8b00d3151
Author: Christoph Hellwig <hch@lst.de>
Date:   Mon Jun 18 18:26:47 2007 +1000

    spusched: no preemption for nosched contexts
    
    And last but not least we need to make sure the scheduler tick never
    preempts a nosched context.
    
    Signed-off-by: Christoph Hellwig <hch@lst.de>
    Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
    Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

commit 313ef343e53d304391337b44bc1c62ea71f3b478
Author: Christoph Hellwig <hch@lst.de>
Date:   Mon Jun 18 18:26:33 2007 +1000

    spusched: catch nosched contexts in spu_deactivate
    
    spu_deactivate should never be called for nosched contets.  Put in
    a check so we can print a stacktrace and exit early in case it
    happes errornously.
    
    Signed-off-by: Christoph Hellwig <hch@lst.de>
    Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
    Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

commit 54b51b6689ddefa8ea0ec4b3848587bb57a9cd71
Author: Christoph Hellwig <hch@lst.de>
Date:   Mon Jun 18 18:26:17 2007 +1000

    spusched: fix cpu/node binding
    
    Add a cpus_allowed allowed filed to struct spu_context so that we always
    use the cpu mask of the owning thread instead of the one happening to
    call into the scheduler.  Also use this information in
    grab_runnable_context to avoid spurious wakeups.
    
    Signed-off-by: Christoph Hellwig <hch@lst.de>
    Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
    Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

commit 60cfd45c3fdcb81f1211967da0c92e91089a0df7
Author: Christoph Hellwig <hch@lst.de>
Date:   Mon Jun 18 18:26:06 2007 +1000

    spusched: update scheduling paramters on every spu_run
    
    Update scheduling information on every spu_run to allow for setting
    threads to realtime priority just before running them.  This requires
    some slightly ugly code in spufs_run_spu because we can just update
    the information unlocked if the spu is not runnable, but we need to
    acquire the active_mutex when it is runnable to protect against
    find_victim.  This locking scheme requires opencoding
    spu_acquire_runnable in spufs_run_spu which actually is a nice cleanup
    all by itself.
    
    Signed-off-by: Christoph Hellwig <hch@lst.de>
    Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
    Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

commit a5c1a2eb6275d6b109182732b7fea2bbd34e1f47
Author: Jeremy Kerr <jk@ozlabs.org>
Date:   Mon Jun 18 18:25:55 2007 +1000

    spusched: Print out scheduling tunables with DEBUG
    
    Print out a few scheduler tuning parameters when we've compiled
    with DEBUG defined.
    
    Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

commit 6b161ecc6451d9765c3dd2eb07b6169e13f6a296
Author: Jeremy Kerr <jk@ozlabs.org>
Date:   Mon Jun 18 18:25:44 2007 +1000

    spusched: Fix timeslice calculations
    
    The current timeslice code mixes 'jiffies' up with 'spesched ticks'. This
    change correctly defines the number of time slices each SPE contexts is
    given, and clarifies the comment.
    
    This brings the default timeslice for SPE contexts into a reasonable
    range.
    
    Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

commit 7bed139b5c6079e46e32ec872b25f11488b012e3
Author: Christoph Hellwig <hch@lst.de>
Date:   Mon Jun 18 18:25:28 2007 +1000

    spusched: dynamic timeslicing for SCHED_OTHER
    
    Enable preemptive scheduling for non-RT contexts.
    
    We use the same algorithms as the CPU scheduler to calculate the time
    slice length, and for now we also use the same timeslice length as the
    CPU scheduler. This might be not enough for good performance and can be
    changed after some benchmarking.
    
    Note that currently we do not boost the priority for contexts waiting
    on the runqueue for a long time, so contexts with a higher nice value
    could starve ones with less priority.  This could easily be fixed once
    the rework of the spu lists that Luke and I discussed is done.
    
    Signed-off-by: Christoph Hellwig <hch@lst.de>
    Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
    Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

commit 6deac36fa8f38abd8e7339678b62906dcf31a99c
Author: Christoph Hellwig <hch@lst.de>
Date:   Mon Jun 18 18:24:22 2007 +1000

    spusched: switch from workqueues to kthread + timer tick
    
    Get rid of the scheduler workqueues that complicated things a lot to
    a dedicated spu scheduler thread that gets woken by a traditional
    scheduler tick.  By default this scheduler tick runs a HZ * 10, aka
    one spu scheduler tick for every 10 cpu ticks.
    
    Currently the tick is not disabled when we have less context than
    available spus, but I will implement this later.
    
    Signed-off-by: Christoph Hellwig <hch@lst.de>
    Signed-off-by: Arnd Bergmann <arnd.bergmann@de.ibm.com>
    Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

commit 0e82f6418a99d98db72638a30344ea54de247540
Author: Sebastian Siewior <bigeasy@linux.vnet.ibm.com>
Date:   Wed Jun 13 01:14:56 2007 +1000

    spufs: Add bit definition
    
    Add a bit define from book, and replace one hex number with a
    symbol, for clarity.
    
    Signed-off-by: Sebastian Siewior <bigeasy@linux.vnet.ibm.com>
    Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

commit 540b92e4e330fd3017cf9d5efe8eff3494b2588f
Author: Sebastian Siewior <bigeasy@linux.vnet.ibm.com>
Date:   Fri Jun 8 20:04:13 2007 +1000

    spufs: fix building spufs/spu_save_dump.h
    
    Currently it fails with gcc from sdk 2.1 because of a spec change [1].
    Maybe we should start using the definitions from spu_mfcio.h.
    
    [1] http://gcc.gnu.org/ml/gcc-patches/2006-11/msg01598.html
    
    Signed-off-by: Sebastian Siewior <bigeasy@linux.vnet.ibm.com>
    Signed-off-by: Jeremy Kerr <jk@ozlabs.org>

commit 3ea083b358542fa8e227623b2cb4179b7a72e7a4
Author: Christoph Hellwig <hch@lst.de>
Date:   Mon Jun 4 20:09:26 2007 +1000

    spufs: allow coredumps to include nosched contexts
    
    We only need to prevent coredumps of isolated contexts, no need to
    exclude purely nosched contexts.
    
    Signed-off-by: Christoph Hellwig <hch@lst.de>
    Signed-off-by: Jeremy Kerr <jk@ozlabs.org>
Index: linux-2.6/arch/powerpc/platforms/cell/spufs/context.c
===================================================================
--- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/context.c
+++ linux-2.6/arch/powerpc/platforms/cell/spufs/context.c
@@ -53,10 +53,8 @@ struct spu_context *alloc_spu_context(st
 	INIT_LIST_HEAD(&ctx->rq);
 	if (gang)
 		spu_gang_add_ctx(gang, ctx);
-	ctx->rt_priority = current->rt_priority;
-	ctx->policy = current->policy;
-	ctx->prio = current->prio;
-	INIT_DELAYED_WORK(&ctx->sched_work, spu_sched_tick);
+	ctx->cpus_allowed = current->cpus_allowed;
+	spu_set_timeslice(ctx);
 	goto out;
 out_free:
 	kfree(ctx);
Index: linux-2.6/arch/powerpc/platforms/cell/spufs/inode.c
===================================================================
--- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/inode.c
+++ linux-2.6/arch/powerpc/platforms/cell/spufs/inode.c
@@ -232,10 +232,6 @@ static int spufs_dir_close(struct inode 
 	return dcache_dir_close(inode, file);
 }
 
-const struct inode_operations spufs_dir_inode_operations = {
-	.lookup = simple_lookup,
-};
-
 const struct file_operations spufs_context_fops = {
 	.open		= dcache_dir_open,
 	.release	= spufs_dir_close,
@@ -269,7 +265,7 @@ spufs_mkdir(struct inode *dir, struct de
 		goto out_iput;
 
 	ctx->flags = flags;
-	inode->i_op = &spufs_dir_inode_operations;
+	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	if (flags & SPU_CREATE_NOSCHED)
 		ret = spufs_fill_dir(dentry, spufs_dir_nosched_contents,
@@ -386,7 +382,7 @@ spufs_mkgang(struct inode *dir, struct d
 	if (!gang)
 		goto out_iput;
 
-	inode->i_op = &spufs_dir_inode_operations;
+	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 
 	d_instantiate(dentry, inode);
@@ -593,7 +589,7 @@ spufs_create_root(struct super_block *sb
 	if (!inode)
 		goto out;
 
-	inode->i_op = &spufs_dir_inode_operations;
+	inode->i_op = &simple_dir_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	SPUFS_I(inode)->i_ctx = NULL;
 
Index: linux-2.6/arch/powerpc/platforms/cell/spufs/run.c
===================================================================
--- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/run.c
+++ linux-2.6/arch/powerpc/platforms/cell/spufs/run.c
@@ -29,7 +29,8 @@ static inline int spu_stopped(struct spu
 	spu = ctx->spu;
 	pte_fault = spu->dsisr &
 	    (MFC_DSISR_PTE_NOT_FOUND | MFC_DSISR_ACCESS_DENIED);
-	return (!(*stat & 0x1) || pte_fault || spu->class_0_pending) ? 1 : 0;
+	return (!(*stat & SPU_STATUS_RUNNING) || pte_fault || spu->class_0_pending) ?
+		1 : 0;
 }
 
 static int spu_setup_isolated(struct spu_context *ctx)
@@ -143,7 +144,6 @@ static int spu_run_init(struct spu_conte
 		ctx->ops->runcntl_write(ctx, runcntl);
 	} else {
 		unsigned long mode = SPU_PRIVCNTL_MODE_NORMAL;
-		spu_start_tick(ctx);
 		ctx->ops->npc_write(ctx, *npc);
 		if (test_thread_flag(TIF_SINGLESTEP))
 			mode = SPU_PRIVCNTL_MODE_SINGLE_STEP;
@@ -159,7 +159,6 @@ static int spu_run_fini(struct spu_conte
 {
 	int ret = 0;
 
-	spu_stop_tick(ctx);
 	*status = ctx->ops->status_read(ctx);
 	*npc = ctx->ops->npc_read(ctx);
 	spu_release(ctx);
@@ -302,9 +301,22 @@ long spufs_run_spu(struct file *file, st
 	ctx->ops->master_start(ctx);
 	ctx->event_return = 0;
 
-	ret = spu_acquire_runnable(ctx, 0);
-	if (ret)
-		return ret;
+	spu_acquire(ctx);
+	if (ctx->state == SPU_STATE_SAVED) {
+		__spu_update_sched_info(ctx);
+
+		ret = spu_activate(ctx, 0);
+		if (ret) {
+			spu_release(ctx);
+			goto out;
+		}
+	} else {
+		/*
+		 * We have to update the scheduling priority under active_mutex
+		 * to protect against find_victim().
+		 */
+		spu_update_sched_info(ctx);
+	}
 
 	ret = spu_run_init(ctx, npc);
 	if (ret) {
@@ -329,10 +341,8 @@ long spufs_run_spu(struct file *file, st
 
 		if (unlikely(ctx->state != SPU_STATE_RUNNABLE)) {
 			ret = spu_reacquire_runnable(ctx, npc, &status);
-			if (ret) {
-				spu_stop_tick(ctx);
+			if (ret)
 				goto out2;
-			}
 			continue;
 		}
 		ret = spu_process_events(ctx);
Index: linux-2.6/arch/powerpc/platforms/cell/spufs/sched.c
===================================================================
--- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/sched.c
+++ linux-2.6/arch/powerpc/platforms/cell/spufs/sched.c
@@ -35,6 +35,7 @@
 #include <linux/numa.h>
 #include <linux/mutex.h>
 #include <linux/notifier.h>
+#include <linux/kthread.h>
 
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -43,8 +44,6 @@
 #include <asm/spu_priv1.h>
 #include "spufs.h"
 
-#define SPU_TIMESLICE	(HZ)
-
 struct spu_prio_array {
 	DECLARE_BITMAP(bitmap, MAX_PRIO);
 	struct list_head runq[MAX_PRIO];
@@ -54,43 +53,107 @@ struct spu_prio_array {
 };
 
 static struct spu_prio_array *spu_prio;
-static struct workqueue_struct *spu_sched_wq;
+static struct task_struct *spusched_task;
+static struct timer_list spusched_timer;
+
+/*
+ * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
+ */
+#define NORMAL_PRIO		120
+
+/*
+ * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
+ * tick for every 10 CPU scheduler ticks.
+ */
+#define SPUSCHED_TICK		(10)
+
+/*
+ * These are the 'tuning knobs' of the scheduler:
+ *
+ * Minimum timeslice is 5 msecs (or 1 spu scheduler tick, whichever is
+ * larger), default timeslice is 100 msecs, maximum timeslice is 800 msecs.
+ */
+#define MIN_SPU_TIMESLICE	max(5 * HZ / (1000 * SPUSCHED_TICK), 1)
+#define DEF_SPU_TIMESLICE	(100 * HZ / (1000 * SPUSCHED_TICK))
 
-static inline int node_allowed(int node)
+#define MAX_USER_PRIO		(MAX_PRIO - MAX_RT_PRIO)
+#define SCALE_PRIO(x, prio) \
+	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
+
+/*
+ * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values:
+ * [800ms ... 100ms ... 5ms]
+ *
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
+ */
+void spu_set_timeslice(struct spu_context *ctx)
 {
-	cpumask_t mask;
+	if (ctx->prio < NORMAL_PRIO)
+		ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE * 4, ctx->prio);
+	else
+		ctx->time_slice = SCALE_PRIO(DEF_SPU_TIMESLICE, ctx->prio);
+}
 
-	if (!nr_cpus_node(node))
-		return 0;
-	mask = node_to_cpumask(node);
-	if (!cpus_intersects(mask, current->cpus_allowed))
-		return 0;
-	return 1;
+/*
+ * Update scheduling information from the owning thread.
+ */
+void __spu_update_sched_info(struct spu_context *ctx)
+{
+	/*
+	 * We do our own priority calculations, so we normally want
+	 * ->static_prio to start with. Unfortunately thies field
+	 * contains junk for threads with a realtime scheduling
+	 * policy so we have to look at ->prio in this case.
+	 */
+	if (rt_prio(current->prio))
+		ctx->prio = current->prio;
+	else
+		ctx->prio = current->static_prio;
+	ctx->policy = current->policy;
+
+	/*
+	 * A lot of places that don't hold active_mutex poke into
+	 * cpus_allowed, including grab_runnable_context which
+	 * already holds the runq_lock.  So abuse runq_lock
+	 * to protect this field aswell.
+	 */
+	spin_lock(&spu_prio->runq_lock);
+	ctx->cpus_allowed = current->cpus_allowed;
+	spin_unlock(&spu_prio->runq_lock);
 }
 
-void spu_start_tick(struct spu_context *ctx)
+void spu_update_sched_info(struct spu_context *ctx)
 {
-	if (ctx->policy == SCHED_RR) {
-		/*
-		 * Make sure the exiting bit is cleared.
-		 */
-		clear_bit(SPU_SCHED_EXITING, &ctx->sched_flags);
-		mb();
-		queue_delayed_work(spu_sched_wq, &ctx->sched_work, SPU_TIMESLICE);
-	}
+	int node = ctx->spu->node;
+
+	mutex_lock(&spu_prio->active_mutex[node]);
+	__spu_update_sched_info(ctx);
+	mutex_unlock(&spu_prio->active_mutex[node]);
 }
 
-void spu_stop_tick(struct spu_context *ctx)
+static int __node_allowed(struct spu_context *ctx, int node)
 {
-	if (ctx->policy == SCHED_RR) {
-		/*
-		 * While the work can be rearming normally setting this flag
-		 * makes sure it does not rearm itself anymore.
-		 */
-		set_bit(SPU_SCHED_EXITING, &ctx->sched_flags);
-		mb();
-		cancel_delayed_work(&ctx->sched_work);
+	if (nr_cpus_node(node)) {
+		cpumask_t mask = node_to_cpumask(node);
+
+		if (cpus_intersects(mask, ctx->cpus_allowed))
+			return 1;
 	}
+
+	return 0;
+}
+
+static int node_allowed(struct spu_context *ctx, int node)
+{
+	int rval;
+
+	spin_lock(&spu_prio->runq_lock);
+	rval = __node_allowed(ctx, node);
+	spin_unlock(&spu_prio->runq_lock);
+
+	return rval;
 }
 
 /**
@@ -104,6 +167,11 @@ static void spu_add_to_active_list(struc
 	mutex_unlock(&spu_prio->active_mutex[spu->node]);
 }
 
+static void __spu_remove_from_active_list(struct spu *spu)
+{
+	list_del_init(&spu->list);
+}
+
 /**
  * spu_remove_from_active_list - remove spu from active list
  * @spu:       spu to remove from the active list
@@ -113,7 +181,7 @@ static void spu_remove_from_active_list(
 	int node = spu->node;
 
 	mutex_lock(&spu_prio->active_mutex[node]);
-	list_del_init(&spu->list);
+	__spu_remove_from_active_list(spu);
 	mutex_unlock(&spu_prio->active_mutex[node]);
 }
 
@@ -161,7 +229,6 @@ static void spu_bind_context(struct spu 
 	spu->timestamp = jiffies;
 	spu_cpu_affinity_set(spu, raw_smp_processor_id());
 	spu_switch_notify(spu, ctx);
-	spu_add_to_active_list(spu);
 	ctx->state = SPU_STATE_RUNNABLE;
 }
 
@@ -175,7 +242,6 @@ static void spu_unbind_context(struct sp
 	pr_debug("%s: unbind pid=%d SPU=%d NODE=%d\n", __FUNCTION__,
 		 spu->pid, spu->number, spu->node);
 
-	spu_remove_from_active_list(spu);
 	spu_switch_notify(spu, NULL);
 	spu_unmap_mappings(ctx);
 	spu_save(&ctx->csa, spu);
@@ -244,7 +310,7 @@ static struct spu *spu_get_idle(struct s
 
 	for (n = 0; n < MAX_NUMNODES; n++, node++) {
 		node = (node < MAX_NUMNODES) ? node : 0;
-		if (!node_allowed(node))
+		if (!node_allowed(ctx, node))
 			continue;
 		spu = spu_alloc_node(node);
 		if (spu)
@@ -276,15 +342,15 @@ static struct spu *find_victim(struct sp
 	node = cpu_to_node(raw_smp_processor_id());
 	for (n = 0; n < MAX_NUMNODES; n++, node++) {
 		node = (node < MAX_NUMNODES) ? node : 0;
-		if (!node_allowed(node))
+		if (!node_allowed(ctx, node))
 			continue;
 
 		mutex_lock(&spu_prio->active_mutex[node]);
 		list_for_each_entry(spu, &spu_prio->active_list[node], list) {
 			struct spu_context *tmp = spu->ctx;
 
-			if (tmp->rt_priority < ctx->rt_priority &&
-			    (!victim || tmp->rt_priority < victim->rt_priority))
+			if (tmp->prio > ctx->prio &&
+			    (!victim || tmp->prio > victim->prio))
 				victim = spu->ctx;
 		}
 		mutex_unlock(&spu_prio->active_mutex[node]);
@@ -312,6 +378,7 @@ static struct spu *find_victim(struct sp
 				victim = NULL;
 				goto restart;
 			}
+			spu_remove_from_active_list(spu);
 			spu_unbind_context(spu, victim);
 			mutex_unlock(&victim->state_mutex);
 			/*
@@ -350,10 +417,11 @@ int spu_activate(struct spu_context *ctx
 		 * If this is a realtime thread we try to get it running by
 		 * preempting a lower priority thread.
 		 */
-		if (!spu && ctx->rt_priority)
+		if (!spu && rt_prio(ctx->prio))
 			spu = find_victim(ctx);
 		if (spu) {
 			spu_bind_context(spu, ctx);
+			spu_add_to_active_list(spu);
 			return 0;
 		}
 
@@ -369,23 +437,28 @@ int spu_activate(struct spu_context *ctx
  * Remove the highest priority context on the runqueue and return it
  * to the caller.  Returns %NULL if no runnable context was found.
  */
-static struct spu_context *grab_runnable_context(int prio)
+static struct spu_context *grab_runnable_context(int prio, int node)
 {
-	struct spu_context *ctx = NULL;
+	struct spu_context *ctx;
 	int best;
 
 	spin_lock(&spu_prio->runq_lock);
 	best = sched_find_first_bit(spu_prio->bitmap);
-	if (best < prio) {
+	while (best < prio) {
 		struct list_head *rq = &spu_prio->runq[best];
 
-		BUG_ON(list_empty(rq));
-
-		ctx = list_entry(rq->next, struct spu_context, rq);
-		__spu_del_from_rq(ctx);
+		list_for_each_entry(ctx, rq, rq) {
+			/* XXX(hch): check for affinity here aswell */
+			if (__node_allowed(ctx, node)) {
+				__spu_del_from_rq(ctx);
+				goto found;
+			}
+		}
+		best++;
 	}
+	ctx = NULL;
+ found:
 	spin_unlock(&spu_prio->runq_lock);
-
 	return ctx;
 }
 
@@ -395,8 +468,9 @@ static int __spu_deactivate(struct spu_c
 	struct spu_context *new = NULL;
 
 	if (spu) {
-		new = grab_runnable_context(max_prio);
+		new = grab_runnable_context(max_prio, spu->node);
 		if (new || force) {
+			spu_remove_from_active_list(spu);
 			spu_unbind_context(spu, ctx);
 			spu_free(spu);
 			if (new)
@@ -417,6 +491,15 @@ static int __spu_deactivate(struct spu_c
  */
 void spu_deactivate(struct spu_context *ctx)
 {
+	/*
+	 * We must never reach this for a nosched context,
+	 * but handle the case gracefull instead of panicing.
+	 */
+	if (ctx->flags & SPU_CREATE_NOSCHED) {
+		WARN_ON(1);
+		return;
+	}
+
 	__spu_deactivate(ctx, 1, MAX_PRIO);
 }
 
@@ -437,51 +520,85 @@ void spu_yield(struct spu_context *ctx)
 	}
 }
 
-void spu_sched_tick(struct work_struct *work)
+static void spusched_tick(struct spu_context *ctx)
 {
-	struct spu_context *ctx =
-		container_of(work, struct spu_context, sched_work.work);
-	int preempted;
+	if (ctx->flags & SPU_CREATE_NOSCHED)
+		return;
+	if (ctx->policy == SCHED_FIFO)
+		return;
+
+	if (--ctx->time_slice)
+		return;
 
 	/*
-	 * If this context is being stopped avoid rescheduling from the
-	 * scheduler tick because we would block on the state_mutex.
-	 * The caller will yield the spu later on anyway.
+	 * Unfortunately active_mutex ranks outside of state_mutex, so
+	 * we have to trylock here.  If we fail give the context another
+	 * tick and try again.
 	 */
-	if (test_bit(SPU_SCHED_EXITING, &ctx->sched_flags))
-		return;
+	if (mutex_trylock(&ctx->state_mutex)) {
+ 		struct spu *spu = ctx->spu;
+		struct spu_context *new;
 
-	mutex_lock(&ctx->state_mutex);
-	preempted = __spu_deactivate(ctx, 0, ctx->prio + 1);
-	mutex_unlock(&ctx->state_mutex);
+		new = grab_runnable_context(ctx->prio + 1, spu->node);
+		if (new) {
 
-	if (preempted) {
-		/*
-		 * We need to break out of the wait loop in spu_run manually
-		 * to ensure this context gets put on the runqueue again
-		 * ASAP.
-		 */
-		wake_up(&ctx->stop_wq);
+			__spu_remove_from_active_list(spu);
+			spu_unbind_context(spu, ctx);
+			spu_free(spu);
+			wake_up(&new->stop_wq);
+			/*
+			 * We need to break out of the wait loop in
+			 * spu_run manually to ensure this context
+			 * gets put on the runqueue again ASAP.
+			 */
+			wake_up(&ctx->stop_wq);
+		}
+		spu_set_timeslice(ctx);
+		mutex_unlock(&ctx->state_mutex);
 	} else {
-		spu_start_tick(ctx);
+		ctx->time_slice++;
 	}
 }
 
+static void spusched_wake(unsigned long data)
+{
+	mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
+	wake_up_process(spusched_task);
+}
+
+static int spusched_thread(void *unused)
+{
+	struct spu *spu, *next;
+	int node;
+
+	setup_timer(&spusched_timer, spusched_wake, 0);
+	__mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
+		for (node = 0; node < MAX_NUMNODES; node++) {
+			mutex_lock(&spu_prio->active_mutex[node]);
+			list_for_each_entry_safe(spu, next,
+						 &spu_prio->active_list[node],
+						 list)
+				spusched_tick(spu->ctx);
+			mutex_unlock(&spu_prio->active_mutex[node]);
+		}
+	}
+
+	del_timer_sync(&spusched_timer);
+	return 0;
+}
+
 int __init spu_sched_init(void)
 {
 	int i;
 
-	spu_sched_wq = create_singlethread_workqueue("spusched");
-	if (!spu_sched_wq)
-		return 1;
-
 	spu_prio = kzalloc(sizeof(struct spu_prio_array), GFP_KERNEL);
-	if (!spu_prio) {
-		printk(KERN_WARNING "%s: Unable to allocate priority queue.\n",
-		       __FUNCTION__);
-		       destroy_workqueue(spu_sched_wq);
-		return 1;
-	}
+	if (!spu_prio)
+		return -ENOMEM;
+
 	for (i = 0; i < MAX_PRIO; i++) {
 		INIT_LIST_HEAD(&spu_prio->runq[i]);
 		__clear_bit(i, spu_prio->bitmap);
@@ -492,7 +609,17 @@ int __init spu_sched_init(void)
 		INIT_LIST_HEAD(&spu_prio->active_list[i]);
 	}
 	spin_lock_init(&spu_prio->runq_lock);
+
+	spusched_task = kthread_run(spusched_thread, NULL, "spusched");
+	if (IS_ERR(spusched_task)) {
+		kfree(spu_prio);
+		return PTR_ERR(spusched_task);
+	}
+
+	pr_debug("spusched: tick: %d, min ticks: %d, default ticks: %d\n",
+			SPUSCHED_TICK, MIN_SPU_TIMESLICE, DEF_SPU_TIMESLICE);
 	return 0;
+
 }
 
 void __exit spu_sched_exit(void)
@@ -500,6 +627,8 @@ void __exit spu_sched_exit(void)
 	struct spu *spu, *tmp;
 	int node;
 
+	kthread_stop(spusched_task);
+
 	for (node = 0; node < MAX_NUMNODES; node++) {
 		mutex_lock(&spu_prio->active_mutex[node]);
 		list_for_each_entry_safe(spu, tmp, &spu_prio->active_list[node],
@@ -510,5 +639,4 @@ void __exit spu_sched_exit(void)
 		mutex_unlock(&spu_prio->active_mutex[node]);
 	}
 	kfree(spu_prio);
-	destroy_workqueue(spu_sched_wq);
 }
Index: linux-2.6/arch/powerpc/platforms/cell/spufs/spu_save.c
===================================================================
--- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/spu_save.c
+++ linux-2.6/arch/powerpc/platforms/cell/spufs/spu_save.c
@@ -44,7 +44,7 @@ static inline void save_event_mask(void)
 	 *    Read the SPU_RdEventMsk channel and save to the LSCSA.
 	 */
 	offset = LSCSA_QW_OFFSET(event_mask);
-	regs_spill[offset].slot[0] = spu_readch(SPU_RdEventStatMask);
+	regs_spill[offset].slot[0] = spu_readch(SPU_RdEventMask);
 }
 
 static inline void save_tag_mask(void)
Index: linux-2.6/arch/powerpc/platforms/cell/spufs/spufs.h
===================================================================
--- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/spufs.h
+++ linux-2.6/arch/powerpc/platforms/cell/spufs/spufs.h
@@ -26,6 +26,7 @@
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/fs.h>
+#include <linux/cpumask.h>
 
 #include <asm/spu.h>
 #include <asm/spu_csa.h>
@@ -39,11 +40,6 @@ enum {
 struct spu_context_ops;
 struct spu_gang;
 
-/* ctx->sched_flags */
-enum {
-	SPU_SCHED_EXITING = 0,
-};
-
 struct spu_context {
 	struct spu *spu;		  /* pointer to a physical SPU */
 	struct spu_state csa;		  /* SPU context save area. */
@@ -83,9 +79,9 @@ struct spu_context {
 
 	/* scheduler fields */
  	struct list_head rq;
-	struct delayed_work sched_work;
+	unsigned int time_slice;
 	unsigned long sched_flags;
-	unsigned long rt_priority;
+	cpumask_t cpus_allowed;
 	int policy;
 	int prio;
 };
@@ -200,9 +196,9 @@ void spu_acquire_saved(struct spu_contex
 int spu_activate(struct spu_context *ctx, unsigned long flags);
 void spu_deactivate(struct spu_context *ctx);
 void spu_yield(struct spu_context *ctx);
-void spu_start_tick(struct spu_context *ctx);
-void spu_stop_tick(struct spu_context *ctx);
-void spu_sched_tick(struct work_struct *work);
+void spu_set_timeslice(struct spu_context *ctx);
+void spu_update_sched_info(struct spu_context *ctx);
+void __spu_update_sched_info(struct spu_context *ctx);
 int __init spu_sched_init(void);
 void __exit spu_sched_exit(void);
 
Index: linux-2.6/include/asm-powerpc/spu.h
===================================================================
--- linux-2.6.orig/include/asm-powerpc/spu.h
+++ linux-2.6/include/asm-powerpc/spu.h
@@ -448,6 +448,7 @@ struct spu_priv1 {
 #define MFC_STATE1_PROBLEM_STATE_MASK		0x08ull
 #define MFC_STATE1_RELOCATE_MASK		0x10ull
 #define MFC_STATE1_MASTER_RUN_CONTROL_MASK	0x20ull
+#define MFC_STATE1_TABLE_SEARCH_MASK		0x40ull
 	u64 mfc_lpid_RW;					/* 0x008 */
 	u64 spu_idr_RW;						/* 0x010 */
 	u64 mfc_vr_RO;						/* 0x018 */