Subject: spu sched: switch from workqueues to kthread + timer tick From: Christoph Hellwig Get rid of the scheduler workqueues that complicated things a lot to a dedicated spu scheduler thread that gets woken by a traditional scheduler tick. By default this scheduler tick runs a HZ * 10, aka one spu scheduler tick for every 10 cpu ticks. Currently the tick is not disabled when we have less context than available spus, but I will implement this later. Signed-off-by: Christoph Hellwig Signed-off-by: Arnd Bergmann Index: linux-2.6/arch/powerpc/platforms/cell/spufs/sched.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/sched.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/sched.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -45,6 +46,8 @@ #define SPU_TIMESLICE (HZ) +#define SPUSCHED_TICK (HZ / 100) + struct spu_prio_array { DECLARE_BITMAP(bitmap, MAX_PRIO); struct list_head runq[MAX_PRIO]; @@ -54,7 +57,8 @@ struct spu_prio_array { }; static struct spu_prio_array *spu_prio; -static struct workqueue_struct *spu_sched_wq; +static struct task_struct *spusched_task; +static struct timer_list spusched_timer; static inline int node_allowed(int node) { @@ -68,31 +72,6 @@ static inline int node_allowed(int node) return 1; } -void spu_start_tick(struct spu_context *ctx) -{ - if (ctx->policy == SCHED_RR) { - /* - * Make sure the exiting bit is cleared. - */ - clear_bit(SPU_SCHED_EXITING, &ctx->sched_flags); - mb(); - queue_delayed_work(spu_sched_wq, &ctx->sched_work, SPU_TIMESLICE); - } -} - -void spu_stop_tick(struct spu_context *ctx) -{ - if (ctx->policy == SCHED_RR) { - /* - * While the work can be rearming normally setting this flag - * makes sure it does not rearm itself anymore. - */ - set_bit(SPU_SCHED_EXITING, &ctx->sched_flags); - mb(); - cancel_delayed_work(&ctx->sched_work); - } -} - /** * spu_add_to_active_list - add spu to active list * @spu: spu to add to the active list @@ -104,6 +83,11 @@ static void spu_add_to_active_list(struc mutex_unlock(&spu_prio->active_mutex[spu->node]); } +static void __spu_remove_from_active_list(struct spu *spu) +{ + list_del_init(&spu->list); +} + /** * spu_remove_from_active_list - remove spu from active list * @spu: spu to remove from the active list @@ -113,7 +97,7 @@ static void spu_remove_from_active_list( int node = spu->node; mutex_lock(&spu_prio->active_mutex[node]); - list_del_init(&spu->list); + __spu_remove_from_active_list(spu); mutex_unlock(&spu_prio->active_mutex[node]); } @@ -188,7 +172,6 @@ static void spu_bind_context(struct spu spu->timestamp = jiffies; spu_cpu_affinity_set(spu, raw_smp_processor_id()); spu_switch_notify(spu, ctx); - spu_add_to_active_list(spu); ctx->state = SPU_STATE_RUNNABLE; } @@ -202,7 +185,6 @@ static void spu_unbind_context(struct sp pr_debug("%s: unbind pid=%d SPU=%d NODE=%d\n", __FUNCTION__, spu->pid, spu->number, spu->node); - spu_remove_from_active_list(spu); spu_switch_notify(spu, NULL); spu_unmap_mappings(ctx); spu_save(&ctx->csa, spu); @@ -340,6 +322,7 @@ static struct spu *find_victim(struct sp victim = NULL; goto restart; } + spu_remove_from_active_list(spu); spu_unbind_context(spu, victim); mutex_unlock(&victim->state_mutex); /* @@ -382,6 +365,7 @@ int spu_activate(struct spu_context *ctx spu = find_victim(ctx); if (spu) { spu_bind_context(spu, ctx); + spu_add_to_active_list(spu); return 0; } @@ -425,6 +409,7 @@ static int __spu_deactivate(struct spu_c if (spu) { new = grab_runnable_context(max_prio); if (new || force) { + spu_remove_from_active_list(spu); spu_unbind_context(spu, ctx); spu_free(spu); if (new) @@ -465,51 +450,78 @@ void spu_yield(struct spu_context *ctx) } } -void spu_sched_tick(struct work_struct *work) +static void spusched_tick(struct spu_context *ctx) { - struct spu_context *ctx = - container_of(work, struct spu_context, sched_work.work); - int preempted; + if (ctx->policy != SCHED_RR || --ctx->time_slice) + return; /* - * If this context is being stopped avoid rescheduling from the - * scheduler tick because we would block on the state_mutex. - * The caller will yield the spu later on anyway. + * Unfortunately active_mutex ranks outside of state_mutex, so + * we have to trylock here. If we fail give the context another + * tick and try again. */ - if (test_bit(SPU_SCHED_EXITING, &ctx->sched_flags)) - return; - - mutex_lock(&ctx->state_mutex); - preempted = __spu_deactivate(ctx, 0, ctx->prio + 1); - mutex_unlock(&ctx->state_mutex); + if (mutex_trylock(&ctx->state_mutex)) { + struct spu_context *new = grab_runnable_context(ctx->prio + 1); + if (new) { + struct spu *spu = ctx->spu; - if (preempted) { - /* - * We need to break out of the wait loop in spu_run manually - * to ensure this context gets put on the runqueue again - * ASAP. - */ - wake_up(&ctx->stop_wq); + __spu_remove_from_active_list(spu); + spu_unbind_context(spu, ctx); + spu_free(spu); + wake_up(&new->stop_wq); + /* + * We need to break out of the wait loop in + * spu_run manually to ensure this context + * gets put on the runqueue again ASAP. + */ + wake_up(&ctx->stop_wq); + } + ctx->time_slice = SPU_DEF_TIMESLICE; + mutex_unlock(&ctx->state_mutex); } else { - spu_start_tick(ctx); + ctx->time_slice++; } } +static void spusched_wake(unsigned long data) +{ + mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK); + wake_up_process(spusched_task); +} + +static int spusched_thread(void *unused) +{ + struct spu *spu, *next; + int node; + + setup_timer(&spusched_timer, spusched_wake, 0); + __mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + for (node = 0; node < MAX_NUMNODES; node++) { + mutex_lock(&spu_prio->active_mutex[node]); + list_for_each_entry_safe(spu, next, + &spu_prio->active_list[node], + list) + spusched_tick(spu->ctx); + mutex_unlock(&spu_prio->active_mutex[node]); + } + } + + del_timer_sync(&spusched_timer); + return 0; +} + int __init spu_sched_init(void) { int i; - spu_sched_wq = create_singlethread_workqueue("spusched"); - if (!spu_sched_wq) - return 1; - spu_prio = kzalloc(sizeof(struct spu_prio_array), GFP_KERNEL); - if (!spu_prio) { - printk(KERN_WARNING "%s: Unable to allocate priority queue.\n", - __FUNCTION__); - destroy_workqueue(spu_sched_wq); - return 1; - } + if (!spu_prio) + return -ENOMEM; + for (i = 0; i < MAX_PRIO; i++) { INIT_LIST_HEAD(&spu_prio->runq[i]); __clear_bit(i, spu_prio->bitmap); @@ -520,7 +532,14 @@ int __init spu_sched_init(void) INIT_LIST_HEAD(&spu_prio->active_list[i]); } spin_lock_init(&spu_prio->runq_lock); + + spusched_task = kthread_run(spusched_thread, NULL, "spusched"); + if (IS_ERR(spusched_task)) { + kfree(spu_prio); + return PTR_ERR(spusched_task); + } return 0; + } void __exit spu_sched_exit(void) @@ -528,6 +547,8 @@ void __exit spu_sched_exit(void) struct spu *spu, *tmp; int node; + kthread_stop(spusched_task); + for (node = 0; node < MAX_NUMNODES; node++) { mutex_lock(&spu_prio->active_mutex[node]); list_for_each_entry_safe(spu, tmp, &spu_prio->active_list[node], @@ -538,5 +559,4 @@ void __exit spu_sched_exit(void) mutex_unlock(&spu_prio->active_mutex[node]); } kfree(spu_prio); - destroy_workqueue(spu_sched_wq); } Index: linux-2.6/arch/powerpc/platforms/cell/spufs/context.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/context.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/context.c @@ -57,7 +57,7 @@ struct spu_context *alloc_spu_context(st ctx->rt_priority = current->rt_priority; ctx->policy = current->policy; ctx->prio = current->prio; - INIT_DELAYED_WORK(&ctx->sched_work, spu_sched_tick); + ctx->time_slice = SPU_DEF_TIMESLICE; goto out; out_free: kfree(ctx); @@ -180,5 +180,3 @@ void * spu_get_profile_private_kref(stru return ctx->prof_priv_kref; } EXPORT_SYMBOL_GPL(spu_get_profile_private_kref); - - Index: linux-2.6/arch/powerpc/platforms/cell/spufs/run.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/run.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/run.c @@ -144,7 +144,6 @@ static int spu_run_init(struct spu_conte runcntl = SPU_RUNCNTL_RUNNABLE; ctx->ops->runcntl_write(ctx, runcntl); } else { - spu_start_tick(ctx); ctx->ops->npc_write(ctx, *npc); ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_RUNNABLE); } @@ -157,7 +156,6 @@ static int spu_run_fini(struct spu_conte { int ret = 0; - spu_stop_tick(ctx); *status = ctx->ops->status_read(ctx); *npc = ctx->ops->npc_read(ctx); spu_release(ctx); @@ -337,10 +335,8 @@ long spufs_run_spu(struct file *file, st if (unlikely(ctx->state != SPU_STATE_RUNNABLE)) { ret = spu_reacquire_runnable(ctx, npc, &status); - if (ret) { - spu_stop_tick(ctx); + if (ret) goto out2; - } continue; } ret = spu_process_events(ctx); Index: linux-2.6/arch/powerpc/platforms/cell/spufs/spufs.h =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/spufs.h +++ linux-2.6/arch/powerpc/platforms/cell/spufs/spufs.h @@ -31,6 +31,8 @@ #include #include +#define SPU_DEF_TIMESLICE 100 + /* The magic number for our file system */ enum { SPUFS_MAGIC = 0x23c9b64e, @@ -41,8 +43,7 @@ struct spu_gang; /* ctx->sched_flags */ enum { - SPU_SCHED_EXITING = 0, - SPU_SCHED_NOTIFY_ACTIVE, + SPU_SCHED_NOTIFY_ACTIVE = 0, }; struct spu_context { @@ -86,7 +87,7 @@ struct spu_context { /* scheduler fields */ struct list_head rq; - struct delayed_work sched_work; + unsigned int time_slice; unsigned long sched_flags; unsigned long rt_priority; int policy; @@ -204,9 +205,6 @@ int spu_activate(struct spu_context *ctx void spu_deactivate(struct spu_context *ctx); void spu_yield(struct spu_context *ctx); void spu_switch_notify(struct spu *spu, struct spu_context *ctx); -void spu_start_tick(struct spu_context *ctx); -void spu_stop_tick(struct spu_context *ctx); -void spu_sched_tick(struct work_struct *work); int __init spu_sched_init(void); void __exit spu_sched_exit(void);