Subject: Re: fix physical spu utilization statistics From: Luke Browning SPU contexts only accrue SPU time while they are loaded or running on an SPU. Time spent on the runqueue is not an interesting physical spu utilization statistic. This patch fixes that bug by only measure time when the context is associated with an SPU. This patch also use the timebase register to improve granularity of time stamping and provide a more repeatable statistic. Many of the critical sections that are measured fall into the few microsecond level. ie. context switching. For SPE contexts, utilization should be interpreted as: The "user" category measures the time spent executing SPE application code. The "system" utilization category is a measure of system overhead and includes time spent context switching and executing remote system calls. The physical SPU is not available to be used to schedule other SPU contexts while in this state. The "iowait" time measures time spent processing page faults and provides an indication of the potential benefit that be realized by tuning the I/O subsystem and/or the application. The "loaded" statistic reflects the time spent lazily loaded. Statistics are also collected at the Logical SPUs level. In general, the SPE context statistics are simply added to the corresponding logical SPE utilization bucket, except the SPE context "loaded" category is added to the system spu "idle" bucket. Signed-off-by: Luke Browning Signed-off-by: Arnd Bergmann --- Fixes made so far: - white space formatting - unbind ctxt state transition marked as loaded, so that underlying physical spu is marked as idle. The state of the spu context in this case doesn't matter, since it is not bound to a physical spu. --- Index: linux-2.6/arch/powerpc/platforms/cell/spu_base.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spu_base.c +++ linux-2.6/arch/powerpc/platforms/cell/spu_base.c @@ -627,8 +627,8 @@ static int __init create_spu(void *data) spin_unlock_irqrestore(&spu_full_list_lock, flags); mutex_unlock(&spu_full_list_mutex); - spu->stats.utilization_state = SPU_UTIL_IDLE; - spu->stats.tstamp = jiffies; + spu->stats.util_state = SPU_UTIL_IDLE; + spu->stats.tstamp = get_tbl(); INIT_LIST_HEAD(&spu->aff_list); @@ -653,10 +653,15 @@ static unsigned long long spu_acct_time( { unsigned long long time = spu->stats.times[state]; - if (spu->stats.utilization_state == state) - time += jiffies - spu->stats.tstamp; + /* + * If the spu is idle or the context is stopped, utilization + * statistics are not updated. Apply the time delta from the + * last recorded state of the spu. + */ + if (spu->stats.util_state == state) + time += get_tbl() - spu->stats.tstamp; - return jiffies_to_msecs(time); + return time / tb_ticks_per_usec / 1000; /* msecs */ } @@ -666,7 +671,7 @@ static ssize_t spu_stat_show(struct sys_ return sprintf(buf, "%s %llu %llu %llu %llu " "%llu %llu %llu %llu %llu %llu %llu %llu\n", - spu_state_names[spu->stats.utilization_state], + spu_state_names[spu->stats.util_state], spu_acct_time(spu, SPU_UTIL_USER), spu_acct_time(spu, SPU_UTIL_SYSTEM), spu_acct_time(spu, SPU_UTIL_IOWAIT), Index: linux-2.6/arch/powerpc/platforms/cell/spufs/context.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/context.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/context.c @@ -61,8 +61,7 @@ struct spu_context *alloc_spu_context(st spu_gang_add_ctx(gang, ctx); ctx->cpus_allowed = current->cpus_allowed; spu_set_timeslice(ctx); - ctx->stats.execution_state = SPUCTX_UTIL_USER; - ctx->stats.tstamp = jiffies; + ctx->stats.util_state = SPUCTX_UTIL_USER; atomic_inc(&nr_spu_contexts); goto out; Index: linux-2.6/arch/powerpc/platforms/cell/spufs/fault.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/fault.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/fault.c @@ -187,10 +187,8 @@ int spufs_handle_class1(struct spu_conte dsisr, ctx->state); ctx->stats.hash_flt++; - if (ctx->state == SPU_STATE_RUNNABLE) { + if (ctx->state == SPU_STATE_RUNNABLE) ctx->spu->stats.hash_flt++; - spu_switch_state(ctx->spu, SPU_UTIL_IOWAIT); - } /* we must not hold the lock when entering spu_handle_mm_fault */ spu_release(ctx); Index: linux-2.6/arch/powerpc/platforms/cell/spufs/file.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/file.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/file.c @@ -2065,14 +2065,23 @@ static const char *ctx_state_names[] = { }; static unsigned long long spufs_acct_time(struct spu_context *ctx, - enum spuctx_execution_state state) + enum spuctx_utilization_state state) { unsigned long time = ctx->stats.times[state]; - if (ctx->stats.execution_state == state) - time += jiffies - ctx->stats.tstamp; + /* + * In general, utilization statistics are updated by the controlling + * thread as the spu context moves through various well defined + * state transitions, but if the context is lazily loaded its + * utilization statistics are not updated as the controlling thread + * is not tightly coupled with the execution of the spu context. We + * calculate and apply the time delta from the last recorded state + * of the spu context. + */ + if (ctx->spu && ctx->stats.util_state == state) + time += get_tbl() - ctx->stats.tstamp; - return jiffies_to_msecs(time); + return time / tb_ticks_per_usec / 1000; /* msecs */ } static unsigned long long spufs_slb_flts(struct spu_context *ctx) @@ -2107,7 +2116,7 @@ static int spufs_show_stat(struct seq_fi spu_acquire(ctx); seq_printf(s, "%s %llu %llu %llu %llu " "%llu %llu %llu %llu %llu %llu %llu %llu\n", - ctx_state_names[ctx->stats.execution_state], + ctx_state_names[ctx->stats.util_state], spufs_acct_time(ctx, SPUCTX_UTIL_USER), spufs_acct_time(ctx, SPUCTX_UTIL_SYSTEM), spufs_acct_time(ctx, SPUCTX_UTIL_IOWAIT), Index: linux-2.6/arch/powerpc/platforms/cell/spufs/run.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/run.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/run.c @@ -128,6 +128,9 @@ out: static int spu_run_init(struct spu_context *ctx, u32 * npc) { + + spuctx_switch_state(ctx, SPUCTX_UTIL_SYSTEM); + if (ctx->flags & SPU_CREATE_ISOLATE) { unsigned long runcntl; @@ -153,6 +156,8 @@ static int spu_run_init(struct spu_conte ctx->ops->runcntl_write(ctx, SPU_RUNCNTL_RUNNABLE); } + spuctx_switch_state(ctx, SPUCTX_UTIL_USER); + return 0; } @@ -161,8 +166,12 @@ static int spu_run_fini(struct spu_conte { int ret = 0; + spuctx_switch_state(ctx, SPUCTX_UTIL_SYSTEM); + *status = ctx->ops->status_read(ctx); *npc = ctx->ops->npc_read(ctx); + + spuctx_switch_state(ctx, SPUCTX_UTIL_LOADED); spu_release(ctx); if (signal_pending(current)) @@ -332,6 +341,9 @@ long spufs_run_spu(struct file *file, st spu = ctx->spu; if (unlikely(ret)) break; + + spuctx_switch_state(ctx, SPUCTX_UTIL_SYSTEM); + if (unlikely(test_bit(SPU_SCHED_NOTIFY_ACTIVE, &ctx->sched_flags))) { clear_bit(SPU_SCHED_NOTIFY_ACTIVE, &ctx->sched_flags); Index: linux-2.6/arch/powerpc/platforms/cell/spufs/sched.c =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/sched.c +++ linux-2.6/arch/powerpc/platforms/cell/spufs/sched.c @@ -393,6 +393,8 @@ static void spu_bind_context(struct spu pr_debug("%s: pid=%d SPU=%d NODE=%d\n", __FUNCTION__, current->pid, spu->number, spu->node); + spuctx_switch_state(ctx, SPUCTX_UTIL_SYSTEM); + if (ctx->flags & SPU_CREATE_NOSCHED) atomic_inc(&cbe_spu_info[spu->node].reserved_spus); if (!list_empty(&ctx->aff_list)) @@ -405,6 +407,7 @@ static void spu_bind_context(struct spu spu->flags = 0; ctx->spu = spu; ctx->ops = &spu_hw_ops; + ctx->state = SPU_STATE_RUNNABLE; spu->pid = current->pid; spu->tgid = current->tgid; spu_associate_mm(spu, ctx->owner); @@ -419,8 +422,8 @@ static void spu_bind_context(struct spu spu->timestamp = jiffies; spu_cpu_affinity_set(spu, raw_smp_processor_id()); spu_switch_notify(spu, ctx); - ctx->state = SPU_STATE_RUNNABLE; - spu_switch_state(spu, SPU_UTIL_SYSTEM); + + spuctx_switch_state(ctx, SPUCTX_UTIL_LOADED); } /** @@ -433,7 +436,7 @@ static void spu_unbind_context(struct sp pr_debug("%s: unbind pid=%d SPU=%d NODE=%d\n", __FUNCTION__, spu->pid, spu->number, spu->node); - spu_switch_state(spu, SPU_UTIL_IDLE); + spuctx_switch_state(ctx, SPUCTX_UTIL_SYSTEM); if (spu->ctx->flags & SPU_CREATE_NOSCHED) atomic_dec(&cbe_spu_info[spu->node].reserved_spus); @@ -444,7 +447,6 @@ static void spu_unbind_context(struct sp spu_unmap_mappings(ctx); spu_save(&ctx->csa, spu); spu->timestamp = jiffies; - ctx->state = SPU_STATE_SAVED; spu->ibox_callback = NULL; spu->wbox_callback = NULL; spu->stop_callback = NULL; @@ -454,14 +456,18 @@ static void spu_unbind_context(struct sp spu->pid = 0; spu->tgid = 0; ctx->ops = &spu_backing_ops; - ctx->spu = NULL; - spu->flags = 0; spu->ctx = NULL; + spu->flags = 0; ctx->stats.slb_flt += (spu->stats.slb_flt - ctx->stats.slb_flt_base); ctx->stats.class2_intr += (spu->stats.class2_intr - ctx->stats.class2_intr_base); + + /* This maps the underlying spu state to idle */ + spuctx_switch_state(ctx, SPUCTX_UTIL_LOADED); + ctx->spu = NULL; + ctx->state = SPU_STATE_SAVED; } /** @@ -634,8 +640,6 @@ static struct spu *find_victim(struct sp */ int spu_activate(struct spu_context *ctx, unsigned long flags) { - spuctx_switch_state(ctx, SPUCTX_UTIL_SYSTEM); - do { struct spu *spu; @@ -739,7 +743,6 @@ void spu_deactivate(struct spu_context * } __spu_deactivate(ctx, 1, MAX_PRIO); - spuctx_switch_state(ctx, SPUCTX_UTIL_USER); } /** @@ -754,12 +757,7 @@ void spu_yield(struct spu_context *ctx) { if (!(ctx->flags & SPU_CREATE_NOSCHED)) { mutex_lock(&ctx->state_mutex); - if (__spu_deactivate(ctx, 0, MAX_PRIO)) - spuctx_switch_state(ctx, SPUCTX_UTIL_USER); - else { - spuctx_switch_state(ctx, SPUCTX_UTIL_LOADED); - spu_switch_state(ctx->spu, SPU_UTIL_USER); - } + __spu_deactivate(ctx, 0, MAX_PRIO); mutex_unlock(&ctx->state_mutex); } } Index: linux-2.6/arch/powerpc/platforms/cell/spufs/spufs.h =================================================================== --- linux-2.6.orig/arch/powerpc/platforms/cell/spufs/spufs.h +++ linux-2.6/arch/powerpc/platforms/cell/spufs/spufs.h @@ -44,12 +44,14 @@ struct spu_gang; * This is the state for spu utilization reporting to userspace. * Because this state is visible to userspace it must never change and needs * to be kept strictly separate from any internal state kept by the kernel. + * + * Must be kept in sync with spu_utilization_state. */ -enum spuctx_execution_state { +enum spuctx_utilization_state { SPUCTX_UTIL_USER = 0, SPUCTX_UTIL_SYSTEM, SPUCTX_UTIL_IOWAIT, - SPUCTX_UTIL_LOADED, + SPUCTX_UTIL_LOADED, /* mapped to SPU_UTIL_IDLE at system spu level*/ SPUCTX_UTIL_MAX }; @@ -111,7 +113,7 @@ struct spu_context { /* statistics */ struct { /* updates protected by ctx->state_mutex */ - enum spuctx_execution_state execution_state; + enum spuctx_utilization_state util_state; unsigned long tstamp; /* time of last ctx switch */ unsigned long times[SPUCTX_UTIL_MAX]; unsigned long long vol_ctx_switch; @@ -319,30 +321,31 @@ extern int spufs_coredump_num_notes; * line. */ static inline void spuctx_switch_state(struct spu_context *ctx, - enum spuctx_execution_state new_state) + enum spuctx_utilization_state new_state) { - WARN_ON(!mutex_is_locked(&ctx->state_mutex)); - - if (ctx->stats.execution_state != new_state) { - unsigned long curtime = jiffies; + unsigned long long curtime = get_tbl(); + unsigned long long delta = curtime - ctx->stats.tstamp; + struct spu *spu; + enum spuctx_utilization_state old_state; - ctx->stats.times[ctx->stats.execution_state] += - curtime - ctx->stats.tstamp; - ctx->stats.tstamp = curtime; - ctx->stats.execution_state = new_state; - } -} + WARN_ON(!mutex_is_locked(&ctx->state_mutex)); -static inline void spu_switch_state(struct spu *spu, - enum spuctx_execution_state new_state) -{ - if (spu->stats.utilization_state != new_state) { - unsigned long curtime = jiffies; + if ((signed long) delta < 0) + return; - spu->stats.times[spu->stats.utilization_state] += - curtime - spu->stats.tstamp; + spu = ctx->spu; + old_state = ctx->stats.util_state; + ctx->stats.util_state = new_state; + ctx->stats.tstamp = curtime; + + /* + * Update the physical SPU utilization statistics. + */ + if (spu) { + ctx->stats.times[old_state] += delta; + spu->stats.times[old_state] += delta; + spu->stats.util_state = new_state; spu->stats.tstamp = curtime; - spu->stats.utilization_state = new_state; } } Index: linux-2.6/include/asm-powerpc/spu.h =================================================================== --- linux-2.6.orig/include/asm-powerpc/spu.h +++ linux-2.6/include/asm-powerpc/spu.h @@ -106,9 +106,15 @@ struct spu_context; struct spu_runqueue; struct device_node; +/* + * MUST be kept in sync with spuctx_utilization_statistics as those + * utilization buckets are dumped into the spu structure's. The spu_context + * category SPUCTX_UTIL_LOAD is dumped into SPU_UTIL_IDLE as the lazily loaded + * context may be preempted by any context that is ready to run. + */ enum spu_utilization_state { - SPU_UTIL_SYSTEM, SPU_UTIL_USER, + SPU_UTIL_SYSTEM, SPU_UTIL_IOWAIT, SPU_UTIL_IDLE, SPU_UTIL_MAX @@ -172,9 +178,9 @@ struct spu { struct { /* protected by interrupt reentrancy */ - enum spu_utilization_state utilization_state; - unsigned long tstamp; /* time of last ctx switch */ - unsigned long times[SPU_UTIL_MAX]; + enum spu_utilization_state util_state; + unsigned long long tstamp; + unsigned long long times[SPU_UTIL_MAX]; unsigned long long vol_ctx_switch; unsigned long long invol_ctx_switch; unsigned long long min_flt;