Subject: perfmon2: handle 'partial' samples in hardware-sampling mode From: Kevin Corry When a monitored process is being context-switched-out, or when we are switching event-sets, the Cell PMU trace-buffer will likely not be full, and will likely be in the middle of collecting the next sample to add to the trace-buffer. In this case, the entry that we record in the sampling-buffer will have one entry at the end that does not represent a complete time-interval. In order for user-space to know this has happened, we add a pm_interval field to the end of the pfm_cell_hw_smpl_entry_hdr structure. When this field is non-zero, it indicates that the very last sample in this entry is "partial", and also indicates how far through the current time-interval we were when we recorded this sample. And since the sample is incomplete, we have to record it directly from the counters instead of the trace-buffer. In this case, we also need to make sure the counters are reset to the appropriate value, since the next time this event set is enabled, it will start a new sampling time-interval. Signed-off-by: Kevin Corry Signed-off-by: Carl Love Signed-off-by: Arnd Bergmann Index: linux-2.6/arch/powerpc/perfmon/perfmon_cell.c =================================================================== --- linux-2.6.orig/arch/powerpc/perfmon/perfmon_cell.c +++ linux-2.6/arch/powerpc/perfmon/perfmon_cell.c @@ -77,7 +77,6 @@ static struct pfm_regmap_desc pfm_cell_p }; #define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_cell_pmc_desc) -#define CELL_PMC_PM_STATUS 20 /* * Mapping from Perfmon logical data counters to Cell hardware counters. */ @@ -312,7 +311,7 @@ static void pfm_cell_disable_counters(st cbe_disable_pm(smp_processor_id()); if (smpl_fmt && !strcmp(smpl_fmt->fmt_name, PFM_CELL_HW_SMPL_NAME)) { - ctx->ovfl_arg.ovfl_pmd = PFM_CELL_HW_SMPL_OVFL_PMD; + ctx->ovfl_arg.ovfl_pmd = PFM_CELL_HW_SMPL_OVFL_PMD_PARTIAL; ctx->ovfl_arg.active_set = ctx->active_set->id; regs = current->thread.regs; smpl_fmt->fmt_handler(ctx->smpl_addr, &ctx->ovfl_arg, Index: linux-2.6/arch/powerpc/perfmon/perfmon_cell_hw_smpl.c =================================================================== --- linux-2.6.orig/arch/powerpc/perfmon/perfmon_cell_hw_smpl.c +++ linux-2.6/arch/powerpc/perfmon/perfmon_cell_hw_smpl.c @@ -139,6 +139,7 @@ init_entry_header(struct pfm_cell_hw_smp ent_hdr->set = set->id; ent_hdr->num_samples = 0; ent_hdr->entry_num = buf_hdr->count; + ent_hdr->pm_interval = 0; return ent_hdr; } @@ -168,6 +169,49 @@ static void read_trace_buffer(struct pfm } /** + * read_partial_sample + * + * We are disabling this event-set, and are in the middle of a sampling + * interval. When this event set gets re-enabled, the pm_interval will get + * reset to its initial value, so we'll lose the counter values from this + * incomplete sample. In "count" sampling-mode, we should be able to prevent + * this data loss by recording a "partial" sample from the actual counters. + * In occurrence and threshold sampling, however, the partial data is not + * available, so we'll have to settle for an "empty" sample. In any case, + * fill in the current pm_interval value so user-space will know that the last + * sample in this entry is partial or empty, and so they will be able to + * calculate the percentage-complete for that sample. + **/ +static void read_partial_sample(struct pfm_event_set *set, + struct pfm_cell_hw_smpl_entry_hdr *ent_hdr, + u64 **trace_buffer_sample) +{ + u32 pm_control = set->pmcs[CELL_PMC_PM_CONTROL]; + u32 *partial_sample; + int i, cpu = smp_processor_id(); + + if (CBE_PM_TRACE_MODE_GET(pm_control) == CBE_PM_TRACE_MODE_COUNT) { + partial_sample = (u32 *)*trace_buffer_sample; + for (i = 0; i < NR_PHYS_CTRS; i++) { + partial_sample[i] = cbe_read_phys_ctr(cpu, i); + } + } + + ent_hdr->pm_interval = cbe_read_pm(cpu, pm_interval); + ent_hdr->num_samples++; + *trace_buffer_sample += 2; + + /* In all cases, reset the in-use PMDs to their "long" reset + * value, since we've effectively invalidated the data in this + * interval. We have not saved the PMDs to the event-set at + * this point, so write to the actual counter registers. + */ + for (i = 0; i < NR_CTRS; i++) + if (test_bit(i, cast_ulp(&set->used_pmds))) + cbe_write_ctr(cpu, i, set->pmds[i].long_reset); +} + +/** * handle_full_buffer **/ static int handle_full_buffer(struct pfm_cell_hw_smpl_hdr *buf_hdr, @@ -217,9 +261,11 @@ static int pfm_cell_hw_smpl_handler(void /* If this handler was called due to an actual PMD overflowing, do * nothing. Only store the contents of the trace-buffer if the trace- - * buffer overflowed. + * buffer overflowed or if we're disabling an event-set (during a + * process context-switch or an event-set switch). */ - if (arg->ovfl_pmd != PFM_CELL_HW_SMPL_OVFL_PMD) + if (!(arg->ovfl_pmd == PFM_CELL_HW_SMPL_OVFL_PMD || + arg->ovfl_pmd == PFM_CELL_HW_SMPL_OVFL_PMD_PARTIAL)) return 0; ctx = __get_cpu_var(pmu_ctx); @@ -228,16 +274,21 @@ static int pfm_cell_hw_smpl_handler(void /* Check if the sampling-buffer is full. This should never happen, * since we check if the buffer is full after adding the new entry. */ - free_bytes = buf_hdr->buf_size - buf_hdr->cur_offset; + free_bytes = hdr->buf_size - hdr->cur_offset; if (free_bytes < PFM_CELL_HW_SMPL_MAX_ENTRY_SIZE) { PFM_ERR("Cell HW Sampling: Buffer is full " "before adding new entry."); - return handle_full_buffer(buf_hdr, ctx, set); + return handle_full_buffer(hdr, ctx, set); } - ent = init_entry_header(buf_hdr, set); + ent = init_entry_header(hdr, set); - read_trace_buffer(ent_hdr, &trace_buffer_sample); + read_trace_buffer(ent, &trace_buffer_sample); + + if (arg->ovfl_pmd == PFM_CELL_HW_SMPL_OVFL_PMD_PARTIAL && + ent->num_samples < CBE_PM_TRACE_BUF_MAX_COUNT) { + read_partial_sample(set, ent, &trace_buffer_sample); + } /* Update the sampling-buffer header for the next entry. Since the * hw_smpl_hdr and hw_smpl_entry_hdr structures are both padded to @@ -245,16 +296,16 @@ static int pfm_cell_hw_smpl_handler(void * every buffer entry will start on a 128-bit boundary. */ if (ent->num_samples) { - hdr->cur_offset = (void *)trace_buffer_sample - (void *)buf_hdr; + hdr->cur_offset = (void *)trace_buffer_sample - (void *)hdr; hdr->count++; } /* Check the available size in the buffer again so we won't lose the * next sample entry. */ - free_bytes = buf_hdr->buf_size - buf_hdr->cur_offset; + free_bytes = hdr->buf_size - hdr->cur_offset; if (free_bytes < PFM_CELL_HW_SMPL_MAX_ENTRY_SIZE) - return handle_full_buffer(buf_hdr, ctx, set); + return handle_full_buffer(hdr, ctx, set); return 0; } Index: linux-2.6/include/asm-powerpc/perfmon_cell_hw_smpl.h =================================================================== --- linux-2.6.orig/include/asm-powerpc/perfmon_cell_hw_smpl.h +++ linux-2.6/include/asm-powerpc/perfmon_cell_hw_smpl.h @@ -70,6 +70,12 @@ struct pfm_cell_hw_smpl_hdr { * @set: Event-set that was active when the overflow occurred. * @num_samples: Number of 128-bit trace-buffer samples in this entry. * @entry_num: Sequence number of sampling-buffer entries. + * @pm_interval: If the last sample in this entry is for a partial time + * interval, this is the value of pm_interval when the partial + * sample was recorded. If the value is zero, there is no partial + * sample at the end of the entry. + * @reserved1: Pad to 128-bit boundary. + * * * The header for each data entry in the sampling-buffer. The entry header * is immediately followed by the contents of the trace-buffer. Each line in @@ -79,10 +85,12 @@ struct pfm_cell_hw_smpl_hdr { struct pfm_cell_hw_smpl_entry_hdr { __u32 pid; __u32 tgid; - __u16 cpu; - __u16 set; - __u16 num_samples; - __u16 entry_num; + __u32 cpu; + __u32 set; + __u32 num_samples; + __u32 entry_num; + __u32 pm_interval; + __u32 reserved1; }; /* The max size of each sampling-buffer entry is the size of the entry header @@ -98,9 +106,15 @@ struct pfm_cell_hw_smpl_entry_hdr { #define PFM_CELL_HW_SMPL_MIN_BUF_SIZE (sizeof(struct pfm_cell_hw_smpl_hdr) + \ PFM_CELL_HW_SMPL_MAX_ENTRY_SIZE) -#define PFM_CELL_HW_SMPL_VERSION 1 -#define PFM_CELL_HW_SMPL_NAME "perfmon_cell_hw_smpl" -#define PFM_CELL_HW_SMPL_OVFL_PMD (PFM_MAX_PMDS + 1) -#define PFM_MSG_CELL_HW_SMPL_BUF_FULL 99 +#define PFM_CELL_HW_SMPL_VERSION 1 +#define PFM_CELL_HW_SMPL_NAME "perfmon_cell_hw_smpl" +#define PFM_CELL_HW_SMPL_OVFL_PMD (PFM_MAX_PMDS + 1) +#define PFM_CELL_HW_SMPL_OVFL_PMD_PARTIAL (PFM_MAX_PMDS + 2) +#define PFM_MSG_CELL_HW_SMPL_BUF_FULL 99 + +/* Values are indexes into pfm_cell_pmc_desc[] array. */ +#define CELL_PMC_PM_STATUS 20 +#define CELL_PMC_PM_CONTROL 21 +#define CELL_PMC_PM_INTERVAL 22 #endif /* __ASM_POWERPC_PERFMON_CELL_HW_SMPL_H__ */ Index: linux-2.6/include/asm-powerpc/cell-pmu.h =================================================================== --- linux-2.6.orig/include/asm-powerpc/cell-pmu.h +++ linux-2.6/include/asm-powerpc/cell-pmu.h @@ -41,6 +41,11 @@ #define CBE_PM_FREEZE_ALL_CTRS 0x00100000 #define CBE_PM_ENABLE_EXT_TRACE 0x00008000 +#define CBE_PM_TRACE_MODE_NONE 0 +#define CBE_PM_TRACE_MODE_COUNT 1 +#define CBE_PM_TRACE_MODE_OCCURRENCE 2 +#define CBE_PM_TRACE_MODE_THRESHOLD 3 + /* Macros for the trace_address register. */ #define CBE_PM_TRACE_BUF_FULL 0x00000800 #define CBE_PM_TRACE_BUF_EMPTY 0x00000400