Simple Performance Counters Simple Performance Counters This patch allows the use of simple performance counters to measure time intervals in the kernel source code. This allows a detailed analysis of the time spend and the amount of data processed in specific code sections of the kernel. Time is measured using the cycle counter (TSC on IA32, ITC on IA64) which has a very low latency. To use add #include to the header of the file where the measurement needs to take place. Then add the folowing to the code: To declare a time stamp do struct pc pc; To mark the beginning of the time measurement do pc_start(&pc, ) (If measurement from the beginning of a function is desired one may use INIT_PC(xx) instead). To mark the end of the time frame do: pc_stop(&pc); or if the amount of data transferred needs to be measured as well: pc_bytes(&pc, number-of-bytes); The measurements will show up in /proc/perf/all. Processor specific statistics may be obtained via /proc/perf/. Writing to /proc/perf/reset will reset all counters. F.e. echo >/proc/perf/reset The first counter is the number of times that the time measurement was performed. (+ xx) is the number of samples that were thrown away since the processor on which the process is running changed. Cycle counters may not be consistent across different processors. Then follows the sum of the time spend in the code segment followed in parentheses by the minimum / average / maximum time spent there. The second block are the sizes of data processed. Signed-off-by: Christoph Lameter --- arch/i386/kernel/tsc.c | 10 - arch/x86_64/kernel/tsc.c | 4 include/asm-i386/timex.h | 8 + include/asm-ia64/timex.h | 3 include/asm-x86_64/timex.h | 1 include/linux/perf.h | 60 ++++++++ init/Kconfig | 9 + kernel/perf.c | 333 +++++++++++++++++++++++++++++++++++++++++++++ kernel/timer.c | 3 mm/page_alloc.c | 3 mm/vmscan.c | 9 + 11 files changed, 433 insertions(+), 10 deletions(-) Index: linux-2.6.21-rc6/include/linux/perf.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.21-rc6/include/linux/perf.h 2007-04-13 22:19:08.000000000 -0700 @@ -0,0 +1,60 @@ +#ifndef __LINUX_PERF_H +#define __LINUX_PERF_H + +#include +/* + * Time Stamp Performance Counters + * + * (C) 2007 sgi + * Christoph Lameter , April 2007 + * + * Counters are calculated using the cycle counter. If a process + * is migrated to another cpu during the measurement then the measurement + * is invalid. + */ + +enum pc_item { + PC_UPDATE_PROCESS_TIMES, + PC_ALLOC_PAGES, + PC_DIRECT_RECLAIM_SCANNED, + PC_DIRECT_RECLAIM_RECLAIMED, + PC_RECLAIM_SCANNED, + PC_RECLAIM_RECLAIMED, + NR_PC_ITEMS +}; + +/* + * Information about the start of the measurement + */ +struct pc { + unsigned long time; + int processor; + enum pc_item item; +}; + +#define pc_stop(__pc) pc_bytes(__pc, 0) + +#ifdef CONFIG_PERFCOUNT + +#define INIT_PC(__var, __item) struct pc __var = \ + { get_cycles(), smp_processor_id(), __item } + +static inline void pc_start(struct pc *pc, enum pc_item nr) +{ + pc->item = nr; + pc->processor = smp_processor_id(); + pc->time = get_cycles(); +} + +void pc_bytes(struct pc *pc, unsigned long bytes); + +#else + +#define INIT_PC(__var, __item) do { } while(0) +static inline void pc_start(struct pc *pc, enum pc_item nr) {} +static inline void pc_bytes(struct pc *pc, unsigned long bytes) {} + +#endif + +#endif + Index: linux-2.6.21-rc6/kernel/perf.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.21-rc6/kernel/perf.c 2007-04-13 22:19:08.000000000 -0700 @@ -0,0 +1,333 @@ +/* + * Simple Performance Counter subsystem + * + * (C) 2007 sgi. + * + * April 2007, Christoph Lameter + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_NUMA +static int unsynced_get_cycles = 1; +#else +#define unsynced_get_cycles = 0 +#endif + +const char *var_id[NR_PC_ITEMS] = { + "update_process_times", + "alloc_pages", + "direct_reclaim_scanned", + "direct_reclaim_reclaimed", + "reclaim_scanned", + "reclaim_reclaimed" +}; + +struct perf_counter { + u32 events; + u32 mintime; + u32 maxtime; + u32 minbytes; + u32 maxbytes; + u32 skipped; + u64 time; + u64 bytes; +}; + +static DEFINE_PER_CPU(struct perf_counter, perf_counters)[NR_PC_ITEMS]; + +void pc_bytes(struct pc *pc, unsigned long bytes) +{ + unsigned long time = get_cycles(); + unsigned long ns; + struct perf_counter *p = &get_cpu_var(perf_counters)[pc->item]; + + if (unlikely(pc->item >= NR_PC_ITEMS)) { + printk(KERN_CRIT "pc_bytes: item number " + "(%d) out of range\n", pc->item); + dump_stack(); + goto out; + } + + if (unlikely(unsynced_get_cycles && + pc->processor != smp_processor_id())) { + /* On different processor. TSC measurement not possible. */ + p->skipped++; + goto out; + } + + ns = cycles_to_ns((unsigned long long)(time - pc->time)); + p->time += ns; + p->events++; + + if (ns > p->maxtime) + p->maxtime = ns; + + if (p->mintime == 0 || ns < p->mintime) + p->mintime = ns; + + if (bytes) { + p->bytes += bytes; + if (bytes > p->maxbytes) + p->maxbytes = bytes; + if (p->minbytes == 0 || bytes < p->minbytes) + p->minbytes = bytes; + } +out: + put_cpu_var(); + return; +} +EXPORT_SYMBOL(pc_bytes); + +static void reset_perfcount_item(struct perf_counter *c) +{ + memset(c, 0, sizeof(struct perf_counter)); +} + +static void perfcount_reset(void) { + int cpu; + enum pc_item i; + + for_each_online_cpu(cpu) + for (i = 0; i < NR_PC_ITEMS; i++) + reset_perfcount_item( + &per_cpu(perf_counters, cpu)[i]); +} + +struct unit { + unsigned int n; + const char * s; +}; + +static const struct unit event_units[] = { + { 1000, "" }, + { 1000, "K" }, + { 1000, "M" }, + { 1000, "G" }, + { 1000, "T" }, + { 1000, "P" }, + { 1000, "Q" }, +}; + + +static const struct unit time_units[] = { + { 1000, "ns" }, + { 1000, "us" }, + { 1000, "ms" }, + { 60, "s" }, + { 60, "m" }, + { 24, "h" }, + { 365, "d" }, + { 1000, "y" }, +}; + +static const struct unit byte_units[] = { + { 1000, "b" }, + { 1000, "kb" }, + { 1000, "mb" }, + { 1000, "gb" }, + { 1000, "tb" }, + { 1000, "pb" }, + { 1000, "qb" } +}; + +/* Print a value using the given array of units and scale it properly */ +static void pval(struct seq_file *s, unsigned long x, const struct unit *u) +{ + unsigned n = 0; + unsigned rem = 0; + unsigned last_divisor = 0; + + while (x >= u[n].n) { + last_divisor = u[n].n; + rem = x % last_divisor; + x = x / last_divisor; + n++; + } + + if (last_divisor) + rem = (rem * 10 + last_divisor / 2) / last_divisor; + else + rem = 0; + + /* + * Rounding may have resulted in the need to go + * to the next number + */ + if (rem == 10) { + x++; + rem = 0; + }; + + seq_printf(s, "%lu", x); + if (rem) { + seq_putc(s, '.'); + seq_putc(s, '0' + rem); + } + seq_puts(s, u[n].s); +} + +/* Print a set of statistical values in the form sum(max/avg/min) */ +static void pc_print(struct seq_file *s, const struct unit *u, + unsigned long count, unsigned long sum, + unsigned long min, unsigned long max) +{ + pval(s, sum, u); + seq_putc(s,'('); + pval(s, min, u); + seq_putc(s,'/'); + if (count) + pval(s, (sum + count / 2 ) / count, u); + else + pval(s, 0, u); + seq_putc(s,'/'); + pval(s, max, u); + seq_putc(s,')'); +} + + +static int perf_show(struct seq_file *s, void *v) +{ + int cpu = (unsigned long)s->private; + enum pc_item counter = (unsigned long)v - 1; + struct perf_counter summary, *x; + + if (cpu >= 0) + x = &per_cpu(perf_counters, cpu)[counter]; + else { + memcpy(&summary, &per_cpu(perf_counters, 0)[counter], + sizeof(summary)); + for_each_online_cpu(cpu) { + struct perf_counter *c = + &per_cpu(perf_counters, 0)[counter]; + + summary.events += c->events; + summary.skipped += c->skipped; + summary.time += c->time; + summary.bytes += c->bytes; + + if (summary.maxtime < c->maxtime) + summary.maxtime = c->maxtime; + + if (summary.mintime == 0 || + (c->mintime != 0 && + summary.mintime > c->mintime)) + summary.mintime = c->mintime; + + if (summary.maxbytes < c->maxbytes) + summary.maxbytes = c->maxbytes; + + if (summary.minbytes == 0 || + (c->minbytes != 0 && + summary.minbytes > c->minbytes)) + summary.minbytes = c->minbytes; + + } + x = &summary; + } + + seq_printf(s, "%-30s %10u ", var_id[counter], x->events); + if (x->skipped) + seq_printf(s, "(+%3u) ", x->skipped); + pc_print(s, time_units, x->events, x->time, x->mintime, x->maxtime); + if (x->bytes) { + seq_putc(s,' '); + pc_print(s, byte_units, x->events, x->bytes, + x->minbytes, x->maxbytes); + } + seq_putc(s, '\n'); + return 0; +} + +static void *perf_start(struct seq_file *m, loff_t *pos) +{ + return (*pos < NR_PC_ITEMS) ? (void *)(*pos +1) : NULL; +} + +static void *perf_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return perf_start(m, pos); +} + +static void perf_stop(struct seq_file *m, void *v) +{ +} + +struct seq_operations perf_data_ops = { + .start = perf_start, + .next = perf_next, + .stop = perf_stop, + .show = perf_show, +}; + +static int perf_data_open(struct inode *inode, struct file *file) +{ + int res; + + res = seq_open(file, &perf_data_ops); + if (!res) + ((struct seq_file *)file->private_data)->private = + PDE(inode)->data; + + return res; +} + +static struct file_operations perf_data_fops = { + .open = perf_data_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int perf_reset_write(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + perfcount_reset(); + return count; +} + +static __init int init_perfcounter(void) { + int cpu; + + struct proc_dir_entry *proc_perf, *perf_reset, *perf_all; + + proc_perf = proc_mkdir("perf", NULL); + if (!proc_perf) + return -ENOMEM; + + perf_reset = create_proc_entry("reset", S_IWUGO, proc_perf); + perf_reset->write_proc = perf_reset_write; + + perf_all = create_proc_entry("all", S_IRUGO, proc_perf); + perf_all->proc_fops = &perf_data_fops; + perf_all->data = (void *)-1; + + for_each_possible_cpu(cpu) { + char name[20]; + struct proc_dir_entry *p; + + sprintf(name, "%d", cpu); + p = create_proc_entry(name, S_IRUGO, proc_perf); + + p->proc_fops = &perf_data_fops; + p->data = (void *)(unsigned long)cpu; + } + + perfcount_reset(); + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) + if (!unsynchronized_tsc()) + unsynced_get_cycles = 0; +#endif + return 0; +} + +__initcall(init_perfcounter); + Index: linux-2.6.21-rc6/arch/x86_64/kernel/tsc.c =================================================================== --- linux-2.6.21-rc6.orig/arch/x86_64/kernel/tsc.c 2007-04-13 22:18:10.000000000 -0700 +++ linux-2.6.21-rc6/arch/x86_64/kernel/tsc.c 2007-04-13 22:19:08.000000000 -0700 @@ -21,7 +21,7 @@ void set_cyc2ns_scale(unsigned long khz) cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; } -static unsigned long long cycles_2_ns(unsigned long long cyc) +unsigned long long cycles_to_ns(unsigned long long cyc) { return (cyc * cyc2ns_scale) >> NS_SCALE; } @@ -37,7 +37,7 @@ unsigned long long sched_clock(void) */ rdtscll(a); - return cycles_2_ns(a); + return cycles_to_ns(a); } static int tsc_unstable; Index: linux-2.6.21-rc6/include/asm-x86_64/timex.h =================================================================== --- linux-2.6.21-rc6.orig/include/asm-x86_64/timex.h 2007-04-13 22:18:10.000000000 -0700 +++ linux-2.6.21-rc6/include/asm-x86_64/timex.h 2007-04-13 22:19:08.000000000 -0700 @@ -29,4 +29,5 @@ extern int read_current_timer(unsigned l extern void mark_tsc_unstable(void); extern void set_cyc2ns_scale(unsigned long khz); +unsigned long long cycles_to_ns(unsigned long long cyc); #endif Index: linux-2.6.21-rc6/arch/i386/kernel/tsc.c =================================================================== --- linux-2.6.21-rc6.orig/arch/i386/kernel/tsc.c 2007-04-13 22:18:10.000000000 -0700 +++ linux-2.6.21-rc6/arch/i386/kernel/tsc.c 2007-04-13 22:19:08.000000000 -0700 @@ -83,19 +83,13 @@ static inline int check_tsc_unstable(voi * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static unsigned long cyc2ns_scale __read_mostly; - -#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ +unsigned long cyc2ns_scale __read_mostly; static inline void set_cyc2ns_scale(unsigned long cpu_khz) { cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz; } -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; -} /* * Scheduler clock - returns current time in nanosec units. @@ -115,7 +109,7 @@ unsigned long long sched_clock(void) get_scheduled_cycles(this_offset); /* return the value in ns */ - return cycles_2_ns(this_offset); + return cycles_to_ns(this_offset); } unsigned long native_calculate_cpu_khz(void) Index: linux-2.6.21-rc6/include/asm-ia64/timex.h =================================================================== --- linux-2.6.21-rc6.orig/include/asm-ia64/timex.h 2007-04-13 22:18:10.000000000 -0700 +++ linux-2.6.21-rc6/include/asm-ia64/timex.h 2007-04-13 22:19:08.000000000 -0700 @@ -39,4 +39,7 @@ get_cycles (void) return ret; } +#define cycles_to_ns(x) (((x) * local_cpu_data->nsec_per_cyc) \ + >> IA64_NSEC_PER_CYC_SHIFT) + #endif /* _ASM_IA64_TIMEX_H */ Index: linux-2.6.21-rc6/init/Kconfig =================================================================== --- linux-2.6.21-rc6.orig/init/Kconfig 2007-04-13 22:18:10.000000000 -0700 +++ linux-2.6.21-rc6/init/Kconfig 2007-04-13 22:19:08.000000000 -0700 @@ -216,6 +216,15 @@ config TASK_IO_ACCOUNTING Say N if unsure. +config PERFCOUNT + bool "Time Stamp Counter based performance measurements" + help + Enables performance counters based on the time stamp counters. + These can be used to measure code paths in the kernel and also + gauge their effectiveness in transferring bytes. The performance + counters must be added by modifying code. The counters will then + be visible via files in /proc/perf/*. + config UTS_NS bool "UTS Namespaces" default n Index: linux-2.6.21-rc6/kernel/timer.c =================================================================== --- linux-2.6.21-rc6.orig/kernel/timer.c 2007-04-13 22:18:10.000000000 -0700 +++ linux-2.6.21-rc6/kernel/timer.c 2007-04-13 22:19:08.000000000 -0700 @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -1207,6 +1208,7 @@ void update_process_times(int user_tick) { struct task_struct *p = current; int cpu = smp_processor_id(); + INIT_PC(pc, PC_UPDATE_PROCESS_TIMES); /* Note: this timer irq context must be accounted for as well. */ if (user_tick) @@ -1218,6 +1220,7 @@ void update_process_times(int user_tick) rcu_check_callbacks(cpu, user_tick); scheduler_tick(); run_posix_cpu_timers(p); + pc_stop(&pc); } /* Index: linux-2.6.21-rc6/include/asm-i386/timex.h =================================================================== --- linux-2.6.21-rc6.orig/include/asm-i386/timex.h 2007-04-13 22:18:10.000000000 -0700 +++ linux-2.6.21-rc6/include/asm-i386/timex.h 2007-04-13 22:19:08.000000000 -0700 @@ -19,4 +19,12 @@ extern int read_current_timer(unsigned long *timer_value); #define ARCH_HAS_READ_CURRENT_TIMER 1 +extern long cyc2ns_scale; +#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ + +static inline unsigned long long cycles_to_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; +} + #endif Index: linux-2.6.21-rc6/mm/page_alloc.c =================================================================== --- linux-2.6.21-rc6.orig/mm/page_alloc.c 2007-04-13 22:18:10.000000000 -0700 +++ linux-2.6.21-rc6/mm/page_alloc.c 2007-04-13 22:19:08.000000000 -0700 @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -1238,6 +1239,7 @@ struct page * fastcall __alloc_pages(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist) { + INIT_PC(pc, PC_ALLOC_PAGES); const gfp_t wait = gfp_mask & __GFP_WAIT; struct zone **z; struct page *page; @@ -1397,6 +1399,7 @@ nopage: show_mem(); } got_pg: + pc_bytes(&pc, PAGE_SIZE << order); return page; } Index: linux-2.6.21-rc6/mm/vmscan.c =================================================================== --- linux-2.6.21-rc6.orig/mm/vmscan.c 2007-04-13 22:18:10.000000000 -0700 +++ linux-2.6.21-rc6/mm/vmscan.c 2007-04-13 22:19:08.000000000 -0700 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -1017,6 +1018,8 @@ static unsigned long shrink_zones(int pr */ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) { + INIT_PC(pc_scanned, PC_DIRECT_RECLAIM_SCANNED); + INIT_PC(pc_reclaimed, PC_DIRECT_RECLAIM_RECLAIMED); int priority; int ret = 0; unsigned long total_scanned = 0; @@ -1098,6 +1101,8 @@ out: zone->prev_priority = priority; } + pc_bytes(&pc_scanned, PAGE_SIZE * sc.nr_scanned); + pc_bytes(&pc_reclaimed, PAGE_SIZE * nr_reclaimed); return ret; } @@ -1124,6 +1129,8 @@ out: */ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) { + INIT_PC(pc_reclaim_scanned, PC_RECLAIM_SCANNED); + INIT_PC(pc_reclaim_reclaimed, PC_RECLAIM_RECLAIMED); int all_zones_ok; int priority; int i; @@ -1273,6 +1280,8 @@ out: goto loop_again; } + pc_bytes(&pc_reclaim_scanned, total_scanned * PAGE_SIZE); + pc_bytes(&pc_reclaim_reclaimed, nr_reclaimed * PAGE_SIZE); return nr_reclaimed; }