commit d2b2ea81b693098865444f199ec1e61e75747654 Merge: 29e3e17... 3f46faa... Author: Arnd Bergmann Date: Fri Jul 6 19:25:32 2007 +0200 Merge branches 'netdev-merge' and 'perfmon2' into perfmon2-merge commit 29e3e17760cc6add6c95aaeb76d11893c9f7229a Merge: 2f18e1c... fb8e39e... Author: Arnd Bergmann Date: Fri Jul 6 19:18:10 2007 +0200 Merge branch 'netdev-merge' into perfmon2-merge Conflicts: arch/powerpc/Kconfig include/asm-powerpc/systbl.h include/asm-powerpc/unistd.h commit 2f18e1caf74987f749eba3af99dc39e3e07f0e85 Author: Kevin Corry Date: Fri Jun 29 08:56:00 2007 -0500 Perfmon2: Add support for Cell PMU's hardware-sampling. commit fa0876715a1a02a5170e023ebb39e6cb68abacf7 Author: Stephane Eranian Date: Sun Jun 24 16:22:04 2007 -0500 Update to Perfmon2 version 070621: powerpc code. commit e332e9217b88f537cbf8b882398592f54c557b97 Author: Stephane Eranian Date: Sun Jun 24 15:26:17 2007 -0500 Update to Perfmon2 version 070621: x86-64 code. commit 9e5443f91fc5dd66469ab09331b5d2bee06bb1db Author: Stephane Eranian Date: Sun Jun 24 15:10:58 2007 -0500 Update to Perfmon2 version 070621: mips code. commit f06e44db987b70306896ff5cdfd84023b224949b Author: Stephane Eranian Date: Sat Jun 23 17:22:04 2007 -0500 Update to Perfmon2 version 070621: ia64 code. commit fb9f0a0e62583e51f242b19a817629e75e5961c0 Author: Kevin Corry Date: Sat Jun 23 17:19:55 2007 -0500 Update to Perfmon2 version 070621: i386 code. commit 8c649c0b5e0a210015b1eb45ec82d87b76e7c9d9 Author: Stephane Eranian Date: Sat Jun 23 17:06:31 2007 -0500 Update to Perfmon2 version 070621: core Perfmon2 code. commit 68971f94498ea868c6a3e6d6351ff7120edf14ad Merge: 90ec31a... 0864a4e... Author: Kevin Corry Date: Fri Jun 22 09:09:16 2007 -0500 Merge with git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git commit 90ec31acbcfa76df7ab5c7c98a0b9d6b3e7ec1ce Author: Kevin Corry Date: Thu Jun 21 11:57:54 2007 -0500 Add TIF_PERFMON_WORK and TIF_PERFMON_CTXSW flags. This patch has already been picked up in the powerpc tree, but not yet in mainline. This patch is required for the mainline-based Perfmon2 tree to compile on powerpc. commit 483a8ce2532afb62378abfe6758ba833be719a63 Author: Stephane Eranian Date: Thu Jun 21 11:28:50 2007 -0500 Perfmon2: powerpc modifications. Modifications to powerpc architecture code needed for Perfmon2. commit b1a5a6fc3dcb6bff4f19377785a21692fc47620f Author: Stephane Eranian Date: Wed Jun 20 16:12:37 2007 -0500 Perfmon2: new powerpc architecture code commit c4db69f1389171c8977534c8944d7128dee794d3 Author: Stephane Eranian Date: Wed Jun 20 14:45:03 2007 -0500 Perfmon2: x86-64 modifications. Modifications to x86-64 architecture code needed for Perfmon2. commit 77323002a9ffd5367f7d0288619733a15dd403ef Author: Stephane Eranian Date: Wed Jun 20 15:17:55 2007 -0500 Perfmon2: new x86-64 architecture code commit ba8ce1c95c79fafe561bafeff3c5e480a9104d1a Author: Stephane Eranian Date: Wed Jun 20 14:28:23 2007 -0500 Perfmon2: mips modifications. Modifications to mips architecture code needed for Perfmon2. commit 0af167473835e52268d7f5c004e2ec9463a357f4 Author: Stephane Eranian Date: Wed Jun 20 15:15:41 2007 -0500 Perfmon2: new mips architecture code commit 884b8c725e227901f0a548b55803a01cc6f156f6 Author: Stephane Eranian Date: Wed Jun 20 14:11:17 2007 -0500 Perfmon2: ia64 modifications. Modifications to ia64 architecture code needed for Perfmon2. commit af04b61bf1b16df0a1f2c96d2b02b12bf1c39cdd Author: Stephane Eranian Date: Wed Jun 20 15:14:10 2007 -0500 Perfmon2: new ia64 architecture code commit ff7f2d460449a9167c0389ab19d12be2c927aebd Author: Stephane Eranian Date: Thu Jun 21 11:25:24 2007 -0500 Perfmon2: i386 modifications. Modifications to i386 architecture code needed for Perfmon2. commit dedfc8b61584483c2511c7afba39bed6159ecee2 Author: Stephane Eranian Date: Wed Jun 20 15:06:45 2007 -0500 Perfmon2: new i386 architecture code commit 9e4067e8ce29fcec3e32a1d89339ada5904eaa7b Author: Stephane Eranian Date: Wed Jun 20 16:04:03 2007 -0500 Perfmon2 arch-independent modifications. Modifications to architecture-independent code needed for Perfmon2. commit 94b60799f0c2e4a1f320ab9025ccfadc2e57d6be Author: Stephane Eranian Date: Thu Jun 21 11:23:22 2007 -0500 Perfmon2: new core code Index: linux-2.6/Makefile =================================================================== --- linux-2.6.orig/Makefile +++ linux-2.6/Makefile @@ -553,7 +553,7 @@ export mod_strip_cmd ifeq ($(KBUILD_EXTMOD),) -core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ +core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ perfmon/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ Index: linux-2.6/arch/i386/Kconfig =================================================================== --- linux-2.6.orig/arch/i386/Kconfig +++ linux-2.6/arch/i386/Kconfig @@ -910,6 +910,8 @@ config COMPAT_VDSO If unsure, say Y. +source "arch/i386/perfmon/Kconfig" + endmenu config ARCH_ENABLE_MEMORY_HOTPLUG Index: linux-2.6/arch/i386/Makefile =================================================================== --- linux-2.6.orig/arch/i386/Makefile +++ linux-2.6/arch/i386/Makefile @@ -99,6 +99,7 @@ mflags-y += -Iinclude/asm-i386/mach-defa head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ +core-$(CONFIG_PERFMON) += arch/i386/perfmon/ core-y += arch/i386/kernel/ \ arch/i386/mm/ \ arch/i386/$(mcore-y)/ \ Index: linux-2.6/arch/i386/kernel/apic.c =================================================================== --- linux-2.6.orig/arch/i386/kernel/apic.c +++ linux-2.6/arch/i386/kernel/apic.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -562,6 +563,8 @@ static void local_apic_timer_interrupt(v per_cpu(irq_stat, cpu).apic_timer_irqs++; evt->event_handler(evt); + + pfm_handle_switch_timeout(); } /* @@ -1325,6 +1328,9 @@ void __init apic_intr_init(void) #ifdef CONFIG_X86_MCE_P4THERMAL set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif +#ifdef CONFIG_PERFMON + set_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt); +#endif } /** Index: linux-2.6/arch/i386/kernel/cpu/common.c =================================================================== --- linux-2.6.orig/arch/i386/kernel/cpu/common.c +++ linux-2.6/arch/i386/kernel/cpu/common.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -718,6 +719,8 @@ void __cpuinit cpu_init(void) current_thread_info()->status = 0; clear_used_math(); mxcsr_feature_mask_init(); + + pfm_init_percpu(); } #ifdef CONFIG_HOTPLUG_CPU Index: linux-2.6/arch/i386/kernel/entry.S =================================================================== --- linux-2.6.orig/arch/i386/kernel/entry.S +++ linux-2.6/arch/i386/kernel/entry.S @@ -465,7 +465,7 @@ ENDPROC(system_call) ALIGN RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: - testb $_TIF_NEED_RESCHED, %cl + testw $(_TIF_NEED_RESCHED|_TIF_PERFMON_WORK), %cx jz work_notifysig work_resched: call schedule Index: linux-2.6/arch/i386/kernel/process.c =================================================================== --- linux-2.6.orig/arch/i386/kernel/process.c +++ linux-2.6/arch/i386/kernel/process.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -384,6 +385,7 @@ void exit_thread(void) tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } + pfm_exit_thread(current); } void flush_thread(void) @@ -435,6 +437,8 @@ int copy_thread(int nr, unsigned long cl savesegment(gs,p->thread.gs); + pfm_copy_thread(p); + tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, @@ -538,8 +542,9 @@ int dump_task_regs(struct task_struct *t return 1; } -static noinline void __switch_to_xtra(struct task_struct *next_p, - struct tss_struct *tss) +static noinline void __switch_to_xtra(struct task_struct *prev_p, + struct task_struct *next_p, + struct tss_struct *tss) { struct thread_struct *next; @@ -555,6 +560,10 @@ static noinline void __switch_to_xtra(st set_debugreg(next->debugreg[7], 7); } + if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW) + || test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw(prev_p, next_p); + if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { /* * Disable the bitmap via an invalid offset. We still cache @@ -690,8 +699,8 @@ struct task_struct fastcall * __switch_t * Now maybe handle debug registers and/or IO bitmaps */ if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW) - || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))) - __switch_to_xtra(next_p, tss); + || (task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW))) + __switch_to_xtra(prev_p, next_p, tss); disable_tsc(prev_p, next_p); Index: linux-2.6/arch/i386/kernel/signal.c =================================================================== --- linux-2.6.orig/arch/i386/kernel/signal.c +++ linux-2.6/arch/i386/kernel/signal.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -652,6 +653,9 @@ void do_notify_resume(struct pt_regs *re clear_thread_flag(TIF_SINGLESTEP); } + if (thread_info_flags & _TIF_PERFMON_WORK) + pfm_handle_work(regs); + /* deal with pending signal delivery */ if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) do_signal(regs); Index: linux-2.6/arch/i386/kernel/smpboot.c =================================================================== --- linux-2.6.orig/arch/i386/kernel/smpboot.c +++ linux-2.6/arch/i386/kernel/smpboot.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -1207,6 +1208,7 @@ int __cpu_disable(void) cpu_clear(cpu, map); fixup_irqs(map); + pfm_cpu_disable(); /* It's now safe to remove this processor from the online map */ cpu_clear(cpu, cpu_online_map); return 0; Index: linux-2.6/arch/i386/kernel/syscall_table.S =================================================================== --- linux-2.6.orig/arch/i386/kernel/syscall_table.S +++ linux-2.6/arch/i386/kernel/syscall_table.S @@ -323,3 +323,15 @@ ENTRY(sys_call_table) .long sys_signalfd .long sys_timerfd .long sys_eventfd + .long sys_pfm_create_context + .long sys_pfm_write_pmcs /* 325 */ + .long sys_pfm_write_pmds + .long sys_pfm_read_pmds + .long sys_pfm_load_context + .long sys_pfm_start + .long sys_pfm_stop /* 330 */ + .long sys_pfm_restart + .long sys_pfm_create_evtsets + .long sys_pfm_getinfo_evtsets + .long sys_pfm_delete_evtsets + .long sys_pfm_unload_context /* 335 */ Index: linux-2.6/arch/i386/oprofile/Makefile =================================================================== --- linux-2.6.orig/arch/i386/oprofile/Makefile +++ linux-2.6/arch/i386/oprofile/Makefile @@ -10,3 +10,4 @@ oprofile-y := $(DRIVER_OBJS) init.o b oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ op_model_ppro.o op_model_p4.o oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o +oprofile-$(CONFIG_PERFMON) += perfmon.o Index: linux-2.6/arch/i386/oprofile/init.c =================================================================== --- linux-2.6.orig/arch/i386/oprofile/init.c +++ linux-2.6/arch/i386/oprofile/init.c @@ -15,9 +15,11 @@ * with the NMI mode driver. */ +extern int op_perfmon_init(struct oprofile_operations * ops); extern int op_nmi_init(struct oprofile_operations * ops); extern int op_nmi_timer_init(struct oprofile_operations * ops); extern void op_nmi_exit(void); +extern void op_perfmon_exit(void); extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); @@ -27,8 +29,12 @@ int __init oprofile_arch_init(struct opr ret = -ENODEV; +#ifdef CONFIG_PERFMON + ret = op_perfmon_init(ops); +#endif #ifdef CONFIG_X86_LOCAL_APIC - ret = op_nmi_init(ops); + if (ret < 0) + ret = op_nmi_init(ops); #endif #ifdef CONFIG_X86_IO_APIC if (ret < 0) @@ -42,6 +48,9 @@ int __init oprofile_arch_init(struct opr void oprofile_arch_exit(void) { +#ifdef CONFIG_PERFMON + op_perfmon_exit(); +#endif #ifdef CONFIG_X86_LOCAL_APIC op_nmi_exit(); #endif Index: linux-2.6/arch/i386/oprofile/nmi_int.c =================================================================== --- linux-2.6.orig/arch/i386/oprofile/nmi_int.c +++ linux-2.6/arch/i386/oprofile/nmi_int.c @@ -465,6 +465,7 @@ int __init op_nmi_init(struct oprofile_o ops->start = nmi_start; ops->stop = nmi_stop; ops->cpu_type = cpu_type; + ops->implementation = "oprofile"; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); return 0; } Index: linux-2.6/arch/i386/oprofile/nmi_timer_int.c =================================================================== --- linux-2.6.orig/arch/i386/oprofile/nmi_timer_int.c +++ linux-2.6/arch/i386/oprofile/nmi_timer_int.c @@ -64,6 +64,7 @@ int __init op_nmi_timer_init(struct opro ops->start = timer_start; ops->stop = timer_stop; ops->cpu_type = "timer"; + ops->implementation = "nmi_timer"; printk(KERN_INFO "oprofile: using NMI timer interrupt.\n"); return 0; } Index: linux-2.6/arch/i386/oprofile/perfmon.c =================================================================== --- /dev/null +++ linux-2.6/arch/i386/oprofile/perfmon.c @@ -0,0 +1,161 @@ +/** + * @file perfmon.c + * + * @remark Copyright 2003 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + */ + +#include +#include +#include +#include + +static int allow_ints; + +static int +perfmon_handler(void *buf, struct pfm_ovfl_arg *arg, + unsigned long ip, u64 stamp, void *data) +{ + struct pt_regs * const regs = data; + int event = arg->pmd_eventid; + + PFM_DBG_ovfl("oprofile overflow ip=%lx, event=%d", + instruction_pointer(regs), event); + + arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET; + + /* the owner of the oprofile event buffer may have exited + * without perfmon being shutdown (e.g. SIGSEGV) + */ + if (allow_ints) + oprofile_add_sample(regs, event); + return 0; +} + + +static int perfmon_start(void) +{ + allow_ints = 1; + return 0; +} + + +static void perfmon_stop(void) +{ + allow_ints = 0; +} + +static struct pfm_smpl_fmt oprofile_fmt = { + .fmt_name = "OProfile", + .fmt_handler = perfmon_handler, + .fmt_flags = PFM_FMT_BUILTIN_FLAG, + .owner = THIS_MODULE +}; + +/* all the ops are handled via userspace for i386 oprofile using perfmon */ + +static int using_perfmon; + +static int __init ppro_init(char ** cpu_type) +{ + __u8 cpu_model = boot_cpu_data.x86_model; + + if (cpu_model == 14) + *cpu_type = "i386/core"; + else if (cpu_model == 15) + *cpu_type = "i386/core_2"; + else if (cpu_model > 0xd) + return 0; + else if (cpu_model == 9) { + *cpu_type = "i386/p6_mobile"; + } else if (cpu_model > 5) { + *cpu_type = "i386/piii"; + } else if (cpu_model > 2) { + *cpu_type = "i386/pii"; + } else { + *cpu_type = "i386/ppro"; + } + return 1; +} + +static int __init p4_init(char ** cpu_type) +{ +#ifndef CONFIG_SMP + *cpu_type = "i386/p4"; + return 1; +#else + switch (smp_num_siblings) { + case 1: + *cpu_type = "i386/p4"; + return 1; + + case 2: + *cpu_type = "i386/p4-ht"; + return 1; + } +#endif + return 0; +} + +static char *get_cpu_type(void) +{ + char *cpu_type = "??/??"; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + /* Needs to be at least an Athlon (or hammer in 32bit mode) */ + switch (boot_cpu_data.x86) { + case 6: + cpu_type = "i386/athlon"; + break; + case 0xf: + /* Actually it could be i386/hammer too, but give + user space a consistent name. */ + cpu_type = "x86-64/hammer"; + break; + } + break; + case X86_VENDOR_INTEL: + switch (boot_cpu_data.x86) { + /* Pentium IV */ + case 0xf: + p4_init(&cpu_type); + break; + + /* A P6-class processor */ + case 6: + ppro_init(&cpu_type); + break; + } + break; + } + return cpu_type; +} + + +int __init op_perfmon_init(struct oprofile_operations * ops) +{ + int ret = pfm_fmt_register(&oprofile_fmt); + if (ret) + return -ENODEV; + + ops->cpu_type = get_cpu_type(); + ops->start = perfmon_start; + ops->stop = perfmon_stop; + ops->implementation = "perfmon2"; + using_perfmon = 1; + printk(KERN_INFO "oprofile: using perfmon.\n"); + return 0; +} + + +void __exit op_perfmon_exit(void) +{ + if (!using_perfmon) + return; + + pfm_fmt_unregister(&oprofile_fmt); +} + Index: linux-2.6/arch/i386/perfmon/Kconfig =================================================================== --- /dev/null +++ linux-2.6/arch/i386/perfmon/Kconfig @@ -0,0 +1,65 @@ +menu "Hardware Performance Monitoring support" +config PERFMON + bool "Perfmon2 performance monitoring interface" + select X86_LOCAL_APIC + default n + help + Enables the perfmon2 interface to access the hardware + performance counters. See for + more details. + +config PERFMON_DEBUG + bool "Perfmon debugging" + default n + depends on PERFMON + help + Enables perfmon debugging support + +config PERFMON_P6 + tristate "Support for Intel P6/Pentium M processor hardware performance counters" + depends on PERFMON + default n + help + Enables support for Intel P6-style hardware performance counters. + To be used for with Intel Pentium III, PentiumPro, Pentium M processors. + +config I386_PERFMON_P4 + tristate "Support for 32-bit Intel Pentium 4/Xeon hardware performance counters" + depends on PERFMON + default n + help + Enables support for 32-bit Intel Pentium 4/Xeon hardware performance + counters. + +config I386_PERFMON_PEBS + tristate "Support for Intel Precise Event-Based Sampling (PEBS)" + depends on PERFMON + default n + help + Enables support for 32-bit Precise Event-Based Sampling (PEBS) on the Intel + Pentium 4, Xeon, and Core-based processors which support it. + +config I386_PERFMON_CORE + tristate "Support for Intel Core-based performance counters" + depends on PERFMON + default n + help + Enables 32-bit support for Intel Core-based performance counters. Enable + this option to support Intel Core 2 Duo processors. + +config I386_PERFMON_INTEL_ARCH + tristate "Support for Intel architectural performance counters" + depends on PERFMON + default n + help + Enables 32-bit support for Intel architectural performance counters. This + architecture was introduced by Intel Core Solo/Core Duo processors. + +config I386_PERFMON_K8 + tristate "Support 32-bit mode AMD Athlon64/Opteron64 hardware performance counters" + depends on PERFMON + default n + help + Enables support for 32-bit mode AND Althon64/Opterton64 hardware performance counters. +endmenu + Index: linux-2.6/arch/i386/perfmon/Makefile =================================================================== --- /dev/null +++ linux-2.6/arch/i386/perfmon/Makefile @@ -0,0 +1,14 @@ +# +# Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. +# Contributed by Stephane Eranian +# +obj-$(CONFIG_PERFMON) += perfmon.o +obj-$(CONFIG_PERFMON_P6) += perfmon_p6.o +obj-$(CONFIG_I386_PERFMON_P4) += perfmon_p4.o +obj-$(CONFIG_I386_PERFMON_CORE) += perfmon_core.o +obj-$(CONFIG_I386_PERFMON_INTEL_ARCH) += perfmon_intel_arch.o +obj-$(CONFIG_I386_PERFMON_PEBS) += perfmon_pebs_smpl.o +obj-$(CONFIG_I386_PERFMON_K8) += perfmon_k8.o + +perfmon_k8-$(subst m,y,$(CONFIG_I386_PERFMON_K8)) += ../../x86_64/perfmon/perfmon_k8.o +perfmon_core-$(subst m,y,$(CONFIG_I386_PERFMON_CORE)) += ../../x86_64/perfmon/perfmon_core.o Index: linux-2.6/arch/i386/perfmon/perfmon.c =================================================================== --- /dev/null +++ linux-2.6/arch/i386/perfmon/perfmon.c @@ -0,0 +1,1302 @@ +/* + * This file implements the X86 specific support for the perfmon2 interface + * + * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include + +#include +#include +#include + +DEFINE_PER_CPU(unsigned long, real_iip); + +static int pfm_using_nmi; + +struct pfm_ds_area { + unsigned long bts_buf_base; + unsigned long bts_index; + unsigned long bts_abs_max; + unsigned long bts_intr_thres; + unsigned long pebs_buf_base; + unsigned long pebs_index; + unsigned long pebs_abs_max; + unsigned long pebs_intr_thres; + u64 pebs_cnt_reset; +}; + +static int (*pfm_has_ovfl)(struct pfm_context *); +static int (*pfm_stop_save)(struct pfm_context *ctx, + struct pfm_event_set *set); + +static inline int get_smt_id(void) +{ +#ifdef CONFIG_SMP + int cpu = smp_processor_id(); + return (cpu != first_cpu(cpu_sibling_map[cpu])); +#else + return 0; +#endif +} + +void __pfm_write_reg_p4(const struct pfm_arch_ext_reg *xreg, u64 val) +{ + u64 pmi; + int smt_id; + + smt_id = get_smt_id(); + /* + * HT is only supported by P4-style PMU + * + * Adjust for T1 if necessary: + * + * - move the T0_OS/T0_USR bits into T1 slots + * - move the OVF_PMI_T0 bits into T1 slot + * + * The P4/EM64T T1 is cleared by description table. + * User only works with T0. + */ + if (smt_id) { + if (xreg->reg_type & PFM_REGT_ESCR) { + + /* copy T0_USR & T0_OS to T1 */ + val |= ((val & 0xc) >> 2); + + /* clear bits T0_USR & T0_OS */ + val &= ~0xc; + + } else if (xreg->reg_type & PFM_REGT_CCCR) { + pmi = (val >> 26) & 0x1; + if (pmi) { + val &=~(1UL<<26); + val |= 1UL<<27; + } + } + } + if (xreg->addrs[smt_id]) + wrmsrl(xreg->addrs[smt_id], val); +} + +void __pfm_read_reg_p4(const struct pfm_arch_ext_reg *xreg, u64 *val) +{ + int smt_id; + + smt_id = get_smt_id(); + + if (likely(xreg->addrs[smt_id])) { + rdmsrl(xreg->addrs[smt_id], *val); + /* + * HT is only supported by P4-style PMU + * + * move the Tx_OS and Tx_USR bits into + * T0 slots setting the T1 slots to zero + */ + if (xreg->reg_type & PFM_REGT_ESCR) { + if (smt_id) + *val |= (((*val) & 0x3) << 2); + + /* + * zero out bits that are reserved + * (including T1_OS and T1_USR) + */ + *val &= PFM_ESCR_RSVD; + } + } else + *val = 0; +} + +/* + * called from NMI interrupt handler + */ +static void __kprobes __pfm_arch_quiesce_pmu_percpu(void) +{ + struct pfm_arch_pmu_info *arch_info; + unsigned int i; + + arch_info = pfm_pmu_conf->arch_info; + + /* + * quiesce PMU by clearing registers that have enable bits + * (start/stop capabilities). + */ + for (i = 0; i < arch_info->max_ena; i++) + if (test_bit(i, cast_ulp(arch_info->enable_mask))) + pfm_arch_write_pmc(NULL, i, 0); +} + +/* + * unfreeze PMU from pfm_do_interrupt_handler(). + * ctx may be NULL for spurious interrupts. + * interrupts are masked. + */ +void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + + if (ctx == NULL) + return; + + PFM_DBG_ovfl("state=%d", ctx->state); + + ctx->flags.started = 1; + + if (ctx->state == PFM_CTX_MASKED) + return; + + ctx_arch = pfm_ctx_arch(ctx); + + pfm_arch_restore_pmcs(ctx, ctx->active_set); + + if (ctx_arch->flags & PFM_X86_USE_DS) + wrmsrl(MSR_IA32_DS_AREA, ctx_arch->ds_area); +} + +/* + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * set cannot be NULL. Context is locked. Interrupts are masked. + * Caller has already restored all PMD and PMC registers. + * + * must reactivate monitoring + */ +void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * reload DS management area pointer. Pointer + * not managed as a PMC thus it is not restored + * with the rest of the registers. + */ + if (ctx_arch->flags & PFM_X86_USE_DS) + wrmsrl(MSR_IA32_DS_AREA, ctx_arch->ds_area); + + if (set->npend_ovfls) + __get_cpu_var(real_iip) = ctx_arch->saved_real_iip; +} + +static int pfm_stop_save_p6(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + u64 used_mask[PFM_PMC_BV]; + u64 *cnt_mask; + u64 val, wmask, ovfl_mask; + u32 i, count; + + if (ctx->state == PFM_CTX_MASKED) + return 1; + + wmask = 1ULL << pfm_pmu_conf->counter_width; + + bitmap_and(cast_ulp(used_mask), + cast_ulp(set->used_pmcs), + cast_ulp(arch_info->enable_mask), + arch_info->max_ena); + + count = bitmap_weight(cast_ulp(used_mask), pfm_pmu_conf->regs.max_pmc); + + /* + * stop monitoring + * Unfortunately, this is very expensive! + * wrmsrl() is serializing. + */ + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(used_mask))) { + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0); + count--; + } + } + + /* + * if we already having a pending overflow condition, we simply + * return to take care of this first. + */ + if (set->npend_ovfls) { + __get_cpu_var(pfm_stats).ccnt6++; + return 1; + } + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + cnt_mask = pfm_pmu_conf->regs.cnt_pmds; + + /* + * check for pending overflows and save PMDs (combo) + * Must check for counting PMDs because of virtual PMDs + */ + count = set->nused_pmds; + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(set->used_pmds))) { + val = pfm_arch_read_pmd(ctx, i); + if (likely(test_bit(i, cast_ulp(cnt_mask)))) { + if (!(val & wmask)) { + __set_bit(i, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + } + val = (set->pmds[i].value & ~ovfl_mask) | (val & ovfl_mask); + } + set->pmds[i].value = val; + count--; + } + } + /* 0 means: no need to save PMDs at upper level */ + return 0; +} + +static int pfm_stop_save_amd64(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + return pfm_stop_save_p6(ctx, set); +} + +static int pfm_stop_save_intel_core(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + struct pfm_arch_context *ctx_arch; + struct pfm_ds_area *ds; + u64 used_mask[PFM_PMC_BV]; + u64 *cnt_mask; + u64 val, wmask, ovfl_mask; + u32 i, count; + + if (ctx->state == PFM_CTX_MASKED) + return 1; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * if PEBS used, clear DS area pointer + */ + if (ctx_arch->flags & PFM_X86_USE_DS) + wrmsrl(MSR_IA32_DS_AREA, 0); + + wmask = 1ULL << pfm_pmu_conf->counter_width; + + /* + * used enable pmc bitmask + */ + bitmap_and(cast_ulp(used_mask), + cast_ulp(set->used_pmcs), + cast_ulp(arch_info->enable_mask), + arch_info->max_ena); + + count = bitmap_weight(cast_ulp(used_mask), arch_info->max_ena); + + /* + * stop monitoring + * Unfortunately, this is very expensive! + * wrmsrl() is serializing. + */ + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(used_mask))) { + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0); + count--; + } + } + + /* + * if we already having a pending overflow condition, we simply + * return to take care of this first. + */ + if (set->npend_ovfls) + return 1; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + cnt_mask = pfm_pmu_conf->regs.cnt_pmds; + + /* + * check for pending overflows and save PMDs (combo) + * Must check for counting PMDs because of virtual PMDs + * + * XXX: should use the ovf_status register instead, yet + * we would have to check if NMI is used and fallback + * to individual pmd inspection. + */ + count = set->nused_pmds; + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(set->used_pmds))) { + val = pfm_arch_read_pmd(ctx, i); + if (likely(test_bit(i, cast_ulp(cnt_mask)))) { + if (!(val & wmask)) { + __set_bit(i, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + } + val = (set->pmds[i].value & ~ovfl_mask) | (val & ovfl_mask); + } + set->pmds[i].value = val; + count--; + } + } + + /* + * check for PEBS buffer full and set the corresponding PMD overflow + */ + if (ctx_arch->flags & PFM_X86_USE_PEBS) { + + ds = (struct pfm_ds_area *)ctx_arch->ds_area; + + PFM_DBG("ds=%p pebs_idx=0x%lx thres=0x%lx", + ds, + ds->pebs_index, + ds->pebs_intr_thres); + + if (ds->pebs_index >= ds->pebs_intr_thres + && test_bit(arch_info->pebs_ctr_idx, + cast_ulp(set->used_pmds))) { + __set_bit(arch_info->pebs_ctr_idx, + cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + } + } + /* 0 means: no need to save PMDs at upper level */ + return 0; +} + +static int pfm_stop_save_p4(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + struct pfm_arch_context *ctx_arch; + struct pfm_arch_ext_reg *xrc, *xrd; + u64 used_mask[PFM_PMC_BV]; + u32 i, j, count; + u16 max_pmc; + u64 cccr, ctr1, ctr2, ovfl_mask; + + if (ctx->state == PFM_CTX_MASKED) + return 1; + + ctx_arch = pfm_ctx_arch(ctx); + max_pmc = pfm_pmu_conf->regs.max_pmc; + xrc = arch_info->pmc_addrs; + xrd = arch_info->pmd_addrs; + ovfl_mask = pfm_pmu_conf->ovfl_mask; + + /* + * build used enable PMC bitmask + * if user did not set any CCCR, then mask is + * empty and there is nothing to do because nothing + * was started + */ + bitmap_and(cast_ulp(used_mask), + cast_ulp(set->used_pmcs), + cast_ulp(arch_info->enable_mask), + arch_info->max_ena); + + count = bitmap_weight(cast_ulp(used_mask), arch_info->max_ena); + + PFM_DBG_ovfl("npend=%u ena_mask=0x%llx u_pmcs=0x%llx count=%u num=%u", + set->npend_ovfls, + (unsigned long long)arch_info->enable_mask[0], + (unsigned long long)set->used_pmcs[0], + count, arch_info->max_ena); + /* + * stop clear DS area pointer + */ + if (ctx_arch->flags & PFM_X86_USE_DS) + wrmsrl(MSR_IA32_DS_AREA, 0); + + /* + * ensures we do not destroy pending overflow + * information. If pended interrupts are already + * known, then we just stop monitoring. + */ + if (set->npend_ovfls) { + /* + * clear enable bit + * unfortunately, this is very expensive! + */ + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(used_mask))) { + __pfm_write_reg_p4(xrc+i, 0); + count--; + } + } + /* need save PMDs at upper level */ + return 1; + } + + /* + * stop monitoring AND collect pending overflow information AND + * save pmds. + * + * We need to access the CCCR twice, once to get overflow info + * and a second to stop monitoring (which destroys the OVF flag) + * Similarly, we need to read the counter twice to check whether + * it did overflow between the CCR read and the CCCR write. + */ + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(used_mask))) { + /* + * controlled counter + */ + j = xrc[i].ctr; + + /* read CCCR (PMC) value */ + __pfm_read_reg_p4(xrc+i, &cccr); + + /* read counter (PMD) controlled by PMC */ + __pfm_read_reg_p4(xrd+j, &ctr1); + + /* clear CCCR value: stop counter but destroy OVF */ + __pfm_write_reg_p4(xrc+i, 0); + + /* read counter controlled by CCCR again */ + __pfm_read_reg_p4(xrd+j, &ctr2); + + /* + * there is an overflow if either: + * - CCCR.ovf is set (and we just cleared it) + * - ctr2 < ctr1 + * in that case we set the bit corresponding to the + * overflowed PMD in povfl_pmds. + */ + if ((cccr & (1ULL<<31)) || (ctr2 < ctr1)) { + __set_bit(j, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + } + ctr2 = (set->pmds[j].value & ~ovfl_mask) | (ctr2 & ovfl_mask); + set->pmds[j].value = ctr2; + count--; + } + } + /* + * check for PEBS buffer full and set the corresponding PMD overflow + */ + if (ctx_arch->flags & PFM_X86_USE_PEBS) { + struct pfm_ds_area *ds; + ds = (struct pfm_ds_area *)ctx_arch->ds_area; + PFM_DBG("ds=%p pebs_idx=0x%lx thres=0x%lx", ds, ds->pebs_index, ds->pebs_intr_thres); + if (ds->pebs_index >= ds->pebs_intr_thres + && test_bit(arch_info->pebs_ctr_idx, cast_ulp(set->used_pmds))) { + __set_bit(arch_info->pebs_ctr_idx, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + } + } + + /* 0 means: no need to save the PMD at higher level */ + return 0; +} + +/* + * Called from pfm_stop() and idle notifier + * + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * task is not necessarily current. If not current task, then + * task is guaranteed stopped and off any cpu. Access to PMU + * is not guaranteed. Interrupts are masked. Context is locked. + * Set is the active set. + * + * For system-wide: + * task is current + * + * must disable active monitoring. ctx cannot be NULL + */ +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + /* + * no need to go through stop_save() + * if we are already stopped + */ + if (!ctx->flags.started) + return; + /* + * on x86, masked is equivalent to stopped, thus we have + * nothing to do here + */ + if (task == current) + pfm_stop_save(ctx, set); +} + +/* + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * Context is locked. Interrupts are masked. Monitoring may be active. + * PMU access is guaranteed. PMC and PMD registers are live in PMU. + * + * Must stop monitoring, save pending overflow information + * + * Return: + * non-zero : did not save PMDs (as part of stopping the PMU) + * 0 : saved PMDs (no need to save them in caller) + */ +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + /* + * disable lazy restore of PMCS on ctxswin because + * we modify some of them. + */ + set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; + + if (set->npend_ovfls) { + struct pfm_arch_context *ctx_arch; + ctx_arch = pfm_ctx_arch(ctx); + ctx_arch->saved_real_iip = __get_cpu_var(real_iip); + } + return pfm_stop_save(ctx, set); +} + +/* + * called from pfm_start() and idle notifier + * + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * Task is not necessarily current. If not current task, then task + * is guaranteed stopped and off any cpu. No access to PMU is task + * is not current. + * + * For system-wide: + * task is always current + * + * must enable active monitoring. + */ +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + u64 *mask; + u16 i, num; + + /* + * pfm_start issue while context is masked as no effect. + * This comes from the fact that on x86, masking and stopping + * use the same mechanism, i.e., clearing the enable bits + * of the PMC registers. + */ + if (ctx->state == PFM_CTX_MASKED) + return; + + /* + * cannot restore PMC if no access to PMU. Will be done + * when the thread is switched back in + */ + if (task != current) + return; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * we must actually install all implemented pmcs registers because + * until started, we do not write any PMC registers. + * Note that registers used by other subsystems (e.g. NMI) are + * removed from pmcs. + * + * The available registers that are actually not used get their default + * value such that counters do not count anything. As such, we can + * afford to write all of them but then stop only the one we use. + * + * XXX: we may be able to optimize this for non-P4 PMU has pmcs are + * independent from each others. + */ + num = pfm_pmu_conf->regs.num_pmcs; + mask = pfm_pmu_conf->regs.pmcs; + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(mask))) { + pfm_arch_write_pmc(ctx, i, set->pmcs[i]); + num--; + } + } + + /* + * reload DS area pointer. + */ + if (ctx_arch->flags & PFM_X86_USE_DS) + wrmsrl(MSR_IA32_DS_AREA, ctx_arch->ds_area); + +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw() + * + * context is locked. Interrupts are masked. Set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore PMD registers + */ +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 *used_pmds; + u16 i, num; + + used_pmds = set->used_pmds; + num = set->nused_pmds; + + /* + * we can restore only the PMD we use because: + * - you can only read with pfm_read_pmds() the registers + * declared used via pfm_write_pmds(), smpl_pmds, reset_pmds + * + * - if cr4.pce=1, only counters are exposed to user. No + * address is ever exposed by counters. + */ + for (i = 0; num; i++) { + if (likely(test_bit(i, cast_ulp(used_pmds)))) { + pfm_write_pmd(ctx, i, set->pmds[i].value); + num--; + } + } +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(). + * Context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMC registers from set + */ +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + u64 *mask; + u16 i, num; + + /* + * - by default, no PMC measures anything + * - on ctxswout, all used PMCs are disabled (cccr cleared) + * + * we need to restore the PMC (incl enable bits) only if + * not masked and user issued pfm_start() + */ + if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0) + return; + + /* + * In general, writing MSRs is very expensive, so try to be smart. + * + * P6-style, Core-style: + * - pmc are totally independent of each other, there is + * possible side-effect from stale pmcs. Therefore we only + * restore the registers we use + * P4-style: + * - must restore everything because there are some dependencies + * (e.g., ESCR and CCCR) + */ + if (arch_info->pmu_style == PFM_X86_PMU_P4) { + num = pfm_pmu_conf->regs.num_pmcs; + mask = pfm_pmu_conf->regs.pmcs; + } else { + num = set->nused_pmcs; + mask = set->used_pmcs; + } + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(mask))) { + pfm_arch_write_pmc(ctx, i, set->pmcs[i]); + num--; + } + } +} + +/* + * invoked only when NMI is used. Called from the LOCAL_PERFMON_VECTOR + * handler to copy P4 overflow state captured when the NMI triggered. + * Given that on P4, stopping monitoring destroy the overflow information + * we save it in pfm_has_ovfl_p4() where monitoring is also stopped. + * + * Here we propagate the overflow state to current active set. The + * freeze_pmu() call we not overwrite this state because npend_ovfls + * is non-zero. + */ +static void pfm_p4_copy_nmi_state(void) +{ + struct pfm_context *ctx; + struct pfm_arch_context *ctx_arch; + struct pfm_event_set *set; + + ctx = __get_cpu_var(pmu_ctx); + if (!ctx) + return; + + ctx_arch = pfm_ctx_arch(ctx); + set = ctx->active_set; + + if (ctx_arch->p4->npend_ovfls) { + set->npend_ovfls = ctx_arch->p4->npend_ovfls; + + bitmap_copy(cast_ulp(set->povfl_pmds), + cast_ulp(ctx_arch->p4->povfl_pmds), + pfm_pmu_conf->regs.max_pmd); + + ctx_arch->p4->npend_ovfls = 0; + } +} + +/* + * The PMU interrupt is handled through an interrupt gate, therefore + * the CPU automatically clears the EFLAGS.IF, i.e., masking interrupts. + * + * The perfmon interrupt handler MUST run with interrupts disabled due + * to possible race with other, higher priority interrupts, such as timer + * or IPI function calls. + * + * See description in IA-32 architecture manual, Vol 3 section 5.8.1 + */ +fastcall void smp_pmu_interrupt(struct pt_regs *regs) +{ + struct pfm_arch_pmu_info *arch_info; + unsigned long iip; + + ack_APIC_irq(); + + irq_enter(); + + /* + * when using NMI, pfm_handle_nmi() gets called + * first. It stops monitoring and record the + * iip into real_iip, then it repost the interrupt + * using the lower priority vector LOCAL_PERFMON_VECTOR + * + * On P4, due to the difficulty of detecting overflows + * and stoppping the PMU, pfm_handle_nmi() needs to + * record npend_ovfl and ovfl_pmds in ctx_arch. So + * here we simply copy them back to the set. + */ + if (pfm_using_nmi) { + arch_info = pfm_pmu_conf->arch_info; + iip = __get_cpu_var(real_iip); + if (arch_info->pmu_style == PFM_X86_PMU_P4) + pfm_p4_copy_nmi_state(); + } else + iip = instruction_pointer(regs); + + pfm_interrupt_handler(iip, regs); + + /* + * On Intel P6, Pentium M, P4, Intel Core: + * - it is necessary to clear the MASK field for the LVTPC + * vector. Otherwise interrupts remain masked. See + * section 8.5.1 + * AMD X86-64: + * - the documentation does not stipulate the behavior. + * To be safe, we also rewrite the vector to clear the + * mask field + */ + if (cpu_data->x86_vendor == X86_VENDOR_INTEL) + apic_write(APIC_LVTPC, LOCAL_PERFMON_VECTOR); + + irq_exit(); +} + +/* + * detect is counters have overflowed. + * return: + * 0 : no overflow + * 1 : at least one overflow + * + * used by AMD K8 and Intel architectural PMU + */ +static int __kprobes pfm_has_ovfl_p6(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + struct pfm_arch_ext_reg *xrd; + u64 *cnt_mask; + u64 wmask, val; + u16 i, num; + + cnt_mask = pfm_pmu_conf->regs.cnt_pmds; + num = pfm_pmu_conf->regs.num_counters; + wmask = 1ULL << pfm_pmu_conf->counter_width; + xrd = arch_info->pmd_addrs; + + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(cnt_mask))) { + rdmsrl(xrd[i].addrs[0], val); + if (!(val & wmask)) + return 1; + num--; + } + } + return 0; +} + +static int __kprobes pfm_has_ovfl_amd64(struct pfm_context *ctx) +{ + return pfm_has_ovfl_p6(ctx); +} + +/* + * detect is counters have overflowed. + * return: + * 0 : no overflow + * 1 : at least one overflow + * + * used by Intel P4 + */ +static int __kprobes pfm_has_ovfl_p4(struct pfm_context *ctx) +{ + struct pfm_arch_ext_reg *xrc, *xrd; + struct pfm_arch_context *ctx_arch; + struct pfm_arch_p4_context *p4; + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + u64 ena_mask[PFM_PMC_BV]; + u64 cccr, ctr1, ctr2; + int n, i, j; + + ctx_arch = pfm_ctx_arch(ctx); + xrc = arch_info->pmc_addrs; + xrd = arch_info->pmd_addrs; + p4 = ctx_arch->p4; + + bitmap_and(cast_ulp(ena_mask), + cast_ulp(pfm_pmu_conf->regs.pmcs), + cast_ulp(arch_info->enable_mask), + arch_info->max_ena); + + n = bitmap_weight(cast_ulp(ena_mask), arch_info->max_ena); + + for(i=0; n; i++) { + if (!test_bit(i, cast_ulp(ena_mask))) + continue; + /* + * controlled counter + */ + j = xrc[i].ctr; + + /* read CCCR (PMC) value */ + __pfm_read_reg_p4(xrc+i, &cccr); + + /* read counter (PMD) controlled by PMC */ + __pfm_read_reg_p4(xrd+j, &ctr1); + + /* clear CCCR value: stop counter but destroy OVF */ + __pfm_write_reg_p4(xrc+i, 0); + + /* read counter controlled by CCCR again */ + __pfm_read_reg_p4(xrd+j, &ctr2); + + /* + * there is an overflow if either: + * - CCCR.ovf is set (and we just cleared it) + * - ctr2 < ctr1 + * in that case we set the bit corresponding to the + * overflowed PMD in povfl_pmds. + */ + if ((cccr & (1ULL<<31)) || (ctr2 < ctr1)) { + __set_bit(j, cast_ulp(ctx_arch->p4->povfl_pmds)); + ctx_arch->p4->npend_ovfls++; + } + p4->saved_cccrs[i] = cccr; + n--; + } + /* + * if there was no overflow, then it means the NMI was not really + * for us, so we have to resume monitoring + */ + if (unlikely(!ctx_arch->p4->npend_ovfls)) { + for(i=0; n; i++) { + if (!test_bit(i, cast_ulp(ena_mask))) + continue; + __pfm_write_reg_p4(xrc+i, ctx_arch->p4->saved_cccrs[i]); + } + } + return 0; +} + +/* + * detect is counters have overflowed. + * return: + * 0 : no overflow + * 1 : at least one overflow + * + * used by Intel Core-based processors + */ +static int __kprobes pfm_has_ovfl_intel_core(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + struct pfm_arch_ext_reg *xrd; + u64 *cnt_mask; + u64 wmask, val; + u16 i, num; + + cnt_mask = pfm_pmu_conf->regs.cnt_pmds; + num = pfm_pmu_conf->regs.num_counters; + wmask = 1ULL << pfm_pmu_conf->counter_width; + xrd = arch_info->pmd_addrs; + + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(cnt_mask))) { + rdmsrl(xrd[i].addrs[0], val); + if (!(val & wmask)) + return 1; + num--; + } + } + return 0; +} + +/* + * called from notify_die() notifier from an trap handler path. We only + * care about NMI related callbacks, and ignore everything else. + * + * Cannot grab any locks, include the perfmon context lock + * + * Must detect if NMI interrupt comes from perfmon, and if so it must + * stop the PMU and repost a lower-priority interrupt. The perfmon interrupt + * handler needs to grab the context lock, thus is cannot be run directly + * from the NMI interrupt call path. + */ +static int __kprobes pfm_handle_nmi(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct die_args *args = data; + struct pfm_context *ctx; + + /* + * only NMI related calls + */ + if (val != DIE_NMI_IPI) + return NOTIFY_DONE; + + /* + * perfmon not active on this processor + */ + ctx = __get_cpu_var(pmu_ctx); + if (ctx == NULL) { + PFM_DBG_ovfl("ctx NULL"); + return NOTIFY_DONE; + } + + /* + * detect if we have overflows, i.e., NMI interrupt + * caused by PMU + */ + if (!pfm_has_ovfl(ctx)) { + PFM_DBG_ovfl("no ovfl"); + return NOTIFY_DONE; + } + + /* + * we stop the PMU to avoid further overflow before this + * one is treated by lower priority interrupt handler + */ + __pfm_arch_quiesce_pmu_percpu(); + + /* + * record actual instruction pointer + */ + __get_cpu_var(real_iip) = instruction_pointer(args->regs); + + /* + * post lower priority interrupt (LOCAL_PERFMON_VECTOR) + */ + pfm_arch_resend_irq(); + + __get_cpu_var(pfm_stats).ovfl_intr_nmi_count++; + + /* + * we need to rewrite the APIC vector on Intel + */ + if (cpu_data->x86_vendor == X86_VENDOR_INTEL) + apic_write(APIC_LVTPC, APIC_DM_NMI); + + /* + * the notification was for us + */ + return NOTIFY_STOP; +} + +static struct notifier_block pfm_nmi_nb={ + .notifier_call = pfm_handle_nmi +}; + +/* + * called from pfm_register_pmu_config() after the new + * config has been validated. The pfm_session_lock + * is held. + * + * return: + * < 0 : if error + * 0 : if success + */ +int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) +{ + struct pfm_arch_pmu_info *arch_info = cfg->arch_info; + + /* + * ensure that PMU description is able to deal with NMI watchdog using + * the performance counters + */ + if ( nmi_watchdog == NMI_LOCAL_APIC + && !(arch_info->flags & PFM_X86_FL_USE_NMI)) { + PFM_INFO("NMI watchdog uses counters, PMU module cannot handle"); + return -EINVAL; + } + + /* + * adust stop routine based on PMU model + * + * P6 : P6, Pentium M, AMD K8, Intel architectural perfmon + * P4 : Xeon, EM64T, P4 + * Core: Core 2, + */ + switch(arch_info->pmu_style) { + case PFM_X86_PMU_P4: + pfm_stop_save = pfm_stop_save_p4; + pfm_has_ovfl = pfm_has_ovfl_p4; + break; + case PFM_X86_PMU_P6: + pfm_stop_save = pfm_stop_save_p6; + pfm_has_ovfl = pfm_has_ovfl_p6; + break; + case PFM_X86_PMU_CORE: + pfm_stop_save = pfm_stop_save_intel_core; + pfm_has_ovfl = pfm_has_ovfl_intel_core; + break; + case PFM_X86_PMU_AMD64: + pfm_stop_save = pfm_stop_save_amd64; + pfm_has_ovfl = pfm_has_ovfl_amd64; + break; + default: + PFM_INFO("unknown pmu_style=%d", arch_info->pmu_style); + return -EINVAL; + } + + /* + * determine interrupt type to use + */ + if (arch_info->flags & PFM_X86_FL_USE_NMI) { + register_die_notifier(&pfm_nmi_nb); + PFM_INFO("intr_type=NMI"); + pfm_using_nmi = 1; + } else { + PFM_INFO("intr_type=regular"); + } + return 0; +} + +void pfm_arch_pmu_config_remove(void) +{ + if (pfm_using_nmi) + unregister_die_notifier(&pfm_nmi_nb); + + pfm_using_nmi = 0; +} + +char *pfm_arch_get_pmu_module_name(void) +{ + switch(cpu_data->x86) { + case 6: + switch(cpu_data->x86_model) { + case 3: /* Pentium II */ + case 7 ... 11: + case 13: + return "perfmon_p6"; + case 15: + return "perfmon_core"; + default: + goto try_arch; + } + case 15: + case 16: + /* All Opteron processors */ + if (cpu_data->x86_vendor == X86_VENDOR_AMD) + return "perfmon_k8"; + + switch(cpu_data->x86_model) { + case 0 ... 6: + return "perfmon_p4"; + } + /* FALL THROUGH */ + default: +try_arch: + if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) + return "perfmon_intel_arch"; + return NULL; + } + return NULL; +} + +void pfm_arch_resend_irq(void) +{ + unsigned long val, dest; + /* + * we cannot use hw_resend_irq() because it goes to + * the I/O APIC. We need to go to the Local APIC. + * + * The "int vec" is not the right solution either + * because it triggers a software intr. We need + * to regenerate the interrupt and have it pended + * until we unmask interrupts. + * + * Instead we send ourself an IPI on the perfmon + * vector. + */ + val = APIC_DEST_SELF|APIC_INT_ASSERT| + APIC_DM_FIXED|LOCAL_PERFMON_VECTOR; + + dest = apic_read(APIC_ID); + apic_write(APIC_ICR2, dest); + apic_write(APIC_ICR, val); +} + +DEFINE_PER_CPU(unsigned long, saved_lvtpc); + +static void pfm_arch_pmu_acquire_percpu(void *data) +{ + int vec; + + __get_cpu_var(saved_lvtpc) = apic_read(APIC_LVTPC); + + vec = pfm_using_nmi ? APIC_DM_NMI : LOCAL_PERFMON_VECTOR; + apic_write(APIC_LVTPC, vec); + + PFM_DBG("LTVPC=0x%lx saved=0x%lx", + (unsigned long)apic_read(APIC_LVTPC), + (unsigned long)__get_cpu_var(saved_lvtpc)); +} + +static void pfm_arch_pmu_release_percpu(void *data) +{ + PFM_DBG("restoring LVTPC=0x%lx", __get_cpu_var(saved_lvtpc)); + apic_write(APIC_LVTPC, __get_cpu_var(saved_lvtpc)); +} + + +/* + * called from pfm_acquire_pmu() with + * pfm_pmu_conf.regs copied from pfm_pmu_conf.full_regs + * needs to adjust regs to match current PMU availabilityy + * + * Caller does recalculate all max/num/first limits on the + * pfm_pmu_conf.regs structure. + * + * interrupts are not masked + * + * + * XXX: until reserve_*_nmi() get fixed by Bjorn to work + * correctly whenever the NMI watchdog is not used. We skip + * the allocation. Yet we do the percpu initialization. + */ +int pfm_arch_pmu_acquire(void) +{ + struct pfm_arch_pmu_info *arch_info; + struct pfm_regmap_desc *d; + struct pfm_arch_ext_reg *pc; + u16 i, n, ena = 0; + + arch_info = pfm_pmu_conf->arch_info; + pc = arch_info->pmc_addrs; + + bitmap_zero(cast_ulp(arch_info->enable_mask), PFM_MAX_PMCS); + + d = pfm_pmu_conf->pmc_desc; + n = pfm_pmu_conf->regs.num_pmcs; + for(i=0; n; i++, d++) { + /* + * skip not implemented registers (including those + * already removed by the module) + */ + if (!(d->type & PFM_REG_I)) + continue; + + n--; + + if (d->type & PFM_REG_V) + continue; + + /* + * reserve register with lower-level allocator + */ + if (!reserve_evntsel(d->hw_addr)) { + PFM_DBG("pmc%d (%s) in use elsewhere, disabling", i, d->desc); + __clear_bit(i, cast_ulp(pfm_pmu_conf->regs.pmcs)); + } else { + if (pc[i].reg_type & PFM_REGT_EN) { + __set_bit(i, cast_ulp(arch_info->enable_mask)); + ena++; + arch_info->max_ena = i + 1; + } + } + } + + PFM_DBG("%u PMCs with enable capability", ena); + if (!ena) { + PFM_INFO("no registers with start/stop capability," + "try rebooting with nmi_watchdog=0"); + goto undo; + } + + d = pfm_pmu_conf->pmd_desc; + n = pfm_pmu_conf->regs.num_pmds; + for(i=0; n; i++, d++) { + if (!(d->type & PFM_REG_I)) + continue; + n--; + + if (d->type & PFM_REG_V) + continue; + + if (!reserve_perfctr(d->hw_addr)) { + PFM_DBG("pmd%d (%s) in use elsewhere, disabling", i, d->desc); + __clear_bit(i, cast_ulp(pfm_pmu_conf->regs.pmds)); + __clear_bit(i, cast_ulp(pfm_pmu_conf->regs.cnt_pmds)); + __clear_bit(i, cast_ulp(pfm_pmu_conf->regs.rw_pmds)); + } + } + + /* + * program APIC + */ + on_each_cpu(pfm_arch_pmu_acquire_percpu, NULL, 0, 1); + + return 0; +undo: + pfm_pmu_conf->regs = pfm_pmu_conf->full_regs; + return -EBUSY; +} + +/* + * called from pfm_pmu_release() + * interrupts are not masked + */ +void pfm_arch_pmu_release(void) +{ + struct pfm_regmap_desc *d; + u16 i, n; + + d = pfm_pmu_conf->pmc_desc; + n = pfm_pmu_conf->regs.num_pmcs; + for(i=0; n; i++, d++) { + if (!test_bit(i, cast_ulp(pfm_pmu_conf->regs.pmcs))) + continue; + release_evntsel(d->hw_addr); + n--; + PFM_DBG("pmc%u released", i); + } + d = pfm_pmu_conf->pmd_desc; + n = pfm_pmu_conf->regs.num_pmds; + for(i=0; n; i++, d++) { + if (!test_bit(i, cast_ulp(pfm_pmu_conf->regs.pmds))) + continue; + release_perfctr(d->hw_addr); + n--; + PFM_DBG("pmd%u released", i); + } + on_each_cpu(pfm_arch_pmu_release_percpu, NULL, 0, 1); +} Index: linux-2.6/arch/i386/perfmon/perfmon_gen_ia32.c =================================================================== --- /dev/null +++ linux-2.6/arch/i386/perfmon/perfmon_gen_ia32.c @@ -0,0 +1,290 @@ +/* + * This file contains the IA-32 architectural perfmon register description tables. + * + * The IA-32 architectural perfmon (PMU) was introduced with Intel Core Solo + * and Core Duo processors. + * + * Copyright (c) 2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Generic IA-32 PMU description table"); +MODULE_LICENSE("GPL"); + +static int force; +MODULE_PARM_DESC(force, "bool: force module to load succesfully"); +module_param(force, bool, 0600); + +/* + * - upper 32 bits are reserved + * - INT: APIC enable bit is reserved (forced to 1) + * - bit 21 is reserved + * + * RSVD: reserved bits are 1 + */ +#define PFM_GEN_IA32_PMC_RSVD ((~((1ULL<<32)-1)) \ + | (1ULL<<20) \ + | (1ULL<<21)) + +/* + * force Local APIC interrupt on overflow + * disable with NO_EMUL64 + */ +#define PFM_GEN_IA32_PMC_VAL (1ULL<<20) +#define PFM_GEN_IA32_NO64 (1ULL<<20) + +/* + * architectuture specifies that: + * IA32_PMCx MSR starts at 0xc1 & occupy a contiguous block of MSR addr + * IA32_PERFEVTSELx MSR starts at 0x186 & occupy a contiguous block of MSR addr + */ +#define MSR_GEN_PERFEVTSEL_BASE MSR_P6_EVNTSEL0 +#define MSR_GEN_PMC_BASE MSR_P6_PERFCTR0 + +#define PFM_GEN_IA32_SEL(n) { \ + .addrs[0] = MSR_GEN_PERFEVTSEL_BASE+(n), \ + .addrs[1] = 0, \ + .ctr = n, \ + .reg_type = PFM_REGT_EN} + +#define PFM_GEN_IA32_CTR(n) { \ + .addrs[0] = MSR_GEN_PMC_BASE+(n), \ + .addrs[1] = 0, \ + .ctr = n, \ + .reg_type = PFM_REGT_CTR} + +struct pmu_eax { + unsigned int version:8; + unsigned int num_cnt:8; + unsigned int cnt_width:8; + unsigned int ebx_length:8; +}; + +/* + * physical addresses of MSR controlling the perfevtsel and counter registers + */ +struct pfm_arch_pmu_info pfm_gen_ia32_pmu_info={ + .pmc_addrs = { + PFM_GEN_IA32_SEL(0) , PFM_GEN_IA32_SEL(1), PFM_GEN_IA32_SEL(2), PFM_GEN_IA32_SEL(3), + PFM_GEN_IA32_SEL(4) , PFM_GEN_IA32_SEL(5), PFM_GEN_IA32_SEL(6), PFM_GEN_IA32_SEL(7), + PFM_GEN_IA32_SEL(8) , PFM_GEN_IA32_SEL(9), PFM_GEN_IA32_SEL(10), PFM_GEN_IA32_SEL(11), + PFM_GEN_IA32_SEL(12), PFM_GEN_IA32_SEL(13), PFM_GEN_IA32_SEL(14), PFM_GEN_IA32_SEL(15), + PFM_GEN_IA32_SEL(16), PFM_GEN_IA32_SEL(17), PFM_GEN_IA32_SEL(18), PFM_GEN_IA32_SEL(19), + PFM_GEN_IA32_SEL(20), PFM_GEN_IA32_SEL(21), PFM_GEN_IA32_SEL(22), PFM_GEN_IA32_SEL(23), + PFM_GEN_IA32_SEL(24), PFM_GEN_IA32_SEL(25), PFM_GEN_IA32_SEL(26), PFM_GEN_IA32_SEL(27), + PFM_GEN_IA32_SEL(28), PFM_GEN_IA32_SEL(29), PFM_GEN_IA32_SEL(30), PFM_GEN_IA32_SEL(31) + }, + .pmd_addrs = { + PFM_GEN_IA32_CTR(0) , PFM_GEN_IA32_CTR(1), PFM_GEN_IA32_CTR(2), PFM_GEN_IA32_CTR(3), + PFM_GEN_IA32_CTR(4) , PFM_GEN_IA32_CTR(5), PFM_GEN_IA32_CTR(6), PFM_GEN_IA32_CTR(7), + PFM_GEN_IA32_CTR(8) , PFM_GEN_IA32_CTR(9), PFM_GEN_IA32_CTR(10), PFM_GEN_IA32_CTR(11), + PFM_GEN_IA32_CTR(12), PFM_GEN_IA32_CTR(13), PFM_GEN_IA32_CTR(14), PFM_GEN_IA32_CTR(15), + PFM_GEN_IA32_CTR(16), PFM_GEN_IA32_CTR(17), PFM_GEN_IA32_CTR(18), PFM_GEN_IA32_CTR(19), + PFM_GEN_IA32_CTR(20), PFM_GEN_IA32_CTR(21), PFM_GEN_IA32_CTR(22), PFM_GEN_IA32_CTR(23), + PFM_GEN_IA32_CTR(24), PFM_GEN_IA32_CTR(25), PFM_GEN_IA32_CTR(26), PFM_GEN_IA32_CTR(27), + PFM_GEN_IA32_CTR(28), PFM_GEN_IA32_CTR(29), PFM_GEN_IA32_CTR(30), PFM_GEN_IA32_CTR(31) + }, + .pmu_style = PFM_X86_PMU_P6 +}; + +#define PFM_GEN_IA32_C(n) { \ + .type = PFM_REG_I64, \ + .desc = "PERFEVTSEL"#n, \ + .dfl_val = PFM_GEN_IA32_PMC_VAL, \ + .rsvd_msk = PFM_GEN_IA32_PMC_RSVD, \ + .no_emul64_msk = PFM_GEN_IA32_NO64, \ + .hw_addr = MSR_GEN_PERFEVTSEL_BASE+(n) \ + } + +#define PFM_GEN_IA32_D(n) { \ + .type = PFM_REG_C, \ + .desc = "PMC"#n, \ + .dfl_val = 0, \ + .rsvd_msk = 0, \ + .no_emul64_msk = 0, \ + .hw_addr = MSR_GEN_PMC_BASE+(n) \ + } + +static struct pfm_reg_desc pfm_gen_ia32_pmc_desc[]={ +/* pmc0 */ PFM_GEN_IA32_C(0), PFM_GEN_IA32_C(1), PFM_GEN_IA32_C(2), PFM_GEN_IA32_C(3), +/* pmc4 */ PFM_GEN_IA32_C(4), PFM_GEN_IA32_C(5), PFM_GEN_IA32_C(6), PFM_GEN_IA32_C(7), +/* pmc8 */ PFM_GEN_IA32_C(8), PFM_GEN_IA32_C(9), PFM_GEN_IA32_C(10), PFM_GEN_IA32_C(11), +/* pmc12 */ PFM_GEN_IA32_C(12), PFM_GEN_IA32_C(13), PFM_GEN_IA32_C(14), PFM_GEN_IA32_C(15), +/* pmc16 */ PFM_GEN_IA32_C(16), PFM_GEN_IA32_C(17), PFM_GEN_IA32_C(18), PFM_GEN_IA32_C(19), +/* pmc20 */ PFM_GEN_IA32_C(20), PFM_GEN_IA32_C(21), PFM_GEN_IA32_C(22), PFM_GEN_IA32_C(23), +/* pmc24 */ PFM_GEN_IA32_C(24), PFM_GEN_IA32_C(25), PFM_GEN_IA32_C(26), PFM_GEN_IA32_C(27), +/* pmc28 */ PFM_GEN_IA32_C(28), PFM_GEN_IA32_C(29), PFM_GEN_IA32_C(30), PFM_GEN_IA32_C(31) +}; + +static struct pfm_reg_desc pfm_gen_ia32_pmd_desc[]={ +/* pmd0 */ PFM_GEN_IA32_D(0), PFM_GEN_IA32_D(1), PFM_GEN_IA32_D(2), PFM_GEN_IA32_D(3), +/* pmd4 */ PFM_GEN_IA32_D(4), PFM_GEN_IA32_D(5), PFM_GEN_IA32_D(6), PFM_GEN_IA32_D(7), +/* pmd8 */ PFM_GEN_IA32_D(8), PFM_GEN_IA32_D(9), PFM_GEN_IA32_D(10), PFM_GEN_IA32_D(11), +/* pmd12 */ PFM_GEN_IA32_D(12), PFM_GEN_IA32_D(13), PFM_GEN_IA32_D(14), PFM_GEN_IA32_D(15), +/* pmd16 */ PFM_GEN_IA32_D(16), PFM_GEN_IA32_D(17), PFM_GEN_IA32_D(18), PFM_GEN_IA32_D(19), +/* pmd20 */ PFM_GEN_IA32_D(20), PFM_GEN_IA32_D(21), PFM_GEN_IA32_D(22), PFM_GEN_IA32_D(23), +/* pmd24 */ PFM_GEN_IA32_D(24), PFM_GEN_IA32_D(25), PFM_GEN_IA32_D(26), PFM_GEN_IA32_D(27), +/* pmd28 */ PFM_GEN_IA32_D(28), PFM_GEN_IA32_D(29), PFM_GEN_IA32_D(30), PFM_GEN_IA32_D(31) +}; +#define PFM_GEN_IA32_MAX_PMCS ARRAY_SIZE(pfm_gen_ia32_pmc_desc) + +#define MSR_IA32_MISC_ENABLE_PERF_AVAIL (1<<7) /* read-only status bit */ + +static struct pfm_pmu_config pfm_gen_ia32_pmu_conf; + +static int pfm_gen_ia32_probe_pmu(void) +{ + union { + unsigned int val; + struct pmu_eax eax; + } eax; + unsigned int ebx, ecx, edx; + unsigned int num_cnt; + + if (cpu_data->x86_vendor != X86_VENDOR_INTEL) { + PFM_INFO("not an Intel processor"); + return -1; + } + + /* + * ensure CPUID instruction exists + */ + if (cpu_data->x86 < 5) { + PFM_INFO("processor family too old"); + return -1; + } + + if (force == 0) { + /* + * check if CPU supports 0xa function of CPUID + * 0xa started with Core Duo/Solo. Needed to detect if + * architected PMU is present + */ + cpuid(0x0, &eax.val, &ebx, &ecx, &edx); + if (eax.val < 0xa) { + PFM_INFO("CPUID 0xa function not supported\n"); + return -1; + } + + cpuid(0xa, &eax.val, &ebx, &ecx, &edx); + if (eax.eax.version < 1) { + PFM_INFO("architectural perfmon not supported\n"); + return -1; + } + + /* + * ensure that when all moduels are linked in, we picked the right + * one for Intel Core-based processors, as they accept architectural + * perfmon, but implement extensions which are only visible with + * perfmon_core module + */ + if (cpu_data->x86 == 6 && cpu_data->x86_model == 15) { + PFM_INFO("use perfmon_core for Core-based processors"); + return -1; + } + } else { + eax.eax.num_cnt = 2; + eax.eax.cnt_width = 31; + } + + num_cnt = eax.eax.num_cnt; + + /* + * sanity check number of counters + */ + if (num_cnt == 0 || num_cnt >= PFM_MAX_HW_PMCS) { + PFM_INFO("invalid number of counters %u\n", eax.eax.num_cnt); + return -1; + } + /* + * instead of dynamically generating the description table + * and MSR addresses, we have a default description with a reasonably + * large number of counters (32). We believe this is plenty for quite + * some time. Thus allows us to have a much simpler probing and + * initialization routine, especially because we have no dynamic + * allocation, especially for the counter names. + * + * When HW supports more that what we haev prepared for, then we limit + * the number of counters we support and print a message. + */ + if (num_cnt >= PFM_GEN_IA32_MAX_PMCS) { + printk(KERN_INFO "perfmon: Limiting number of counters to %zu," + "HW supports %u", PFM_GEN_IA32_MAX_PMCS, num_cnt); + num_cnt = PFM_GEN_IA32_MAX_PMCS; + } + + if (eax.eax.cnt_width > 63) { + PFM_INFO("invalid counter width %u\n", eax.eax.cnt_width); + return -1; + } + + if (!cpu_has_apic) { + PFM_INFO("no Local APIC, unsupported"); + return -1; + } + + if (nmi_watchdog == NMI_LOCAL_APIC) { + PFM_INFO("NMI watchdog using PERFEVTSEL0/PERTCTR0, disabling them for perfmon"); + pfm_gen_ia32_pmc_desc[0].type = PFM_REG_NA; + pfm_gen_ia32_pmd_desc[0].type = PFM_REG_NA; + pfm_gen_ia32_pmu_info.pmc_addrs[0].reg_type = PFM_REGT_NA; + pfm_gen_ia32_pmu_info.pmd_addrs[0].reg_type = PFM_REGT_NA; + } + pfm_gen_ia32_pmu_conf.num_pmc_entries = num_cnt; + pfm_gen_ia32_pmu_conf.num_pmd_entries = num_cnt; + + return 0; +} + +/* + * Counters may have model-specific width. Yet the documentation says + * that only the lower 32 bits can be written to. bits [w-32] + * are sign extensions of bit 31. As such the effective width of + * a counter is 31 bits only. + * See IA-32 Intel Architecture Software developer manual Vol 3b: + * system programming and section 18.17.2 in particular. + */ +static struct pfm_pmu_config pfm_gen_ia32_pmu_conf={ + .pmu_name = "Intel architectural", + .pmd_desc = pfm_gen_ia32_pmd_desc, + .counter_width = 31, + .pmc_desc = pfm_gen_ia32_pmc_desc, + .probe_pmu = pfm_gen_ia32_probe_pmu, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .arch_info = &pfm_gen_ia32_pmu_info +}; + +static int __init pfm_gen_ia32_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_gen_ia32_pmu_conf); +} + +static void __exit pfm_gen_ia32_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_gen_ia32_pmu_conf); +} + +module_init(pfm_gen_ia32_pmu_init_module); +module_exit(pfm_gen_ia32_pmu_cleanup_module); Index: linux-2.6/arch/i386/perfmon/perfmon_intel_arch.c =================================================================== --- /dev/null +++ linux-2.6/arch/i386/perfmon/perfmon_intel_arch.c @@ -0,0 +1,261 @@ +/* + * This file contains the Intel architectural perfmon register v1 + * description tables. + * + * Architectural perfmon was introduced with Intel Core Solo and + * Core Duo processors. + * + * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Intel architectural perfmon v1"); +MODULE_LICENSE("GPL"); + +static int force, force_nmi; +MODULE_PARM_DESC(force, "bool: force module to load succesfully"); +MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); +module_param(force, bool, 0600); +module_param(force_nmi, bool, 0600); + +/* + * - upper 32 bits are reserved + * - INT: APIC enable bit is reserved (forced to 1) + * - bit 21 is reserved + * + * RSVD: reserved bits are 1 + */ +#define PFM_INTEL_ARCH_PMC_RSVD ((~((1ULL<<32)-1)) \ + | (1ULL<<20) \ + | (1ULL<<21)) + +/* + * force Local APIC interrupt on overflow + * disable with NO_EMUL64 + */ +#define PFM_INTEL_ARCH_PMC_VAL (1ULL<<20) +#define PFM_INTEL_ARCH_NO64 (1ULL<<20) + +/* + * architectuture specifies that: + * IA32_PMCx MSR : starts at 0xc1 & occupy a contiguous block of MSR + * IA32_PERFEVTSELx MSR : starts at 0x186 & occupy a contiguous block of MSR + */ +#define MSR_GEN_SEL_BASE MSR_P6_EVNTSEL0 +#define MSR_GEN_PMC_BASE MSR_P6_PERFCTR0 + +#define PFM_INTEL_ARCH_SEL(n) { \ + .addrs[0] = MSR_GEN_SEL_BASE+(n), \ + .addrs[1] = 0, \ + .ctr = n, \ + .reg_type = PFM_REGT_EN} + +#define PFM_INTEL_ARCH_CTR(n) { \ + .addrs[0] = MSR_GEN_PMC_BASE+(n), \ + .addrs[1] = 0, \ + .ctr = n, \ + .reg_type = PFM_REGT_CTR} + +struct pmu_eax { + unsigned int version:8; + unsigned int num_cnt:8; /* up to 256 counters? */ + unsigned int cnt_width:8; + unsigned int ebx_length:8; +}; + +/* + * physical addresses of MSR controlling the perfevtsel and counter registers + */ +struct pfm_arch_pmu_info pfm_intel_arch_pmu_info={ + .pmc_addrs = { + PFM_INTEL_ARCH_SEL(0) , PFM_INTEL_ARCH_SEL(1), PFM_INTEL_ARCH_SEL(2), PFM_INTEL_ARCH_SEL(3), + PFM_INTEL_ARCH_SEL(4) , PFM_INTEL_ARCH_SEL(5), PFM_INTEL_ARCH_SEL(6), PFM_INTEL_ARCH_SEL(7), + PFM_INTEL_ARCH_SEL(8) , PFM_INTEL_ARCH_SEL(9), PFM_INTEL_ARCH_SEL(10), PFM_INTEL_ARCH_SEL(11), + PFM_INTEL_ARCH_SEL(12), PFM_INTEL_ARCH_SEL(13), PFM_INTEL_ARCH_SEL(14), PFM_INTEL_ARCH_SEL(15), + PFM_INTEL_ARCH_SEL(16), PFM_INTEL_ARCH_SEL(17), PFM_INTEL_ARCH_SEL(18), PFM_INTEL_ARCH_SEL(19), + PFM_INTEL_ARCH_SEL(20), PFM_INTEL_ARCH_SEL(21), PFM_INTEL_ARCH_SEL(22), PFM_INTEL_ARCH_SEL(23), + PFM_INTEL_ARCH_SEL(24), PFM_INTEL_ARCH_SEL(25), PFM_INTEL_ARCH_SEL(26), PFM_INTEL_ARCH_SEL(27), + PFM_INTEL_ARCH_SEL(28), PFM_INTEL_ARCH_SEL(29), PFM_INTEL_ARCH_SEL(30), PFM_INTEL_ARCH_SEL(31) + }, + .pmd_addrs = { + PFM_INTEL_ARCH_CTR(0) , PFM_INTEL_ARCH_CTR(1), PFM_INTEL_ARCH_CTR(2), PFM_INTEL_ARCH_CTR(3), + PFM_INTEL_ARCH_CTR(4) , PFM_INTEL_ARCH_CTR(5), PFM_INTEL_ARCH_CTR(6), PFM_INTEL_ARCH_CTR(7), + PFM_INTEL_ARCH_CTR(8) , PFM_INTEL_ARCH_CTR(9), PFM_INTEL_ARCH_CTR(10), PFM_INTEL_ARCH_CTR(11), + PFM_INTEL_ARCH_CTR(12), PFM_INTEL_ARCH_CTR(13), PFM_INTEL_ARCH_CTR(14), PFM_INTEL_ARCH_CTR(15), + PFM_INTEL_ARCH_CTR(16), PFM_INTEL_ARCH_CTR(17), PFM_INTEL_ARCH_CTR(18), PFM_INTEL_ARCH_CTR(19), + PFM_INTEL_ARCH_CTR(20), PFM_INTEL_ARCH_CTR(21), PFM_INTEL_ARCH_CTR(22), PFM_INTEL_ARCH_CTR(23), + PFM_INTEL_ARCH_CTR(24), PFM_INTEL_ARCH_CTR(25), PFM_INTEL_ARCH_CTR(26), PFM_INTEL_ARCH_CTR(27), + PFM_INTEL_ARCH_CTR(28), PFM_INTEL_ARCH_CTR(29), PFM_INTEL_ARCH_CTR(30), PFM_INTEL_ARCH_CTR(31) + }, + .pmu_style = PFM_X86_PMU_P6 +}; + +#define PFM_INTEL_ARCH_C(n) { \ + .type = PFM_REG_I64, \ + .desc = "PERFEVTSEL"#n, \ + .dfl_val = PFM_INTEL_ARCH_PMC_VAL, \ + .rsvd_msk = PFM_INTEL_ARCH_PMC_RSVD, \ + .no_emul64_msk = PFM_INTEL_ARCH_NO64, \ + .hw_addr = MSR_GEN_SEL_BASE+(n) \ + } + +#define PFM_INTEL_ARCH_D(n) { \ + .type = PFM_REG_C, \ + .desc = "PMC"#n, \ + .dfl_val = 0, \ + .rsvd_msk = 0, \ + .no_emul64_msk = 0, \ + .hw_addr = MSR_GEN_PMC_BASE+(n) \ + } + +static struct pfm_regmap_desc pfm_intel_arch_pmc_desc[]={ +/* pmc0 */ PFM_INTEL_ARCH_C(0), PFM_INTEL_ARCH_C(1), PFM_INTEL_ARCH_C(2), PFM_INTEL_ARCH_C(3), +/* pmc4 */ PFM_INTEL_ARCH_C(4), PFM_INTEL_ARCH_C(5), PFM_INTEL_ARCH_C(6), PFM_INTEL_ARCH_C(7), +/* pmc8 */ PFM_INTEL_ARCH_C(8), PFM_INTEL_ARCH_C(9), PFM_INTEL_ARCH_C(10), PFM_INTEL_ARCH_C(11), +/* pmc12 */ PFM_INTEL_ARCH_C(12), PFM_INTEL_ARCH_C(13), PFM_INTEL_ARCH_C(14), PFM_INTEL_ARCH_C(15), +/* pmc16 */ PFM_INTEL_ARCH_C(16), PFM_INTEL_ARCH_C(17), PFM_INTEL_ARCH_C(18), PFM_INTEL_ARCH_C(19), +/* pmc20 */ PFM_INTEL_ARCH_C(20), PFM_INTEL_ARCH_C(21), PFM_INTEL_ARCH_C(22), PFM_INTEL_ARCH_C(23), +/* pmc24 */ PFM_INTEL_ARCH_C(24), PFM_INTEL_ARCH_C(25), PFM_INTEL_ARCH_C(26), PFM_INTEL_ARCH_C(27), +/* pmc28 */ PFM_INTEL_ARCH_C(28), PFM_INTEL_ARCH_C(29), PFM_INTEL_ARCH_C(30), PFM_INTEL_ARCH_C(31) +}; + +static struct pfm_regmap_desc pfm_intel_arch_pmd_desc[]={ +/* pmd0 */ PFM_INTEL_ARCH_D(0), PFM_INTEL_ARCH_D(1), PFM_INTEL_ARCH_D(2), PFM_INTEL_ARCH_D(3), +/* pmd4 */ PFM_INTEL_ARCH_D(4), PFM_INTEL_ARCH_D(5), PFM_INTEL_ARCH_D(6), PFM_INTEL_ARCH_D(7), +/* pmd8 */ PFM_INTEL_ARCH_D(8), PFM_INTEL_ARCH_D(9), PFM_INTEL_ARCH_D(10), PFM_INTEL_ARCH_D(11), +/* pmd12 */ PFM_INTEL_ARCH_D(12), PFM_INTEL_ARCH_D(13), PFM_INTEL_ARCH_D(14), PFM_INTEL_ARCH_D(15), +/* pmd16 */ PFM_INTEL_ARCH_D(16), PFM_INTEL_ARCH_D(17), PFM_INTEL_ARCH_D(18), PFM_INTEL_ARCH_D(19), +/* pmd20 */ PFM_INTEL_ARCH_D(20), PFM_INTEL_ARCH_D(21), PFM_INTEL_ARCH_D(22), PFM_INTEL_ARCH_D(23), +/* pmd24 */ PFM_INTEL_ARCH_D(24), PFM_INTEL_ARCH_D(25), PFM_INTEL_ARCH_D(26), PFM_INTEL_ARCH_D(27), +/* pmd28 */ PFM_INTEL_ARCH_D(28), PFM_INTEL_ARCH_D(29), PFM_INTEL_ARCH_D(30), PFM_INTEL_ARCH_D(31) +}; +#define PFM_INTEL_ARCH_MAX_PMCS ARRAY_SIZE(pfm_intel_arch_pmc_desc) + +static struct pfm_pmu_config pfm_intel_arch_pmu_conf; + + +static int pfm_intel_arch_probe_pmu(void) +{ + union { + unsigned int val; + struct pmu_eax eax; + } eax; + unsigned int ebx, ecx, edx; + unsigned int num_cnt; + + if (!cpu_has_arch_perfmon) { + PFM_INFO("no support for Intel architectural PMU"); + return -1; + } + + if (force == 0) { + cpuid(0xa, &eax.val, &ebx, &ecx, &edx); + } else { + eax.eax.num_cnt = 2; + eax.eax.cnt_width = 31; + } + + /* number of counters */ + num_cnt = eax.eax.num_cnt; + + /* + * sanity check number of counters + */ + if (num_cnt == 0 || num_cnt >= PFM_MAX_PMCS) { + PFM_INFO("invalid number of counters %u\n", eax.eax.num_cnt); + return -1; + } + /* + * instead of dynamically generating the description table + * and MSR addresses, we have a default description with a reasonably + * large number of counters (32). We believe this is plenty for quite + * some time. This allows us to have a much simpler probing and + * initialization routine, especially because we have no dynamic + * allocation. + * + * When HW supports more that what we prepared for, then we limit + * the number of counters we support and print a message. + */ + if (num_cnt >= PFM_INTEL_ARCH_MAX_PMCS) { + printk(KERN_INFO "perfmon: Limiting number of counters to %zu," + "HW supports %u", PFM_INTEL_ARCH_MAX_PMCS, num_cnt); + num_cnt = PFM_INTEL_ARCH_MAX_PMCS; + } + + if (eax.eax.cnt_width > 63) { + PFM_INFO("invalid counter width %u\n", eax.eax.cnt_width); + return -1; + } + + if (!cpu_has_apic) { + PFM_INFO("no Local APIC, try rebooting with lapic"); + return -1; + } + + pfm_intel_arch_pmu_conf.num_pmc_entries = num_cnt; + pfm_intel_arch_pmu_conf.num_pmd_entries = num_cnt; + + PFM_INFO("nmi_watchdog=%d nmi_active=%d force_nmi=%d", + nmi_watchdog, atomic_read(&nmi_active), force_nmi); + + /* + * NMI using PMU? + * Actual removal of NMI counter is done by pfm_pmu_acquire() + */ + if (nmi_watchdog == NMI_LOCAL_APIC || force_nmi) + pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_USE_NMI; + + return 0; +} + +/* + * Counters may have model-specific width. Yet the documentation says + * that only the lower 32 bits can be written to. bits [w-32] + * are sign extensions of bit 31. As such the effective width of + * a counter is 31 bits only. + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ +static struct pfm_pmu_config pfm_intel_arch_pmu_conf={ + .pmu_name = "Intel architectural v1", + .pmd_desc = pfm_intel_arch_pmd_desc, + .counter_width = 31, + .pmc_desc = pfm_intel_arch_pmc_desc, + .probe_pmu = pfm_intel_arch_probe_pmu, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .arch_info = &pfm_intel_arch_pmu_info +}; + +static int __init pfm_intel_arch_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_intel_arch_pmu_conf); +} + +static void __exit pfm_intel_arch_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_intel_arch_pmu_conf); +} + +module_init(pfm_intel_arch_pmu_init_module); +module_exit(pfm_intel_arch_pmu_cleanup_module); Index: linux-2.6/arch/i386/perfmon/perfmon_p4.c =================================================================== --- /dev/null +++ linux-2.6/arch/i386/perfmon/perfmon_p4.c @@ -0,0 +1,414 @@ +/* + * This file contains the P4/Xeon PMU register description tables + * for both 32 and 64 bit modes. + * + * Copyright (c) 2005 Intel Corporation + * Contributed by Bryan Wilkerson + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Bryan Wilkerson "); +MODULE_DESCRIPTION("P4/Xeon/EM64T PMU description table"); +MODULE_LICENSE("GPL"); + +static int force; +MODULE_PARM_DESC(force, "bool: force module to load succesfully"); +module_param(force, bool, 0600); + +static int force_nmi; +MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); +module_param(force_nmi, bool, 0600); + +/* + * CCCR default value: + * - OVF_PMI_T0=1 (bit 26) + * - OVF_PMI_T1=0 (bit 27) (set if necessary in pfm_write_reg()) + * - all other bits are zero + * + * OVF_PMI is forced to zero if PFM_REGFL_NO_EMUL64 is set on CCCR + */ +#define PFM_CCCR_DFL (1ULL<<26) | (3ULL<<16) + +/* + * CCCR reserved fields: + * - bits 0-11, 25-29, 31-63 + * - OVF_PMI (26-27), override with REGFL_NO_EMUL64 + * + * RSVD: reserved bits must be 1 + */ +#define PFM_CCCR_RSVD ~((0xfull<<12) \ + | (0x7full<<18) \ + | (0x1ull<<30)) + +#define PFM_P4_NO64 (3ULL<<26) /* use 3 even in non HT mode */ + +/* + * With HyperThreading enabled: + * + * The ESCRs and CCCRs are divided in half with the top half + * belonging to logical processor 0 and the bottom half going to + * logical processor 1. Thus only half of the PMU resources are + * accessible to applications. + * + * PEBS is not available due to the fact that: + * - MSR_PEBS_MATRIX_VERT is shared between the threads + * - IA32_PEBS_ENABLE is shared between the threads + * + * With HyperThreading disabled: + * + * The full set of PMU resources is exposed to applications. + * + * The mapping is chosen such that PMCxx -> MSR is the same + * in HT and non HT mode, if register is present in HT mode. + * + */ +#define PFM_REGT_NHTESCR (PFM_REGT_ESCR|PFM_REGT_NOHT) +#define PFM_REGT_NHTCCCR (PFM_REGT_CCCR|PFM_REGT_NOHT|PFM_REGT_EN) +#define PFM_REGT_NHTPEBS (PFM_REGT_PEBS|PFM_REGT_NOHT|PFM_REGT_EN) +#define PFM_REGT_NHTCTR (PFM_REGT_CTR|PFM_REGT_NOHT) +#define PFM_REGT_ENAC (PFM_REGT_CCCR|PFM_REGT_EN) + +static struct pfm_arch_pmu_info pfm_p4_pmu_info={ + .pmc_addrs = { + /*pmc 0 */ {{MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1}, 0, PFM_REGT_ESCR}, /* BPU_ESCR0,1 */ + /*pmc 1 */ {{MSR_P4_IS_ESCR0, MSR_P4_IS_ESCR1}, 0, PFM_REGT_ESCR}, /* IS_ESCR0,1 */ + /*pmc 2 */ {{MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1}, 0, PFM_REGT_ESCR}, /* MOB_ESCR0,1 */ + /*pmc 3 */ {{MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1}, 0, PFM_REGT_ESCR}, /* ITLB_ESCR0,1 */ + /*pmc 4 */ {{MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1}, 0, PFM_REGT_ESCR}, /* PMH_ESCR0,1 */ + /*pmc 5 */ {{MSR_P4_IX_ESCR0, MSR_P4_IX_ESCR1}, 0, PFM_REGT_ESCR}, /* IX_ESCR0,1 */ + /*pmc 6 */ {{MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1}, 0, PFM_REGT_ESCR}, /* FSB_ESCR0,1 */ + /*pmc 7 */ {{MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1}, 0, PFM_REGT_ESCR}, /* BSU_ESCR0,1 */ + /*pmc 8 */ {{MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1}, 0, PFM_REGT_ESCR}, /* MS_ESCR0,1 */ + /*pmc 9 */ {{MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1}, 0, PFM_REGT_ESCR}, /* TC_ESCR0,1 */ + /*pmc 10*/ {{MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1}, 0, PFM_REGT_ESCR}, /* TBPU_ESCR0,1 */ + /*pmc 11*/ {{MSR_P4_FLAME_ESCR0, MSR_P4_FLAME_ESCR1}, 0, PFM_REGT_ESCR}, /* FLAME_ESCR0,1 */ + /*pmc 12*/ {{MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1}, 0, PFM_REGT_ESCR}, /* FIRM_ESCR0,1 */ + /*pmc 13*/ {{MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1}, 0, PFM_REGT_ESCR}, /* SAAT_ESCR0,1 */ + /*pmc 14*/ {{MSR_P4_U2L_ESCR0, MSR_P4_U2L_ESCR1}, 0, PFM_REGT_ESCR}, /* U2L_ESCR0,1 */ + /*pmc 15*/ {{MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1}, 0, PFM_REGT_ESCR}, /* DAC_ESCR0,1 */ + /*pmc 16*/ {{MSR_P4_IQ_ESCR0, MSR_P4_IQ_ESCR1}, 0, PFM_REGT_ESCR}, /* IQ_ESCR0,1 (only model 1 and 2) */ + /*pmc 17*/ {{MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1}, 0, PFM_REGT_ESCR}, /* ALF_ESCR0,1 */ + /*pmc 18*/ {{MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1}, 0, PFM_REGT_ESCR}, /* RAT_ESCR0,1 */ + /*pmc 19*/ {{MSR_P4_SSU_ESCR0, 0}, 0, PFM_REGT_ESCR}, /* SSU_ESCR0 */ + /*pmc 20*/ {{MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1}, 0, PFM_REGT_ESCR}, /* CRU_ESCR0,1 */ + /*pmc 21*/ {{MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3}, 0, PFM_REGT_ESCR}, /* CRU_ESCR2,3 */ + /*pmc 22*/ {{MSR_P4_CRU_ESCR4, MSR_P4_CRU_ESCR5}, 0, PFM_REGT_ESCR}, /* CRU_ESCR4,5 */ + + /*pmc 23*/ {{MSR_P4_BPU_CCCR0, MSR_P4_BPU_CCCR2}, 0, PFM_REGT_ENAC}, /* BPU_CCCR0,2 */ + /*pmc 24*/ {{MSR_P4_BPU_CCCR1, MSR_P4_BPU_CCCR3}, 1, PFM_REGT_ENAC}, /* BPU_CCCR1,3 */ + /*pmc 25*/ {{MSR_P4_MS_CCCR0, MSR_P4_MS_CCCR2}, 2, PFM_REGT_ENAC}, /* MS_CCCR0,2 */ + /*pmc 26*/ {{MSR_P4_MS_CCCR1, MSR_P4_MS_CCCR3}, 3, PFM_REGT_ENAC}, /* MS_CCCR1,3 */ + /*pmc 27*/ {{MSR_P4_FLAME_CCCR0, MSR_P4_FLAME_CCCR2}, 4, PFM_REGT_ENAC}, /* FLAME_CCCR0,2 */ + /*pmc 28*/ {{MSR_P4_FLAME_CCCR1, MSR_P4_FLAME_CCCR3}, 5, PFM_REGT_ENAC}, /* FLAME_CCCR1,3 */ + /*pmc 29*/ {{MSR_P4_IQ_CCCR0, MSR_P4_IQ_CCCR2}, 6, PFM_REGT_ENAC}, /* IQ_CCCR0,2 */ + /*pmc 30*/ {{MSR_P4_IQ_CCCR1, MSR_P4_IQ_CCCR3}, 7, PFM_REGT_ENAC}, /* IQ_CCCR1,3 */ + /*pmc 31*/ {{MSR_P4_IQ_CCCR4, MSR_P4_IQ_CCCR5}, 8, PFM_REGT_ENAC}, /* IQ_CCCR4,5 */ + /* non HT extensions */ + /*pmc 32*/ {{MSR_P4_BPU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* BPU_ESCR1 */ + /*pmc 33*/ {{MSR_P4_IS_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* IS_ESCR1 */ + /*pmc 34*/ {{MSR_P4_MOB_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* MOB_ESCR1 */ + /*pmc 35*/ {{MSR_P4_ITLB_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* ITLB_ESCR1 */ + /*pmc 36*/ {{MSR_P4_PMH_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* PMH_ESCR1 */ + /*pmc 37*/ {{MSR_P4_IX_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* IX_ESCR1 */ + /*pmc 38*/ {{MSR_P4_FSB_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* FSB_ESCR1 */ + /*pmc 39*/ {{MSR_P4_BSU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* BSU_ESCR1 */ + /*pmc 40*/ {{MSR_P4_MS_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* MS_ESCR1 */ + /*pmc 41*/ {{MSR_P4_TC_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* TC_ESCR1 */ + /*pmc 42*/ {{MSR_P4_TBPU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* TBPU_ESCR1 */ + /*pmc 43*/ {{MSR_P4_FLAME_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* FLAME_ESCR1 */ + /*pmc 44*/ {{MSR_P4_FIRM_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* FIRM_ESCR1 */ + /*pmc 45*/ {{MSR_P4_SAAT_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* SAAT_ESCR1 */ + /*pmc 46*/ {{MSR_P4_U2L_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* U2L_ESCR1 */ + /*pmc 47*/ {{MSR_P4_DAC_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* DAC_ESCR1 */ + /*pmc 48*/ {{MSR_P4_IQ_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* IQ_ESCR1 (only model 1 and 2) */ + /*pmc 49*/ {{MSR_P4_ALF_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* ALF_ESCR1 */ + /*pmc 50*/ {{MSR_P4_RAT_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* RAT_ESCR1 */ + /*pmc 51*/ {{MSR_P4_CRU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* CRU_ESCR1 */ + /*pmc 52*/ {{MSR_P4_CRU_ESCR3, 0}, 0, PFM_REGT_NHTESCR}, /* CRU_ESCR3 */ + /*pmc 53*/ {{MSR_P4_CRU_ESCR5, 0}, 0, PFM_REGT_NHTESCR}, /* CRU_ESCR5 */ + /*pmc 54*/ {{MSR_P4_BPU_CCCR1, 0}, 9, PFM_REGT_NHTCCCR}, /* BPU_CCCR1 */ + /*pmc 55*/ {{MSR_P4_BPU_CCCR3, 0},10, PFM_REGT_NHTCCCR}, /* BPU_CCCR3 */ + /*pmc 56*/ {{MSR_P4_MS_CCCR1, 0},11, PFM_REGT_NHTCCCR}, /* MS_CCCR1 */ + /*pmc 57*/ {{MSR_P4_MS_CCCR3, 0},12, PFM_REGT_NHTCCCR}, /* MS_CCCR3 */ + /*pmc 58*/ {{MSR_P4_FLAME_CCCR1, 0},13, PFM_REGT_NHTCCCR}, /* FLAME_CCCR1 */ + /*pmc 59*/ {{MSR_P4_FLAME_CCCR3, 0},14, PFM_REGT_NHTCCCR}, /* FLAME_CCCR3 */ + /*pmc 60*/ {{MSR_P4_IQ_CCCR2, 0},15, PFM_REGT_NHTCCCR}, /* IQ_CCCR2 */ + /*pmc 61*/ {{MSR_P4_IQ_CCCR3, 0},16, PFM_REGT_NHTCCCR}, /* IQ_CCCR3 */ + /*pmc 62*/ {{MSR_P4_IQ_CCCR5, 0},17, PFM_REGT_NHTCCCR}, /* IQ_CCCR5 */ + /*pmc 63*/ {{0x3f2, 0}, 0, PFM_REGT_NHTPEBS},/* PEBS_MATRIX_VERT */ + /*pmc 64*/ {{0x3f1, 0}, 0, PFM_REGT_NHTPEBS} /* PEBS_ENABLE */ + }, + + .pmd_addrs = { + /*pmd 0 */ {{MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_PERFCTR2}, 0, PFM_REGT_CTR}, /* BPU_CTR0,2 */ + /*pmd 1 */ {{MSR_P4_BPU_PERFCTR1, MSR_P4_BPU_PERFCTR3}, 0, PFM_REGT_CTR}, /* BPU_CTR1,3 */ + /*pmd 2 */ {{MSR_P4_MS_PERFCTR0, MSR_P4_MS_PERFCTR2}, 0, PFM_REGT_CTR}, /* MS_CTR0,2 */ + /*pmd 3 */ {{MSR_P4_MS_PERFCTR1, MSR_P4_MS_PERFCTR3}, 0, PFM_REGT_CTR}, /* MS_CTR1,3 */ + /*pmd 4 */ {{MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_PERFCTR2}, 0, PFM_REGT_CTR}, /* FLAME_CTR0,2 */ + /*pmd 5 */ {{MSR_P4_FLAME_PERFCTR1, MSR_P4_FLAME_PERFCTR3}, 0, PFM_REGT_CTR}, /* FLAME_CTR1,3 */ + /*pmd 6 */ {{MSR_P4_IQ_PERFCTR0, MSR_P4_IQ_PERFCTR2}, 0, PFM_REGT_CTR}, /* IQ_CTR0,2 */ + /*pmd 7 */ {{MSR_P4_IQ_PERFCTR1, MSR_P4_IQ_PERFCTR3}, 0, PFM_REGT_CTR}, /* IQ_CTR1,3 */ + /*pmd 8 */ {{MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_PERFCTR5}, 0, PFM_REGT_CTR}, /* IQ_CTR4,5 */ + /* + * non HT extensions + */ + /*pmd 9 */ {{MSR_P4_BPU_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* BPU_CTR2 */ + /*pmd 10*/ {{MSR_P4_BPU_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* BPU_CTR3 */ + /*pmd 11*/ {{MSR_P4_MS_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* MS_CTR2 */ + /*pmd 12*/ {{MSR_P4_MS_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* MS_CTR3 */ + /*pmd 13*/ {{MSR_P4_FLAME_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* FLAME_CTR2 */ + /*pmd 14*/ {{MSR_P4_FLAME_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* FLAME_CTR3 */ + /*pmd 15*/ {{MSR_P4_IQ_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* IQ_CTR2 */ + /*pmd 16*/ {{MSR_P4_IQ_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* IQ_CTR3 */ + /*pmd 17*/ {{MSR_P4_IQ_PERFCTR5, 0}, 0, PFM_REGT_NHTCTR}, /* IQ_CTR5 */ + }, + .pebs_ctr_idx = 8, /* thread0: IQ_CTR4, thread1: IQ_CTR5 */ + .pmu_style = PFM_X86_PMU_P4 +}; + +static struct pfm_regmap_desc pfm_p4_pmc_desc[]={ +/* pmc0 */ PMC_D(PFM_REG_I, "BPU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BPU_ESCR0), +/* pmc1 */ PMC_D(PFM_REG_I, "IS_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR0), +/* pmc2 */ PMC_D(PFM_REG_I, "MOB_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MOB_ESCR0), +/* pmc3 */ PMC_D(PFM_REG_I, "ITLB_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ITLB_ESCR0), +/* pmc4 */ PMC_D(PFM_REG_I, "PMH_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_PMH_ESCR0), +/* pmc5 */ PMC_D(PFM_REG_I, "IX_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IX_ESCR0), +/* pmc6 */ PMC_D(PFM_REG_I, "FSB_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FSB_ESCR0), +/* pmc7 */ PMC_D(PFM_REG_I, "BSU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BSU_ESCR0), +/* pmc8 */ PMC_D(PFM_REG_I, "MS_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MS_ESCR0), +/* pmc9 */ PMC_D(PFM_REG_I, "TC_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TC_ESCR0), +/* pmc10 */ PMC_D(PFM_REG_I, "TBPU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TBPU_ESCR0), +/* pmc11 */ PMC_D(PFM_REG_I, "FLAME_ESCR0", 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FLAME_ESCR0), +/* pmc12 */ PMC_D(PFM_REG_I, "FIRM_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FIRM_ESCR0), +/* pmc13 */ PMC_D(PFM_REG_I, "SAAT_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SAAT_ESCR0), +/* pmc14 */ PMC_D(PFM_REG_I, "U2L_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_U2L_ESCR0), +/* pmc15 */ PMC_D(PFM_REG_I, "DAC_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_DAC_ESCR0), +/* pmc16 */ PMC_D(PFM_REG_I, "IQ_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR0), /* only model 1 and 2*/ +/* pmc17 */ PMC_D(PFM_REG_I, "ALF_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ALF_ESCR0), +/* pmc18 */ PMC_D(PFM_REG_I, "RAT_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_RAT_ESCR0), +/* pmc19 */ PMC_D(PFM_REG_I, "SSU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SSU_ESCR0), +/* pmc20 */ PMC_D(PFM_REG_I, "CRU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR0), +/* pmc21 */ PMC_D(PFM_REG_I, "CRU_ESCR2" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR2), +/* pmc22 */ PMC_D(PFM_REG_I, "CRU_ESCR4" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR4), +/* pmc23 */ PMC_D(PFM_REG_I64, "BPU_CCCR0" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR0), +/* pmc24 */ PMC_D(PFM_REG_I64, "BPU_CCCR1" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR1), +/* pmc25 */ PMC_D(PFM_REG_I64, "MS_CCCR0" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR0), +/* pmc26 */ PMC_D(PFM_REG_I64, "MS_CCCR1" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR1), +/* pmc27 */ PMC_D(PFM_REG_I64, "FLAME_CCCR0", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR0), +/* pmc28 */ PMC_D(PFM_REG_I64, "FLAME_CCCR1", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR1), +/* pmc29 */ PMC_D(PFM_REG_I64, "IQ_CCCR0" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR0), +/* pmc30 */ PMC_D(PFM_REG_I64, "IQ_CCCR1" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR1), +/* pmc31 */ PMC_D(PFM_REG_I64, "IQ_CCCR4" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR4), + /* No HT extension */ +/* pmc32 */ PMC_D(PFM_REG_I, "BPU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BPU_ESCR1), +/* pmc33 */ PMC_D(PFM_REG_I, "IS_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IS_ESCR1), +/* pmc34 */ PMC_D(PFM_REG_I, "MOB_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MOB_ESCR1), +/* pmc35 */ PMC_D(PFM_REG_I, "ITLB_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ITLB_ESCR1), +/* pmc36 */ PMC_D(PFM_REG_I, "PMH_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_PMH_ESCR1), +/* pmc37 */ PMC_D(PFM_REG_I, "IX_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IX_ESCR1), +/* pmc38 */ PMC_D(PFM_REG_I, "FSB_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FSB_ESCR1), +/* pmc39 */ PMC_D(PFM_REG_I, "BSU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BSU_ESCR1), +/* pmc40 */ PMC_D(PFM_REG_I, "MS_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MS_ESCR1), +/* pmc41 */ PMC_D(PFM_REG_I, "TC_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TC_ESCR1), +/* pmc42 */ PMC_D(PFM_REG_I, "TBPU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TBPU_ESCR1), +/* pmc43 */ PMC_D(PFM_REG_I, "FLAME_ESCR1", 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FLAME_ESCR1), +/* pmc44 */ PMC_D(PFM_REG_I, "FIRM_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FIRM_ESCR1), +/* pmc45 */ PMC_D(PFM_REG_I, "SAAT_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SAAT_ESCR1), +/* pmc46 */ PMC_D(PFM_REG_I, "U2L_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_U2L_ESCR1), +/* pmc47 */ PMC_D(PFM_REG_I, "DAC_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_DAC_ESCR1), +/* pmc48 */ PMC_D(PFM_REG_I, "IQ_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR1), /* only model 1 and 2 */ +/* pmc49 */ PMC_D(PFM_REG_I, "ALF_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ALF_ESCR1), +/* pmc50 */ PMC_D(PFM_REG_I, "RAT_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_RAT_ESCR1), +/* pmc51 */ PMC_D(PFM_REG_I, "CRU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR1), +/* pmc52 */ PMC_D(PFM_REG_I, "CRU_ESCR3" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR3), +/* pmc53 */ PMC_D(PFM_REG_I, "CRU_ESCR5" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR5), +/* pmc54 */ PMC_D(PFM_REG_I64, "BPU_CCCR2" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR2), +/* pmc55 */ PMC_D(PFM_REG_I64, "BPU_CCCR3" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR3), +/* pmc56 */ PMC_D(PFM_REG_I64, "MS_CCCR2" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR2), +/* pmc57 */ PMC_D(PFM_REG_I64, "MS_CCCR3" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR3), +/* pmc58 */ PMC_D(PFM_REG_I64, "FLAME_CCCR2", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR2), +/* pmc59 */ PMC_D(PFM_REG_I64, "FLAME_CCCR3", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR3), +/* pmc60 */ PMC_D(PFM_REG_I64, "IQ_CCCR2" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR2), +/* pmc61 */ PMC_D(PFM_REG_I64, "IQ_CCCR3" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR3), +/* pmc62 */ PMC_D(PFM_REG_I64, "IQ_CCCR5" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR5), +/* pmc63 */ PMC_D(PFM_REG_I, "PEBS_MATRIX_VERT", 0, 0xffffffffffffffecULL, 0, 0x3f2), +/* pmc64 */ PMC_D(PFM_REG_I, "PEBS_ENABLE", 0, 0xfffffffff8ffe000ULL, 0, 0x3f1) +}; +#define PFM_P4_NUM_PMCS ARRAY_SIZE(pfm_p4_pmc_desc) + +/* + * See section 15.10.6.6 for details about the IQ block + */ +static struct pfm_regmap_desc pfm_p4_pmd_desc[]={ +/* pmd0 */ PMD_D(PFM_REG_C, "BPU_CTR0", MSR_P4_BPU_PERFCTR0), +/* pmd1 */ PMD_D(PFM_REG_C, "BPU_CTR1", MSR_P4_BPU_PERFCTR1), +/* pmd2 */ PMD_D(PFM_REG_C, "MS_CTR0", MSR_P4_MS_PERFCTR0), +/* pmd3 */ PMD_D(PFM_REG_C, "MS_CTR1", MSR_P4_MS_PERFCTR1), +/* pmd4 */ PMD_D(PFM_REG_C, "FLAME_CTR0", MSR_P4_FLAME_PERFCTR0), +/* pmd5 */ PMD_D(PFM_REG_C, "FLAME_CTR1", MSR_P4_FLAME_PERFCTR1), +/* pmd6 */ PMD_D(PFM_REG_C, "IQ_CTR0", MSR_P4_IQ_PERFCTR0), +/* pmd7 */ PMD_D(PFM_REG_C, "IQ_CTR1", MSR_P4_IQ_PERFCTR1), +/* pmd8 */ PMD_D(PFM_REG_C, "IQ_CTR4", MSR_P4_IQ_PERFCTR4), + /* no HT extension */ +/* pmd9 */ PMD_D(PFM_REG_C, "BPU_CTR2", MSR_P4_BPU_PERFCTR2), +/* pmd10 */ PMD_D(PFM_REG_C, "BPU_CTR3", MSR_P4_BPU_PERFCTR3), +/* pmd11 */ PMD_D(PFM_REG_C, "MS_CTR2", MSR_P4_MS_PERFCTR2), +/* pmd12 */ PMD_D(PFM_REG_C, "MS_CTR3", MSR_P4_MS_PERFCTR3), +/* pmd13 */ PMD_D(PFM_REG_C, "FLAME_CTR2", MSR_P4_FLAME_PERFCTR2), +/* pmd14 */ PMD_D(PFM_REG_C, "FLAME_CTR3", MSR_P4_FLAME_PERFCTR3), +/* pmd15 */ PMD_D(PFM_REG_C, "IQ_CTR2", MSR_P4_IQ_PERFCTR1), +/* pmd16 */ PMD_D(PFM_REG_C, "IQ_CTR3", MSR_P4_IQ_PERFCTR3), +/* pmd17 */ PMD_D(PFM_REG_C, "IQ_CTR5", MSR_P4_IQ_PERFCTR5) +}; +#define PFM_P4_NUM_PMDS ARRAY_SIZE(pfm_p4_pmd_desc) + +/* + * Due to hotplug CPU support, threads may not necessarily + * be activated at the time the module is inserted. We need + * to check whether they could be activated by looking at + * the present CPU (present != online). + */ +static int pfm_p4_probe_pmu(void) +{ + unsigned int i; + int ht_enabled; + + /* + * only works on Intel processors + */ + if (cpu_data->x86_vendor != X86_VENDOR_INTEL) { + PFM_INFO("not running on Intel processor"); + return -1; + } + + if (cpu_data->x86 != 15) { + PFM_INFO("unsupported family=%d", cpu_data->x86); + return -1; + } + + switch(cpu_data->x86_model) { + case 0 ... 2: + break; + case 3 ... 6: + /* + * IQ_ESCR0, IQ_ESCR1 only present on model 1, 2 + */ + pfm_p4_pmc_desc[16].type = PFM_REG_NA; + pfm_p4_pmc_desc[48].type = PFM_REG_NA; + break; + default: + /* + * do not know if they all work the same, so reject + * for now + */ + if (!force) { + PFM_INFO("unsupported model %d", cpu_data->x86_model); + return -1; + } + } + + /* + * check for local APIC (required) + */ + if (!cpu_has_apic) { + PFM_INFO("no local APIC, unsupported"); + return -1; + } +#ifdef CONFIG_SMP + ht_enabled = (cpus_weight(cpu_core_map[smp_processor_id()]) + / cpu_data->x86_max_cores) > 1; +#else + ht_enabled = 0; +#endif + if (cpu_has_ht) { + + PFM_INFO("HyperThreading supported, status %s", + ht_enabled ? "on": "off"); + /* + * disable registers not supporting HT + */ + if (ht_enabled) { + PFM_INFO("disabling half the registers for HT"); + for (i = 0; i < PFM_P4_NUM_PMCS; i++) { + if (pfm_p4_pmu_info.pmc_addrs[(i)].reg_type & + PFM_REGT_NOHT) + pfm_p4_pmc_desc[i].type = PFM_REG_NA; + } + for (i = 0; i < PFM_P4_NUM_PMDS; i++) { + if (pfm_p4_pmu_info.pmd_addrs[(i)].reg_type & + PFM_REGT_NOHT) + pfm_p4_pmd_desc[i].type = PFM_REG_NA; + } + } + } + + if (cpu_has_ds) { + PFM_INFO("Data Save Area (DS) supported"); + + pfm_p4_pmu_info.flags = PFM_X86_FL_PMU_DS; + + if (cpu_has_pebs) { + /* + * PEBS does not work with HyperThreading enabled + */ + if (ht_enabled) { + PFM_INFO("PEBS supported, status off (because of HT)"); + } else { + pfm_p4_pmu_info.flags |= PFM_X86_FL_PMU_PEBS; + PFM_INFO("PEBS supported, status on"); + } + } + } + /* + * NMI using PMU? + * Actual removal of NMI counter is done by pfm_pmu_acquire() + */ + if (nmi_watchdog == NMI_LOCAL_APIC || force_nmi) + pfm_p4_pmu_info.flags |= PFM_X86_FL_USE_NMI; + return 0; +} + +static struct pfm_pmu_config pfm_p4_pmu_conf={ + .pmu_name = "Intel P4", + .counter_width = 40, + .pmd_desc = pfm_p4_pmd_desc, + .pmc_desc = pfm_p4_pmc_desc, + .num_pmc_entries = PFM_P4_NUM_PMCS, + .num_pmd_entries = PFM_P4_NUM_PMDS, + .probe_pmu = pfm_p4_probe_pmu, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .arch_info = &pfm_p4_pmu_info +}; + +static int __init pfm_p4_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_p4_pmu_conf); +} + +static void __exit pfm_p4_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_p4_pmu_conf); +} + +module_init(pfm_p4_pmu_init_module); +module_exit(pfm_p4_pmu_cleanup_module); Index: linux-2.6/arch/i386/perfmon/perfmon_p6.c =================================================================== --- /dev/null +++ linux-2.6/arch/i386/perfmon/perfmon_p6.c @@ -0,0 +1,172 @@ +/* + * This file contains the P6 family processor PMU register description tables + * + * This module supports original P6 processors + * (Pentium II, Pentium Pro, Pentium III) and Pentium M. + * + * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("P6 PMU description table"); +MODULE_LICENSE("GPL"); + +static int force_nmi; +MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); +module_param(force_nmi, bool, 0600); + +/* + * - upper 32 bits are reserved + * - INT: APIC enable bit is reserved (forced to 1) + * - bit 21 is reserved + * + * RSVD: reserved bits are 1 + */ +#define PFM_P6_PMC_RSVD ((~((1ULL<<32)-1)) \ + | (1ULL<<20) \ + | (1ULL<<21)) + +/* + * force Local APIC interrupt on overflow + * disable with NO_EMUL64 + */ +#define PFM_P6_PMC_VAL (1ULL<<20) +#define PFM_P6_NO64 (1ULL<<20) + +struct pfm_arch_pmu_info pfm_p6_pmu_info={ + .pmc_addrs = { + {{MSR_P6_EVNTSEL0, 0}, 0, PFM_REGT_EN}, /* has enable bit */ + {{MSR_P6_EVNTSEL1, 0}, 1, PFM_REGT_OTH} /* no enable bit */ + }, + .pmd_addrs = { + {{MSR_P6_PERFCTR0, 0}, 0, PFM_REGT_CTR}, + {{MSR_P6_PERFCTR1, 0}, 0, PFM_REGT_CTR} + }, + .pmu_style = PFM_X86_PMU_P6 +}; + +static struct pfm_regmap_desc pfm_p6_pmc_desc[]={ +/* pmc0 */ PMC_D(PFM_REG_I64, "PERFEVTSEL0", PFM_P6_PMC_VAL, PFM_P6_PMC_RSVD, PFM_P6_NO64, MSR_P6_EVNTSEL0), +/* pmc1 */ PMC_D(PFM_REG_I64, "PERFEVTSEL1", PFM_P6_PMC_VAL, PFM_P6_PMC_RSVD, PFM_P6_NO64, MSR_P6_EVNTSEL1) +}; +#define PFM_P6_NUM_PMCS ARRAY_SIZE(pfm_p6_pmc_desc) + +static struct pfm_regmap_desc pfm_p6_pmd_desc[]={ +/* pmd0 */ PMD_D(PFM_REG_C , "PERFCTR0", MSR_P6_PERFCTR0), +/* pmd1 */ PMD_D(PFM_REG_C , "PERFCTR1", MSR_P6_PERFCTR1) +}; +#define PFM_P6_NUM_PMDS ARRAY_SIZE(pfm_p6_pmd_desc) + +static int pfm_p6_probe_pmu(void) +{ + int high, low; + + if (cpu_data->x86_vendor != X86_VENDOR_INTEL) { + PFM_INFO("not an Intel processor"); + return -1; + } + + /* + * check for P6 processor family + */ + if (cpu_data->x86 != 6) { + PFM_INFO("unsupported family=%d", cpu_data->x86); + return -1; + } + + switch(cpu_data->x86_model) { + case 3: + case 5: /* Pentium II Deschutes */ + case 7 ... 11: + break; + case 13: + /* for Pentium M, we need to check if PMU exist */ + rdmsr(MSR_IA32_MISC_ENABLE, low, high); + if (low & (1U << 7)) + break; + default: + PFM_INFO("unsupported CPU model %d", + cpu_data->x86_model); + return -1; + + } + + if (!cpu_has_apic) { + PFM_INFO("no Local APIC, try rebooting with lapic"); + return -1; + } + + PFM_INFO("nmi_watchdog=%d nmi_active=%d force_nmi=%d", + nmi_watchdog, atomic_read(&nmi_active), force_nmi); + + /* + * we cannot have perfmon/nmi_watchdog running together as there + * is only one enable bit for both counters. + */ + if (nmi_watchdog == NMI_LOCAL_APIC) { + PFM_INFO("NMI watchdog using performance counters." + "perfmon cannot work correctly, reboot with nmi_watchdog=0"); + return -1; + } + + /* + * force NMI interrupt? + */ + if (force_nmi) + pfm_p6_pmu_info.flags |= PFM_X86_FL_USE_NMI; + + return 0; +} + +/* + * Counters have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a counter for P6-like PMU is 31 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ +static struct pfm_pmu_config pfm_p6_pmu_conf={ + .pmu_name = "Intel P6 processor Family", + .counter_width = 31, + .pmd_desc = pfm_p6_pmd_desc, + .pmc_desc = pfm_p6_pmc_desc, + .num_pmc_entries = PFM_P6_NUM_PMCS, + .num_pmd_entries = PFM_P6_NUM_PMDS, + .probe_pmu = pfm_p6_probe_pmu, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .arch_info = &pfm_p6_pmu_info +}; + +static int __init pfm_p6_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_p6_pmu_conf); +} + +static void __exit pfm_p6_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_p6_pmu_conf); +} + +module_init(pfm_p6_pmu_init_module); +module_exit(pfm_p6_pmu_cleanup_module); Index: linux-2.6/arch/i386/perfmon/perfmon_pebs_smpl.c =================================================================== --- /dev/null +++ linux-2.6/arch/i386/perfmon/perfmon_pebs_smpl.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file implements the Precise Event Based Sampling (PEBS) + * sampling format. It supports the following processors: + * - 32-bit Pentium 4, Xeon, Core-based processors. + * - 64-bit Pentium 4, Xeon, Core-based processors. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Intel Precise Event-Based Sampling (PEBS)"); +MODULE_LICENSE("GPL"); + +#define ALIGN_PEBS(a, order) \ + ((a)+(1UL<<(order))-1) & ~((1UL<<(order))-1) + +#define PEBS_PADDING_ORDER 8 /* log2(256) padding for PEBS alignment constraint */ + +static int pfm_pebs_fmt_validate(u32 flags, u16 npmds, void *data) +{ + struct pfm_pebs_smpl_arg *arg = data; + size_t min_buf_size; + + /* + * need to define at least the size of the buffer + */ + if (data == NULL) { + PFM_DBG("no argument passed"); + return -EINVAL; + } + + /* + * compute min buf size. npmds is the maximum number + * of implemented PMD registers. + */ + min_buf_size = sizeof(struct pfm_pebs_smpl_hdr) + + sizeof(struct pfm_pebs_smpl_entry) + + (1UL<buf_size); + + /* + * must hold at least the buffer header + one minimally sized entry + */ + if (arg->buf_size < min_buf_size) + return -EINVAL; + + return 0; +} + +static int pfm_pebs_fmt_get_size(unsigned int flags, void *data, size_t *size) +{ + struct pfm_pebs_smpl_arg *arg = data; + + /* + * size has been validated in pfm_pebs_fmt_validate() + */ + *size = arg->buf_size + (1UL<ds; + + /* + * align PEBS buffer base + */ + pebs_start = ALIGN_PEBS((unsigned long)(hdr+1), PEBS_PADDING_ORDER); + pebs_end = pebs_start + arg->buf_size + 1; + + hdr->version = PFM_PEBS_SMPL_VERSION; + hdr->buf_size = arg->buf_size; + hdr->overflows = 0; + + /* + * express PEBS buffer base as offset from the end of the header + */ + hdr->start_offs = pebs_start - (unsigned long)(hdr+1); + + /* + * PEBS buffer boundaries + */ + ds->pebs_buf_base = pebs_start; + ds->pebs_abs_max = pebs_end; + + /* + * PEBS starting position + */ + ds->pebs_index = pebs_start; + + /* + * PEBS interrupt threshold + */ + ds->pebs_intr_thres = pebs_start + + arg->intr_thres + * sizeof(struct pfm_pebs_smpl_entry); + + /* + * save counter reset value for PEBS counter + */ + ds->pebs_cnt_reset = arg->cnt_reset; + + /* + * keep track of DS AREA + */ + ctx_arch->ds_area = (unsigned long)ds; + ctx_arch->flags |= PFM_X86_USE_PEBS; + + PFM_DBG("buffer=%p buf_size=%llu offs=%llu pebs_start=0x%lx " + "pebs_end=0x%lx ds=%p pebs_thres=0x%lx cnt_reset=0x%llx", + buf, + (unsigned long long)hdr->buf_size, + (unsigned long long)hdr->start_offs, + pebs_start, + pebs_end, + ds, + ds->pebs_intr_thres, + (unsigned long long)ds->pebs_cnt_reset); + + return 0; +} + +static int pfm_pebs_fmt_handler(void *buf, struct pfm_ovfl_arg *arg, + unsigned long ip, u64 tstamp, void *data) +{ + struct pfm_pebs_smpl_hdr *hdr; + + hdr = buf; + + PFM_DBG_ovfl("buffer full"); + /* + * increment number of buffer overflows. + * important to detect duplicate set of samples. + */ + hdr->overflows++; + + /* + * request notification and masking of monitoring. + * Notification is still subject to the overflowed + * register having the FL_NOTIFY flag set. + */ + arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY| PFM_OVFL_CTRL_MASK; + + return -ENOBUFS; /* we are full, sorry */ +} + +static int pfm_pebs_fmt_restart(int is_active, u32 *ovfl_ctrl, + void *buf) +{ + struct pfm_pebs_smpl_hdr *hdr = buf; + + /* + * reset index to base of buffer + */ + hdr->ds.pebs_index = hdr->ds.pebs_buf_base; + + *ovfl_ctrl = PFM_OVFL_CTRL_RESET; + + return 0; +} + +static int pfm_pebs_fmt_exit(void *buf) +{ + return 0; +} + +static struct pfm_smpl_fmt pebs_fmt={ + .fmt_name = PFM_PEBS_SMPL_NAME, + .fmt_version = 0x1, + .fmt_arg_size = sizeof(struct pfm_pebs_smpl_arg), + .fmt_validate = pfm_pebs_fmt_validate, + .fmt_getsize = pfm_pebs_fmt_get_size, + .fmt_init = pfm_pebs_fmt_init, + .fmt_handler = pfm_pebs_fmt_handler, + .fmt_restart = pfm_pebs_fmt_restart, + .fmt_exit = pfm_pebs_fmt_exit, + .fmt_flags = PFM_FMT_BUILTIN_FLAG, + .owner = THIS_MODULE, +}; + +static int __init pfm_pebs_fmt_init_module(void) +{ + int ht_enabled; + + if (!cpu_has_pebs) { + PFM_INFO("processor does not have PEBS support"); + return -1; + } +#ifdef CONFIG_SMP + ht_enabled = cpus_weight(cpu_core_map[smp_processor_id()]) + / cpu_data->x86_max_cores > 1; +#else + ht_enabled = 0; +#endif + if (ht_enabled) { + PFM_INFO("PEBS not available because HyperThreading is on"); + return -1; + } + return pfm_fmt_register(&pebs_fmt); +} + +static void __exit pfm_pebs_fmt_cleanup_module(void) +{ + pfm_fmt_unregister(&pebs_fmt); +} + +module_init(pfm_pebs_fmt_init_module); +module_exit(pfm_pebs_fmt_cleanup_module); Index: linux-2.6/arch/ia64/Kconfig =================================================================== --- linux-2.6.orig/arch/ia64/Kconfig +++ linux-2.6/arch/ia64/Kconfig @@ -424,14 +424,6 @@ config COMPAT config IA64_MCA_RECOVERY tristate "MCA recovery from errors other than TLB." -config PERFMON - bool "Performance monitor support" - help - Selects whether support for the IA-64 performance monitor hardware - is included in the kernel. This makes some kernel data-structures a - little bigger and slows down execution a bit, but it is generally - a good idea to turn this on. If you're unsure, say Y. - config IA64_PALINFO tristate "/proc/pal support" help @@ -493,6 +485,8 @@ source "drivers/firmware/Kconfig" source "fs/Kconfig.binfmt" +source "arch/ia64/perfmon/Kconfig" + endmenu menu "Power management and ACPI" Index: linux-2.6/arch/ia64/Makefile =================================================================== --- linux-2.6.orig/arch/ia64/Makefile +++ linux-2.6/arch/ia64/Makefile @@ -55,6 +55,7 @@ core-$(CONFIG_IA64_GENERIC) += arch/ia6 core-$(CONFIG_IA64_HP_ZX1) += arch/ia64/dig/ core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/ core-$(CONFIG_IA64_SGI_SN2) += arch/ia64/sn/ +core-$(CONFIG_PERFMON) += arch/ia64/perfmon/ drivers-$(CONFIG_PCI) += arch/ia64/pci/ drivers-$(CONFIG_IA64_HP_SIM) += arch/ia64/hp/sim/ Index: linux-2.6/arch/ia64/defconfig =================================================================== --- linux-2.6.orig/arch/ia64/defconfig +++ linux-2.6/arch/ia64/defconfig @@ -162,7 +162,6 @@ CONFIG_HAVE_ARCH_NODEDATA_EXTENSION=y CONFIG_IA32_SUPPORT=y CONFIG_COMPAT=y CONFIG_IA64_MCA_RECOVERY=y -CONFIG_PERFMON=y CONFIG_IA64_PALINFO=y # CONFIG_MC_ERR_INJECT is not set CONFIG_SGI_SN=y @@ -184,6 +183,16 @@ CONFIG_BINFMT_ELF=y CONFIG_BINFMT_MISC=m # +# Hardware Performance Monitoring support +# +CONFIG_PERFMON=y +CONFIG_IA64_PERFMON_COMPAT=y +CONFIG_IA64_PERFMON_GENERIC=m +CONFIG_IA64_PERFMON_ITANIUM=y +CONFIG_IA64_PERFMON_MCKINLEY=y +CONFIG_IA64_PERFMON_MONTECITO=y + +# # Power management and ACPI # CONFIG_PM=y Index: linux-2.6/arch/ia64/kernel/Makefile =================================================================== --- linux-2.6.orig/arch/ia64/kernel/Makefile +++ linux-2.6/arch/ia64/kernel/Makefile @@ -5,7 +5,7 @@ extra-y := head.o init_task.o vmlinux.lds obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ - irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \ + irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o ptrace.o sal.o \ salinfo.o semaphore.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \ unwind.o mca.o mca_asm.o topology.o @@ -23,7 +23,6 @@ obj-$(CONFIG_IOSAPIC) += iosapic.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_SMP) += smp.o smpboot.o obj-$(CONFIG_NUMA) += numa.o -obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o Index: linux-2.6/arch/ia64/kernel/entry.S =================================================================== --- linux-2.6.orig/arch/ia64/kernel/entry.S +++ linux-2.6/arch/ia64/kernel/entry.S @@ -1588,5 +1588,17 @@ sys_call_table: data8 sys_signalfd data8 sys_timerfd data8 sys_eventfd + data8 sys_pfm_create_context // 1310 + data8 sys_pfm_write_pmcs + data8 sys_pfm_write_pmds + data8 sys_pfm_read_pmds + data8 sys_pfm_load_context + data8 sys_pfm_start // 1305 + data8 sys_pfm_stop + data8 sys_pfm_restart + data8 sys_pfm_create_evtsets + data8 sys_pfm_getinfo_evtsets + data8 sys_pfm_delete_evtsets // 1310 + data8 sys_pfm_unload_context .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls Index: linux-2.6/arch/ia64/kernel/irq_ia64.c =================================================================== --- linux-2.6.orig/arch/ia64/kernel/irq_ia64.c +++ linux-2.6/arch/ia64/kernel/irq_ia64.c @@ -40,10 +40,6 @@ #include #include -#ifdef CONFIG_PERFMON -# include -#endif - #define IRQ_DEBUG 0 /* These can be overridden in platform_irq_init */ @@ -320,9 +316,6 @@ init_IRQ (void) register_percpu_irq(IA64_IPI_RESCHEDULE, &resched_irqaction); register_percpu_irq(IA64_IPI_LOCAL_TLB_FLUSH, &tlb_irqaction); #endif -#ifdef CONFIG_PERFMON - pfm_init_percpu(); -#endif platform_irq_init(); } Index: linux-2.6/arch/ia64/kernel/perfmon.c =================================================================== --- linux-2.6.orig/arch/ia64/kernel/perfmon.c +++ /dev/null @@ -1,6879 +0,0 @@ -/* - * This file implements the perfmon-2 subsystem which is used - * to program the IA-64 Performance Monitoring Unit (PMU). - * - * The initial version of perfmon.c was written by - * Ganesh Venkitachalam, IBM Corp. - * - * Then it was modified for perfmon-1.x by Stephane Eranian and - * David Mosberger, Hewlett Packard Co. - * - * Version Perfmon-2.x is a rewrite of perfmon-1.x - * by Stephane Eranian, Hewlett Packard Co. - * - * Copyright (C) 1999-2005 Hewlett Packard Co - * Stephane Eranian - * David Mosberger-Tang - * - * More information about perfmon available at: - * http://www.hpl.hp.com/research/linux/perfmon - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_PERFMON -/* - * perfmon context state - */ -#define PFM_CTX_UNLOADED 1 /* context is not loaded onto any task */ -#define PFM_CTX_LOADED 2 /* context is loaded onto a task */ -#define PFM_CTX_MASKED 3 /* context is loaded but monitoring is masked due to overflow */ -#define PFM_CTX_ZOMBIE 4 /* owner of the context is closing it */ - -#define PFM_INVALID_ACTIVATION (~0UL) - -#define PFM_NUM_PMC_REGS 64 /* PMC save area for ctxsw */ -#define PFM_NUM_PMD_REGS 64 /* PMD save area for ctxsw */ - -/* - * depth of message queue - */ -#define PFM_MAX_MSGS 32 -#define PFM_CTXQ_EMPTY(g) ((g)->ctx_msgq_head == (g)->ctx_msgq_tail) - -/* - * type of a PMU register (bitmask). - * bitmask structure: - * bit0 : register implemented - * bit1 : end marker - * bit2-3 : reserved - * bit4 : pmc has pmc.pm - * bit5 : pmc controls a counter (has pmc.oi), pmd is used as counter - * bit6-7 : register type - * bit8-31: reserved - */ -#define PFM_REG_NOTIMPL 0x0 /* not implemented at all */ -#define PFM_REG_IMPL 0x1 /* register implemented */ -#define PFM_REG_END 0x2 /* end marker */ -#define PFM_REG_MONITOR (0x1<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm field only */ -#define PFM_REG_COUNTING (0x2<<4|PFM_REG_MONITOR) /* a monitor + pmc.oi+ PMD used as a counter */ -#define PFM_REG_CONTROL (0x4<<4|PFM_REG_IMPL) /* PMU control register */ -#define PFM_REG_CONFIG (0x8<<4|PFM_REG_IMPL) /* configuration register */ -#define PFM_REG_BUFFER (0xc<<4|PFM_REG_IMPL) /* PMD used as buffer */ - -#define PMC_IS_LAST(i) (pmu_conf->pmc_desc[i].type & PFM_REG_END) -#define PMD_IS_LAST(i) (pmu_conf->pmd_desc[i].type & PFM_REG_END) - -#define PMC_OVFL_NOTIFY(ctx, i) ((ctx)->ctx_pmds[i].flags & PFM_REGFL_OVFL_NOTIFY) - -/* i assumed unsigned */ -#define PMC_IS_IMPL(i) (i< PMU_MAX_PMCS && (pmu_conf->pmc_desc[i].type & PFM_REG_IMPL)) -#define PMD_IS_IMPL(i) (i< PMU_MAX_PMDS && (pmu_conf->pmd_desc[i].type & PFM_REG_IMPL)) - -/* XXX: these assume that register i is implemented */ -#define PMD_IS_COUNTING(i) ((pmu_conf->pmd_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING) -#define PMC_IS_COUNTING(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING) -#define PMC_IS_MONITOR(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_MONITOR) == PFM_REG_MONITOR) -#define PMC_IS_CONTROL(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_CONTROL) == PFM_REG_CONTROL) - -#define PMC_DFL_VAL(i) pmu_conf->pmc_desc[i].default_value -#define PMC_RSVD_MASK(i) pmu_conf->pmc_desc[i].reserved_mask -#define PMD_PMD_DEP(i) pmu_conf->pmd_desc[i].dep_pmd[0] -#define PMC_PMD_DEP(i) pmu_conf->pmc_desc[i].dep_pmd[0] - -#define PFM_NUM_IBRS IA64_NUM_DBG_REGS -#define PFM_NUM_DBRS IA64_NUM_DBG_REGS - -#define CTX_OVFL_NOBLOCK(c) ((c)->ctx_fl_block == 0) -#define CTX_HAS_SMPL(c) ((c)->ctx_fl_is_sampling) -#define PFM_CTX_TASK(h) (h)->ctx_task - -#define PMU_PMC_OI 5 /* position of pmc.oi bit */ - -/* XXX: does not support more than 64 PMDs */ -#define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] |= (mask) -#define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL) - -#define CTX_USED_MONITOR(ctx, mask) (ctx)->ctx_used_monitors[0] |= (mask) - -#define CTX_USED_IBR(ctx,n) (ctx)->ctx_used_ibrs[(n)>>6] |= 1UL<< ((n) % 64) -#define CTX_USED_DBR(ctx,n) (ctx)->ctx_used_dbrs[(n)>>6] |= 1UL<< ((n) % 64) -#define CTX_USES_DBREGS(ctx) (((pfm_context_t *)(ctx))->ctx_fl_using_dbreg==1) -#define PFM_CODE_RR 0 /* requesting code range restriction */ -#define PFM_DATA_RR 1 /* requestion data range restriction */ - -#define PFM_CPUINFO_CLEAR(v) pfm_get_cpu_var(pfm_syst_info) &= ~(v) -#define PFM_CPUINFO_SET(v) pfm_get_cpu_var(pfm_syst_info) |= (v) -#define PFM_CPUINFO_GET() pfm_get_cpu_var(pfm_syst_info) - -#define RDEP(x) (1UL<<(x)) - -/* - * context protection macros - * in SMP: - * - we need to protect against CPU concurrency (spin_lock) - * - we need to protect against PMU overflow interrupts (local_irq_disable) - * in UP: - * - we need to protect against PMU overflow interrupts (local_irq_disable) - * - * spin_lock_irqsave()/spin_unlock_irqrestore(): - * in SMP: local_irq_disable + spin_lock - * in UP : local_irq_disable - * - * spin_lock()/spin_lock(): - * in UP : removed automatically - * in SMP: protect against context accesses from other CPU. interrupts - * are not masked. This is useful for the PMU interrupt handler - * because we know we will not get PMU concurrency in that code. - */ -#define PROTECT_CTX(c, f) \ - do { \ - DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, current->pid)); \ - spin_lock_irqsave(&(c)->ctx_lock, f); \ - DPRINT(("spinlocked ctx %p by [%d]\n", c, current->pid)); \ - } while(0) - -#define UNPROTECT_CTX(c, f) \ - do { \ - DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, current->pid)); \ - spin_unlock_irqrestore(&(c)->ctx_lock, f); \ - } while(0) - -#define PROTECT_CTX_NOPRINT(c, f) \ - do { \ - spin_lock_irqsave(&(c)->ctx_lock, f); \ - } while(0) - - -#define UNPROTECT_CTX_NOPRINT(c, f) \ - do { \ - spin_unlock_irqrestore(&(c)->ctx_lock, f); \ - } while(0) - - -#define PROTECT_CTX_NOIRQ(c) \ - do { \ - spin_lock(&(c)->ctx_lock); \ - } while(0) - -#define UNPROTECT_CTX_NOIRQ(c) \ - do { \ - spin_unlock(&(c)->ctx_lock); \ - } while(0) - - -#ifdef CONFIG_SMP - -#define GET_ACTIVATION() pfm_get_cpu_var(pmu_activation_number) -#define INC_ACTIVATION() pfm_get_cpu_var(pmu_activation_number)++ -#define SET_ACTIVATION(c) (c)->ctx_last_activation = GET_ACTIVATION() - -#else /* !CONFIG_SMP */ -#define SET_ACTIVATION(t) do {} while(0) -#define GET_ACTIVATION(t) do {} while(0) -#define INC_ACTIVATION(t) do {} while(0) -#endif /* CONFIG_SMP */ - -#define SET_PMU_OWNER(t, c) do { pfm_get_cpu_var(pmu_owner) = (t); pfm_get_cpu_var(pmu_ctx) = (c); } while(0) -#define GET_PMU_OWNER() pfm_get_cpu_var(pmu_owner) -#define GET_PMU_CTX() pfm_get_cpu_var(pmu_ctx) - -#define LOCK_PFS(g) spin_lock_irqsave(&pfm_sessions.pfs_lock, g) -#define UNLOCK_PFS(g) spin_unlock_irqrestore(&pfm_sessions.pfs_lock, g) - -#define PFM_REG_RETFLAG_SET(flags, val) do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0) - -/* - * cmp0 must be the value of pmc0 - */ -#define PMC0_HAS_OVFL(cmp0) (cmp0 & ~0x1UL) - -#define PFMFS_MAGIC 0xa0b4d889 - -/* - * debugging - */ -#define PFM_DEBUGGING 1 -#ifdef PFM_DEBUGGING -#define DPRINT(a) \ - do { \ - if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \ - } while (0) - -#define DPRINT_ovfl(a) \ - do { \ - if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \ - } while (0) -#endif - -/* - * 64-bit software counter structure - * - * the next_reset_type is applied to the next call to pfm_reset_regs() - */ -typedef struct { - unsigned long val; /* virtual 64bit counter value */ - unsigned long lval; /* last reset value */ - unsigned long long_reset; /* reset value on sampling overflow */ - unsigned long short_reset; /* reset value on overflow */ - unsigned long reset_pmds[4]; /* which other pmds to reset when this counter overflows */ - unsigned long smpl_pmds[4]; /* which pmds are accessed when counter overflow */ - unsigned long seed; /* seed for random-number generator */ - unsigned long mask; /* mask for random-number generator */ - unsigned int flags; /* notify/do not notify */ - unsigned long eventid; /* overflow event identifier */ -} pfm_counter_t; - -/* - * context flags - */ -typedef struct { - unsigned int block:1; /* when 1, task will blocked on user notifications */ - unsigned int system:1; /* do system wide monitoring */ - unsigned int using_dbreg:1; /* using range restrictions (debug registers) */ - unsigned int is_sampling:1; /* true if using a custom format */ - unsigned int excl_idle:1; /* exclude idle task in system wide session */ - unsigned int going_zombie:1; /* context is zombie (MASKED+blocking) */ - unsigned int trap_reason:2; /* reason for going into pfm_handle_work() */ - unsigned int no_msg:1; /* no message sent on overflow */ - unsigned int can_restart:1; /* allowed to issue a PFM_RESTART */ - unsigned int reserved:22; -} pfm_context_flags_t; - -#define PFM_TRAP_REASON_NONE 0x0 /* default value */ -#define PFM_TRAP_REASON_BLOCK 0x1 /* we need to block on overflow */ -#define PFM_TRAP_REASON_RESET 0x2 /* we need to reset PMDs */ - - -/* - * perfmon context: encapsulates all the state of a monitoring session - */ - -typedef struct pfm_context { - spinlock_t ctx_lock; /* context protection */ - - pfm_context_flags_t ctx_flags; /* bitmask of flags (block reason incl.) */ - unsigned int ctx_state; /* state: active/inactive (no bitfield) */ - - struct task_struct *ctx_task; /* task to which context is attached */ - - unsigned long ctx_ovfl_regs[4]; /* which registers overflowed (notification) */ - - struct completion ctx_restart_done; /* use for blocking notification mode */ - - unsigned long ctx_used_pmds[4]; /* bitmask of PMD used */ - unsigned long ctx_all_pmds[4]; /* bitmask of all accessible PMDs */ - unsigned long ctx_reload_pmds[4]; /* bitmask of force reload PMD on ctxsw in */ - - unsigned long ctx_all_pmcs[4]; /* bitmask of all accessible PMCs */ - unsigned long ctx_reload_pmcs[4]; /* bitmask of force reload PMC on ctxsw in */ - unsigned long ctx_used_monitors[4]; /* bitmask of monitor PMC being used */ - - unsigned long ctx_pmcs[PFM_NUM_PMC_REGS]; /* saved copies of PMC values */ - - unsigned int ctx_used_ibrs[1]; /* bitmask of used IBR (speedup ctxsw in) */ - unsigned int ctx_used_dbrs[1]; /* bitmask of used DBR (speedup ctxsw in) */ - unsigned long ctx_dbrs[IA64_NUM_DBG_REGS]; /* DBR values (cache) when not loaded */ - unsigned long ctx_ibrs[IA64_NUM_DBG_REGS]; /* IBR values (cache) when not loaded */ - - pfm_counter_t ctx_pmds[PFM_NUM_PMD_REGS]; /* software state for PMDS */ - - unsigned long th_pmcs[PFM_NUM_PMC_REGS]; /* PMC thread save state */ - unsigned long th_pmds[PFM_NUM_PMD_REGS]; /* PMD thread save state */ - - u64 ctx_saved_psr_up; /* only contains psr.up value */ - - unsigned long ctx_last_activation; /* context last activation number for last_cpu */ - unsigned int ctx_last_cpu; /* CPU id of current or last CPU used (SMP only) */ - unsigned int ctx_cpu; /* cpu to which perfmon is applied (system wide) */ - - int ctx_fd; /* file descriptor used my this context */ - pfm_ovfl_arg_t ctx_ovfl_arg; /* argument to custom buffer format handler */ - - pfm_buffer_fmt_t *ctx_buf_fmt; /* buffer format callbacks */ - void *ctx_smpl_hdr; /* points to sampling buffer header kernel vaddr */ - unsigned long ctx_smpl_size; /* size of sampling buffer */ - void *ctx_smpl_vaddr; /* user level virtual address of smpl buffer */ - - wait_queue_head_t ctx_msgq_wait; - pfm_msg_t ctx_msgq[PFM_MAX_MSGS]; - int ctx_msgq_head; - int ctx_msgq_tail; - struct fasync_struct *ctx_async_queue; - - wait_queue_head_t ctx_zombieq; /* termination cleanup wait queue */ -} pfm_context_t; - -/* - * magic number used to verify that structure is really - * a perfmon context - */ -#define PFM_IS_FILE(f) ((f)->f_op == &pfm_file_ops) - -#define PFM_GET_CTX(t) ((pfm_context_t *)(t)->thread.pfm_context) - -#ifdef CONFIG_SMP -#define SET_LAST_CPU(ctx, v) (ctx)->ctx_last_cpu = (v) -#define GET_LAST_CPU(ctx) (ctx)->ctx_last_cpu -#else -#define SET_LAST_CPU(ctx, v) do {} while(0) -#define GET_LAST_CPU(ctx) do {} while(0) -#endif - - -#define ctx_fl_block ctx_flags.block -#define ctx_fl_system ctx_flags.system -#define ctx_fl_using_dbreg ctx_flags.using_dbreg -#define ctx_fl_is_sampling ctx_flags.is_sampling -#define ctx_fl_excl_idle ctx_flags.excl_idle -#define ctx_fl_going_zombie ctx_flags.going_zombie -#define ctx_fl_trap_reason ctx_flags.trap_reason -#define ctx_fl_no_msg ctx_flags.no_msg -#define ctx_fl_can_restart ctx_flags.can_restart - -#define PFM_SET_WORK_PENDING(t, v) do { (t)->thread.pfm_needs_checking = v; } while(0); -#define PFM_GET_WORK_PENDING(t) (t)->thread.pfm_needs_checking - -/* - * global information about all sessions - * mostly used to synchronize between system wide and per-process - */ -typedef struct { - spinlock_t pfs_lock; /* lock the structure */ - - unsigned int pfs_task_sessions; /* number of per task sessions */ - unsigned int pfs_sys_sessions; /* number of per system wide sessions */ - unsigned int pfs_sys_use_dbregs; /* incremented when a system wide session uses debug regs */ - unsigned int pfs_ptrace_use_dbregs; /* incremented when a process uses debug regs */ - struct task_struct *pfs_sys_session[NR_CPUS]; /* point to task owning a system-wide session */ -} pfm_session_t; - -/* - * information about a PMC or PMD. - * dep_pmd[]: a bitmask of dependent PMD registers - * dep_pmc[]: a bitmask of dependent PMC registers - */ -typedef int (*pfm_reg_check_t)(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); -typedef struct { - unsigned int type; - int pm_pos; - unsigned long default_value; /* power-on default value */ - unsigned long reserved_mask; /* bitmask of reserved bits */ - pfm_reg_check_t read_check; - pfm_reg_check_t write_check; - unsigned long dep_pmd[4]; - unsigned long dep_pmc[4]; -} pfm_reg_desc_t; - -/* assume cnum is a valid monitor */ -#define PMC_PM(cnum, val) (((val) >> (pmu_conf->pmc_desc[cnum].pm_pos)) & 0x1) - -/* - * This structure is initialized at boot time and contains - * a description of the PMU main characteristics. - * - * If the probe function is defined, detection is based - * on its return value: - * - 0 means recognized PMU - * - anything else means not supported - * When the probe function is not defined, then the pmu_family field - * is used and it must match the host CPU family such that: - * - cpu->family & config->pmu_family != 0 - */ -typedef struct { - unsigned long ovfl_val; /* overflow value for counters */ - - pfm_reg_desc_t *pmc_desc; /* detailed PMC register dependencies descriptions */ - pfm_reg_desc_t *pmd_desc; /* detailed PMD register dependencies descriptions */ - - unsigned int num_pmcs; /* number of PMCS: computed at init time */ - unsigned int num_pmds; /* number of PMDS: computed at init time */ - unsigned long impl_pmcs[4]; /* bitmask of implemented PMCS */ - unsigned long impl_pmds[4]; /* bitmask of implemented PMDS */ - - char *pmu_name; /* PMU family name */ - unsigned int pmu_family; /* cpuid family pattern used to identify pmu */ - unsigned int flags; /* pmu specific flags */ - unsigned int num_ibrs; /* number of IBRS: computed at init time */ - unsigned int num_dbrs; /* number of DBRS: computed at init time */ - unsigned int num_counters; /* PMC/PMD counting pairs : computed at init time */ - int (*probe)(void); /* customized probe routine */ - unsigned int use_rr_dbregs:1; /* set if debug registers used for range restriction */ -} pmu_config_t; -/* - * PMU specific flags - */ -#define PFM_PMU_IRQ_RESEND 1 /* PMU needs explicit IRQ resend */ - -/* - * debug register related type definitions - */ -typedef struct { - unsigned long ibr_mask:56; - unsigned long ibr_plm:4; - unsigned long ibr_ig:3; - unsigned long ibr_x:1; -} ibr_mask_reg_t; - -typedef struct { - unsigned long dbr_mask:56; - unsigned long dbr_plm:4; - unsigned long dbr_ig:2; - unsigned long dbr_w:1; - unsigned long dbr_r:1; -} dbr_mask_reg_t; - -typedef union { - unsigned long val; - ibr_mask_reg_t ibr; - dbr_mask_reg_t dbr; -} dbreg_t; - - -/* - * perfmon command descriptions - */ -typedef struct { - int (*cmd_func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs); - char *cmd_name; - int cmd_flags; - unsigned int cmd_narg; - size_t cmd_argsize; - int (*cmd_getsize)(void *arg, size_t *sz); -} pfm_cmd_desc_t; - -#define PFM_CMD_FD 0x01 /* command requires a file descriptor */ -#define PFM_CMD_ARG_READ 0x02 /* command must read argument(s) */ -#define PFM_CMD_ARG_RW 0x04 /* command must read/write argument(s) */ -#define PFM_CMD_STOP 0x08 /* command does not work on zombie context */ - - -#define PFM_CMD_NAME(cmd) pfm_cmd_tab[(cmd)].cmd_name -#define PFM_CMD_READ_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_READ) -#define PFM_CMD_RW_ARG(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_RW) -#define PFM_CMD_USE_FD(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_FD) -#define PFM_CMD_STOPPED(cmd) (pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_STOP) - -#define PFM_CMD_ARG_MANY -1 /* cannot be zero */ - -typedef struct { - unsigned long pfm_spurious_ovfl_intr_count; /* keep track of spurious ovfl interrupts */ - unsigned long pfm_replay_ovfl_intr_count; /* keep track of replayed ovfl interrupts */ - unsigned long pfm_ovfl_intr_count; /* keep track of ovfl interrupts */ - unsigned long pfm_ovfl_intr_cycles; /* cycles spent processing ovfl interrupts */ - unsigned long pfm_ovfl_intr_cycles_min; /* min cycles spent processing ovfl interrupts */ - unsigned long pfm_ovfl_intr_cycles_max; /* max cycles spent processing ovfl interrupts */ - unsigned long pfm_smpl_handler_calls; - unsigned long pfm_smpl_handler_cycles; - char pad[SMP_CACHE_BYTES] ____cacheline_aligned; -} pfm_stats_t; - -/* - * perfmon internal variables - */ -static pfm_stats_t pfm_stats[NR_CPUS]; -static pfm_session_t pfm_sessions; /* global sessions information */ - -static DEFINE_SPINLOCK(pfm_alt_install_check); -static pfm_intr_handler_desc_t *pfm_alt_intr_handler; - -static struct proc_dir_entry *perfmon_dir; -static pfm_uuid_t pfm_null_uuid = {0,}; - -static spinlock_t pfm_buffer_fmt_lock; -static LIST_HEAD(pfm_buffer_fmt_list); - -static pmu_config_t *pmu_conf; - -/* sysctl() controls */ -pfm_sysctl_t pfm_sysctl; -EXPORT_SYMBOL(pfm_sysctl); - -static ctl_table pfm_ctl_table[]={ - { - .ctl_name = CTL_UNNUMBERED, - .procname = "debug", - .data = &pfm_sysctl.debug, - .maxlen = sizeof(int), - .mode = 0666, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "debug_ovfl", - .data = &pfm_sysctl.debug_ovfl, - .maxlen = sizeof(int), - .mode = 0666, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "fastctxsw", - .data = &pfm_sysctl.fastctxsw, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "expert_mode", - .data = &pfm_sysctl.expert_mode, - .maxlen = sizeof(int), - .mode = 0600, - .proc_handler = &proc_dointvec, - }, - {} -}; -static ctl_table pfm_sysctl_dir[] = { - { - .ctl_name = CTL_UNNUMBERED, - .procname = "perfmon", - .mode = 0755, - .child = pfm_ctl_table, - }, - {} -}; -static ctl_table pfm_sysctl_root[] = { - { - .ctl_name = CTL_KERN, - .procname = "kernel", - .mode = 0755, - .child = pfm_sysctl_dir, - }, - {} -}; -static struct ctl_table_header *pfm_sysctl_header; - -static int pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs); - -#define pfm_get_cpu_var(v) __ia64_per_cpu_var(v) -#define pfm_get_cpu_data(a,b) per_cpu(a, b) - -static inline void -pfm_put_task(struct task_struct *task) -{ - if (task != current) put_task_struct(task); -} - -static inline void -pfm_set_task_notify(struct task_struct *task) -{ - struct thread_info *info; - - info = (struct thread_info *) ((char *) task + IA64_TASK_SIZE); - set_bit(TIF_NOTIFY_RESUME, &info->flags); -} - -static inline void -pfm_clear_task_notify(void) -{ - clear_thread_flag(TIF_NOTIFY_RESUME); -} - -static inline void -pfm_reserve_page(unsigned long a) -{ - SetPageReserved(vmalloc_to_page((void *)a)); -} -static inline void -pfm_unreserve_page(unsigned long a) -{ - ClearPageReserved(vmalloc_to_page((void*)a)); -} - -static inline unsigned long -pfm_protect_ctx_ctxsw(pfm_context_t *x) -{ - spin_lock(&(x)->ctx_lock); - return 0UL; -} - -static inline void -pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f) -{ - spin_unlock(&(x)->ctx_lock); -} - -static inline unsigned int -pfm_do_munmap(struct mm_struct *mm, unsigned long addr, size_t len, int acct) -{ - return do_munmap(mm, addr, len); -} - -static inline unsigned long -pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec) -{ - return get_unmapped_area(file, addr, len, pgoff, flags); -} - - -static int -pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, - struct vfsmount *mnt) -{ - return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt); -} - -static struct file_system_type pfm_fs_type = { - .name = "pfmfs", - .get_sb = pfmfs_get_sb, - .kill_sb = kill_anon_super, -}; - -DEFINE_PER_CPU(unsigned long, pfm_syst_info); -DEFINE_PER_CPU(struct task_struct *, pmu_owner); -DEFINE_PER_CPU(pfm_context_t *, pmu_ctx); -DEFINE_PER_CPU(unsigned long, pmu_activation_number); -EXPORT_PER_CPU_SYMBOL_GPL(pfm_syst_info); - - -/* forward declaration */ -static const struct file_operations pfm_file_ops; - -/* - * forward declarations - */ -#ifndef CONFIG_SMP -static void pfm_lazy_save_regs (struct task_struct *ta); -#endif - -void dump_pmu_state(const char *); -static int pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs); - -#include "perfmon_itanium.h" -#include "perfmon_mckinley.h" -#include "perfmon_montecito.h" -#include "perfmon_generic.h" - -static pmu_config_t *pmu_confs[]={ - &pmu_conf_mont, - &pmu_conf_mck, - &pmu_conf_ita, - &pmu_conf_gen, /* must be last */ - NULL -}; - - -static int pfm_end_notify_user(pfm_context_t *ctx); - -static inline void -pfm_clear_psr_pp(void) -{ - ia64_rsm(IA64_PSR_PP); - ia64_srlz_i(); -} - -static inline void -pfm_set_psr_pp(void) -{ - ia64_ssm(IA64_PSR_PP); - ia64_srlz_i(); -} - -static inline void -pfm_clear_psr_up(void) -{ - ia64_rsm(IA64_PSR_UP); - ia64_srlz_i(); -} - -static inline void -pfm_set_psr_up(void) -{ - ia64_ssm(IA64_PSR_UP); - ia64_srlz_i(); -} - -static inline unsigned long -pfm_get_psr(void) -{ - unsigned long tmp; - tmp = ia64_getreg(_IA64_REG_PSR); - ia64_srlz_i(); - return tmp; -} - -static inline void -pfm_set_psr_l(unsigned long val) -{ - ia64_setreg(_IA64_REG_PSR_L, val); - ia64_srlz_i(); -} - -static inline void -pfm_freeze_pmu(void) -{ - ia64_set_pmc(0,1UL); - ia64_srlz_d(); -} - -static inline void -pfm_unfreeze_pmu(void) -{ - ia64_set_pmc(0,0UL); - ia64_srlz_d(); -} - -static inline void -pfm_restore_ibrs(unsigned long *ibrs, unsigned int nibrs) -{ - int i; - - for (i=0; i < nibrs; i++) { - ia64_set_ibr(i, ibrs[i]); - ia64_dv_serialize_instruction(); - } - ia64_srlz_i(); -} - -static inline void -pfm_restore_dbrs(unsigned long *dbrs, unsigned int ndbrs) -{ - int i; - - for (i=0; i < ndbrs; i++) { - ia64_set_dbr(i, dbrs[i]); - ia64_dv_serialize_data(); - } - ia64_srlz_d(); -} - -/* - * PMD[i] must be a counter. no check is made - */ -static inline unsigned long -pfm_read_soft_counter(pfm_context_t *ctx, int i) -{ - return ctx->ctx_pmds[i].val + (ia64_get_pmd(i) & pmu_conf->ovfl_val); -} - -/* - * PMD[i] must be a counter. no check is made - */ -static inline void -pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val) -{ - unsigned long ovfl_val = pmu_conf->ovfl_val; - - ctx->ctx_pmds[i].val = val & ~ovfl_val; - /* - * writing to unimplemented part is ignore, so we do not need to - * mask off top part - */ - ia64_set_pmd(i, val & ovfl_val); -} - -static pfm_msg_t * -pfm_get_new_msg(pfm_context_t *ctx) -{ - int idx, next; - - next = (ctx->ctx_msgq_tail+1) % PFM_MAX_MSGS; - - DPRINT(("ctx_fd=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail)); - if (next == ctx->ctx_msgq_head) return NULL; - - idx = ctx->ctx_msgq_tail; - ctx->ctx_msgq_tail = next; - - DPRINT(("ctx=%p head=%d tail=%d msg=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, idx)); - - return ctx->ctx_msgq+idx; -} - -static pfm_msg_t * -pfm_get_next_msg(pfm_context_t *ctx) -{ - pfm_msg_t *msg; - - DPRINT(("ctx=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail)); - - if (PFM_CTXQ_EMPTY(ctx)) return NULL; - - /* - * get oldest message - */ - msg = ctx->ctx_msgq+ctx->ctx_msgq_head; - - /* - * and move forward - */ - ctx->ctx_msgq_head = (ctx->ctx_msgq_head+1) % PFM_MAX_MSGS; - - DPRINT(("ctx=%p head=%d tail=%d type=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, msg->pfm_gen_msg.msg_type)); - - return msg; -} - -static void -pfm_reset_msgq(pfm_context_t *ctx) -{ - ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0; - DPRINT(("ctx=%p msgq reset\n", ctx)); -} - -static void * -pfm_rvmalloc(unsigned long size) -{ - void *mem; - unsigned long addr; - - size = PAGE_ALIGN(size); - mem = vmalloc(size); - if (mem) { - //printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem); - memset(mem, 0, size); - addr = (unsigned long)mem; - while (size > 0) { - pfm_reserve_page(addr); - addr+=PAGE_SIZE; - size-=PAGE_SIZE; - } - } - return mem; -} - -static void -pfm_rvfree(void *mem, unsigned long size) -{ - unsigned long addr; - - if (mem) { - DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size)); - addr = (unsigned long) mem; - while ((long) size > 0) { - pfm_unreserve_page(addr); - addr+=PAGE_SIZE; - size-=PAGE_SIZE; - } - vfree(mem); - } - return; -} - -static pfm_context_t * -pfm_context_alloc(void) -{ - pfm_context_t *ctx; - - /* - * allocate context descriptor - * must be able to free with interrupts disabled - */ - ctx = kzalloc(sizeof(pfm_context_t), GFP_KERNEL); - if (ctx) { - DPRINT(("alloc ctx @%p\n", ctx)); - } - return ctx; -} - -static void -pfm_context_free(pfm_context_t *ctx) -{ - if (ctx) { - DPRINT(("free ctx @%p\n", ctx)); - kfree(ctx); - } -} - -static void -pfm_mask_monitoring(struct task_struct *task) -{ - pfm_context_t *ctx = PFM_GET_CTX(task); - unsigned long mask, val, ovfl_mask; - int i; - - DPRINT_ovfl(("masking monitoring for [%d]\n", task->pid)); - - ovfl_mask = pmu_conf->ovfl_val; - /* - * monitoring can only be masked as a result of a valid - * counter overflow. In UP, it means that the PMU still - * has an owner. Note that the owner can be different - * from the current task. However the PMU state belongs - * to the owner. - * In SMP, a valid overflow only happens when task is - * current. Therefore if we come here, we know that - * the PMU state belongs to the current task, therefore - * we can access the live registers. - * - * So in both cases, the live register contains the owner's - * state. We can ONLY touch the PMU registers and NOT the PSR. - * - * As a consequence to this call, the ctx->th_pmds[] array - * contains stale information which must be ignored - * when context is reloaded AND monitoring is active (see - * pfm_restart). - */ - mask = ctx->ctx_used_pmds[0]; - for (i = 0; mask; i++, mask>>=1) { - /* skip non used pmds */ - if ((mask & 0x1) == 0) continue; - val = ia64_get_pmd(i); - - if (PMD_IS_COUNTING(i)) { - /* - * we rebuild the full 64 bit value of the counter - */ - ctx->ctx_pmds[i].val += (val & ovfl_mask); - } else { - ctx->ctx_pmds[i].val = val; - } - DPRINT_ovfl(("pmd[%d]=0x%lx hw_pmd=0x%lx\n", - i, - ctx->ctx_pmds[i].val, - val & ovfl_mask)); - } - /* - * mask monitoring by setting the privilege level to 0 - * we cannot use psr.pp/psr.up for this, it is controlled by - * the user - * - * if task is current, modify actual registers, otherwise modify - * thread save state, i.e., what will be restored in pfm_load_regs() - */ - mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER; - for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) { - if ((mask & 0x1) == 0UL) continue; - ia64_set_pmc(i, ctx->th_pmcs[i] & ~0xfUL); - ctx->th_pmcs[i] &= ~0xfUL; - DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, ctx->th_pmcs[i])); - } - /* - * make all of this visible - */ - ia64_srlz_d(); -} - -/* - * must always be done with task == current - * - * context must be in MASKED state when calling - */ -static void -pfm_restore_monitoring(struct task_struct *task) -{ - pfm_context_t *ctx = PFM_GET_CTX(task); - unsigned long mask, ovfl_mask; - unsigned long psr, val; - int i, is_system; - - is_system = ctx->ctx_fl_system; - ovfl_mask = pmu_conf->ovfl_val; - - if (task != current) { - printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid); - return; - } - if (ctx->ctx_state != PFM_CTX_MASKED) { - printk(KERN_ERR "perfmon.%d: task[%d] current[%d] invalid state=%d\n", __LINE__, - task->pid, current->pid, ctx->ctx_state); - return; - } - psr = pfm_get_psr(); - /* - * monitoring is masked via the PMC. - * As we restore their value, we do not want each counter to - * restart right away. We stop monitoring using the PSR, - * restore the PMC (and PMD) and then re-establish the psr - * as it was. Note that there can be no pending overflow at - * this point, because monitoring was MASKED. - * - * system-wide session are pinned and self-monitoring - */ - if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) { - /* disable dcr pp */ - ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP); - pfm_clear_psr_pp(); - } else { - pfm_clear_psr_up(); - } - /* - * first, we restore the PMD - */ - mask = ctx->ctx_used_pmds[0]; - for (i = 0; mask; i++, mask>>=1) { - /* skip non used pmds */ - if ((mask & 0x1) == 0) continue; - - if (PMD_IS_COUNTING(i)) { - /* - * we split the 64bit value according to - * counter width - */ - val = ctx->ctx_pmds[i].val & ovfl_mask; - ctx->ctx_pmds[i].val &= ~ovfl_mask; - } else { - val = ctx->ctx_pmds[i].val; - } - ia64_set_pmd(i, val); - - DPRINT(("pmd[%d]=0x%lx hw_pmd=0x%lx\n", - i, - ctx->ctx_pmds[i].val, - val)); - } - /* - * restore the PMCs - */ - mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER; - for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) { - if ((mask & 0x1) == 0UL) continue; - ctx->th_pmcs[i] = ctx->ctx_pmcs[i]; - ia64_set_pmc(i, ctx->th_pmcs[i]); - DPRINT(("[%d] pmc[%d]=0x%lx\n", task->pid, i, ctx->th_pmcs[i])); - } - ia64_srlz_d(); - - /* - * must restore DBR/IBR because could be modified while masked - * XXX: need to optimize - */ - if (ctx->ctx_fl_using_dbreg) { - pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs); - pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs); - } - - /* - * now restore PSR - */ - if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) { - /* enable dcr pp */ - ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) | IA64_DCR_PP); - ia64_srlz_i(); - } - pfm_set_psr_l(psr); -} - -static inline void -pfm_save_pmds(unsigned long *pmds, unsigned long mask) -{ - int i; - - ia64_srlz_d(); - - for (i=0; mask; i++, mask>>=1) { - if (mask & 0x1) pmds[i] = ia64_get_pmd(i); - } -} - -/* - * reload from thread state (used for ctxw only) - */ -static inline void -pfm_restore_pmds(unsigned long *pmds, unsigned long mask) -{ - int i; - unsigned long val, ovfl_val = pmu_conf->ovfl_val; - - for (i=0; mask; i++, mask>>=1) { - if ((mask & 0x1) == 0) continue; - val = PMD_IS_COUNTING(i) ? pmds[i] & ovfl_val : pmds[i]; - ia64_set_pmd(i, val); - } - ia64_srlz_d(); -} - -/* - * propagate PMD from context to thread-state - */ -static inline void -pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx) -{ - unsigned long ovfl_val = pmu_conf->ovfl_val; - unsigned long mask = ctx->ctx_all_pmds[0]; - unsigned long val; - int i; - - DPRINT(("mask=0x%lx\n", mask)); - - for (i=0; mask; i++, mask>>=1) { - - val = ctx->ctx_pmds[i].val; - - /* - * We break up the 64 bit value into 2 pieces - * the lower bits go to the machine state in the - * thread (will be reloaded on ctxsw in). - * The upper part stays in the soft-counter. - */ - if (PMD_IS_COUNTING(i)) { - ctx->ctx_pmds[i].val = val & ~ovfl_val; - val &= ovfl_val; - } - ctx->th_pmds[i] = val; - - DPRINT(("pmd[%d]=0x%lx soft_val=0x%lx\n", - i, - ctx->th_pmds[i], - ctx->ctx_pmds[i].val)); - } -} - -/* - * propagate PMC from context to thread-state - */ -static inline void -pfm_copy_pmcs(struct task_struct *task, pfm_context_t *ctx) -{ - unsigned long mask = ctx->ctx_all_pmcs[0]; - int i; - - DPRINT(("mask=0x%lx\n", mask)); - - for (i=0; mask; i++, mask>>=1) { - /* masking 0 with ovfl_val yields 0 */ - ctx->th_pmcs[i] = ctx->ctx_pmcs[i]; - DPRINT(("pmc[%d]=0x%lx\n", i, ctx->th_pmcs[i])); - } -} - - - -static inline void -pfm_restore_pmcs(unsigned long *pmcs, unsigned long mask) -{ - int i; - - for (i=0; mask; i++, mask>>=1) { - if ((mask & 0x1) == 0) continue; - ia64_set_pmc(i, pmcs[i]); - } - ia64_srlz_d(); -} - -static inline int -pfm_uuid_cmp(pfm_uuid_t a, pfm_uuid_t b) -{ - return memcmp(a, b, sizeof(pfm_uuid_t)); -} - -static inline int -pfm_buf_fmt_exit(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, struct pt_regs *regs) -{ - int ret = 0; - if (fmt->fmt_exit) ret = (*fmt->fmt_exit)(task, buf, regs); - return ret; -} - -static inline int -pfm_buf_fmt_getsize(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags, int cpu, void *arg, unsigned long *size) -{ - int ret = 0; - if (fmt->fmt_getsize) ret = (*fmt->fmt_getsize)(task, flags, cpu, arg, size); - return ret; -} - - -static inline int -pfm_buf_fmt_validate(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags, - int cpu, void *arg) -{ - int ret = 0; - if (fmt->fmt_validate) ret = (*fmt->fmt_validate)(task, flags, cpu, arg); - return ret; -} - -static inline int -pfm_buf_fmt_init(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, unsigned int flags, - int cpu, void *arg) -{ - int ret = 0; - if (fmt->fmt_init) ret = (*fmt->fmt_init)(task, buf, flags, cpu, arg); - return ret; -} - -static inline int -pfm_buf_fmt_restart(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs) -{ - int ret = 0; - if (fmt->fmt_restart) ret = (*fmt->fmt_restart)(task, ctrl, buf, regs); - return ret; -} - -static inline int -pfm_buf_fmt_restart_active(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs) -{ - int ret = 0; - if (fmt->fmt_restart_active) ret = (*fmt->fmt_restart_active)(task, ctrl, buf, regs); - return ret; -} - -static pfm_buffer_fmt_t * -__pfm_find_buffer_fmt(pfm_uuid_t uuid) -{ - struct list_head * pos; - pfm_buffer_fmt_t * entry; - - list_for_each(pos, &pfm_buffer_fmt_list) { - entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list); - if (pfm_uuid_cmp(uuid, entry->fmt_uuid) == 0) - return entry; - } - return NULL; -} - -/* - * find a buffer format based on its uuid - */ -static pfm_buffer_fmt_t * -pfm_find_buffer_fmt(pfm_uuid_t uuid) -{ - pfm_buffer_fmt_t * fmt; - spin_lock(&pfm_buffer_fmt_lock); - fmt = __pfm_find_buffer_fmt(uuid); - spin_unlock(&pfm_buffer_fmt_lock); - return fmt; -} - -int -pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt) -{ - int ret = 0; - - /* some sanity checks */ - if (fmt == NULL || fmt->fmt_name == NULL) return -EINVAL; - - /* we need at least a handler */ - if (fmt->fmt_handler == NULL) return -EINVAL; - - /* - * XXX: need check validity of fmt_arg_size - */ - - spin_lock(&pfm_buffer_fmt_lock); - - if (__pfm_find_buffer_fmt(fmt->fmt_uuid)) { - printk(KERN_ERR "perfmon: duplicate sampling format: %s\n", fmt->fmt_name); - ret = -EBUSY; - goto out; - } - list_add(&fmt->fmt_list, &pfm_buffer_fmt_list); - printk(KERN_INFO "perfmon: added sampling format %s\n", fmt->fmt_name); - -out: - spin_unlock(&pfm_buffer_fmt_lock); - return ret; -} -EXPORT_SYMBOL(pfm_register_buffer_fmt); - -int -pfm_unregister_buffer_fmt(pfm_uuid_t uuid) -{ - pfm_buffer_fmt_t *fmt; - int ret = 0; - - spin_lock(&pfm_buffer_fmt_lock); - - fmt = __pfm_find_buffer_fmt(uuid); - if (!fmt) { - printk(KERN_ERR "perfmon: cannot unregister format, not found\n"); - ret = -EINVAL; - goto out; - } - list_del_init(&fmt->fmt_list); - printk(KERN_INFO "perfmon: removed sampling format: %s\n", fmt->fmt_name); - -out: - spin_unlock(&pfm_buffer_fmt_lock); - return ret; - -} -EXPORT_SYMBOL(pfm_unregister_buffer_fmt); - -extern void update_pal_halt_status(int); - -static int -pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu) -{ - unsigned long flags; - /* - * validity checks on cpu_mask have been done upstream - */ - LOCK_PFS(flags); - - DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n", - pfm_sessions.pfs_sys_sessions, - pfm_sessions.pfs_task_sessions, - pfm_sessions.pfs_sys_use_dbregs, - is_syswide, - cpu)); - - if (is_syswide) { - /* - * cannot mix system wide and per-task sessions - */ - if (pfm_sessions.pfs_task_sessions > 0UL) { - DPRINT(("system wide not possible, %u conflicting task_sessions\n", - pfm_sessions.pfs_task_sessions)); - goto abort; - } - - if (pfm_sessions.pfs_sys_session[cpu]) goto error_conflict; - - DPRINT(("reserving system wide session on CPU%u currently on CPU%u\n", cpu, smp_processor_id())); - - pfm_sessions.pfs_sys_session[cpu] = task; - - pfm_sessions.pfs_sys_sessions++ ; - - } else { - if (pfm_sessions.pfs_sys_sessions) goto abort; - pfm_sessions.pfs_task_sessions++; - } - - DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n", - pfm_sessions.pfs_sys_sessions, - pfm_sessions.pfs_task_sessions, - pfm_sessions.pfs_sys_use_dbregs, - is_syswide, - cpu)); - - /* - * disable default_idle() to go to PAL_HALT - */ - update_pal_halt_status(0); - - UNLOCK_PFS(flags); - - return 0; - -error_conflict: - DPRINT(("system wide not possible, conflicting session [%d] on CPU%d\n", - pfm_sessions.pfs_sys_session[cpu]->pid, - cpu)); -abort: - UNLOCK_PFS(flags); - - return -EBUSY; - -} - -static int -pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu) -{ - unsigned long flags; - /* - * validity checks on cpu_mask have been done upstream - */ - LOCK_PFS(flags); - - DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n", - pfm_sessions.pfs_sys_sessions, - pfm_sessions.pfs_task_sessions, - pfm_sessions.pfs_sys_use_dbregs, - is_syswide, - cpu)); - - - if (is_syswide) { - pfm_sessions.pfs_sys_session[cpu] = NULL; - /* - * would not work with perfmon+more than one bit in cpu_mask - */ - if (ctx && ctx->ctx_fl_using_dbreg) { - if (pfm_sessions.pfs_sys_use_dbregs == 0) { - printk(KERN_ERR "perfmon: invalid release for ctx %p sys_use_dbregs=0\n", ctx); - } else { - pfm_sessions.pfs_sys_use_dbregs--; - } - } - pfm_sessions.pfs_sys_sessions--; - } else { - pfm_sessions.pfs_task_sessions--; - } - DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n", - pfm_sessions.pfs_sys_sessions, - pfm_sessions.pfs_task_sessions, - pfm_sessions.pfs_sys_use_dbregs, - is_syswide, - cpu)); - - /* - * if possible, enable default_idle() to go into PAL_HALT - */ - if (pfm_sessions.pfs_task_sessions == 0 && pfm_sessions.pfs_sys_sessions == 0) - update_pal_halt_status(1); - - UNLOCK_PFS(flags); - - return 0; -} - -/* - * removes virtual mapping of the sampling buffer. - * IMPORTANT: cannot be called with interrupts disable, e.g. inside - * a PROTECT_CTX() section. - */ -static int -pfm_remove_smpl_mapping(struct task_struct *task, void *vaddr, unsigned long size) -{ - int r; - - /* sanity checks */ - if (task->mm == NULL || size == 0UL || vaddr == NULL) { - printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task->pid, task->mm); - return -EINVAL; - } - - DPRINT(("smpl_vaddr=%p size=%lu\n", vaddr, size)); - - /* - * does the actual unmapping - */ - down_write(&task->mm->mmap_sem); - - DPRINT(("down_write done smpl_vaddr=%p size=%lu\n", vaddr, size)); - - r = pfm_do_munmap(task->mm, (unsigned long)vaddr, size, 0); - - up_write(&task->mm->mmap_sem); - if (r !=0) { - printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task->pid, vaddr, size); - } - - DPRINT(("do_unmap(%p, %lu)=%d\n", vaddr, size, r)); - - return 0; -} - -/* - * free actual physical storage used by sampling buffer - */ -#if 0 -static int -pfm_free_smpl_buffer(pfm_context_t *ctx) -{ - pfm_buffer_fmt_t *fmt; - - if (ctx->ctx_smpl_hdr == NULL) goto invalid_free; - - /* - * we won't use the buffer format anymore - */ - fmt = ctx->ctx_buf_fmt; - - DPRINT(("sampling buffer @%p size %lu vaddr=%p\n", - ctx->ctx_smpl_hdr, - ctx->ctx_smpl_size, - ctx->ctx_smpl_vaddr)); - - pfm_buf_fmt_exit(fmt, current, NULL, NULL); - - /* - * free the buffer - */ - pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size); - - ctx->ctx_smpl_hdr = NULL; - ctx->ctx_smpl_size = 0UL; - - return 0; - -invalid_free: - printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", current->pid); - return -EINVAL; -} -#endif - -static inline void -pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt) -{ - if (fmt == NULL) return; - - pfm_buf_fmt_exit(fmt, current, NULL, NULL); - -} - -/* - * pfmfs should _never_ be mounted by userland - too much of security hassle, - * no real gain from having the whole whorehouse mounted. So we don't need - * any operations on the root directory. However, we need a non-trivial - * d_name - pfm: will go nicely and kill the special-casing in procfs. - */ -static struct vfsmount *pfmfs_mnt; - -static int __init -init_pfm_fs(void) -{ - int err = register_filesystem(&pfm_fs_type); - if (!err) { - pfmfs_mnt = kern_mount(&pfm_fs_type); - err = PTR_ERR(pfmfs_mnt); - if (IS_ERR(pfmfs_mnt)) - unregister_filesystem(&pfm_fs_type); - else - err = 0; - } - return err; -} - -static void __exit -exit_pfm_fs(void) -{ - unregister_filesystem(&pfm_fs_type); - mntput(pfmfs_mnt); -} - -static ssize_t -pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos) -{ - pfm_context_t *ctx; - pfm_msg_t *msg; - ssize_t ret; - unsigned long flags; - DECLARE_WAITQUEUE(wait, current); - if (PFM_IS_FILE(filp) == 0) { - printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid); - return -EINVAL; - } - - ctx = (pfm_context_t *)filp->private_data; - if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", current->pid); - return -EINVAL; - } - - /* - * check even when there is no message - */ - if (size < sizeof(pfm_msg_t)) { - DPRINT(("message is too small ctx=%p (>=%ld)\n", ctx, sizeof(pfm_msg_t))); - return -EINVAL; - } - - PROTECT_CTX(ctx, flags); - - /* - * put ourselves on the wait queue - */ - add_wait_queue(&ctx->ctx_msgq_wait, &wait); - - - for(;;) { - /* - * check wait queue - */ - - set_current_state(TASK_INTERRUPTIBLE); - - DPRINT(("head=%d tail=%d\n", ctx->ctx_msgq_head, ctx->ctx_msgq_tail)); - - ret = 0; - if(PFM_CTXQ_EMPTY(ctx) == 0) break; - - UNPROTECT_CTX(ctx, flags); - - /* - * check non-blocking read - */ - ret = -EAGAIN; - if(filp->f_flags & O_NONBLOCK) break; - - /* - * check pending signals - */ - if(signal_pending(current)) { - ret = -EINTR; - break; - } - /* - * no message, so wait - */ - schedule(); - - PROTECT_CTX(ctx, flags); - } - DPRINT(("[%d] back to running ret=%ld\n", current->pid, ret)); - set_current_state(TASK_RUNNING); - remove_wait_queue(&ctx->ctx_msgq_wait, &wait); - - if (ret < 0) goto abort; - - ret = -EINVAL; - msg = pfm_get_next_msg(ctx); - if (msg == NULL) { - printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, current->pid); - goto abort_locked; - } - - DPRINT(("fd=%d type=%d\n", msg->pfm_gen_msg.msg_ctx_fd, msg->pfm_gen_msg.msg_type)); - - ret = -EFAULT; - if(copy_to_user(buf, msg, sizeof(pfm_msg_t)) == 0) ret = sizeof(pfm_msg_t); - -abort_locked: - UNPROTECT_CTX(ctx, flags); -abort: - return ret; -} - -static ssize_t -pfm_write(struct file *file, const char __user *ubuf, - size_t size, loff_t *ppos) -{ - DPRINT(("pfm_write called\n")); - return -EINVAL; -} - -static unsigned int -pfm_poll(struct file *filp, poll_table * wait) -{ - pfm_context_t *ctx; - unsigned long flags; - unsigned int mask = 0; - - if (PFM_IS_FILE(filp) == 0) { - printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid); - return 0; - } - - ctx = (pfm_context_t *)filp->private_data; - if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", current->pid); - return 0; - } - - - DPRINT(("pfm_poll ctx_fd=%d before poll_wait\n", ctx->ctx_fd)); - - poll_wait(filp, &ctx->ctx_msgq_wait, wait); - - PROTECT_CTX(ctx, flags); - - if (PFM_CTXQ_EMPTY(ctx) == 0) - mask = POLLIN | POLLRDNORM; - - UNPROTECT_CTX(ctx, flags); - - DPRINT(("pfm_poll ctx_fd=%d mask=0x%x\n", ctx->ctx_fd, mask)); - - return mask; -} - -static int -pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) -{ - DPRINT(("pfm_ioctl called\n")); - return -EINVAL; -} - -/* - * interrupt cannot be masked when coming here - */ -static inline int -pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on) -{ - int ret; - - ret = fasync_helper (fd, filp, on, &ctx->ctx_async_queue); - - DPRINT(("pfm_fasync called by [%d] on ctx_fd=%d on=%d async_queue=%p ret=%d\n", - current->pid, - fd, - on, - ctx->ctx_async_queue, ret)); - - return ret; -} - -static int -pfm_fasync(int fd, struct file *filp, int on) -{ - pfm_context_t *ctx; - int ret; - - if (PFM_IS_FILE(filp) == 0) { - printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", current->pid); - return -EBADF; - } - - ctx = (pfm_context_t *)filp->private_data; - if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid); - return -EBADF; - } - /* - * we cannot mask interrupts during this call because this may - * may go to sleep if memory is not readily avalaible. - * - * We are protected from the conetxt disappearing by the get_fd()/put_fd() - * done in caller. Serialization of this function is ensured by caller. - */ - ret = pfm_do_fasync(fd, filp, ctx, on); - - - DPRINT(("pfm_fasync called on ctx_fd=%d on=%d async_queue=%p ret=%d\n", - fd, - on, - ctx->ctx_async_queue, ret)); - - return ret; -} - -#ifdef CONFIG_SMP -/* - * this function is exclusively called from pfm_close(). - * The context is not protected at that time, nor are interrupts - * on the remote CPU. That's necessary to avoid deadlocks. - */ -static void -pfm_syswide_force_stop(void *info) -{ - pfm_context_t *ctx = (pfm_context_t *)info; - struct pt_regs *regs = task_pt_regs(current); - struct task_struct *owner; - unsigned long flags; - int ret; - - if (ctx->ctx_cpu != smp_processor_id()) { - printk(KERN_ERR "perfmon: pfm_syswide_force_stop for CPU%d but on CPU%d\n", - ctx->ctx_cpu, - smp_processor_id()); - return; - } - owner = GET_PMU_OWNER(); - if (owner != ctx->ctx_task) { - printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected owner [%d] instead of [%d]\n", - smp_processor_id(), - owner->pid, ctx->ctx_task->pid); - return; - } - if (GET_PMU_CTX() != ctx) { - printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected ctx %p instead of %p\n", - smp_processor_id(), - GET_PMU_CTX(), ctx); - return; - } - - DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), ctx->ctx_task->pid)); - /* - * the context is already protected in pfm_close(), we simply - * need to mask interrupts to avoid a PMU interrupt race on - * this CPU - */ - local_irq_save(flags); - - ret = pfm_context_unload(ctx, NULL, 0, regs); - if (ret) { - DPRINT(("context_unload returned %d\n", ret)); - } - - /* - * unmask interrupts, PMU interrupts are now spurious here - */ - local_irq_restore(flags); -} - -static void -pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx) -{ - int ret; - - DPRINT(("calling CPU%d for cleanup\n", ctx->ctx_cpu)); - ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 0, 1); - DPRINT(("called CPU%d for cleanup ret=%d\n", ctx->ctx_cpu, ret)); -} -#endif /* CONFIG_SMP */ - -/* - * called for each close(). Partially free resources. - * When caller is self-monitoring, the context is unloaded. - */ -static int -pfm_flush(struct file *filp, fl_owner_t id) -{ - pfm_context_t *ctx; - struct task_struct *task; - struct pt_regs *regs; - unsigned long flags; - unsigned long smpl_buf_size = 0UL; - void *smpl_buf_vaddr = NULL; - int state, is_system; - - if (PFM_IS_FILE(filp) == 0) { - DPRINT(("bad magic for\n")); - return -EBADF; - } - - ctx = (pfm_context_t *)filp->private_data; - if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", current->pid); - return -EBADF; - } - - /* - * remove our file from the async queue, if we use this mode. - * This can be done without the context being protected. We come - * here when the context has become unreachable by other tasks. - * - * We may still have active monitoring at this point and we may - * end up in pfm_overflow_handler(). However, fasync_helper() - * operates with interrupts disabled and it cleans up the - * queue. If the PMU handler is called prior to entering - * fasync_helper() then it will send a signal. If it is - * invoked after, it will find an empty queue and no - * signal will be sent. In both case, we are safe - */ - if (filp->f_flags & FASYNC) { - DPRINT(("cleaning up async_queue=%p\n", ctx->ctx_async_queue)); - pfm_do_fasync (-1, filp, ctx, 0); - } - - PROTECT_CTX(ctx, flags); - - state = ctx->ctx_state; - is_system = ctx->ctx_fl_system; - - task = PFM_CTX_TASK(ctx); - regs = task_pt_regs(task); - - DPRINT(("ctx_state=%d is_current=%d\n", - state, - task == current ? 1 : 0)); - - /* - * if state == UNLOADED, then task is NULL - */ - - /* - * we must stop and unload because we are losing access to the context. - */ - if (task == current) { -#ifdef CONFIG_SMP - /* - * the task IS the owner but it migrated to another CPU: that's bad - * but we must handle this cleanly. Unfortunately, the kernel does - * not provide a mechanism to block migration (while the context is loaded). - * - * We need to release the resource on the ORIGINAL cpu. - */ - if (is_system && ctx->ctx_cpu != smp_processor_id()) { - - DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); - /* - * keep context protected but unmask interrupt for IPI - */ - local_irq_restore(flags); - - pfm_syswide_cleanup_other_cpu(ctx); - - /* - * restore interrupt masking - */ - local_irq_save(flags); - - /* - * context is unloaded at this point - */ - } else -#endif /* CONFIG_SMP */ - { - - DPRINT(("forcing unload\n")); - /* - * stop and unload, returning with state UNLOADED - * and session unreserved. - */ - pfm_context_unload(ctx, NULL, 0, regs); - - DPRINT(("ctx_state=%d\n", ctx->ctx_state)); - } - } - - /* - * remove virtual mapping, if any, for the calling task. - * cannot reset ctx field until last user is calling close(). - * - * ctx_smpl_vaddr must never be cleared because it is needed - * by every task with access to the context - * - * When called from do_exit(), the mm context is gone already, therefore - * mm is NULL, i.e., the VMA is already gone and we do not have to - * do anything here - */ - if (ctx->ctx_smpl_vaddr && current->mm) { - smpl_buf_vaddr = ctx->ctx_smpl_vaddr; - smpl_buf_size = ctx->ctx_smpl_size; - } - - UNPROTECT_CTX(ctx, flags); - - /* - * if there was a mapping, then we systematically remove it - * at this point. Cannot be done inside critical section - * because some VM function reenables interrupts. - * - */ - if (smpl_buf_vaddr) pfm_remove_smpl_mapping(current, smpl_buf_vaddr, smpl_buf_size); - - return 0; -} -/* - * called either on explicit close() or from exit_files(). - * Only the LAST user of the file gets to this point, i.e., it is - * called only ONCE. - * - * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero - * (fput()),i.e, last task to access the file. Nobody else can access the - * file at this point. - * - * When called from exit_files(), the VMA has been freed because exit_mm() - * is executed before exit_files(). - * - * When called from exit_files(), the current task is not yet ZOMBIE but we - * flush the PMU state to the context. - */ -static int -pfm_close(struct inode *inode, struct file *filp) -{ - pfm_context_t *ctx; - struct task_struct *task; - struct pt_regs *regs; - DECLARE_WAITQUEUE(wait, current); - unsigned long flags; - unsigned long smpl_buf_size = 0UL; - void *smpl_buf_addr = NULL; - int free_possible = 1; - int state, is_system; - - DPRINT(("pfm_close called private=%p\n", filp->private_data)); - - if (PFM_IS_FILE(filp) == 0) { - DPRINT(("bad magic\n")); - return -EBADF; - } - - ctx = (pfm_context_t *)filp->private_data; - if (ctx == NULL) { - printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", current->pid); - return -EBADF; - } - - PROTECT_CTX(ctx, flags); - - state = ctx->ctx_state; - is_system = ctx->ctx_fl_system; - - task = PFM_CTX_TASK(ctx); - regs = task_pt_regs(task); - - DPRINT(("ctx_state=%d is_current=%d\n", - state, - task == current ? 1 : 0)); - - /* - * if task == current, then pfm_flush() unloaded the context - */ - if (state == PFM_CTX_UNLOADED) goto doit; - - /* - * context is loaded/masked and task != current, we need to - * either force an unload or go zombie - */ - - /* - * The task is currently blocked or will block after an overflow. - * we must force it to wakeup to get out of the - * MASKED state and transition to the unloaded state by itself. - * - * This situation is only possible for per-task mode - */ - if (state == PFM_CTX_MASKED && CTX_OVFL_NOBLOCK(ctx) == 0) { - - /* - * set a "partial" zombie state to be checked - * upon return from down() in pfm_handle_work(). - * - * We cannot use the ZOMBIE state, because it is checked - * by pfm_load_regs() which is called upon wakeup from down(). - * In such case, it would free the context and then we would - * return to pfm_handle_work() which would access the - * stale context. Instead, we set a flag invisible to pfm_load_regs() - * but visible to pfm_handle_work(). - * - * For some window of time, we have a zombie context with - * ctx_state = MASKED and not ZOMBIE - */ - ctx->ctx_fl_going_zombie = 1; - - /* - * force task to wake up from MASKED state - */ - complete(&ctx->ctx_restart_done); - - DPRINT(("waking up ctx_state=%d\n", state)); - - /* - * put ourself to sleep waiting for the other - * task to report completion - * - * the context is protected by mutex, therefore there - * is no risk of being notified of completion before - * begin actually on the waitq. - */ - set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&ctx->ctx_zombieq, &wait); - - UNPROTECT_CTX(ctx, flags); - - /* - * XXX: check for signals : - * - ok for explicit close - * - not ok when coming from exit_files() - */ - schedule(); - - - PROTECT_CTX(ctx, flags); - - - remove_wait_queue(&ctx->ctx_zombieq, &wait); - set_current_state(TASK_RUNNING); - - /* - * context is unloaded at this point - */ - DPRINT(("after zombie wakeup ctx_state=%d for\n", state)); - } - else if (task != current) { -#ifdef CONFIG_SMP - /* - * switch context to zombie state - */ - ctx->ctx_state = PFM_CTX_ZOMBIE; - - DPRINT(("zombie ctx for [%d]\n", task->pid)); - /* - * cannot free the context on the spot. deferred until - * the task notices the ZOMBIE state - */ - free_possible = 0; -#else - pfm_context_unload(ctx, NULL, 0, regs); -#endif - } - -doit: - /* reload state, may have changed during opening of critical section */ - state = ctx->ctx_state; - - /* - * the context is still attached to a task (possibly current) - * we cannot destroy it right now - */ - - /* - * we must free the sampling buffer right here because - * we cannot rely on it being cleaned up later by the - * monitored task. It is not possible to free vmalloc'ed - * memory in pfm_load_regs(). Instead, we remove the buffer - * now. should there be subsequent PMU overflow originally - * meant for sampling, the will be converted to spurious - * and that's fine because the monitoring tools is gone anyway. - */ - if (ctx->ctx_smpl_hdr) { - smpl_buf_addr = ctx->ctx_smpl_hdr; - smpl_buf_size = ctx->ctx_smpl_size; - /* no more sampling */ - ctx->ctx_smpl_hdr = NULL; - ctx->ctx_fl_is_sampling = 0; - } - - DPRINT(("ctx_state=%d free_possible=%d addr=%p size=%lu\n", - state, - free_possible, - smpl_buf_addr, - smpl_buf_size)); - - if (smpl_buf_addr) pfm_exit_smpl_buffer(ctx->ctx_buf_fmt); - - /* - * UNLOADED that the session has already been unreserved. - */ - if (state == PFM_CTX_ZOMBIE) { - pfm_unreserve_session(ctx, ctx->ctx_fl_system , ctx->ctx_cpu); - } - - /* - * disconnect file descriptor from context must be done - * before we unlock. - */ - filp->private_data = NULL; - - /* - * if we free on the spot, the context is now completely unreachable - * from the callers side. The monitored task side is also cut, so we - * can freely cut. - * - * If we have a deferred free, only the caller side is disconnected. - */ - UNPROTECT_CTX(ctx, flags); - - /* - * All memory free operations (especially for vmalloc'ed memory) - * MUST be done with interrupts ENABLED. - */ - if (smpl_buf_addr) pfm_rvfree(smpl_buf_addr, smpl_buf_size); - - /* - * return the memory used by the context - */ - if (free_possible) pfm_context_free(ctx); - - return 0; -} - -static int -pfm_no_open(struct inode *irrelevant, struct file *dontcare) -{ - DPRINT(("pfm_no_open called\n")); - return -ENXIO; -} - - - -static const struct file_operations pfm_file_ops = { - .llseek = no_llseek, - .read = pfm_read, - .write = pfm_write, - .poll = pfm_poll, - .ioctl = pfm_ioctl, - .open = pfm_no_open, /* special open code to disallow open via /proc */ - .fasync = pfm_fasync, - .release = pfm_close, - .flush = pfm_flush -}; - -static int -pfmfs_delete_dentry(struct dentry *dentry) -{ - return 1; -} - -static struct dentry_operations pfmfs_dentry_operations = { - .d_delete = pfmfs_delete_dentry, -}; - - -static int -pfm_alloc_fd(struct file **cfile) -{ - int fd, ret = 0; - struct file *file = NULL; - struct inode * inode; - char name[32]; - struct qstr this; - - fd = get_unused_fd(); - if (fd < 0) return -ENFILE; - - ret = -ENFILE; - - file = get_empty_filp(); - if (!file) goto out; - - /* - * allocate a new inode - */ - inode = new_inode(pfmfs_mnt->mnt_sb); - if (!inode) goto out; - - DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode)); - - inode->i_mode = S_IFCHR|S_IRUGO; - inode->i_uid = current->fsuid; - inode->i_gid = current->fsgid; - - sprintf(name, "[%lu]", inode->i_ino); - this.name = name; - this.len = strlen(name); - this.hash = inode->i_ino; - - ret = -ENOMEM; - - /* - * allocate a new dcache entry - */ - file->f_path.dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); - if (!file->f_path.dentry) goto out; - - file->f_path.dentry->d_op = &pfmfs_dentry_operations; - - d_add(file->f_path.dentry, inode); - file->f_path.mnt = mntget(pfmfs_mnt); - file->f_mapping = inode->i_mapping; - - file->f_op = &pfm_file_ops; - file->f_mode = FMODE_READ; - file->f_flags = O_RDONLY; - file->f_pos = 0; - - /* - * may have to delay until context is attached? - */ - fd_install(fd, file); - - /* - * the file structure we will use - */ - *cfile = file; - - return fd; -out: - if (file) put_filp(file); - put_unused_fd(fd); - return ret; -} - -static void -pfm_free_fd(int fd, struct file *file) -{ - struct files_struct *files = current->files; - struct fdtable *fdt; - - /* - * there ie no fd_uninstall(), so we do it here - */ - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - rcu_assign_pointer(fdt->fd[fd], NULL); - spin_unlock(&files->file_lock); - - if (file) - put_filp(file); - put_unused_fd(fd); -} - -static int -pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size) -{ - DPRINT(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size)); - - while (size > 0) { - unsigned long pfn = ia64_tpa(buf) >> PAGE_SHIFT; - - - if (remap_pfn_range(vma, addr, pfn, PAGE_SIZE, PAGE_READONLY)) - return -ENOMEM; - - addr += PAGE_SIZE; - buf += PAGE_SIZE; - size -= PAGE_SIZE; - } - return 0; -} - -/* - * allocate a sampling buffer and remaps it into the user address space of the task - */ -static int -pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t *ctx, unsigned long rsize, void **user_vaddr) -{ - struct mm_struct *mm = task->mm; - struct vm_area_struct *vma = NULL; - unsigned long size; - void *smpl_buf; - - - /* - * the fixed header + requested size and align to page boundary - */ - size = PAGE_ALIGN(rsize); - - DPRINT(("sampling buffer rsize=%lu size=%lu bytes\n", rsize, size)); - - /* - * check requested size to avoid Denial-of-service attacks - * XXX: may have to refine this test - * Check against address space limit. - * - * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur) - * return -ENOMEM; - */ - if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur) - return -ENOMEM; - - /* - * We do the easy to undo allocations first. - * - * pfm_rvmalloc(), clears the buffer, so there is no leak - */ - smpl_buf = pfm_rvmalloc(size); - if (smpl_buf == NULL) { - DPRINT(("Can't allocate sampling buffer\n")); - return -ENOMEM; - } - - DPRINT(("smpl_buf @%p\n", smpl_buf)); - - /* allocate vma */ - vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); - if (!vma) { - DPRINT(("Cannot allocate vma\n")); - goto error_kmem; - } - - /* - * partially initialize the vma for the sampling buffer - */ - vma->vm_mm = mm; - vma->vm_file = filp; - vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED; - vma->vm_page_prot = PAGE_READONLY; /* XXX may need to change */ - - /* - * Now we have everything we need and we can initialize - * and connect all the data structures - */ - - ctx->ctx_smpl_hdr = smpl_buf; - ctx->ctx_smpl_size = size; /* aligned size */ - - /* - * Let's do the difficult operations next. - * - * now we atomically find some area in the address space and - * remap the buffer in it. - */ - down_write(&task->mm->mmap_sem); - - /* find some free area in address space, must have mmap sem held */ - vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0); - if (vma->vm_start == 0UL) { - DPRINT(("Cannot find unmapped area for size %ld\n", size)); - up_write(&task->mm->mmap_sem); - goto error; - } - vma->vm_end = vma->vm_start + size; - vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; - - DPRINT(("aligned size=%ld, hdr=%p mapped @0x%lx\n", size, ctx->ctx_smpl_hdr, vma->vm_start)); - - /* can only be applied to current task, need to have the mm semaphore held when called */ - if (pfm_remap_buffer(vma, (unsigned long)smpl_buf, vma->vm_start, size)) { - DPRINT(("Can't remap buffer\n")); - up_write(&task->mm->mmap_sem); - goto error; - } - - get_file(filp); - - /* - * now insert the vma in the vm list for the process, must be - * done with mmap lock held - */ - insert_vm_struct(mm, vma); - - mm->total_vm += size >> PAGE_SHIFT; - vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, - vma_pages(vma)); - up_write(&task->mm->mmap_sem); - - /* - * keep track of user level virtual address - */ - ctx->ctx_smpl_vaddr = (void *)vma->vm_start; - *(unsigned long *)user_vaddr = vma->vm_start; - - return 0; - -error: - kmem_cache_free(vm_area_cachep, vma); -error_kmem: - pfm_rvfree(smpl_buf, size); - - return -ENOMEM; -} - -/* - * XXX: do something better here - */ -static int -pfm_bad_permissions(struct task_struct *task) -{ - /* inspired by ptrace_attach() */ - DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n", - current->uid, - current->gid, - task->euid, - task->suid, - task->uid, - task->egid, - task->sgid)); - - return ((current->uid != task->euid) - || (current->uid != task->suid) - || (current->uid != task->uid) - || (current->gid != task->egid) - || (current->gid != task->sgid) - || (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE); -} - -static int -pfarg_is_sane(struct task_struct *task, pfarg_context_t *pfx) -{ - int ctx_flags; - - /* valid signal */ - - ctx_flags = pfx->ctx_flags; - - if (ctx_flags & PFM_FL_SYSTEM_WIDE) { - - /* - * cannot block in this mode - */ - if (ctx_flags & PFM_FL_NOTIFY_BLOCK) { - DPRINT(("cannot use blocking mode when in system wide monitoring\n")); - return -EINVAL; - } - } else { - } - /* probably more to add here */ - - return 0; -} - -static int -pfm_setup_buffer_fmt(struct task_struct *task, struct file *filp, pfm_context_t *ctx, unsigned int ctx_flags, - unsigned int cpu, pfarg_context_t *arg) -{ - pfm_buffer_fmt_t *fmt = NULL; - unsigned long size = 0UL; - void *uaddr = NULL; - void *fmt_arg = NULL; - int ret = 0; -#define PFM_CTXARG_BUF_ARG(a) (pfm_buffer_fmt_t *)(a+1) - - /* invoke and lock buffer format, if found */ - fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id); - if (fmt == NULL) { - DPRINT(("[%d] cannot find buffer format\n", task->pid)); - return -EINVAL; - } - - /* - * buffer argument MUST be contiguous to pfarg_context_t - */ - if (fmt->fmt_arg_size) fmt_arg = PFM_CTXARG_BUF_ARG(arg); - - ret = pfm_buf_fmt_validate(fmt, task, ctx_flags, cpu, fmt_arg); - - DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task->pid, ctx_flags, cpu, fmt_arg, ret)); - - if (ret) goto error; - - /* link buffer format and context */ - ctx->ctx_buf_fmt = fmt; - - /* - * check if buffer format wants to use perfmon buffer allocation/mapping service - */ - ret = pfm_buf_fmt_getsize(fmt, task, ctx_flags, cpu, fmt_arg, &size); - if (ret) goto error; - - if (size) { - /* - * buffer is always remapped into the caller's address space - */ - ret = pfm_smpl_buffer_alloc(current, filp, ctx, size, &uaddr); - if (ret) goto error; - - /* keep track of user address of buffer */ - arg->ctx_smpl_vaddr = uaddr; - } - ret = pfm_buf_fmt_init(fmt, task, ctx->ctx_smpl_hdr, ctx_flags, cpu, fmt_arg); - -error: - return ret; -} - -static void -pfm_reset_pmu_state(pfm_context_t *ctx) -{ - int i; - - /* - * install reset values for PMC. - */ - for (i=1; PMC_IS_LAST(i) == 0; i++) { - if (PMC_IS_IMPL(i) == 0) continue; - ctx->ctx_pmcs[i] = PMC_DFL_VAL(i); - DPRINT(("pmc[%d]=0x%lx\n", i, ctx->ctx_pmcs[i])); - } - /* - * PMD registers are set to 0UL when the context in memset() - */ - - /* - * On context switched restore, we must restore ALL pmc and ALL pmd even - * when they are not actively used by the task. In UP, the incoming process - * may otherwise pick up left over PMC, PMD state from the previous process. - * As opposed to PMD, stale PMC can cause harm to the incoming - * process because they may change what is being measured. - * Therefore, we must systematically reinstall the entire - * PMC state. In SMP, the same thing is possible on the - * same CPU but also on between 2 CPUs. - * - * The problem with PMD is information leaking especially - * to user level when psr.sp=0 - * - * There is unfortunately no easy way to avoid this problem - * on either UP or SMP. This definitively slows down the - * pfm_load_regs() function. - */ - - /* - * bitmask of all PMCs accessible to this context - * - * PMC0 is treated differently. - */ - ctx->ctx_all_pmcs[0] = pmu_conf->impl_pmcs[0] & ~0x1; - - /* - * bitmask of all PMDs that are accessible to this context - */ - ctx->ctx_all_pmds[0] = pmu_conf->impl_pmds[0]; - - DPRINT(("<%d> all_pmcs=0x%lx all_pmds=0x%lx\n", ctx->ctx_fd, ctx->ctx_all_pmcs[0],ctx->ctx_all_pmds[0])); - - /* - * useful in case of re-enable after disable - */ - ctx->ctx_used_ibrs[0] = 0UL; - ctx->ctx_used_dbrs[0] = 0UL; -} - -static int -pfm_ctx_getsize(void *arg, size_t *sz) -{ - pfarg_context_t *req = (pfarg_context_t *)arg; - pfm_buffer_fmt_t *fmt; - - *sz = 0; - - if (!pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) return 0; - - fmt = pfm_find_buffer_fmt(req->ctx_smpl_buf_id); - if (fmt == NULL) { - DPRINT(("cannot find buffer format\n")); - return -EINVAL; - } - /* get just enough to copy in user parameters */ - *sz = fmt->fmt_arg_size; - DPRINT(("arg_size=%lu\n", *sz)); - - return 0; -} - - - -/* - * cannot attach if : - * - kernel task - * - task not owned by caller - * - task incompatible with context mode - */ -static int -pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task) -{ - /* - * no kernel task or task not owner by caller - */ - if (task->mm == NULL) { - DPRINT(("task [%d] has not memory context (kernel thread)\n", task->pid)); - return -EPERM; - } - if (pfm_bad_permissions(task)) { - DPRINT(("no permission to attach to [%d]\n", task->pid)); - return -EPERM; - } - /* - * cannot block in self-monitoring mode - */ - if (CTX_OVFL_NOBLOCK(ctx) == 0 && task == current) { - DPRINT(("cannot load a blocking context on self for [%d]\n", task->pid)); - return -EINVAL; - } - - if (task->exit_state == EXIT_ZOMBIE) { - DPRINT(("cannot attach to zombie task [%d]\n", task->pid)); - return -EBUSY; - } - - /* - * always ok for self - */ - if (task == current) return 0; - - if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) { - DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state)); - return -EBUSY; - } - /* - * make sure the task is off any CPU - */ - wait_task_inactive(task); - - /* more to come... */ - - return 0; -} - -static int -pfm_get_task(pfm_context_t *ctx, pid_t pid, struct task_struct **task) -{ - struct task_struct *p = current; - int ret; - - /* XXX: need to add more checks here */ - if (pid < 2) return -EPERM; - - if (pid != current->pid) { - - read_lock(&tasklist_lock); - - p = find_task_by_pid(pid); - - /* make sure task cannot go away while we operate on it */ - if (p) get_task_struct(p); - - read_unlock(&tasklist_lock); - - if (p == NULL) return -ESRCH; - } - - ret = pfm_task_incompatible(ctx, p); - if (ret == 0) { - *task = p; - } else if (p != current) { - pfm_put_task(p); - } - return ret; -} - - - -static int -pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - pfarg_context_t *req = (pfarg_context_t *)arg; - struct file *filp; - int ctx_flags; - int ret; - - /* let's check the arguments first */ - ret = pfarg_is_sane(current, req); - if (ret < 0) return ret; - - ctx_flags = req->ctx_flags; - - ret = -ENOMEM; - - ctx = pfm_context_alloc(); - if (!ctx) goto error; - - ret = pfm_alloc_fd(&filp); - if (ret < 0) goto error_file; - - req->ctx_fd = ctx->ctx_fd = ret; - - /* - * attach context to file - */ - filp->private_data = ctx; - - /* - * does the user want to sample? - */ - if (pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) { - ret = pfm_setup_buffer_fmt(current, filp, ctx, ctx_flags, 0, req); - if (ret) goto buffer_error; - } - - /* - * init context protection lock - */ - spin_lock_init(&ctx->ctx_lock); - - /* - * context is unloaded - */ - ctx->ctx_state = PFM_CTX_UNLOADED; - - /* - * initialization of context's flags - */ - ctx->ctx_fl_block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0; - ctx->ctx_fl_system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0; - ctx->ctx_fl_is_sampling = ctx->ctx_buf_fmt ? 1 : 0; /* assume record() is defined */ - ctx->ctx_fl_no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0; - /* - * will move to set properties - * ctx->ctx_fl_excl_idle = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0; - */ - - /* - * init restart semaphore to locked - */ - init_completion(&ctx->ctx_restart_done); - - /* - * activation is used in SMP only - */ - ctx->ctx_last_activation = PFM_INVALID_ACTIVATION; - SET_LAST_CPU(ctx, -1); - - /* - * initialize notification message queue - */ - ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0; - init_waitqueue_head(&ctx->ctx_msgq_wait); - init_waitqueue_head(&ctx->ctx_zombieq); - - DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n", - ctx, - ctx_flags, - ctx->ctx_fl_system, - ctx->ctx_fl_block, - ctx->ctx_fl_excl_idle, - ctx->ctx_fl_no_msg, - ctx->ctx_fd)); - - /* - * initialize soft PMU state - */ - pfm_reset_pmu_state(ctx); - - return 0; - -buffer_error: - pfm_free_fd(ctx->ctx_fd, filp); - - if (ctx->ctx_buf_fmt) { - pfm_buf_fmt_exit(ctx->ctx_buf_fmt, current, NULL, regs); - } -error_file: - pfm_context_free(ctx); - -error: - return ret; -} - -static inline unsigned long -pfm_new_counter_value (pfm_counter_t *reg, int is_long_reset) -{ - unsigned long val = is_long_reset ? reg->long_reset : reg->short_reset; - unsigned long new_seed, old_seed = reg->seed, mask = reg->mask; - extern unsigned long carta_random32 (unsigned long seed); - - if (reg->flags & PFM_REGFL_RANDOM) { - new_seed = carta_random32(old_seed); - val -= (old_seed & mask); /* counter values are negative numbers! */ - if ((mask >> 32) != 0) - /* construct a full 64-bit random value: */ - new_seed |= carta_random32(old_seed >> 32) << 32; - reg->seed = new_seed; - } - reg->lval = val; - return val; -} - -static void -pfm_reset_regs_masked(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset) -{ - unsigned long mask = ovfl_regs[0]; - unsigned long reset_others = 0UL; - unsigned long val; - int i; - - /* - * now restore reset value on sampling overflowed counters - */ - mask >>= PMU_FIRST_COUNTER; - for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) { - - if ((mask & 0x1UL) == 0UL) continue; - - ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset); - reset_others |= ctx->ctx_pmds[i].reset_pmds[0]; - - DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val)); - } - - /* - * Now take care of resetting the other registers - */ - for(i = 0; reset_others; i++, reset_others >>= 1) { - - if ((reset_others & 0x1) == 0) continue; - - ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset); - - DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n", - is_long_reset ? "long" : "short", i, val)); - } -} - -static void -pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset) -{ - unsigned long mask = ovfl_regs[0]; - unsigned long reset_others = 0UL; - unsigned long val; - int i; - - DPRINT_ovfl(("ovfl_regs=0x%lx is_long_reset=%d\n", ovfl_regs[0], is_long_reset)); - - if (ctx->ctx_state == PFM_CTX_MASKED) { - pfm_reset_regs_masked(ctx, ovfl_regs, is_long_reset); - return; - } - - /* - * now restore reset value on sampling overflowed counters - */ - mask >>= PMU_FIRST_COUNTER; - for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) { - - if ((mask & 0x1UL) == 0UL) continue; - - val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset); - reset_others |= ctx->ctx_pmds[i].reset_pmds[0]; - - DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val)); - - pfm_write_soft_counter(ctx, i, val); - } - - /* - * Now take care of resetting the other registers - */ - for(i = 0; reset_others; i++, reset_others >>= 1) { - - if ((reset_others & 0x1) == 0) continue; - - val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset); - - if (PMD_IS_COUNTING(i)) { - pfm_write_soft_counter(ctx, i, val); - } else { - ia64_set_pmd(i, val); - } - DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n", - is_long_reset ? "long" : "short", i, val)); - } - ia64_srlz_d(); -} - -static int -pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - struct task_struct *task; - pfarg_reg_t *req = (pfarg_reg_t *)arg; - unsigned long value, pmc_pm; - unsigned long smpl_pmds, reset_pmds, impl_pmds; - unsigned int cnum, reg_flags, flags, pmc_type; - int i, can_access_pmu = 0, is_loaded, is_system, expert_mode; - int is_monitor, is_counting, state; - int ret = -EINVAL; - pfm_reg_check_t wr_func; -#define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z)) - - state = ctx->ctx_state; - is_loaded = state == PFM_CTX_LOADED ? 1 : 0; - is_system = ctx->ctx_fl_system; - task = ctx->ctx_task; - impl_pmds = pmu_conf->impl_pmds[0]; - - if (state == PFM_CTX_ZOMBIE) return -EINVAL; - - if (is_loaded) { - /* - * In system wide and when the context is loaded, access can only happen - * when the caller is running on the CPU being monitored by the session. - * It does not have to be the owner (ctx_task) of the context per se. - */ - if (is_system && ctx->ctx_cpu != smp_processor_id()) { - DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); - return -EBUSY; - } - can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0; - } - expert_mode = pfm_sysctl.expert_mode; - - for (i = 0; i < count; i++, req++) { - - cnum = req->reg_num; - reg_flags = req->reg_flags; - value = req->reg_value; - smpl_pmds = req->reg_smpl_pmds[0]; - reset_pmds = req->reg_reset_pmds[0]; - flags = 0; - - - if (cnum >= PMU_MAX_PMCS) { - DPRINT(("pmc%u is invalid\n", cnum)); - goto error; - } - - pmc_type = pmu_conf->pmc_desc[cnum].type; - pmc_pm = (value >> pmu_conf->pmc_desc[cnum].pm_pos) & 0x1; - is_counting = (pmc_type & PFM_REG_COUNTING) == PFM_REG_COUNTING ? 1 : 0; - is_monitor = (pmc_type & PFM_REG_MONITOR) == PFM_REG_MONITOR ? 1 : 0; - - /* - * we reject all non implemented PMC as well - * as attempts to modify PMC[0-3] which are used - * as status registers by the PMU - */ - if ((pmc_type & PFM_REG_IMPL) == 0 || (pmc_type & PFM_REG_CONTROL) == PFM_REG_CONTROL) { - DPRINT(("pmc%u is unimplemented or no-access pmc_type=%x\n", cnum, pmc_type)); - goto error; - } - wr_func = pmu_conf->pmc_desc[cnum].write_check; - /* - * If the PMC is a monitor, then if the value is not the default: - * - system-wide session: PMCx.pm=1 (privileged monitor) - * - per-task : PMCx.pm=0 (user monitor) - */ - if (is_monitor && value != PMC_DFL_VAL(cnum) && is_system ^ pmc_pm) { - DPRINT(("pmc%u pmc_pm=%lu is_system=%d\n", - cnum, - pmc_pm, - is_system)); - goto error; - } - - if (is_counting) { - /* - * enforce generation of overflow interrupt. Necessary on all - * CPUs. - */ - value |= 1 << PMU_PMC_OI; - - if (reg_flags & PFM_REGFL_OVFL_NOTIFY) { - flags |= PFM_REGFL_OVFL_NOTIFY; - } - - if (reg_flags & PFM_REGFL_RANDOM) flags |= PFM_REGFL_RANDOM; - - /* verify validity of smpl_pmds */ - if ((smpl_pmds & impl_pmds) != smpl_pmds) { - DPRINT(("invalid smpl_pmds 0x%lx for pmc%u\n", smpl_pmds, cnum)); - goto error; - } - - /* verify validity of reset_pmds */ - if ((reset_pmds & impl_pmds) != reset_pmds) { - DPRINT(("invalid reset_pmds 0x%lx for pmc%u\n", reset_pmds, cnum)); - goto error; - } - } else { - if (reg_flags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) { - DPRINT(("cannot set ovfl_notify or random on pmc%u\n", cnum)); - goto error; - } - /* eventid on non-counting monitors are ignored */ - } - - /* - * execute write checker, if any - */ - if (likely(expert_mode == 0 && wr_func)) { - ret = (*wr_func)(task, ctx, cnum, &value, regs); - if (ret) goto error; - ret = -EINVAL; - } - - /* - * no error on this register - */ - PFM_REG_RETFLAG_SET(req->reg_flags, 0); - - /* - * Now we commit the changes to the software state - */ - - /* - * update overflow information - */ - if (is_counting) { - /* - * full flag update each time a register is programmed - */ - ctx->ctx_pmds[cnum].flags = flags; - - ctx->ctx_pmds[cnum].reset_pmds[0] = reset_pmds; - ctx->ctx_pmds[cnum].smpl_pmds[0] = smpl_pmds; - ctx->ctx_pmds[cnum].eventid = req->reg_smpl_eventid; - - /* - * Mark all PMDS to be accessed as used. - * - * We do not keep track of PMC because we have to - * systematically restore ALL of them. - * - * We do not update the used_monitors mask, because - * if we have not programmed them, then will be in - * a quiescent state, therefore we will not need to - * mask/restore then when context is MASKED. - */ - CTX_USED_PMD(ctx, reset_pmds); - CTX_USED_PMD(ctx, smpl_pmds); - /* - * make sure we do not try to reset on - * restart because we have established new values - */ - if (state == PFM_CTX_MASKED) ctx->ctx_ovfl_regs[0] &= ~1UL << cnum; - } - /* - * Needed in case the user does not initialize the equivalent - * PMD. Clearing is done indirectly via pfm_reset_pmu_state() so there is no - * possible leak here. - */ - CTX_USED_PMD(ctx, pmu_conf->pmc_desc[cnum].dep_pmd[0]); - - /* - * keep track of the monitor PMC that we are using. - * we save the value of the pmc in ctx_pmcs[] and if - * the monitoring is not stopped for the context we also - * place it in the saved state area so that it will be - * picked up later by the context switch code. - * - * The value in ctx_pmcs[] can only be changed in pfm_write_pmcs(). - * - * The value in th_pmcs[] may be modified on overflow, i.e., when - * monitoring needs to be stopped. - */ - if (is_monitor) CTX_USED_MONITOR(ctx, 1UL << cnum); - - /* - * update context state - */ - ctx->ctx_pmcs[cnum] = value; - - if (is_loaded) { - /* - * write thread state - */ - if (is_system == 0) ctx->th_pmcs[cnum] = value; - - /* - * write hardware register if we can - */ - if (can_access_pmu) { - ia64_set_pmc(cnum, value); - } -#ifdef CONFIG_SMP - else { - /* - * per-task SMP only here - * - * we are guaranteed that the task is not running on the other CPU, - * we indicate that this PMD will need to be reloaded if the task - * is rescheduled on the CPU it ran last on. - */ - ctx->ctx_reload_pmcs[0] |= 1UL << cnum; - } -#endif - } - - DPRINT(("pmc[%u]=0x%lx ld=%d apmu=%d flags=0x%x all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n", - cnum, - value, - is_loaded, - can_access_pmu, - flags, - ctx->ctx_all_pmcs[0], - ctx->ctx_used_pmds[0], - ctx->ctx_pmds[cnum].eventid, - smpl_pmds, - reset_pmds, - ctx->ctx_reload_pmcs[0], - ctx->ctx_used_monitors[0], - ctx->ctx_ovfl_regs[0])); - } - - /* - * make sure the changes are visible - */ - if (can_access_pmu) ia64_srlz_d(); - - return 0; -error: - PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL); - return ret; -} - -static int -pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - struct task_struct *task; - pfarg_reg_t *req = (pfarg_reg_t *)arg; - unsigned long value, hw_value, ovfl_mask; - unsigned int cnum; - int i, can_access_pmu = 0, state; - int is_counting, is_loaded, is_system, expert_mode; - int ret = -EINVAL; - pfm_reg_check_t wr_func; - - - state = ctx->ctx_state; - is_loaded = state == PFM_CTX_LOADED ? 1 : 0; - is_system = ctx->ctx_fl_system; - ovfl_mask = pmu_conf->ovfl_val; - task = ctx->ctx_task; - - if (unlikely(state == PFM_CTX_ZOMBIE)) return -EINVAL; - - /* - * on both UP and SMP, we can only write to the PMC when the task is - * the owner of the local PMU. - */ - if (likely(is_loaded)) { - /* - * In system wide and when the context is loaded, access can only happen - * when the caller is running on the CPU being monitored by the session. - * It does not have to be the owner (ctx_task) of the context per se. - */ - if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) { - DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); - return -EBUSY; - } - can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0; - } - expert_mode = pfm_sysctl.expert_mode; - - for (i = 0; i < count; i++, req++) { - - cnum = req->reg_num; - value = req->reg_value; - - if (!PMD_IS_IMPL(cnum)) { - DPRINT(("pmd[%u] is unimplemented or invalid\n", cnum)); - goto abort_mission; - } - is_counting = PMD_IS_COUNTING(cnum); - wr_func = pmu_conf->pmd_desc[cnum].write_check; - - /* - * execute write checker, if any - */ - if (unlikely(expert_mode == 0 && wr_func)) { - unsigned long v = value; - - ret = (*wr_func)(task, ctx, cnum, &v, regs); - if (ret) goto abort_mission; - - value = v; - ret = -EINVAL; - } - - /* - * no error on this register - */ - PFM_REG_RETFLAG_SET(req->reg_flags, 0); - - /* - * now commit changes to software state - */ - hw_value = value; - - /* - * update virtualized (64bits) counter - */ - if (is_counting) { - /* - * write context state - */ - ctx->ctx_pmds[cnum].lval = value; - - /* - * when context is load we use the split value - */ - if (is_loaded) { - hw_value = value & ovfl_mask; - value = value & ~ovfl_mask; - } - } - /* - * update reset values (not just for counters) - */ - ctx->ctx_pmds[cnum].long_reset = req->reg_long_reset; - ctx->ctx_pmds[cnum].short_reset = req->reg_short_reset; - - /* - * update randomization parameters (not just for counters) - */ - ctx->ctx_pmds[cnum].seed = req->reg_random_seed; - ctx->ctx_pmds[cnum].mask = req->reg_random_mask; - - /* - * update context value - */ - ctx->ctx_pmds[cnum].val = value; - - /* - * Keep track of what we use - * - * We do not keep track of PMC because we have to - * systematically restore ALL of them. - */ - CTX_USED_PMD(ctx, PMD_PMD_DEP(cnum)); - - /* - * mark this PMD register used as well - */ - CTX_USED_PMD(ctx, RDEP(cnum)); - - /* - * make sure we do not try to reset on - * restart because we have established new values - */ - if (is_counting && state == PFM_CTX_MASKED) { - ctx->ctx_ovfl_regs[0] &= ~1UL << cnum; - } - - if (is_loaded) { - /* - * write thread state - */ - if (is_system == 0) ctx->th_pmds[cnum] = hw_value; - - /* - * write hardware register if we can - */ - if (can_access_pmu) { - ia64_set_pmd(cnum, hw_value); - } else { -#ifdef CONFIG_SMP - /* - * we are guaranteed that the task is not running on the other CPU, - * we indicate that this PMD will need to be reloaded if the task - * is rescheduled on the CPU it ran last on. - */ - ctx->ctx_reload_pmds[0] |= 1UL << cnum; -#endif - } - } - - DPRINT(("pmd[%u]=0x%lx ld=%d apmu=%d, hw_value=0x%lx ctx_pmd=0x%lx short_reset=0x%lx " - "long_reset=0x%lx notify=%c seed=0x%lx mask=0x%lx used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n", - cnum, - value, - is_loaded, - can_access_pmu, - hw_value, - ctx->ctx_pmds[cnum].val, - ctx->ctx_pmds[cnum].short_reset, - ctx->ctx_pmds[cnum].long_reset, - PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N', - ctx->ctx_pmds[cnum].seed, - ctx->ctx_pmds[cnum].mask, - ctx->ctx_used_pmds[0], - ctx->ctx_pmds[cnum].reset_pmds[0], - ctx->ctx_reload_pmds[0], - ctx->ctx_all_pmds[0], - ctx->ctx_ovfl_regs[0])); - } - - /* - * make changes visible - */ - if (can_access_pmu) ia64_srlz_d(); - - return 0; - -abort_mission: - /* - * for now, we have only one possibility for error - */ - PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL); - return ret; -} - -/* - * By the way of PROTECT_CONTEXT(), interrupts are masked while we are in this function. - * Therefore we know, we do not have to worry about the PMU overflow interrupt. If an - * interrupt is delivered during the call, it will be kept pending until we leave, making - * it appears as if it had been generated at the UNPROTECT_CONTEXT(). At least we are - * guaranteed to return consistent data to the user, it may simply be old. It is not - * trivial to treat the overflow while inside the call because you may end up in - * some module sampling buffer code causing deadlocks. - */ -static int -pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - struct task_struct *task; - unsigned long val = 0UL, lval, ovfl_mask, sval; - pfarg_reg_t *req = (pfarg_reg_t *)arg; - unsigned int cnum, reg_flags = 0; - int i, can_access_pmu = 0, state; - int is_loaded, is_system, is_counting, expert_mode; - int ret = -EINVAL; - pfm_reg_check_t rd_func; - - /* - * access is possible when loaded only for - * self-monitoring tasks or in UP mode - */ - - state = ctx->ctx_state; - is_loaded = state == PFM_CTX_LOADED ? 1 : 0; - is_system = ctx->ctx_fl_system; - ovfl_mask = pmu_conf->ovfl_val; - task = ctx->ctx_task; - - if (state == PFM_CTX_ZOMBIE) return -EINVAL; - - if (likely(is_loaded)) { - /* - * In system wide and when the context is loaded, access can only happen - * when the caller is running on the CPU being monitored by the session. - * It does not have to be the owner (ctx_task) of the context per se. - */ - if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) { - DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); - return -EBUSY; - } - /* - * this can be true when not self-monitoring only in UP - */ - can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0; - - if (can_access_pmu) ia64_srlz_d(); - } - expert_mode = pfm_sysctl.expert_mode; - - DPRINT(("ld=%d apmu=%d ctx_state=%d\n", - is_loaded, - can_access_pmu, - state)); - - /* - * on both UP and SMP, we can only read the PMD from the hardware register when - * the task is the owner of the local PMU. - */ - - for (i = 0; i < count; i++, req++) { - - cnum = req->reg_num; - reg_flags = req->reg_flags; - - if (unlikely(!PMD_IS_IMPL(cnum))) goto error; - /* - * we can only read the register that we use. That includes - * the one we explicitly initialize AND the one we want included - * in the sampling buffer (smpl_regs). - * - * Having this restriction allows optimization in the ctxsw routine - * without compromising security (leaks) - */ - if (unlikely(!CTX_IS_USED_PMD(ctx, cnum))) goto error; - - sval = ctx->ctx_pmds[cnum].val; - lval = ctx->ctx_pmds[cnum].lval; - is_counting = PMD_IS_COUNTING(cnum); - - /* - * If the task is not the current one, then we check if the - * PMU state is still in the local live register due to lazy ctxsw. - * If true, then we read directly from the registers. - */ - if (can_access_pmu){ - val = ia64_get_pmd(cnum); - } else { - /* - * context has been saved - * if context is zombie, then task does not exist anymore. - * In this case, we use the full value saved in the context (pfm_flush_regs()). - */ - val = is_loaded ? ctx->th_pmds[cnum] : 0UL; - } - rd_func = pmu_conf->pmd_desc[cnum].read_check; - - if (is_counting) { - /* - * XXX: need to check for overflow when loaded - */ - val &= ovfl_mask; - val += sval; - } - - /* - * execute read checker, if any - */ - if (unlikely(expert_mode == 0 && rd_func)) { - unsigned long v = val; - ret = (*rd_func)(ctx->ctx_task, ctx, cnum, &v, regs); - if (ret) goto error; - val = v; - ret = -EINVAL; - } - - PFM_REG_RETFLAG_SET(reg_flags, 0); - - DPRINT(("pmd[%u]=0x%lx\n", cnum, val)); - - /* - * update register return value, abort all if problem during copy. - * we only modify the reg_flags field. no check mode is fine because - * access has been verified upfront in sys_perfmonctl(). - */ - req->reg_value = val; - req->reg_flags = reg_flags; - req->reg_last_reset_val = lval; - } - - return 0; - -error: - PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL); - return ret; -} - -int -pfm_mod_write_pmcs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs) -{ - pfm_context_t *ctx; - - if (req == NULL) return -EINVAL; - - ctx = GET_PMU_CTX(); - - if (ctx == NULL) return -EINVAL; - - /* - * for now limit to current task, which is enough when calling - * from overflow handler - */ - if (task != current && ctx->ctx_fl_system == 0) return -EBUSY; - - return pfm_write_pmcs(ctx, req, nreq, regs); -} -EXPORT_SYMBOL(pfm_mod_write_pmcs); - -int -pfm_mod_read_pmds(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs) -{ - pfm_context_t *ctx; - - if (req == NULL) return -EINVAL; - - ctx = GET_PMU_CTX(); - - if (ctx == NULL) return -EINVAL; - - /* - * for now limit to current task, which is enough when calling - * from overflow handler - */ - if (task != current && ctx->ctx_fl_system == 0) return -EBUSY; - - return pfm_read_pmds(ctx, req, nreq, regs); -} -EXPORT_SYMBOL(pfm_mod_read_pmds); - -/* - * Only call this function when a process it trying to - * write the debug registers (reading is always allowed) - */ -int -pfm_use_debug_registers(struct task_struct *task) -{ - pfm_context_t *ctx = task->thread.pfm_context; - unsigned long flags; - int ret = 0; - - if (pmu_conf->use_rr_dbregs == 0) return 0; - - DPRINT(("called for [%d]\n", task->pid)); - - /* - * do it only once - */ - if (task->thread.flags & IA64_THREAD_DBG_VALID) return 0; - - /* - * Even on SMP, we do not need to use an atomic here because - * the only way in is via ptrace() and this is possible only when the - * process is stopped. Even in the case where the ctxsw out is not totally - * completed by the time we come here, there is no way the 'stopped' process - * could be in the middle of fiddling with the pfm_write_ibr_dbr() routine. - * So this is always safe. - */ - if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1; - - LOCK_PFS(flags); - - /* - * We cannot allow setting breakpoints when system wide monitoring - * sessions are using the debug registers. - */ - if (pfm_sessions.pfs_sys_use_dbregs> 0) - ret = -1; - else - pfm_sessions.pfs_ptrace_use_dbregs++; - - DPRINT(("ptrace_use_dbregs=%u sys_use_dbregs=%u by [%d] ret = %d\n", - pfm_sessions.pfs_ptrace_use_dbregs, - pfm_sessions.pfs_sys_use_dbregs, - task->pid, ret)); - - UNLOCK_PFS(flags); - - return ret; -} - -/* - * This function is called for every task that exits with the - * IA64_THREAD_DBG_VALID set. This indicates a task which was - * able to use the debug registers for debugging purposes via - * ptrace(). Therefore we know it was not using them for - * perfmormance monitoring, so we only decrement the number - * of "ptraced" debug register users to keep the count up to date - */ -int -pfm_release_debug_registers(struct task_struct *task) -{ - unsigned long flags; - int ret; - - if (pmu_conf->use_rr_dbregs == 0) return 0; - - LOCK_PFS(flags); - if (pfm_sessions.pfs_ptrace_use_dbregs == 0) { - printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task->pid); - ret = -1; - } else { - pfm_sessions.pfs_ptrace_use_dbregs--; - ret = 0; - } - UNLOCK_PFS(flags); - - return ret; -} - -static int -pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - struct task_struct *task; - pfm_buffer_fmt_t *fmt; - pfm_ovfl_ctrl_t rst_ctrl; - int state, is_system; - int ret = 0; - - state = ctx->ctx_state; - fmt = ctx->ctx_buf_fmt; - is_system = ctx->ctx_fl_system; - task = PFM_CTX_TASK(ctx); - - switch(state) { - case PFM_CTX_MASKED: - break; - case PFM_CTX_LOADED: - if (CTX_HAS_SMPL(ctx) && fmt->fmt_restart_active) break; - /* fall through */ - case PFM_CTX_UNLOADED: - case PFM_CTX_ZOMBIE: - DPRINT(("invalid state=%d\n", state)); - return -EBUSY; - default: - DPRINT(("state=%d, cannot operate (no active_restart handler)\n", state)); - return -EINVAL; - } - - /* - * In system wide and when the context is loaded, access can only happen - * when the caller is running on the CPU being monitored by the session. - * It does not have to be the owner (ctx_task) of the context per se. - */ - if (is_system && ctx->ctx_cpu != smp_processor_id()) { - DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); - return -EBUSY; - } - - /* sanity check */ - if (unlikely(task == NULL)) { - printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", current->pid); - return -EINVAL; - } - - if (task == current || is_system) { - - fmt = ctx->ctx_buf_fmt; - - DPRINT(("restarting self %d ovfl=0x%lx\n", - task->pid, - ctx->ctx_ovfl_regs[0])); - - if (CTX_HAS_SMPL(ctx)) { - - prefetch(ctx->ctx_smpl_hdr); - - rst_ctrl.bits.mask_monitoring = 0; - rst_ctrl.bits.reset_ovfl_pmds = 0; - - if (state == PFM_CTX_LOADED) - ret = pfm_buf_fmt_restart_active(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs); - else - ret = pfm_buf_fmt_restart(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs); - } else { - rst_ctrl.bits.mask_monitoring = 0; - rst_ctrl.bits.reset_ovfl_pmds = 1; - } - - if (ret == 0) { - if (rst_ctrl.bits.reset_ovfl_pmds) - pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET); - - if (rst_ctrl.bits.mask_monitoring == 0) { - DPRINT(("resuming monitoring for [%d]\n", task->pid)); - - if (state == PFM_CTX_MASKED) pfm_restore_monitoring(task); - } else { - DPRINT(("keeping monitoring stopped for [%d]\n", task->pid)); - - // cannot use pfm_stop_monitoring(task, regs); - } - } - /* - * clear overflowed PMD mask to remove any stale information - */ - ctx->ctx_ovfl_regs[0] = 0UL; - - /* - * back to LOADED state - */ - ctx->ctx_state = PFM_CTX_LOADED; - - /* - * XXX: not really useful for self monitoring - */ - ctx->ctx_fl_can_restart = 0; - - return 0; - } - - /* - * restart another task - */ - - /* - * When PFM_CTX_MASKED, we cannot issue a restart before the previous - * one is seen by the task. - */ - if (state == PFM_CTX_MASKED) { - if (ctx->ctx_fl_can_restart == 0) return -EINVAL; - /* - * will prevent subsequent restart before this one is - * seen by other task - */ - ctx->ctx_fl_can_restart = 0; - } - - /* - * if blocking, then post the semaphore is PFM_CTX_MASKED, i.e. - * the task is blocked or on its way to block. That's the normal - * restart path. If the monitoring is not masked, then the task - * can be actively monitoring and we cannot directly intervene. - * Therefore we use the trap mechanism to catch the task and - * force it to reset the buffer/reset PMDs. - * - * if non-blocking, then we ensure that the task will go into - * pfm_handle_work() before returning to user mode. - * - * We cannot explicitly reset another task, it MUST always - * be done by the task itself. This works for system wide because - * the tool that is controlling the session is logically doing - * "self-monitoring". - */ - if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) { - DPRINT(("unblocking [%d] \n", task->pid)); - complete(&ctx->ctx_restart_done); - } else { - DPRINT(("[%d] armed exit trap\n", task->pid)); - - ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_RESET; - - PFM_SET_WORK_PENDING(task, 1); - - pfm_set_task_notify(task); - - /* - * XXX: send reschedule if task runs on another CPU - */ - } - return 0; -} - -static int -pfm_debug(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - unsigned int m = *(unsigned int *)arg; - - pfm_sysctl.debug = m == 0 ? 0 : 1; - - printk(KERN_INFO "perfmon debugging %s (timing reset)\n", pfm_sysctl.debug ? "on" : "off"); - - if (m == 0) { - memset(pfm_stats, 0, sizeof(pfm_stats)); - for(m=0; m < NR_CPUS; m++) pfm_stats[m].pfm_ovfl_intr_cycles_min = ~0UL; - } - return 0; -} - -/* - * arg can be NULL and count can be zero for this function - */ -static int -pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - struct thread_struct *thread = NULL; - struct task_struct *task; - pfarg_dbreg_t *req = (pfarg_dbreg_t *)arg; - unsigned long flags; - dbreg_t dbreg; - unsigned int rnum; - int first_time; - int ret = 0, state; - int i, can_access_pmu = 0; - int is_system, is_loaded; - - if (pmu_conf->use_rr_dbregs == 0) return -EINVAL; - - state = ctx->ctx_state; - is_loaded = state == PFM_CTX_LOADED ? 1 : 0; - is_system = ctx->ctx_fl_system; - task = ctx->ctx_task; - - if (state == PFM_CTX_ZOMBIE) return -EINVAL; - - /* - * on both UP and SMP, we can only write to the PMC when the task is - * the owner of the local PMU. - */ - if (is_loaded) { - thread = &task->thread; - /* - * In system wide and when the context is loaded, access can only happen - * when the caller is running on the CPU being monitored by the session. - * It does not have to be the owner (ctx_task) of the context per se. - */ - if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) { - DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); - return -EBUSY; - } - can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0; - } - - /* - * we do not need to check for ipsr.db because we do clear ibr.x, dbr.r, and dbr.w - * ensuring that no real breakpoint can be installed via this call. - * - * IMPORTANT: regs can be NULL in this function - */ - - first_time = ctx->ctx_fl_using_dbreg == 0; - - /* - * don't bother if we are loaded and task is being debugged - */ - if (is_loaded && (thread->flags & IA64_THREAD_DBG_VALID) != 0) { - DPRINT(("debug registers already in use for [%d]\n", task->pid)); - return -EBUSY; - } - - /* - * check for debug registers in system wide mode - * - * If though a check is done in pfm_context_load(), - * we must repeat it here, in case the registers are - * written after the context is loaded - */ - if (is_loaded) { - LOCK_PFS(flags); - - if (first_time && is_system) { - if (pfm_sessions.pfs_ptrace_use_dbregs) - ret = -EBUSY; - else - pfm_sessions.pfs_sys_use_dbregs++; - } - UNLOCK_PFS(flags); - } - - if (ret != 0) return ret; - - /* - * mark ourself as user of the debug registers for - * perfmon purposes. - */ - ctx->ctx_fl_using_dbreg = 1; - - /* - * clear hardware registers to make sure we don't - * pick up stale state. - * - * for a system wide session, we do not use - * thread.dbr, thread.ibr because this process - * never leaves the current CPU and the state - * is shared by all processes running on it - */ - if (first_time && can_access_pmu) { - DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid)); - for (i=0; i < pmu_conf->num_ibrs; i++) { - ia64_set_ibr(i, 0UL); - ia64_dv_serialize_instruction(); - } - ia64_srlz_i(); - for (i=0; i < pmu_conf->num_dbrs; i++) { - ia64_set_dbr(i, 0UL); - ia64_dv_serialize_data(); - } - ia64_srlz_d(); - } - - /* - * Now install the values into the registers - */ - for (i = 0; i < count; i++, req++) { - - rnum = req->dbreg_num; - dbreg.val = req->dbreg_value; - - ret = -EINVAL; - - if ((mode == PFM_CODE_RR && rnum >= PFM_NUM_IBRS) || ((mode == PFM_DATA_RR) && rnum >= PFM_NUM_DBRS)) { - DPRINT(("invalid register %u val=0x%lx mode=%d i=%d count=%d\n", - rnum, dbreg.val, mode, i, count)); - - goto abort_mission; - } - - /* - * make sure we do not install enabled breakpoint - */ - if (rnum & 0x1) { - if (mode == PFM_CODE_RR) - dbreg.ibr.ibr_x = 0; - else - dbreg.dbr.dbr_r = dbreg.dbr.dbr_w = 0; - } - - PFM_REG_RETFLAG_SET(req->dbreg_flags, 0); - - /* - * Debug registers, just like PMC, can only be modified - * by a kernel call. Moreover, perfmon() access to those - * registers are centralized in this routine. The hardware - * does not modify the value of these registers, therefore, - * if we save them as they are written, we can avoid having - * to save them on context switch out. This is made possible - * by the fact that when perfmon uses debug registers, ptrace() - * won't be able to modify them concurrently. - */ - if (mode == PFM_CODE_RR) { - CTX_USED_IBR(ctx, rnum); - - if (can_access_pmu) { - ia64_set_ibr(rnum, dbreg.val); - ia64_dv_serialize_instruction(); - } - - ctx->ctx_ibrs[rnum] = dbreg.val; - - DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x ld=%d apmu=%d\n", - rnum, dbreg.val, ctx->ctx_used_ibrs[0], is_loaded, can_access_pmu)); - } else { - CTX_USED_DBR(ctx, rnum); - - if (can_access_pmu) { - ia64_set_dbr(rnum, dbreg.val); - ia64_dv_serialize_data(); - } - ctx->ctx_dbrs[rnum] = dbreg.val; - - DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x ld=%d apmu=%d\n", - rnum, dbreg.val, ctx->ctx_used_dbrs[0], is_loaded, can_access_pmu)); - } - } - - return 0; - -abort_mission: - /* - * in case it was our first attempt, we undo the global modifications - */ - if (first_time) { - LOCK_PFS(flags); - if (ctx->ctx_fl_system) { - pfm_sessions.pfs_sys_use_dbregs--; - } - UNLOCK_PFS(flags); - ctx->ctx_fl_using_dbreg = 0; - } - /* - * install error return flag - */ - PFM_REG_RETFLAG_SET(req->dbreg_flags, PFM_REG_RETFL_EINVAL); - - return ret; -} - -static int -pfm_write_ibrs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - return pfm_write_ibr_dbr(PFM_CODE_RR, ctx, arg, count, regs); -} - -static int -pfm_write_dbrs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - return pfm_write_ibr_dbr(PFM_DATA_RR, ctx, arg, count, regs); -} - -int -pfm_mod_write_ibrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs) -{ - pfm_context_t *ctx; - - if (req == NULL) return -EINVAL; - - ctx = GET_PMU_CTX(); - - if (ctx == NULL) return -EINVAL; - - /* - * for now limit to current task, which is enough when calling - * from overflow handler - */ - if (task != current && ctx->ctx_fl_system == 0) return -EBUSY; - - return pfm_write_ibrs(ctx, req, nreq, regs); -} -EXPORT_SYMBOL(pfm_mod_write_ibrs); - -int -pfm_mod_write_dbrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs) -{ - pfm_context_t *ctx; - - if (req == NULL) return -EINVAL; - - ctx = GET_PMU_CTX(); - - if (ctx == NULL) return -EINVAL; - - /* - * for now limit to current task, which is enough when calling - * from overflow handler - */ - if (task != current && ctx->ctx_fl_system == 0) return -EBUSY; - - return pfm_write_dbrs(ctx, req, nreq, regs); -} -EXPORT_SYMBOL(pfm_mod_write_dbrs); - - -static int -pfm_get_features(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - pfarg_features_t *req = (pfarg_features_t *)arg; - - req->ft_version = PFM_VERSION; - return 0; -} - -static int -pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - struct pt_regs *tregs; - struct task_struct *task = PFM_CTX_TASK(ctx); - int state, is_system; - - state = ctx->ctx_state; - is_system = ctx->ctx_fl_system; - - /* - * context must be attached to issue the stop command (includes LOADED,MASKED,ZOMBIE) - */ - if (state == PFM_CTX_UNLOADED) return -EINVAL; - - /* - * In system wide and when the context is loaded, access can only happen - * when the caller is running on the CPU being monitored by the session. - * It does not have to be the owner (ctx_task) of the context per se. - */ - if (is_system && ctx->ctx_cpu != smp_processor_id()) { - DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); - return -EBUSY; - } - DPRINT(("task [%d] ctx_state=%d is_system=%d\n", - PFM_CTX_TASK(ctx)->pid, - state, - is_system)); - /* - * in system mode, we need to update the PMU directly - * and the user level state of the caller, which may not - * necessarily be the creator of the context. - */ - if (is_system) { - /* - * Update local PMU first - * - * disable dcr pp - */ - ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP); - ia64_srlz_i(); - - /* - * update local cpuinfo - */ - PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP); - - /* - * stop monitoring, does srlz.i - */ - pfm_clear_psr_pp(); - - /* - * stop monitoring in the caller - */ - ia64_psr(regs)->pp = 0; - - return 0; - } - /* - * per-task mode - */ - - if (task == current) { - /* stop monitoring at kernel level */ - pfm_clear_psr_up(); - - /* - * stop monitoring at the user level - */ - ia64_psr(regs)->up = 0; - } else { - tregs = task_pt_regs(task); - - /* - * stop monitoring at the user level - */ - ia64_psr(tregs)->up = 0; - - /* - * monitoring disabled in kernel at next reschedule - */ - ctx->ctx_saved_psr_up = 0; - DPRINT(("task=[%d]\n", task->pid)); - } - return 0; -} - - -static int -pfm_start(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - struct pt_regs *tregs; - int state, is_system; - - state = ctx->ctx_state; - is_system = ctx->ctx_fl_system; - - if (state != PFM_CTX_LOADED) return -EINVAL; - - /* - * In system wide and when the context is loaded, access can only happen - * when the caller is running on the CPU being monitored by the session. - * It does not have to be the owner (ctx_task) of the context per se. - */ - if (is_system && ctx->ctx_cpu != smp_processor_id()) { - DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu)); - return -EBUSY; - } - - /* - * in system mode, we need to update the PMU directly - * and the user level state of the caller, which may not - * necessarily be the creator of the context. - */ - if (is_system) { - - /* - * set user level psr.pp for the caller - */ - ia64_psr(regs)->pp = 1; - - /* - * now update the local PMU and cpuinfo - */ - PFM_CPUINFO_SET(PFM_CPUINFO_DCR_PP); - - /* - * start monitoring at kernel level - */ - pfm_set_psr_pp(); - - /* enable dcr pp */ - ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) | IA64_DCR_PP); - ia64_srlz_i(); - - return 0; - } - - /* - * per-process mode - */ - - if (ctx->ctx_task == current) { - - /* start monitoring at kernel level */ - pfm_set_psr_up(); - - /* - * activate monitoring at user level - */ - ia64_psr(regs)->up = 1; - - } else { - tregs = task_pt_regs(ctx->ctx_task); - - /* - * start monitoring at the kernel level the next - * time the task is scheduled - */ - ctx->ctx_saved_psr_up = IA64_PSR_UP; - - /* - * activate monitoring at user level - */ - ia64_psr(tregs)->up = 1; - } - return 0; -} - -static int -pfm_get_pmc_reset(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - pfarg_reg_t *req = (pfarg_reg_t *)arg; - unsigned int cnum; - int i; - int ret = -EINVAL; - - for (i = 0; i < count; i++, req++) { - - cnum = req->reg_num; - - if (!PMC_IS_IMPL(cnum)) goto abort_mission; - - req->reg_value = PMC_DFL_VAL(cnum); - - PFM_REG_RETFLAG_SET(req->reg_flags, 0); - - DPRINT(("pmc_reset_val pmc[%u]=0x%lx\n", cnum, req->reg_value)); - } - return 0; - -abort_mission: - PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL); - return ret; -} - -static int -pfm_check_task_exist(pfm_context_t *ctx) -{ - struct task_struct *g, *t; - int ret = -ESRCH; - - read_lock(&tasklist_lock); - - do_each_thread (g, t) { - if (t->thread.pfm_context == ctx) { - ret = 0; - break; - } - } while_each_thread (g, t); - - read_unlock(&tasklist_lock); - - DPRINT(("pfm_check_task_exist: ret=%d ctx=%p\n", ret, ctx)); - - return ret; -} - -static int -pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - struct task_struct *task; - struct thread_struct *thread; - struct pfm_context_t *old; - unsigned long flags; -#ifndef CONFIG_SMP - struct task_struct *owner_task = NULL; -#endif - pfarg_load_t *req = (pfarg_load_t *)arg; - unsigned long *pmcs_source, *pmds_source; - int the_cpu; - int ret = 0; - int state, is_system, set_dbregs = 0; - - state = ctx->ctx_state; - is_system = ctx->ctx_fl_system; - /* - * can only load from unloaded or terminated state - */ - if (state != PFM_CTX_UNLOADED) { - DPRINT(("cannot load to [%d], invalid ctx_state=%d\n", - req->load_pid, - ctx->ctx_state)); - return -EBUSY; - } - - DPRINT(("load_pid [%d] using_dbreg=%d\n", req->load_pid, ctx->ctx_fl_using_dbreg)); - - if (CTX_OVFL_NOBLOCK(ctx) == 0 && req->load_pid == current->pid) { - DPRINT(("cannot use blocking mode on self\n")); - return -EINVAL; - } - - ret = pfm_get_task(ctx, req->load_pid, &task); - if (ret) { - DPRINT(("load_pid [%d] get_task=%d\n", req->load_pid, ret)); - return ret; - } - - ret = -EINVAL; - - /* - * system wide is self monitoring only - */ - if (is_system && task != current) { - DPRINT(("system wide is self monitoring only load_pid=%d\n", - req->load_pid)); - goto error; - } - - thread = &task->thread; - - ret = 0; - /* - * cannot load a context which is using range restrictions, - * into a task that is being debugged. - */ - if (ctx->ctx_fl_using_dbreg) { - if (thread->flags & IA64_THREAD_DBG_VALID) { - ret = -EBUSY; - DPRINT(("load_pid [%d] task is debugged, cannot load range restrictions\n", req->load_pid)); - goto error; - } - LOCK_PFS(flags); - - if (is_system) { - if (pfm_sessions.pfs_ptrace_use_dbregs) { - DPRINT(("cannot load [%d] dbregs in use\n", task->pid)); - ret = -EBUSY; - } else { - pfm_sessions.pfs_sys_use_dbregs++; - DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task->pid, pfm_sessions.pfs_sys_use_dbregs)); - set_dbregs = 1; - } - } - - UNLOCK_PFS(flags); - - if (ret) goto error; - } - - /* - * SMP system-wide monitoring implies self-monitoring. - * - * The programming model expects the task to - * be pinned on a CPU throughout the session. - * Here we take note of the current CPU at the - * time the context is loaded. No call from - * another CPU will be allowed. - * - * The pinning via shed_setaffinity() - * must be done by the calling task prior - * to this call. - * - * systemwide: keep track of CPU this session is supposed to run on - */ - the_cpu = ctx->ctx_cpu = smp_processor_id(); - - ret = -EBUSY; - /* - * now reserve the session - */ - ret = pfm_reserve_session(current, is_system, the_cpu); - if (ret) goto error; - - /* - * task is necessarily stopped at this point. - * - * If the previous context was zombie, then it got removed in - * pfm_save_regs(). Therefore we should not see it here. - * If we see a context, then this is an active context - * - * XXX: needs to be atomic - */ - DPRINT(("before cmpxchg() old_ctx=%p new_ctx=%p\n", - thread->pfm_context, ctx)); - - ret = -EBUSY; - old = ia64_cmpxchg(acq, &thread->pfm_context, NULL, ctx, sizeof(pfm_context_t *)); - if (old != NULL) { - DPRINT(("load_pid [%d] already has a context\n", req->load_pid)); - goto error_unres; - } - - pfm_reset_msgq(ctx); - - ctx->ctx_state = PFM_CTX_LOADED; - - /* - * link context to task - */ - ctx->ctx_task = task; - - if (is_system) { - /* - * we load as stopped - */ - PFM_CPUINFO_SET(PFM_CPUINFO_SYST_WIDE); - PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP); - - if (ctx->ctx_fl_excl_idle) PFM_CPUINFO_SET(PFM_CPUINFO_EXCL_IDLE); - } else { - thread->flags |= IA64_THREAD_PM_VALID; - } - - /* - * propagate into thread-state - */ - pfm_copy_pmds(task, ctx); - pfm_copy_pmcs(task, ctx); - - pmcs_source = ctx->th_pmcs; - pmds_source = ctx->th_pmds; - - /* - * always the case for system-wide - */ - if (task == current) { - - if (is_system == 0) { - - /* allow user level control */ - ia64_psr(regs)->sp = 0; - DPRINT(("clearing psr.sp for [%d]\n", task->pid)); - - SET_LAST_CPU(ctx, smp_processor_id()); - INC_ACTIVATION(); - SET_ACTIVATION(ctx); -#ifndef CONFIG_SMP - /* - * push the other task out, if any - */ - owner_task = GET_PMU_OWNER(); - if (owner_task) pfm_lazy_save_regs(owner_task); -#endif - } - /* - * load all PMD from ctx to PMU (as opposed to thread state) - * restore all PMC from ctx to PMU - */ - pfm_restore_pmds(pmds_source, ctx->ctx_all_pmds[0]); - pfm_restore_pmcs(pmcs_source, ctx->ctx_all_pmcs[0]); - - ctx->ctx_reload_pmcs[0] = 0UL; - ctx->ctx_reload_pmds[0] = 0UL; - - /* - * guaranteed safe by earlier check against DBG_VALID - */ - if (ctx->ctx_fl_using_dbreg) { - pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs); - pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs); - } - /* - * set new ownership - */ - SET_PMU_OWNER(task, ctx); - - DPRINT(("context loaded on PMU for [%d]\n", task->pid)); - } else { - /* - * when not current, task MUST be stopped, so this is safe - */ - regs = task_pt_regs(task); - - /* force a full reload */ - ctx->ctx_last_activation = PFM_INVALID_ACTIVATION; - SET_LAST_CPU(ctx, -1); - - /* initial saved psr (stopped) */ - ctx->ctx_saved_psr_up = 0UL; - ia64_psr(regs)->up = ia64_psr(regs)->pp = 0; - } - - ret = 0; - -error_unres: - if (ret) pfm_unreserve_session(ctx, ctx->ctx_fl_system, the_cpu); -error: - /* - * we must undo the dbregs setting (for system-wide) - */ - if (ret && set_dbregs) { - LOCK_PFS(flags); - pfm_sessions.pfs_sys_use_dbregs--; - UNLOCK_PFS(flags); - } - /* - * release task, there is now a link with the context - */ - if (is_system == 0 && task != current) { - pfm_put_task(task); - - if (ret == 0) { - ret = pfm_check_task_exist(ctx); - if (ret) { - ctx->ctx_state = PFM_CTX_UNLOADED; - ctx->ctx_task = NULL; - } - } - } - return ret; -} - -/* - * in this function, we do not need to increase the use count - * for the task via get_task_struct(), because we hold the - * context lock. If the task were to disappear while having - * a context attached, it would go through pfm_exit_thread() - * which also grabs the context lock and would therefore be blocked - * until we are here. - */ -static void pfm_flush_pmds(struct task_struct *, pfm_context_t *ctx); - -static int -pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs) -{ - struct task_struct *task = PFM_CTX_TASK(ctx); - struct pt_regs *tregs; - int prev_state, is_system; - int ret; - - DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1)); - - prev_state = ctx->ctx_state; - is_system = ctx->ctx_fl_system; - - /* - * unload only when necessary - */ - if (prev_state == PFM_CTX_UNLOADED) { - DPRINT(("ctx_state=%d, nothing to do\n", prev_state)); - return 0; - } - - /* - * clear psr and dcr bits - */ - ret = pfm_stop(ctx, NULL, 0, regs); - if (ret) return ret; - - ctx->ctx_state = PFM_CTX_UNLOADED; - - /* - * in system mode, we need to update the PMU directly - * and the user level state of the caller, which may not - * necessarily be the creator of the context. - */ - if (is_system) { - - /* - * Update cpuinfo - * - * local PMU is taken care of in pfm_stop() - */ - PFM_CPUINFO_CLEAR(PFM_CPUINFO_SYST_WIDE); - PFM_CPUINFO_CLEAR(PFM_CPUINFO_EXCL_IDLE); - - /* - * save PMDs in context - * release ownership - */ - pfm_flush_pmds(current, ctx); - - /* - * at this point we are done with the PMU - * so we can unreserve the resource. - */ - if (prev_state != PFM_CTX_ZOMBIE) - pfm_unreserve_session(ctx, 1 , ctx->ctx_cpu); - - /* - * disconnect context from task - */ - task->thread.pfm_context = NULL; - /* - * disconnect task from context - */ - ctx->ctx_task = NULL; - - /* - * There is nothing more to cleanup here. - */ - return 0; - } - - /* - * per-task mode - */ - tregs = task == current ? regs : task_pt_regs(task); - - if (task == current) { - /* - * cancel user level control - */ - ia64_psr(regs)->sp = 1; - - DPRINT(("setting psr.sp for [%d]\n", task->pid)); - } - /* - * save PMDs to context - * release ownership - */ - pfm_flush_pmds(task, ctx); - - /* - * at this point we are done with the PMU - * so we can unreserve the resource. - * - * when state was ZOMBIE, we have already unreserved. - */ - if (prev_state != PFM_CTX_ZOMBIE) - pfm_unreserve_session(ctx, 0 , ctx->ctx_cpu); - - /* - * reset activation counter and psr - */ - ctx->ctx_last_activation = PFM_INVALID_ACTIVATION; - SET_LAST_CPU(ctx, -1); - - /* - * PMU state will not be restored - */ - task->thread.flags &= ~IA64_THREAD_PM_VALID; - - /* - * break links between context and task - */ - task->thread.pfm_context = NULL; - ctx->ctx_task = NULL; - - PFM_SET_WORK_PENDING(task, 0); - - ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE; - ctx->ctx_fl_can_restart = 0; - ctx->ctx_fl_going_zombie = 0; - - DPRINT(("disconnected [%d] from context\n", task->pid)); - - return 0; -} - - -/* - * called only from exit_thread(): task == current - * we come here only if current has a context attached (loaded or masked) - */ -void -pfm_exit_thread(struct task_struct *task) -{ - pfm_context_t *ctx; - unsigned long flags; - struct pt_regs *regs = task_pt_regs(task); - int ret, state; - int free_ok = 0; - - ctx = PFM_GET_CTX(task); - - PROTECT_CTX(ctx, flags); - - DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task->pid)); - - state = ctx->ctx_state; - switch(state) { - case PFM_CTX_UNLOADED: - /* - * only comes to this function if pfm_context is not NULL, i.e., cannot - * be in unloaded state - */ - printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task->pid); - break; - case PFM_CTX_LOADED: - case PFM_CTX_MASKED: - ret = pfm_context_unload(ctx, NULL, 0, regs); - if (ret) { - printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret); - } - DPRINT(("ctx unloaded for current state was %d\n", state)); - - pfm_end_notify_user(ctx); - break; - case PFM_CTX_ZOMBIE: - ret = pfm_context_unload(ctx, NULL, 0, regs); - if (ret) { - printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret); - } - free_ok = 1; - break; - default: - printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task->pid, state); - break; - } - UNPROTECT_CTX(ctx, flags); - - { u64 psr = pfm_get_psr(); - BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP)); - BUG_ON(GET_PMU_OWNER()); - BUG_ON(ia64_psr(regs)->up); - BUG_ON(ia64_psr(regs)->pp); - } - - /* - * All memory free operations (especially for vmalloc'ed memory) - * MUST be done with interrupts ENABLED. - */ - if (free_ok) pfm_context_free(ctx); -} - -/* - * functions MUST be listed in the increasing order of their index (see permfon.h) - */ -#define PFM_CMD(name, flags, arg_count, arg_type, getsz) { name, #name, flags, arg_count, sizeof(arg_type), getsz } -#define PFM_CMD_S(name, flags) { name, #name, flags, 0, 0, NULL } -#define PFM_CMD_PCLRWS (PFM_CMD_FD|PFM_CMD_ARG_RW|PFM_CMD_STOP) -#define PFM_CMD_PCLRW (PFM_CMD_FD|PFM_CMD_ARG_RW) -#define PFM_CMD_NONE { NULL, "no-cmd", 0, 0, 0, NULL} - -static pfm_cmd_desc_t pfm_cmd_tab[]={ -/* 0 */PFM_CMD_NONE, -/* 1 */PFM_CMD(pfm_write_pmcs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL), -/* 2 */PFM_CMD(pfm_write_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL), -/* 3 */PFM_CMD(pfm_read_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL), -/* 4 */PFM_CMD_S(pfm_stop, PFM_CMD_PCLRWS), -/* 5 */PFM_CMD_S(pfm_start, PFM_CMD_PCLRWS), -/* 6 */PFM_CMD_NONE, -/* 7 */PFM_CMD_NONE, -/* 8 */PFM_CMD(pfm_context_create, PFM_CMD_ARG_RW, 1, pfarg_context_t, pfm_ctx_getsize), -/* 9 */PFM_CMD_NONE, -/* 10 */PFM_CMD_S(pfm_restart, PFM_CMD_PCLRW), -/* 11 */PFM_CMD_NONE, -/* 12 */PFM_CMD(pfm_get_features, PFM_CMD_ARG_RW, 1, pfarg_features_t, NULL), -/* 13 */PFM_CMD(pfm_debug, 0, 1, unsigned int, NULL), -/* 14 */PFM_CMD_NONE, -/* 15 */PFM_CMD(pfm_get_pmc_reset, PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL), -/* 16 */PFM_CMD(pfm_context_load, PFM_CMD_PCLRWS, 1, pfarg_load_t, NULL), -/* 17 */PFM_CMD_S(pfm_context_unload, PFM_CMD_PCLRWS), -/* 18 */PFM_CMD_NONE, -/* 19 */PFM_CMD_NONE, -/* 20 */PFM_CMD_NONE, -/* 21 */PFM_CMD_NONE, -/* 22 */PFM_CMD_NONE, -/* 23 */PFM_CMD_NONE, -/* 24 */PFM_CMD_NONE, -/* 25 */PFM_CMD_NONE, -/* 26 */PFM_CMD_NONE, -/* 27 */PFM_CMD_NONE, -/* 28 */PFM_CMD_NONE, -/* 29 */PFM_CMD_NONE, -/* 30 */PFM_CMD_NONE, -/* 31 */PFM_CMD_NONE, -/* 32 */PFM_CMD(pfm_write_ibrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL), -/* 33 */PFM_CMD(pfm_write_dbrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL) -}; -#define PFM_CMD_COUNT (sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t)) - -static int -pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags) -{ - struct task_struct *task; - int state, old_state; - -recheck: - state = ctx->ctx_state; - task = ctx->ctx_task; - - if (task == NULL) { - DPRINT(("context %d no task, state=%d\n", ctx->ctx_fd, state)); - return 0; - } - - DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n", - ctx->ctx_fd, - state, - task->pid, - task->state, PFM_CMD_STOPPED(cmd))); - - /* - * self-monitoring always ok. - * - * for system-wide the caller can either be the creator of the - * context (to one to which the context is attached to) OR - * a task running on the same CPU as the session. - */ - if (task == current || ctx->ctx_fl_system) return 0; - - /* - * we are monitoring another thread - */ - switch(state) { - case PFM_CTX_UNLOADED: - /* - * if context is UNLOADED we are safe to go - */ - return 0; - case PFM_CTX_ZOMBIE: - /* - * no command can operate on a zombie context - */ - DPRINT(("cmd %d state zombie cannot operate on context\n", cmd)); - return -EINVAL; - case PFM_CTX_MASKED: - /* - * PMU state has been saved to software even though - * the thread may still be running. - */ - if (cmd != PFM_UNLOAD_CONTEXT) return 0; - } - - /* - * context is LOADED or MASKED. Some commands may need to have - * the task stopped. - * - * We could lift this restriction for UP but it would mean that - * the user has no guarantee the task would not run between - * two successive calls to perfmonctl(). That's probably OK. - * If this user wants to ensure the task does not run, then - * the task must be stopped. - */ - if (PFM_CMD_STOPPED(cmd)) { - if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) { - DPRINT(("[%d] task not in stopped state\n", task->pid)); - return -EBUSY; - } - /* - * task is now stopped, wait for ctxsw out - * - * This is an interesting point in the code. - * We need to unprotect the context because - * the pfm_save_regs() routines needs to grab - * the same lock. There are danger in doing - * this because it leaves a window open for - * another task to get access to the context - * and possibly change its state. The one thing - * that is not possible is for the context to disappear - * because we are protected by the VFS layer, i.e., - * get_fd()/put_fd(). - */ - old_state = state; - - UNPROTECT_CTX(ctx, flags); - - wait_task_inactive(task); - - PROTECT_CTX(ctx, flags); - - /* - * we must recheck to verify if state has changed - */ - if (ctx->ctx_state != old_state) { - DPRINT(("old_state=%d new_state=%d\n", old_state, ctx->ctx_state)); - goto recheck; - } - } - return 0; -} - -/* - * system-call entry point (must return long) - */ -asmlinkage long -sys_perfmonctl (int fd, int cmd, void __user *arg, int count) -{ - struct file *file = NULL; - pfm_context_t *ctx = NULL; - unsigned long flags = 0UL; - void *args_k = NULL; - long ret; /* will expand int return types */ - size_t base_sz, sz, xtra_sz = 0; - int narg, completed_args = 0, call_made = 0, cmd_flags; - int (*func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs); - int (*getsize)(void *arg, size_t *sz); -#define PFM_MAX_ARGSIZE 4096 - - /* - * reject any call if perfmon was disabled at initialization - */ - if (unlikely(pmu_conf == NULL)) return -ENOSYS; - - if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT)) { - DPRINT(("invalid cmd=%d\n", cmd)); - return -EINVAL; - } - - func = pfm_cmd_tab[cmd].cmd_func; - narg = pfm_cmd_tab[cmd].cmd_narg; - base_sz = pfm_cmd_tab[cmd].cmd_argsize; - getsize = pfm_cmd_tab[cmd].cmd_getsize; - cmd_flags = pfm_cmd_tab[cmd].cmd_flags; - - if (unlikely(func == NULL)) { - DPRINT(("invalid cmd=%d\n", cmd)); - return -EINVAL; - } - - DPRINT(("cmd=%s idx=%d narg=0x%x argsz=%lu count=%d\n", - PFM_CMD_NAME(cmd), - cmd, - narg, - base_sz, - count)); - - /* - * check if number of arguments matches what the command expects - */ - if (unlikely((narg == PFM_CMD_ARG_MANY && count <= 0) || (narg > 0 && narg != count))) - return -EINVAL; - -restart_args: - sz = xtra_sz + base_sz*count; - /* - * limit abuse to min page size - */ - if (unlikely(sz > PFM_MAX_ARGSIZE)) { - printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", current->pid, sz); - return -E2BIG; - } - - /* - * allocate default-sized argument buffer - */ - if (likely(count && args_k == NULL)) { - args_k = kmalloc(PFM_MAX_ARGSIZE, GFP_KERNEL); - if (args_k == NULL) return -ENOMEM; - } - - ret = -EFAULT; - - /* - * copy arguments - * - * assume sz = 0 for command without parameters - */ - if (sz && copy_from_user(args_k, arg, sz)) { - DPRINT(("cannot copy_from_user %lu bytes @%p\n", sz, arg)); - goto error_args; - } - - /* - * check if command supports extra parameters - */ - if (completed_args == 0 && getsize) { - /* - * get extra parameters size (based on main argument) - */ - ret = (*getsize)(args_k, &xtra_sz); - if (ret) goto error_args; - - completed_args = 1; - - DPRINT(("restart_args sz=%lu xtra_sz=%lu\n", sz, xtra_sz)); - - /* retry if necessary */ - if (likely(xtra_sz)) goto restart_args; - } - - if (unlikely((cmd_flags & PFM_CMD_FD) == 0)) goto skip_fd; - - ret = -EBADF; - - file = fget(fd); - if (unlikely(file == NULL)) { - DPRINT(("invalid fd %d\n", fd)); - goto error_args; - } - if (unlikely(PFM_IS_FILE(file) == 0)) { - DPRINT(("fd %d not related to perfmon\n", fd)); - goto error_args; - } - - ctx = (pfm_context_t *)file->private_data; - if (unlikely(ctx == NULL)) { - DPRINT(("no context for fd %d\n", fd)); - goto error_args; - } - prefetch(&ctx->ctx_state); - - PROTECT_CTX(ctx, flags); - - /* - * check task is stopped - */ - ret = pfm_check_task_state(ctx, cmd, flags); - if (unlikely(ret)) goto abort_locked; - -skip_fd: - ret = (*func)(ctx, args_k, count, task_pt_regs(current)); - - call_made = 1; - -abort_locked: - if (likely(ctx)) { - DPRINT(("context unlocked\n")); - UNPROTECT_CTX(ctx, flags); - } - - /* copy argument back to user, if needed */ - if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT; - -error_args: - if (file) - fput(file); - - kfree(args_k); - - DPRINT(("cmd=%s ret=%ld\n", PFM_CMD_NAME(cmd), ret)); - - return ret; -} - -static void -pfm_resume_after_ovfl(pfm_context_t *ctx, unsigned long ovfl_regs, struct pt_regs *regs) -{ - pfm_buffer_fmt_t *fmt = ctx->ctx_buf_fmt; - pfm_ovfl_ctrl_t rst_ctrl; - int state; - int ret = 0; - - state = ctx->ctx_state; - /* - * Unlock sampling buffer and reset index atomically - * XXX: not really needed when blocking - */ - if (CTX_HAS_SMPL(ctx)) { - - rst_ctrl.bits.mask_monitoring = 0; - rst_ctrl.bits.reset_ovfl_pmds = 0; - - if (state == PFM_CTX_LOADED) - ret = pfm_buf_fmt_restart_active(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs); - else - ret = pfm_buf_fmt_restart(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs); - } else { - rst_ctrl.bits.mask_monitoring = 0; - rst_ctrl.bits.reset_ovfl_pmds = 1; - } - - if (ret == 0) { - if (rst_ctrl.bits.reset_ovfl_pmds) { - pfm_reset_regs(ctx, &ovfl_regs, PFM_PMD_LONG_RESET); - } - if (rst_ctrl.bits.mask_monitoring == 0) { - DPRINT(("resuming monitoring\n")); - if (ctx->ctx_state == PFM_CTX_MASKED) pfm_restore_monitoring(current); - } else { - DPRINT(("stopping monitoring\n")); - //pfm_stop_monitoring(current, regs); - } - ctx->ctx_state = PFM_CTX_LOADED; - } -} - -/* - * context MUST BE LOCKED when calling - * can only be called for current - */ -static void -pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs) -{ - int ret; - - DPRINT(("entering for [%d]\n", current->pid)); - - ret = pfm_context_unload(ctx, NULL, 0, regs); - if (ret) { - printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret); - } - - /* - * and wakeup controlling task, indicating we are now disconnected - */ - wake_up_interruptible(&ctx->ctx_zombieq); - - /* - * given that context is still locked, the controlling - * task will only get access when we return from - * pfm_handle_work(). - */ -} - -static int pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds); - /* - * pfm_handle_work() can be called with interrupts enabled - * (TIF_NEED_RESCHED) or disabled. The down_interruptible - * call may sleep, therefore we must re-enable interrupts - * to avoid deadlocks. It is safe to do so because this function - * is called ONLY when returning to user level (PUStk=1), in which case - * there is no risk of kernel stack overflow due to deep - * interrupt nesting. - */ -void -pfm_handle_work(void) -{ - pfm_context_t *ctx; - struct pt_regs *regs; - unsigned long flags, dummy_flags; - unsigned long ovfl_regs; - unsigned int reason; - int ret; - - ctx = PFM_GET_CTX(current); - if (ctx == NULL) { - printk(KERN_ERR "perfmon: [%d] has no PFM context\n", current->pid); - return; - } - - PROTECT_CTX(ctx, flags); - - PFM_SET_WORK_PENDING(current, 0); - - pfm_clear_task_notify(); - - regs = task_pt_regs(current); - - /* - * extract reason for being here and clear - */ - reason = ctx->ctx_fl_trap_reason; - ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE; - ovfl_regs = ctx->ctx_ovfl_regs[0]; - - DPRINT(("reason=%d state=%d\n", reason, ctx->ctx_state)); - - /* - * must be done before we check for simple-reset mode - */ - if (ctx->ctx_fl_going_zombie || ctx->ctx_state == PFM_CTX_ZOMBIE) goto do_zombie; - - - //if (CTX_OVFL_NOBLOCK(ctx)) goto skip_blocking; - if (reason == PFM_TRAP_REASON_RESET) goto skip_blocking; - - /* - * restore interrupt mask to what it was on entry. - * Could be enabled/diasbled. - */ - UNPROTECT_CTX(ctx, flags); - - /* - * force interrupt enable because of down_interruptible() - */ - local_irq_enable(); - - DPRINT(("before block sleeping\n")); - - /* - * may go through without blocking on SMP systems - * if restart has been received already by the time we call down() - */ - ret = wait_for_completion_interruptible(&ctx->ctx_restart_done); - - DPRINT(("after block sleeping ret=%d\n", ret)); - - /* - * lock context and mask interrupts again - * We save flags into a dummy because we may have - * altered interrupts mask compared to entry in this - * function. - */ - PROTECT_CTX(ctx, dummy_flags); - - /* - * we need to read the ovfl_regs only after wake-up - * because we may have had pfm_write_pmds() in between - * and that can changed PMD values and therefore - * ovfl_regs is reset for these new PMD values. - */ - ovfl_regs = ctx->ctx_ovfl_regs[0]; - - if (ctx->ctx_fl_going_zombie) { -do_zombie: - DPRINT(("context is zombie, bailing out\n")); - pfm_context_force_terminate(ctx, regs); - goto nothing_to_do; - } - /* - * in case of interruption of down() we don't restart anything - */ - if (ret < 0) goto nothing_to_do; - -skip_blocking: - pfm_resume_after_ovfl(ctx, ovfl_regs, regs); - ctx->ctx_ovfl_regs[0] = 0UL; - -nothing_to_do: - /* - * restore flags as they were upon entry - */ - UNPROTECT_CTX(ctx, flags); -} - -static int -pfm_notify_user(pfm_context_t *ctx, pfm_msg_t *msg) -{ - if (ctx->ctx_state == PFM_CTX_ZOMBIE) { - DPRINT(("ignoring overflow notification, owner is zombie\n")); - return 0; - } - - DPRINT(("waking up somebody\n")); - - if (msg) wake_up_interruptible(&ctx->ctx_msgq_wait); - - /* - * safe, we are not in intr handler, nor in ctxsw when - * we come here - */ - kill_fasync (&ctx->ctx_async_queue, SIGIO, POLL_IN); - - return 0; -} - -static int -pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds) -{ - pfm_msg_t *msg = NULL; - - if (ctx->ctx_fl_no_msg == 0) { - msg = pfm_get_new_msg(ctx); - if (msg == NULL) { - printk(KERN_ERR "perfmon: pfm_ovfl_notify_user no more notification msgs\n"); - return -1; - } - - msg->pfm_ovfl_msg.msg_type = PFM_MSG_OVFL; - msg->pfm_ovfl_msg.msg_ctx_fd = ctx->ctx_fd; - msg->pfm_ovfl_msg.msg_active_set = 0; - msg->pfm_ovfl_msg.msg_ovfl_pmds[0] = ovfl_pmds; - msg->pfm_ovfl_msg.msg_ovfl_pmds[1] = 0UL; - msg->pfm_ovfl_msg.msg_ovfl_pmds[2] = 0UL; - msg->pfm_ovfl_msg.msg_ovfl_pmds[3] = 0UL; - msg->pfm_ovfl_msg.msg_tstamp = 0UL; - } - - DPRINT(("ovfl msg: msg=%p no_msg=%d fd=%d ovfl_pmds=0x%lx\n", - msg, - ctx->ctx_fl_no_msg, - ctx->ctx_fd, - ovfl_pmds)); - - return pfm_notify_user(ctx, msg); -} - -static int -pfm_end_notify_user(pfm_context_t *ctx) -{ - pfm_msg_t *msg; - - msg = pfm_get_new_msg(ctx); - if (msg == NULL) { - printk(KERN_ERR "perfmon: pfm_end_notify_user no more notification msgs\n"); - return -1; - } - /* no leak */ - memset(msg, 0, sizeof(*msg)); - - msg->pfm_end_msg.msg_type = PFM_MSG_END; - msg->pfm_end_msg.msg_ctx_fd = ctx->ctx_fd; - msg->pfm_ovfl_msg.msg_tstamp = 0UL; - - DPRINT(("end msg: msg=%p no_msg=%d ctx_fd=%d\n", - msg, - ctx->ctx_fl_no_msg, - ctx->ctx_fd)); - - return pfm_notify_user(ctx, msg); -} - -/* - * main overflow processing routine. - * it can be called from the interrupt path or explicitly during the context switch code - */ -static void -pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs) -{ - pfm_ovfl_arg_t *ovfl_arg; - unsigned long mask; - unsigned long old_val, ovfl_val, new_val; - unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL, reset_pmds; - unsigned long tstamp; - pfm_ovfl_ctrl_t ovfl_ctrl; - unsigned int i, has_smpl; - int must_notify = 0; - - if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) goto stop_monitoring; - - /* - * sanity test. Should never happen - */ - if (unlikely((pmc0 & 0x1) == 0)) goto sanity_check; - - tstamp = ia64_get_itc(); - mask = pmc0 >> PMU_FIRST_COUNTER; - ovfl_val = pmu_conf->ovfl_val; - has_smpl = CTX_HAS_SMPL(ctx); - - DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s " - "used_pmds=0x%lx\n", - pmc0, - task ? task->pid: -1, - (regs ? regs->cr_iip : 0), - CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking", - ctx->ctx_used_pmds[0])); - - - /* - * first we update the virtual counters - * assume there was a prior ia64_srlz_d() issued - */ - for (i = PMU_FIRST_COUNTER; mask ; i++, mask >>= 1) { - - /* skip pmd which did not overflow */ - if ((mask & 0x1) == 0) continue; - - /* - * Note that the pmd is not necessarily 0 at this point as qualified events - * may have happened before the PMU was frozen. The residual count is not - * taken into consideration here but will be with any read of the pmd via - * pfm_read_pmds(). - */ - old_val = new_val = ctx->ctx_pmds[i].val; - new_val += 1 + ovfl_val; - ctx->ctx_pmds[i].val = new_val; - - /* - * check for overflow condition - */ - if (likely(old_val > new_val)) { - ovfl_pmds |= 1UL << i; - if (PMC_OVFL_NOTIFY(ctx, i)) ovfl_notify |= 1UL << i; - } - - DPRINT_ovfl(("ctx_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx\n", - i, - new_val, - old_val, - ia64_get_pmd(i) & ovfl_val, - ovfl_pmds, - ovfl_notify)); - } - - /* - * there was no 64-bit overflow, nothing else to do - */ - if (ovfl_pmds == 0UL) return; - - /* - * reset all control bits - */ - ovfl_ctrl.val = 0; - reset_pmds = 0UL; - - /* - * if a sampling format module exists, then we "cache" the overflow by - * calling the module's handler() routine. - */ - if (has_smpl) { - unsigned long start_cycles, end_cycles; - unsigned long pmd_mask; - int j, k, ret = 0; - int this_cpu = smp_processor_id(); - - pmd_mask = ovfl_pmds >> PMU_FIRST_COUNTER; - ovfl_arg = &ctx->ctx_ovfl_arg; - - prefetch(ctx->ctx_smpl_hdr); - - for(i=PMU_FIRST_COUNTER; pmd_mask && ret == 0; i++, pmd_mask >>=1) { - - mask = 1UL << i; - - if ((pmd_mask & 0x1) == 0) continue; - - ovfl_arg->ovfl_pmd = (unsigned char )i; - ovfl_arg->ovfl_notify = ovfl_notify & mask ? 1 : 0; - ovfl_arg->active_set = 0; - ovfl_arg->ovfl_ctrl.val = 0; /* module must fill in all fields */ - ovfl_arg->smpl_pmds[0] = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0]; - - ovfl_arg->pmd_value = ctx->ctx_pmds[i].val; - ovfl_arg->pmd_last_reset = ctx->ctx_pmds[i].lval; - ovfl_arg->pmd_eventid = ctx->ctx_pmds[i].eventid; - - /* - * copy values of pmds of interest. Sampling format may copy them - * into sampling buffer. - */ - if (smpl_pmds) { - for(j=0, k=0; smpl_pmds; j++, smpl_pmds >>=1) { - if ((smpl_pmds & 0x1) == 0) continue; - ovfl_arg->smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ? pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j); - DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg->smpl_pmds_values[k-1])); - } - } - - pfm_stats[this_cpu].pfm_smpl_handler_calls++; - - start_cycles = ia64_get_itc(); - - /* - * call custom buffer format record (handler) routine - */ - ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, ovfl_arg, regs, tstamp); - - end_cycles = ia64_get_itc(); - - /* - * For those controls, we take the union because they have - * an all or nothing behavior. - */ - ovfl_ctrl.bits.notify_user |= ovfl_arg->ovfl_ctrl.bits.notify_user; - ovfl_ctrl.bits.block_task |= ovfl_arg->ovfl_ctrl.bits.block_task; - ovfl_ctrl.bits.mask_monitoring |= ovfl_arg->ovfl_ctrl.bits.mask_monitoring; - /* - * build the bitmask of pmds to reset now - */ - if (ovfl_arg->ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds |= mask; - - pfm_stats[this_cpu].pfm_smpl_handler_cycles += end_cycles - start_cycles; - } - /* - * when the module cannot handle the rest of the overflows, we abort right here - */ - if (ret && pmd_mask) { - DPRINT(("handler aborts leftover ovfl_pmds=0x%lx\n", - pmd_mask<ctx_ovfl_regs[0] = ovfl_pmds; - - /* - * check for blocking context - */ - if (CTX_OVFL_NOBLOCK(ctx) == 0 && ovfl_ctrl.bits.block_task) { - - ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_BLOCK; - - /* - * set the perfmon specific checking pending work for the task - */ - PFM_SET_WORK_PENDING(task, 1); - - /* - * when coming from ctxsw, current still points to the - * previous task, therefore we must work with task and not current. - */ - pfm_set_task_notify(task); - } - /* - * defer until state is changed (shorten spin window). the context is locked - * anyway, so the signal receiver would come spin for nothing. - */ - must_notify = 1; - } - - DPRINT_ovfl(("owner [%d] pending=%ld reason=%u ovfl_pmds=0x%lx ovfl_notify=0x%lx masked=%d\n", - GET_PMU_OWNER() ? GET_PMU_OWNER()->pid : -1, - PFM_GET_WORK_PENDING(task), - ctx->ctx_fl_trap_reason, - ovfl_pmds, - ovfl_notify, - ovfl_ctrl.bits.mask_monitoring ? 1 : 0)); - /* - * in case monitoring must be stopped, we toggle the psr bits - */ - if (ovfl_ctrl.bits.mask_monitoring) { - pfm_mask_monitoring(task); - ctx->ctx_state = PFM_CTX_MASKED; - ctx->ctx_fl_can_restart = 1; - } - - /* - * send notification now - */ - if (must_notify) pfm_ovfl_notify_user(ctx, ovfl_notify); - - return; - -sanity_check: - printk(KERN_ERR "perfmon: CPU%d overflow handler [%d] pmc0=0x%lx\n", - smp_processor_id(), - task ? task->pid : -1, - pmc0); - return; - -stop_monitoring: - /* - * in SMP, zombie context is never restored but reclaimed in pfm_load_regs(). - * Moreover, zombies are also reclaimed in pfm_save_regs(). Therefore we can - * come here as zombie only if the task is the current task. In which case, we - * can access the PMU hardware directly. - * - * Note that zombies do have PM_VALID set. So here we do the minimal. - * - * In case the context was zombified it could not be reclaimed at the time - * the monitoring program exited. At this point, the PMU reservation has been - * returned, the sampiing buffer has been freed. We must convert this call - * into a spurious interrupt. However, we must also avoid infinite overflows - * by stopping monitoring for this task. We can only come here for a per-task - * context. All we need to do is to stop monitoring using the psr bits which - * are always task private. By re-enabling secure montioring, we ensure that - * the monitored task will not be able to re-activate monitoring. - * The task will eventually be context switched out, at which point the context - * will be reclaimed (that includes releasing ownership of the PMU). - * - * So there might be a window of time where the number of per-task session is zero - * yet one PMU might have a owner and get at most one overflow interrupt for a zombie - * context. This is safe because if a per-task session comes in, it will push this one - * out and by the virtue on pfm_save_regs(), this one will disappear. If a system wide - * session is force on that CPU, given that we use task pinning, pfm_save_regs() will - * also push our zombie context out. - * - * Overall pretty hairy stuff.... - */ - DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task->pid: -1)); - pfm_clear_psr_up(); - ia64_psr(regs)->up = 0; - ia64_psr(regs)->sp = 1; - return; -} - -static int -pfm_do_interrupt_handler(int irq, void *arg, struct pt_regs *regs) -{ - struct task_struct *task; - pfm_context_t *ctx; - unsigned long flags; - u64 pmc0; - int this_cpu = smp_processor_id(); - int retval = 0; - - pfm_stats[this_cpu].pfm_ovfl_intr_count++; - - /* - * srlz.d done before arriving here - */ - pmc0 = ia64_get_pmc(0); - - task = GET_PMU_OWNER(); - ctx = GET_PMU_CTX(); - - /* - * if we have some pending bits set - * assumes : if any PMC0.bit[63-1] is set, then PMC0.fr = 1 - */ - if (PMC0_HAS_OVFL(pmc0) && task) { - /* - * we assume that pmc0.fr is always set here - */ - - /* sanity check */ - if (!ctx) goto report_spurious1; - - if (ctx->ctx_fl_system == 0 && (task->thread.flags & IA64_THREAD_PM_VALID) == 0) - goto report_spurious2; - - PROTECT_CTX_NOPRINT(ctx, flags); - - pfm_overflow_handler(task, ctx, pmc0, regs); - - UNPROTECT_CTX_NOPRINT(ctx, flags); - - } else { - pfm_stats[this_cpu].pfm_spurious_ovfl_intr_count++; - retval = -1; - } - /* - * keep it unfrozen at all times - */ - pfm_unfreeze_pmu(); - - return retval; - -report_spurious1: - printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d has no PFM context\n", - this_cpu, task->pid); - pfm_unfreeze_pmu(); - return -1; -report_spurious2: - printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d, invalid flag\n", - this_cpu, - task->pid); - pfm_unfreeze_pmu(); - return -1; -} - -static irqreturn_t -pfm_interrupt_handler(int irq, void *arg) -{ - unsigned long start_cycles, total_cycles; - unsigned long min, max; - int this_cpu; - int ret; - struct pt_regs *regs = get_irq_regs(); - - this_cpu = get_cpu(); - if (likely(!pfm_alt_intr_handler)) { - min = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min; - max = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max; - - start_cycles = ia64_get_itc(); - - ret = pfm_do_interrupt_handler(irq, arg, regs); - - total_cycles = ia64_get_itc(); - - /* - * don't measure spurious interrupts - */ - if (likely(ret == 0)) { - total_cycles -= start_cycles; - - if (total_cycles < min) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min = total_cycles; - if (total_cycles > max) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max = total_cycles; - - pfm_stats[this_cpu].pfm_ovfl_intr_cycles += total_cycles; - } - } - else { - (*pfm_alt_intr_handler->handler)(irq, arg, regs); - } - - put_cpu_no_resched(); - return IRQ_HANDLED; -} - -/* - * /proc/perfmon interface, for debug only - */ - -#define PFM_PROC_SHOW_HEADER ((void *)NR_CPUS+1) - -static void * -pfm_proc_start(struct seq_file *m, loff_t *pos) -{ - if (*pos == 0) { - return PFM_PROC_SHOW_HEADER; - } - - while (*pos <= NR_CPUS) { - if (cpu_online(*pos - 1)) { - return (void *)*pos; - } - ++*pos; - } - return NULL; -} - -static void * -pfm_proc_next(struct seq_file *m, void *v, loff_t *pos) -{ - ++*pos; - return pfm_proc_start(m, pos); -} - -static void -pfm_proc_stop(struct seq_file *m, void *v) -{ -} - -static void -pfm_proc_show_header(struct seq_file *m) -{ - struct list_head * pos; - pfm_buffer_fmt_t * entry; - unsigned long flags; - - seq_printf(m, - "perfmon version : %u.%u\n" - "model : %s\n" - "fastctxsw : %s\n" - "expert mode : %s\n" - "ovfl_mask : 0x%lx\n" - "PMU flags : 0x%x\n", - PFM_VERSION_MAJ, PFM_VERSION_MIN, - pmu_conf->pmu_name, - pfm_sysctl.fastctxsw > 0 ? "Yes": "No", - pfm_sysctl.expert_mode > 0 ? "Yes": "No", - pmu_conf->ovfl_val, - pmu_conf->flags); - - LOCK_PFS(flags); - - seq_printf(m, - "proc_sessions : %u\n" - "sys_sessions : %u\n" - "sys_use_dbregs : %u\n" - "ptrace_use_dbregs : %u\n", - pfm_sessions.pfs_task_sessions, - pfm_sessions.pfs_sys_sessions, - pfm_sessions.pfs_sys_use_dbregs, - pfm_sessions.pfs_ptrace_use_dbregs); - - UNLOCK_PFS(flags); - - spin_lock(&pfm_buffer_fmt_lock); - - list_for_each(pos, &pfm_buffer_fmt_list) { - entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list); - seq_printf(m, "format : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n", - entry->fmt_uuid[0], - entry->fmt_uuid[1], - entry->fmt_uuid[2], - entry->fmt_uuid[3], - entry->fmt_uuid[4], - entry->fmt_uuid[5], - entry->fmt_uuid[6], - entry->fmt_uuid[7], - entry->fmt_uuid[8], - entry->fmt_uuid[9], - entry->fmt_uuid[10], - entry->fmt_uuid[11], - entry->fmt_uuid[12], - entry->fmt_uuid[13], - entry->fmt_uuid[14], - entry->fmt_uuid[15], - entry->fmt_name); - } - spin_unlock(&pfm_buffer_fmt_lock); - -} - -static int -pfm_proc_show(struct seq_file *m, void *v) -{ - unsigned long psr; - unsigned int i; - int cpu; - - if (v == PFM_PROC_SHOW_HEADER) { - pfm_proc_show_header(m); - return 0; - } - - /* show info for CPU (v - 1) */ - - cpu = (long)v - 1; - seq_printf(m, - "CPU%-2d overflow intrs : %lu\n" - "CPU%-2d overflow cycles : %lu\n" - "CPU%-2d overflow min : %lu\n" - "CPU%-2d overflow max : %lu\n" - "CPU%-2d smpl handler calls : %lu\n" - "CPU%-2d smpl handler cycles : %lu\n" - "CPU%-2d spurious intrs : %lu\n" - "CPU%-2d replay intrs : %lu\n" - "CPU%-2d syst_wide : %d\n" - "CPU%-2d dcr_pp : %d\n" - "CPU%-2d exclude idle : %d\n" - "CPU%-2d owner : %d\n" - "CPU%-2d context : %p\n" - "CPU%-2d activations : %lu\n", - cpu, pfm_stats[cpu].pfm_ovfl_intr_count, - cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles, - cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_min, - cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_max, - cpu, pfm_stats[cpu].pfm_smpl_handler_calls, - cpu, pfm_stats[cpu].pfm_smpl_handler_cycles, - cpu, pfm_stats[cpu].pfm_spurious_ovfl_intr_count, - cpu, pfm_stats[cpu].pfm_replay_ovfl_intr_count, - cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_SYST_WIDE ? 1 : 0, - cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_DCR_PP ? 1 : 0, - cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0, - cpu, pfm_get_cpu_data(pmu_owner, cpu) ? pfm_get_cpu_data(pmu_owner, cpu)->pid: -1, - cpu, pfm_get_cpu_data(pmu_ctx, cpu), - cpu, pfm_get_cpu_data(pmu_activation_number, cpu)); - - if (num_online_cpus() == 1 && pfm_sysctl.debug > 0) { - - psr = pfm_get_psr(); - - ia64_srlz_d(); - - seq_printf(m, - "CPU%-2d psr : 0x%lx\n" - "CPU%-2d pmc0 : 0x%lx\n", - cpu, psr, - cpu, ia64_get_pmc(0)); - - for (i=0; PMC_IS_LAST(i) == 0; i++) { - if (PMC_IS_COUNTING(i) == 0) continue; - seq_printf(m, - "CPU%-2d pmc%u : 0x%lx\n" - "CPU%-2d pmd%u : 0x%lx\n", - cpu, i, ia64_get_pmc(i), - cpu, i, ia64_get_pmd(i)); - } - } - return 0; -} - -struct seq_operations pfm_seq_ops = { - .start = pfm_proc_start, - .next = pfm_proc_next, - .stop = pfm_proc_stop, - .show = pfm_proc_show -}; - -static int -pfm_proc_open(struct inode *inode, struct file *file) -{ - return seq_open(file, &pfm_seq_ops); -} - - -/* - * we come here as soon as local_cpu_data->pfm_syst_wide is set. this happens - * during pfm_enable() hence before pfm_start(). We cannot assume monitoring - * is active or inactive based on mode. We must rely on the value in - * local_cpu_data->pfm_syst_info - */ -void -pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin) -{ - struct pt_regs *regs; - unsigned long dcr; - unsigned long dcr_pp; - - dcr_pp = info & PFM_CPUINFO_DCR_PP ? 1 : 0; - - /* - * pid 0 is guaranteed to be the idle task. There is one such task with pid 0 - * on every CPU, so we can rely on the pid to identify the idle task. - */ - if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 || task->pid) { - regs = task_pt_regs(task); - ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0; - return; - } - /* - * if monitoring has started - */ - if (dcr_pp) { - dcr = ia64_getreg(_IA64_REG_CR_DCR); - /* - * context switching in? - */ - if (is_ctxswin) { - /* mask monitoring for the idle task */ - ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP); - pfm_clear_psr_pp(); - ia64_srlz_i(); - return; - } - /* - * context switching out - * restore monitoring for next task - * - * Due to inlining this odd if-then-else construction generates - * better code. - */ - ia64_setreg(_IA64_REG_CR_DCR, dcr |IA64_DCR_PP); - pfm_set_psr_pp(); - ia64_srlz_i(); - } -} - -#ifdef CONFIG_SMP - -static void -pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs) -{ - struct task_struct *task = ctx->ctx_task; - - ia64_psr(regs)->up = 0; - ia64_psr(regs)->sp = 1; - - if (GET_PMU_OWNER() == task) { - DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid)); - SET_PMU_OWNER(NULL, NULL); - } - - /* - * disconnect the task from the context and vice-versa - */ - PFM_SET_WORK_PENDING(task, 0); - - task->thread.pfm_context = NULL; - task->thread.flags &= ~IA64_THREAD_PM_VALID; - - DPRINT(("force cleanup for [%d]\n", task->pid)); -} - - -/* - * in 2.6, interrupts are masked when we come here and the runqueue lock is held - */ -void -pfm_save_regs(struct task_struct *task) -{ - pfm_context_t *ctx; - unsigned long flags; - u64 psr; - - - ctx = PFM_GET_CTX(task); - if (ctx == NULL) return; - - /* - * we always come here with interrupts ALREADY disabled by - * the scheduler. So we simply need to protect against concurrent - * access, not CPU concurrency. - */ - flags = pfm_protect_ctx_ctxsw(ctx); - - if (ctx->ctx_state == PFM_CTX_ZOMBIE) { - struct pt_regs *regs = task_pt_regs(task); - - pfm_clear_psr_up(); - - pfm_force_cleanup(ctx, regs); - - BUG_ON(ctx->ctx_smpl_hdr); - - pfm_unprotect_ctx_ctxsw(ctx, flags); - - pfm_context_free(ctx); - return; - } - - /* - * save current PSR: needed because we modify it - */ - ia64_srlz_d(); - psr = pfm_get_psr(); - - BUG_ON(psr & (IA64_PSR_I)); - - /* - * stop monitoring: - * This is the last instruction which may generate an overflow - * - * We do not need to set psr.sp because, it is irrelevant in kernel. - * It will be restored from ipsr when going back to user level - */ - pfm_clear_psr_up(); - - /* - * keep a copy of psr.up (for reload) - */ - ctx->ctx_saved_psr_up = psr & IA64_PSR_UP; - - /* - * release ownership of this PMU. - * PM interrupts are masked, so nothing - * can happen. - */ - SET_PMU_OWNER(NULL, NULL); - - /* - * we systematically save the PMD as we have no - * guarantee we will be schedule at that same - * CPU again. - */ - pfm_save_pmds(ctx->th_pmds, ctx->ctx_used_pmds[0]); - - /* - * save pmc0 ia64_srlz_d() done in pfm_save_pmds() - * we will need it on the restore path to check - * for pending overflow. - */ - ctx->th_pmcs[0] = ia64_get_pmc(0); - - /* - * unfreeze PMU if had pending overflows - */ - if (ctx->th_pmcs[0] & ~0x1UL) pfm_unfreeze_pmu(); - - /* - * finally, allow context access. - * interrupts will still be masked after this call. - */ - pfm_unprotect_ctx_ctxsw(ctx, flags); -} - -#else /* !CONFIG_SMP */ -void -pfm_save_regs(struct task_struct *task) -{ - pfm_context_t *ctx; - u64 psr; - - ctx = PFM_GET_CTX(task); - if (ctx == NULL) return; - - /* - * save current PSR: needed because we modify it - */ - psr = pfm_get_psr(); - - BUG_ON(psr & (IA64_PSR_I)); - - /* - * stop monitoring: - * This is the last instruction which may generate an overflow - * - * We do not need to set psr.sp because, it is irrelevant in kernel. - * It will be restored from ipsr when going back to user level - */ - pfm_clear_psr_up(); - - /* - * keep a copy of psr.up (for reload) - */ - ctx->ctx_saved_psr_up = psr & IA64_PSR_UP; -} - -static void -pfm_lazy_save_regs (struct task_struct *task) -{ - pfm_context_t *ctx; - unsigned long flags; - - { u64 psr = pfm_get_psr(); - BUG_ON(psr & IA64_PSR_UP); - } - - ctx = PFM_GET_CTX(task); - - /* - * we need to mask PMU overflow here to - * make sure that we maintain pmc0 until - * we save it. overflow interrupts are - * treated as spurious if there is no - * owner. - * - * XXX: I don't think this is necessary - */ - PROTECT_CTX(ctx,flags); - - /* - * release ownership of this PMU. - * must be done before we save the registers. - * - * after this call any PMU interrupt is treated - * as spurious. - */ - SET_PMU_OWNER(NULL, NULL); - - /* - * save all the pmds we use - */ - pfm_save_pmds(ctx->th_pmds, ctx->ctx_used_pmds[0]); - - /* - * save pmc0 ia64_srlz_d() done in pfm_save_pmds() - * it is needed to check for pended overflow - * on the restore path - */ - ctx->th_pmcs[0] = ia64_get_pmc(0); - - /* - * unfreeze PMU if had pending overflows - */ - if (ctx->th_pmcs[0] & ~0x1UL) pfm_unfreeze_pmu(); - - /* - * now get can unmask PMU interrupts, they will - * be treated as purely spurious and we will not - * lose any information - */ - UNPROTECT_CTX(ctx,flags); -} -#endif /* CONFIG_SMP */ - -#ifdef CONFIG_SMP -/* - * in 2.6, interrupts are masked when we come here and the runqueue lock is held - */ -void -pfm_load_regs (struct task_struct *task) -{ - pfm_context_t *ctx; - unsigned long pmc_mask = 0UL, pmd_mask = 0UL; - unsigned long flags; - u64 psr, psr_up; - int need_irq_resend; - - ctx = PFM_GET_CTX(task); - if (unlikely(ctx == NULL)) return; - - BUG_ON(GET_PMU_OWNER()); - - /* - * possible on unload - */ - if (unlikely((task->thread.flags & IA64_THREAD_PM_VALID) == 0)) return; - - /* - * we always come here with interrupts ALREADY disabled by - * the scheduler. So we simply need to protect against concurrent - * access, not CPU concurrency. - */ - flags = pfm_protect_ctx_ctxsw(ctx); - psr = pfm_get_psr(); - - need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND; - - BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP)); - BUG_ON(psr & IA64_PSR_I); - - if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) { - struct pt_regs *regs = task_pt_regs(task); - - BUG_ON(ctx->ctx_smpl_hdr); - - pfm_force_cleanup(ctx, regs); - - pfm_unprotect_ctx_ctxsw(ctx, flags); - - /* - * this one (kmalloc'ed) is fine with interrupts disabled - */ - pfm_context_free(ctx); - - return; - } - - /* - * we restore ALL the debug registers to avoid picking up - * stale state. - */ - if (ctx->ctx_fl_using_dbreg) { - pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs); - pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs); - } - /* - * retrieve saved psr.up - */ - psr_up = ctx->ctx_saved_psr_up; - - /* - * if we were the last user of the PMU on that CPU, - * then nothing to do except restore psr - */ - if (GET_LAST_CPU(ctx) == smp_processor_id() && ctx->ctx_last_activation == GET_ACTIVATION()) { - - /* - * retrieve partial reload masks (due to user modifications) - */ - pmc_mask = ctx->ctx_reload_pmcs[0]; - pmd_mask = ctx->ctx_reload_pmds[0]; - - } else { - /* - * To avoid leaking information to the user level when psr.sp=0, - * we must reload ALL implemented pmds (even the ones we don't use). - * In the kernel we only allow PFM_READ_PMDS on registers which - * we initialized or requested (sampling) so there is no risk there. - */ - pmd_mask = pfm_sysctl.fastctxsw ? ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0]; - - /* - * ALL accessible PMCs are systematically reloaded, unused registers - * get their default (from pfm_reset_pmu_state()) values to avoid picking - * up stale configuration. - * - * PMC0 is never in the mask. It is always restored separately. - */ - pmc_mask = ctx->ctx_all_pmcs[0]; - } - /* - * when context is MASKED, we will restore PMC with plm=0 - * and PMD with stale information, but that's ok, nothing - * will be captured. - * - * XXX: optimize here - */ - if (pmd_mask) pfm_restore_pmds(ctx->th_pmds, pmd_mask); - if (pmc_mask) pfm_restore_pmcs(ctx->th_pmcs, pmc_mask); - - /* - * check for pending overflow at the time the state - * was saved. - */ - if (unlikely(PMC0_HAS_OVFL(ctx->th_pmcs[0]))) { - /* - * reload pmc0 with the overflow information - * On McKinley PMU, this will trigger a PMU interrupt - */ - ia64_set_pmc(0, ctx->th_pmcs[0]); - ia64_srlz_d(); - ctx->th_pmcs[0] = 0UL; - - /* - * will replay the PMU interrupt - */ - if (need_irq_resend) ia64_resend_irq(IA64_PERFMON_VECTOR); - - pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++; - } - - /* - * we just did a reload, so we reset the partial reload fields - */ - ctx->ctx_reload_pmcs[0] = 0UL; - ctx->ctx_reload_pmds[0] = 0UL; - - SET_LAST_CPU(ctx, smp_processor_id()); - - /* - * dump activation value for this PMU - */ - INC_ACTIVATION(); - /* - * record current activation for this context - */ - SET_ACTIVATION(ctx); - - /* - * establish new ownership. - */ - SET_PMU_OWNER(task, ctx); - - /* - * restore the psr.up bit. measurement - * is active again. - * no PMU interrupt can happen at this point - * because we still have interrupts disabled. - */ - if (likely(psr_up)) pfm_set_psr_up(); - - /* - * allow concurrent access to context - */ - pfm_unprotect_ctx_ctxsw(ctx, flags); -} -#else /* !CONFIG_SMP */ -/* - * reload PMU state for UP kernels - * in 2.5 we come here with interrupts disabled - */ -void -pfm_load_regs (struct task_struct *task) -{ - pfm_context_t *ctx; - struct task_struct *owner; - unsigned long pmd_mask, pmc_mask; - u64 psr, psr_up; - int need_irq_resend; - - owner = GET_PMU_OWNER(); - ctx = PFM_GET_CTX(task); - psr = pfm_get_psr(); - - BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP)); - BUG_ON(psr & IA64_PSR_I); - - /* - * we restore ALL the debug registers to avoid picking up - * stale state. - * - * This must be done even when the task is still the owner - * as the registers may have been modified via ptrace() - * (not perfmon) by the previous task. - */ - if (ctx->ctx_fl_using_dbreg) { - pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs); - pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs); - } - - /* - * retrieved saved psr.up - */ - psr_up = ctx->ctx_saved_psr_up; - need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND; - - /* - * short path, our state is still there, just - * need to restore psr and we go - * - * we do not touch either PMC nor PMD. the psr is not touched - * by the overflow_handler. So we are safe w.r.t. to interrupt - * concurrency even without interrupt masking. - */ - if (likely(owner == task)) { - if (likely(psr_up)) pfm_set_psr_up(); - return; - } - - /* - * someone else is still using the PMU, first push it out and - * then we'll be able to install our stuff ! - * - * Upon return, there will be no owner for the current PMU - */ - if (owner) pfm_lazy_save_regs(owner); - - /* - * To avoid leaking information to the user level when psr.sp=0, - * we must reload ALL implemented pmds (even the ones we don't use). - * In the kernel we only allow PFM_READ_PMDS on registers which - * we initialized or requested (sampling) so there is no risk there. - */ - pmd_mask = pfm_sysctl.fastctxsw ? ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0]; - - /* - * ALL accessible PMCs are systematically reloaded, unused registers - * get their default (from pfm_reset_pmu_state()) values to avoid picking - * up stale configuration. - * - * PMC0 is never in the mask. It is always restored separately - */ - pmc_mask = ctx->ctx_all_pmcs[0]; - - pfm_restore_pmds(ctx->th_pmds, pmd_mask); - pfm_restore_pmcs(ctx->th_pmcs, pmc_mask); - - /* - * check for pending overflow at the time the state - * was saved. - */ - if (unlikely(PMC0_HAS_OVFL(ctx->th_pmcs[0]))) { - /* - * reload pmc0 with the overflow information - * On McKinley PMU, this will trigger a PMU interrupt - */ - ia64_set_pmc(0, ctx->th_pmcs[0]); - ia64_srlz_d(); - - ctx->th_pmcs[0] = 0UL; - - /* - * will replay the PMU interrupt - */ - if (need_irq_resend) ia64_resend_irq(IA64_PERFMON_VECTOR); - - pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++; - } - - /* - * establish new ownership. - */ - SET_PMU_OWNER(task, ctx); - - /* - * restore the psr.up bit. measurement - * is active again. - * no PMU interrupt can happen at this point - * because we still have interrupts disabled. - */ - if (likely(psr_up)) pfm_set_psr_up(); -} -#endif /* CONFIG_SMP */ - -/* - * this function assumes monitoring is stopped - */ -static void -pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx) -{ - u64 pmc0; - unsigned long mask2, val, pmd_val, ovfl_val; - int i, can_access_pmu = 0; - int is_self; - - /* - * is the caller the task being monitored (or which initiated the - * session for system wide measurements) - */ - is_self = ctx->ctx_task == task ? 1 : 0; - - /* - * can access PMU is task is the owner of the PMU state on the current CPU - * or if we are running on the CPU bound to the context in system-wide mode - * (that is not necessarily the task the context is attached to in this mode). - * In system-wide we always have can_access_pmu true because a task running on an - * invalid processor is flagged earlier in the call stack (see pfm_stop). - */ - can_access_pmu = (GET_PMU_OWNER() == task) || (ctx->ctx_fl_system && ctx->ctx_cpu == smp_processor_id()); - if (can_access_pmu) { - /* - * Mark the PMU as not owned - * This will cause the interrupt handler to do nothing in case an overflow - * interrupt was in-flight - * This also guarantees that pmc0 will contain the final state - * It virtually gives us full control on overflow processing from that point - * on. - */ - SET_PMU_OWNER(NULL, NULL); - DPRINT(("releasing ownership\n")); - - /* - * read current overflow status: - * - * we are guaranteed to read the final stable state - */ - ia64_srlz_d(); - pmc0 = ia64_get_pmc(0); /* slow */ - - /* - * reset freeze bit, overflow status information destroyed - */ - pfm_unfreeze_pmu(); - } else { - pmc0 = ctx->th_pmcs[0]; - /* - * clear whatever overflow status bits there were - */ - ctx->th_pmcs[0] = 0; - } - ovfl_val = pmu_conf->ovfl_val; - /* - * we save all the used pmds - * we take care of overflows for counting PMDs - * - * XXX: sampling situation is not taken into account here - */ - mask2 = ctx->ctx_used_pmds[0]; - - DPRINT(("is_self=%d ovfl_val=0x%lx mask2=0x%lx\n", is_self, ovfl_val, mask2)); - - for (i = 0; mask2; i++, mask2>>=1) { - - /* skip non used pmds */ - if ((mask2 & 0x1) == 0) continue; - - /* - * can access PMU always true in system wide mode - */ - val = pmd_val = can_access_pmu ? ia64_get_pmd(i) : ctx->th_pmds[i]; - - if (PMD_IS_COUNTING(i)) { - DPRINT(("[%d] pmd[%d] ctx_pmd=0x%lx hw_pmd=0x%lx\n", - task->pid, - i, - ctx->ctx_pmds[i].val, - val & ovfl_val)); - - /* - * we rebuild the full 64 bit value of the counter - */ - val = ctx->ctx_pmds[i].val + (val & ovfl_val); - - /* - * now everything is in ctx_pmds[] and we need - * to clear the saved context from save_regs() such that - * pfm_read_pmds() gets the correct value - */ - pmd_val = 0UL; - - /* - * take care of overflow inline - */ - if (pmc0 & (1UL << i)) { - val += 1 + ovfl_val; - DPRINT(("[%d] pmd[%d] overflowed\n", task->pid, i)); - } - } - - DPRINT(("[%d] ctx_pmd[%d]=0x%lx pmd_val=0x%lx\n", task->pid, i, val, pmd_val)); - - if (is_self) ctx->th_pmds[i] = pmd_val; - - ctx->ctx_pmds[i].val = val; - } -} - -static struct irqaction perfmon_irqaction = { - .handler = pfm_interrupt_handler, - .flags = IRQF_DISABLED, - .name = "perfmon" -}; - -static void -pfm_alt_save_pmu_state(void *data) -{ - struct pt_regs *regs; - - regs = task_pt_regs(current); - - DPRINT(("called\n")); - - /* - * should not be necessary but - * let's take not risk - */ - pfm_clear_psr_up(); - pfm_clear_psr_pp(); - ia64_psr(regs)->pp = 0; - - /* - * This call is required - * May cause a spurious interrupt on some processors - */ - pfm_freeze_pmu(); - - ia64_srlz_d(); -} - -void -pfm_alt_restore_pmu_state(void *data) -{ - struct pt_regs *regs; - - regs = task_pt_regs(current); - - DPRINT(("called\n")); - - /* - * put PMU back in state expected - * by perfmon - */ - pfm_clear_psr_up(); - pfm_clear_psr_pp(); - ia64_psr(regs)->pp = 0; - - /* - * perfmon runs with PMU unfrozen at all times - */ - pfm_unfreeze_pmu(); - - ia64_srlz_d(); -} - -int -pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl) -{ - int ret, i; - int reserve_cpu; - - /* some sanity checks */ - if (hdl == NULL || hdl->handler == NULL) return -EINVAL; - - /* do the easy test first */ - if (pfm_alt_intr_handler) return -EBUSY; - - /* one at a time in the install or remove, just fail the others */ - if (!spin_trylock(&pfm_alt_install_check)) { - return -EBUSY; - } - - /* reserve our session */ - for_each_online_cpu(reserve_cpu) { - ret = pfm_reserve_session(NULL, 1, reserve_cpu); - if (ret) goto cleanup_reserve; - } - - /* save the current system wide pmu states */ - ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 0, 1); - if (ret) { - DPRINT(("on_each_cpu() failed: %d\n", ret)); - goto cleanup_reserve; - } - - /* officially change to the alternate interrupt handler */ - pfm_alt_intr_handler = hdl; - - spin_unlock(&pfm_alt_install_check); - - return 0; - -cleanup_reserve: - for_each_online_cpu(i) { - /* don't unreserve more than we reserved */ - if (i >= reserve_cpu) break; - - pfm_unreserve_session(NULL, 1, i); - } - - spin_unlock(&pfm_alt_install_check); - - return ret; -} -EXPORT_SYMBOL_GPL(pfm_install_alt_pmu_interrupt); - -int -pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl) -{ - int i; - int ret; - - if (hdl == NULL) return -EINVAL; - - /* cannot remove someone else's handler! */ - if (pfm_alt_intr_handler != hdl) return -EINVAL; - - /* one at a time in the install or remove, just fail the others */ - if (!spin_trylock(&pfm_alt_install_check)) { - return -EBUSY; - } - - pfm_alt_intr_handler = NULL; - - ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 0, 1); - if (ret) { - DPRINT(("on_each_cpu() failed: %d\n", ret)); - } - - for_each_online_cpu(i) { - pfm_unreserve_session(NULL, 1, i); - } - - spin_unlock(&pfm_alt_install_check); - - return 0; -} -EXPORT_SYMBOL_GPL(pfm_remove_alt_pmu_interrupt); - -/* - * perfmon initialization routine, called from the initcall() table - */ -static int init_pfm_fs(void); - -static int __init -pfm_probe_pmu(void) -{ - pmu_config_t **p; - int family; - - family = local_cpu_data->family; - p = pmu_confs; - - while(*p) { - if ((*p)->probe) { - if ((*p)->probe() == 0) goto found; - } else if ((*p)->pmu_family == family || (*p)->pmu_family == 0xff) { - goto found; - } - p++; - } - return -1; -found: - pmu_conf = *p; - return 0; -} - -static const struct file_operations pfm_proc_fops = { - .open = pfm_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -int __init -pfm_init(void) -{ - unsigned int n, n_counters, i; - - printk("perfmon: version %u.%u IRQ %u\n", - PFM_VERSION_MAJ, - PFM_VERSION_MIN, - IA64_PERFMON_VECTOR); - - if (pfm_probe_pmu()) { - printk(KERN_INFO "perfmon: disabled, there is no support for processor family %d\n", - local_cpu_data->family); - return -ENODEV; - } - - /* - * compute the number of implemented PMD/PMC from the - * description tables - */ - n = 0; - for (i=0; PMC_IS_LAST(i) == 0; i++) { - if (PMC_IS_IMPL(i) == 0) continue; - pmu_conf->impl_pmcs[i>>6] |= 1UL << (i&63); - n++; - } - pmu_conf->num_pmcs = n; - - n = 0; n_counters = 0; - for (i=0; PMD_IS_LAST(i) == 0; i++) { - if (PMD_IS_IMPL(i) == 0) continue; - pmu_conf->impl_pmds[i>>6] |= 1UL << (i&63); - n++; - if (PMD_IS_COUNTING(i)) n_counters++; - } - pmu_conf->num_pmds = n; - pmu_conf->num_counters = n_counters; - - /* - * sanity checks on the number of debug registers - */ - if (pmu_conf->use_rr_dbregs) { - if (pmu_conf->num_ibrs > IA64_NUM_DBG_REGS) { - printk(KERN_INFO "perfmon: unsupported number of code debug registers (%u)\n", pmu_conf->num_ibrs); - pmu_conf = NULL; - return -1; - } - if (pmu_conf->num_dbrs > IA64_NUM_DBG_REGS) { - printk(KERN_INFO "perfmon: unsupported number of data debug registers (%u)\n", pmu_conf->num_ibrs); - pmu_conf = NULL; - return -1; - } - } - - printk("perfmon: %s PMU detected, %u PMCs, %u PMDs, %u counters (%lu bits)\n", - pmu_conf->pmu_name, - pmu_conf->num_pmcs, - pmu_conf->num_pmds, - pmu_conf->num_counters, - ffz(pmu_conf->ovfl_val)); - - /* sanity check */ - if (pmu_conf->num_pmds >= PFM_NUM_PMD_REGS || pmu_conf->num_pmcs >= PFM_NUM_PMC_REGS) { - printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n"); - pmu_conf = NULL; - return -1; - } - - /* - * create /proc/perfmon (mostly for debugging purposes) - */ - perfmon_dir = create_proc_entry("perfmon", S_IRUGO, NULL); - if (perfmon_dir == NULL) { - printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n"); - pmu_conf = NULL; - return -1; - } - /* - * install customized file operations for /proc/perfmon entry - */ - perfmon_dir->proc_fops = &pfm_proc_fops; - - /* - * create /proc/sys/kernel/perfmon (for debugging purposes) - */ - pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root); - - /* - * initialize all our spinlocks - */ - spin_lock_init(&pfm_sessions.pfs_lock); - spin_lock_init(&pfm_buffer_fmt_lock); - - init_pfm_fs(); - - for(i=0; i < NR_CPUS; i++) pfm_stats[i].pfm_ovfl_intr_cycles_min = ~0UL; - - return 0; -} - -__initcall(pfm_init); - -/* - * this function is called before pfm_init() - */ -void -pfm_init_percpu (void) -{ - static int first_time=1; - /* - * make sure no measurement is active - * (may inherit programmed PMCs from EFI). - */ - pfm_clear_psr_pp(); - pfm_clear_psr_up(); - - /* - * we run with the PMU not frozen at all times - */ - pfm_unfreeze_pmu(); - - if (first_time) { - register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction); - first_time=0; - } - - ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR); - ia64_srlz_d(); -} - -/* - * used for debug purposes only - */ -void -dump_pmu_state(const char *from) -{ - struct task_struct *task; - struct pt_regs *regs; - pfm_context_t *ctx; - unsigned long psr, dcr, info, flags; - int i, this_cpu; - - local_irq_save(flags); - - this_cpu = smp_processor_id(); - regs = task_pt_regs(current); - info = PFM_CPUINFO_GET(); - dcr = ia64_getreg(_IA64_REG_CR_DCR); - - if (info == 0 && ia64_psr(regs)->pp == 0 && (dcr & IA64_DCR_PP) == 0) { - local_irq_restore(flags); - return; - } - - printk("CPU%d from %s() current [%d] iip=0x%lx %s\n", - this_cpu, - from, - current->pid, - regs->cr_iip, - current->comm); - - task = GET_PMU_OWNER(); - ctx = GET_PMU_CTX(); - - printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task->pid : -1, ctx); - - psr = pfm_get_psr(); - - printk("->CPU%d pmc0=0x%lx psr.pp=%d psr.up=%d dcr.pp=%d syst_info=0x%lx user_psr.up=%d user_psr.pp=%d\n", - this_cpu, - ia64_get_pmc(0), - psr & IA64_PSR_PP ? 1 : 0, - psr & IA64_PSR_UP ? 1 : 0, - dcr & IA64_DCR_PP ? 1 : 0, - info, - ia64_psr(regs)->up, - ia64_psr(regs)->pp); - - ia64_psr(regs)->up = 0; - ia64_psr(regs)->pp = 0; - - for (i=1; PMC_IS_LAST(i) == 0; i++) { - if (PMC_IS_IMPL(i) == 0) continue; - printk("->CPU%d pmc[%d]=0x%lx thread_pmc[%d]=0x%lx\n", this_cpu, i, ia64_get_pmc(i), i, ctx->th_pmcs[i]); - } - - for (i=1; PMD_IS_LAST(i) == 0; i++) { - if (PMD_IS_IMPL(i) == 0) continue; - printk("->CPU%d pmd[%d]=0x%lx thread_pmd[%d]=0x%lx\n", this_cpu, i, ia64_get_pmd(i), i, ctx->th_pmds[i]); - } - - if (ctx) { - printk("->CPU%d ctx_state=%d vaddr=%p addr=%p fd=%d ctx_task=[%d] saved_psr_up=0x%lx\n", - this_cpu, - ctx->ctx_state, - ctx->ctx_smpl_vaddr, - ctx->ctx_smpl_hdr, - ctx->ctx_msgq_head, - ctx->ctx_msgq_tail, - ctx->ctx_saved_psr_up); - } - local_irq_restore(flags); -} - -/* - * called from process.c:copy_thread(). task is new child. - */ -void -pfm_inherit(struct task_struct *task, struct pt_regs *regs) -{ - struct thread_struct *thread; - - DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task->pid)); - - thread = &task->thread; - - /* - * cut links inherited from parent (current) - */ - thread->pfm_context = NULL; - - PFM_SET_WORK_PENDING(task, 0); - - /* - * the psr bits are already set properly in copy_threads() - */ -} -#else /* !CONFIG_PERFMON */ -asmlinkage long -sys_perfmonctl (int fd, int cmd, void *arg, int count) -{ - return -ENOSYS; -} -#endif /* CONFIG_PERFMON */ Index: linux-2.6/arch/ia64/kernel/perfmon_default_smpl.c =================================================================== --- linux-2.6.orig/arch/ia64/kernel/perfmon_default_smpl.c +++ /dev/null @@ -1,296 +0,0 @@ -/* - * Copyright (C) 2002-2003 Hewlett-Packard Co - * Stephane Eranian - * - * This file implements the default sampling buffer format - * for the Linux/ia64 perfmon-2 subsystem. - */ -#include -#include -#include -#include -#include -#include - -#include -#include - -MODULE_AUTHOR("Stephane Eranian "); -MODULE_DESCRIPTION("perfmon default sampling format"); -MODULE_LICENSE("GPL"); - -#define DEFAULT_DEBUG 1 - -#ifdef DEFAULT_DEBUG -#define DPRINT(a) \ - do { \ - if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \ - } while (0) - -#define DPRINT_ovfl(a) \ - do { \ - if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \ - } while (0) - -#else -#define DPRINT(a) -#define DPRINT_ovfl(a) -#endif - -static int -default_validate(struct task_struct *task, unsigned int flags, int cpu, void *data) -{ - pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t*)data; - int ret = 0; - - if (data == NULL) { - DPRINT(("[%d] no argument passed\n", task->pid)); - return -EINVAL; - } - - DPRINT(("[%d] validate flags=0x%x CPU%d\n", task->pid, flags, cpu)); - - /* - * must hold at least the buffer header + one minimally sized entry - */ - if (arg->buf_size < PFM_DEFAULT_SMPL_MIN_BUF_SIZE) return -EINVAL; - - DPRINT(("buf_size=%lu\n", arg->buf_size)); - - return ret; -} - -static int -default_get_size(struct task_struct *task, unsigned int flags, int cpu, void *data, unsigned long *size) -{ - pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data; - - /* - * size has been validated in default_validate - */ - *size = arg->buf_size; - - return 0; -} - -static int -default_init(struct task_struct *task, void *buf, unsigned int flags, int cpu, void *data) -{ - pfm_default_smpl_hdr_t *hdr; - pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data; - - hdr = (pfm_default_smpl_hdr_t *)buf; - - hdr->hdr_version = PFM_DEFAULT_SMPL_VERSION; - hdr->hdr_buf_size = arg->buf_size; - hdr->hdr_cur_offs = sizeof(*hdr); - hdr->hdr_overflows = 0UL; - hdr->hdr_count = 0UL; - - DPRINT(("[%d] buffer=%p buf_size=%lu hdr_size=%lu hdr_version=%u cur_offs=%lu\n", - task->pid, - buf, - hdr->hdr_buf_size, - sizeof(*hdr), - hdr->hdr_version, - hdr->hdr_cur_offs)); - - return 0; -} - -static int -default_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp) -{ - pfm_default_smpl_hdr_t *hdr; - pfm_default_smpl_entry_t *ent; - void *cur, *last; - unsigned long *e, entry_size; - unsigned int npmds, i; - unsigned char ovfl_pmd; - unsigned char ovfl_notify; - - if (unlikely(buf == NULL || arg == NULL|| regs == NULL || task == NULL)) { - DPRINT(("[%d] invalid arguments buf=%p arg=%p\n", task->pid, buf, arg)); - return -EINVAL; - } - - hdr = (pfm_default_smpl_hdr_t *)buf; - cur = buf+hdr->hdr_cur_offs; - last = buf+hdr->hdr_buf_size; - ovfl_pmd = arg->ovfl_pmd; - ovfl_notify = arg->ovfl_notify; - - /* - * precheck for sanity - */ - if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; - - npmds = hweight64(arg->smpl_pmds[0]); - - ent = (pfm_default_smpl_entry_t *)cur; - - prefetch(arg->smpl_pmds_values); - - entry_size = sizeof(*ent) + (npmds << 3); - - /* position for first pmd */ - e = (unsigned long *)(ent+1); - - hdr->hdr_count++; - - DPRINT_ovfl(("[%d] count=%lu cur=%p last=%p free_bytes=%lu ovfl_pmd=%d ovfl_notify=%d npmds=%u\n", - task->pid, - hdr->hdr_count, - cur, last, - last-cur, - ovfl_pmd, - ovfl_notify, npmds)); - - /* - * current = task running at the time of the overflow. - * - * per-task mode: - * - this is ususally the task being monitored. - * Under certain conditions, it might be a different task - * - * system-wide: - * - this is not necessarily the task controlling the session - */ - ent->pid = current->pid; - ent->ovfl_pmd = ovfl_pmd; - ent->last_reset_val = arg->pmd_last_reset; //pmd[0].reg_last_reset_val; - - /* - * where did the fault happen (includes slot number) - */ - ent->ip = regs->cr_iip | ((regs->cr_ipsr >> 41) & 0x3); - - ent->tstamp = stamp; - ent->cpu = smp_processor_id(); - ent->set = arg->active_set; - ent->tgid = current->tgid; - - /* - * selectively store PMDs in increasing index number - */ - if (npmds) { - unsigned long *val = arg->smpl_pmds_values; - for(i=0; i < npmds; i++) { - *e++ = *val++; - } - } - - /* - * update position for next entry - */ - hdr->hdr_cur_offs += entry_size; - cur += entry_size; - - /* - * post check to avoid losing the last sample - */ - if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; - - /* - * keep same ovfl_pmds, ovfl_notify - */ - arg->ovfl_ctrl.bits.notify_user = 0; - arg->ovfl_ctrl.bits.block_task = 0; - arg->ovfl_ctrl.bits.mask_monitoring = 0; - arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1; /* reset before returning from interrupt handler */ - - return 0; -full: - DPRINT_ovfl(("sampling buffer full free=%lu, count=%lu, ovfl_notify=%d\n", last-cur, hdr->hdr_count, ovfl_notify)); - - /* - * increment number of buffer overflow. - * important to detect duplicate set of samples. - */ - hdr->hdr_overflows++; - - /* - * if no notification requested, then we saturate the buffer - */ - if (ovfl_notify == 0) { - arg->ovfl_ctrl.bits.notify_user = 0; - arg->ovfl_ctrl.bits.block_task = 0; - arg->ovfl_ctrl.bits.mask_monitoring = 1; - arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; - } else { - arg->ovfl_ctrl.bits.notify_user = 1; - arg->ovfl_ctrl.bits.block_task = 1; /* ignored for non-blocking context */ - arg->ovfl_ctrl.bits.mask_monitoring = 1; - arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; /* no reset now */ - } - return -1; /* we are full, sorry */ -} - -static int -default_restart(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs) -{ - pfm_default_smpl_hdr_t *hdr; - - hdr = (pfm_default_smpl_hdr_t *)buf; - - hdr->hdr_count = 0UL; - hdr->hdr_cur_offs = sizeof(*hdr); - - ctrl->bits.mask_monitoring = 0; - ctrl->bits.reset_ovfl_pmds = 1; /* uses long-reset values */ - - return 0; -} - -static int -default_exit(struct task_struct *task, void *buf, struct pt_regs *regs) -{ - DPRINT(("[%d] exit(%p)\n", task->pid, buf)); - return 0; -} - -static pfm_buffer_fmt_t default_fmt={ - .fmt_name = "default_format", - .fmt_uuid = PFM_DEFAULT_SMPL_UUID, - .fmt_arg_size = sizeof(pfm_default_smpl_arg_t), - .fmt_validate = default_validate, - .fmt_getsize = default_get_size, - .fmt_init = default_init, - .fmt_handler = default_handler, - .fmt_restart = default_restart, - .fmt_restart_active = default_restart, - .fmt_exit = default_exit, -}; - -static int __init -pfm_default_smpl_init_module(void) -{ - int ret; - - ret = pfm_register_buffer_fmt(&default_fmt); - if (ret == 0) { - printk("perfmon_default_smpl: %s v%u.%u registered\n", - default_fmt.fmt_name, - PFM_DEFAULT_SMPL_VERSION_MAJ, - PFM_DEFAULT_SMPL_VERSION_MIN); - } else { - printk("perfmon_default_smpl: %s cannot register ret=%d\n", - default_fmt.fmt_name, - ret); - } - - return ret; -} - -static void __exit -pfm_default_smpl_cleanup_module(void) -{ - int ret; - ret = pfm_unregister_buffer_fmt(default_fmt.fmt_uuid); - - printk("perfmon_default_smpl: unregister %s=%d\n", default_fmt.fmt_name, ret); -} - -module_init(pfm_default_smpl_init_module); -module_exit(pfm_default_smpl_cleanup_module); - Index: linux-2.6/arch/ia64/kernel/perfmon_generic.h =================================================================== --- linux-2.6.orig/arch/ia64/kernel/perfmon_generic.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * This file contains the generic PMU register description tables - * and pmc checker used by perfmon.c. - * - * Copyright (C) 2002-2003 Hewlett Packard Co - * Stephane Eranian - */ - -static pfm_reg_desc_t pfm_gen_pmc_desc[PMU_MAX_PMCS]={ -/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, - { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -static pfm_reg_desc_t pfm_gen_pmd_desc[PMU_MAX_PMDS]={ -/* pmd0 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, -/* pmd1 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, -/* pmd2 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, -/* pmd3 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, -/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}}, -/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}}, -/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}}, -/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}}, - { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -/* - * impl_pmcs, impl_pmds are computed at runtime to minimize errors! - */ -static pmu_config_t pmu_conf_gen={ - .pmu_name = "Generic", - .pmu_family = 0xff, /* any */ - .ovfl_val = (1UL << 32) - 1, - .num_ibrs = 0, /* does not use */ - .num_dbrs = 0, /* does not use */ - .pmd_desc = pfm_gen_pmd_desc, - .pmc_desc = pfm_gen_pmc_desc -}; - Index: linux-2.6/arch/ia64/kernel/perfmon_itanium.h =================================================================== --- linux-2.6.orig/arch/ia64/kernel/perfmon_itanium.h +++ /dev/null @@ -1,115 +0,0 @@ -/* - * This file contains the Itanium PMU register description tables - * and pmc checker used by perfmon.c. - * - * Copyright (C) 2002-2003 Hewlett Packard Co - * Stephane Eranian - */ -static int pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); - -static pfm_reg_desc_t pfm_ita_pmc_desc[PMU_MAX_PMCS]={ -/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc4 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc5 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc6 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc7 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc8 */ { PFM_REG_CONFIG , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc9 */ { PFM_REG_CONFIG , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc10 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0000000010000000UL, -1UL, NULL, pfm_ita_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc13 */ { PFM_REG_CONFIG , 0, 0x0003ffff00000001UL, -1UL, NULL, pfm_ita_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, - { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -static pfm_reg_desc_t pfm_ita_pmd_desc[PMU_MAX_PMDS]={ -/* pmd0 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, -/* pmd1 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, -/* pmd2 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, -/* pmd3 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, -/* pmd4 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}}, -/* pmd5 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}}, -/* pmd6 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}}, -/* pmd7 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}}, -/* pmd8 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd9 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd10 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd11 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd12 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd13 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd14 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd15 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd16 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd17 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, - { PFM_REG_END , 0, 0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -static int -pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs) -{ - int ret; - int is_loaded; - - /* sanitfy check */ - if (ctx == NULL) return -EINVAL; - - is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED; - - /* - * we must clear the (instruction) debug registers if pmc13.ta bit is cleared - * before they are written (fl_using_dbreg==0) to avoid picking up stale information. - */ - if (cnum == 13 && is_loaded && ((*val & 0x1) == 0UL) && ctx->ctx_fl_using_dbreg == 0) { - - DPRINT(("pmc[%d]=0x%lx has active pmc13.ta cleared, clearing ibr\n", cnum, *val)); - - /* don't mix debug with perfmon */ - if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; - - /* - * a count of 0 will mark the debug registers as in use and also - * ensure that they are properly cleared. - */ - ret = pfm_write_ibr_dbr(1, ctx, NULL, 0, regs); - if (ret) return ret; - } - - /* - * we must clear the (data) debug registers if pmc11.pt bit is cleared - * before they are written (fl_using_dbreg==0) to avoid picking up stale information. - */ - if (cnum == 11 && is_loaded && ((*val >> 28)& 0x1) == 0 && ctx->ctx_fl_using_dbreg == 0) { - - DPRINT(("pmc[%d]=0x%lx has active pmc11.pt cleared, clearing dbr\n", cnum, *val)); - - /* don't mix debug with perfmon */ - if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; - - /* - * a count of 0 will mark the debug registers as in use and also - * ensure that they are properly cleared. - */ - ret = pfm_write_ibr_dbr(0, ctx, NULL, 0, regs); - if (ret) return ret; - } - return 0; -} - -/* - * impl_pmcs, impl_pmds are computed at runtime to minimize errors! - */ -static pmu_config_t pmu_conf_ita={ - .pmu_name = "Itanium", - .pmu_family = 0x7, - .ovfl_val = (1UL << 32) - 1, - .pmd_desc = pfm_ita_pmd_desc, - .pmc_desc = pfm_ita_pmc_desc, - .num_ibrs = 8, - .num_dbrs = 8, - .use_rr_dbregs = 1, /* debug register are use for range retrictions */ -}; - - Index: linux-2.6/arch/ia64/kernel/perfmon_mckinley.h =================================================================== --- linux-2.6.orig/arch/ia64/kernel/perfmon_mckinley.h +++ /dev/null @@ -1,187 +0,0 @@ -/* - * This file contains the McKinley PMU register description tables - * and pmc checker used by perfmon.c. - * - * Copyright (C) 2002-2003 Hewlett Packard Co - * Stephane Eranian - */ -static int pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); - -static pfm_reg_desc_t pfm_mck_pmc_desc[PMU_MAX_PMCS]={ -/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc4 */ { PFM_REG_COUNTING, 6, 0x0000000000800000UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc5 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc6 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc7 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc8 */ { PFM_REG_CONFIG , 0, 0xffffffff3fffffffUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc9 */ { PFM_REG_CONFIG , 0, 0xffffffff3ffffffcUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc10 */ { PFM_REG_MONITOR , 4, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0UL, 0x30f01cf, NULL, pfm_mck_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc13 */ { PFM_REG_CONFIG , 0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc14 */ { PFM_REG_CONFIG , 0, 0x0db60db60db60db6UL, 0x2492UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc15 */ { PFM_REG_CONFIG , 0, 0x00000000fffffff0UL, 0xfUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, - { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -static pfm_reg_desc_t pfm_mck_pmd_desc[PMU_MAX_PMDS]={ -/* pmd0 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, -/* pmd1 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, -/* pmd2 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, -/* pmd3 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, -/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}}, -/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}}, -/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}}, -/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}}, -/* pmd8 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd9 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd10 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd11 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd12 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd13 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd14 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd15 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd16 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd17 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, - { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -/* - * PMC reserved fields must have their power-up values preserved - */ -static int -pfm_mck_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs) -{ - unsigned long tmp1, tmp2, ival = *val; - - /* remove reserved areas from user value */ - tmp1 = ival & PMC_RSVD_MASK(cnum); - - /* get reserved fields values */ - tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum); - - *val = tmp1 | tmp2; - - DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n", - cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val)); - return 0; -} - -/* - * task can be NULL if the context is unloaded - */ -static int -pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs) -{ - int ret = 0, check_case1 = 0; - unsigned long val8 = 0, val14 = 0, val13 = 0; - int is_loaded; - - /* first preserve the reserved fields */ - pfm_mck_reserved(cnum, val, regs); - - /* sanitfy check */ - if (ctx == NULL) return -EINVAL; - - is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED; - - /* - * we must clear the debug registers if pmc13 has a value which enable - * memory pipeline event constraints. In this case we need to clear the - * the debug registers if they have not yet been accessed. This is required - * to avoid picking stale state. - * PMC13 is "active" if: - * one of the pmc13.cfg_dbrpXX field is different from 0x3 - * AND - * at the corresponding pmc13.ena_dbrpXX is set. - */ - DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, *val, ctx->ctx_fl_using_dbreg, is_loaded)); - - if (cnum == 13 && is_loaded - && (*val & 0x1e00000000000UL) && (*val & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) { - - DPRINT(("pmc[%d]=0x%lx has active pmc13 settings, clearing dbr\n", cnum, *val)); - - /* don't mix debug with perfmon */ - if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; - - /* - * a count of 0 will mark the debug registers as in use and also - * ensure that they are properly cleared. - */ - ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs); - if (ret) return ret; - } - /* - * we must clear the (instruction) debug registers if any pmc14.ibrpX bit is enabled - * before they are (fl_using_dbreg==0) to avoid picking up stale information. - */ - if (cnum == 14 && is_loaded && ((*val & 0x2222UL) != 0x2222UL) && ctx->ctx_fl_using_dbreg == 0) { - - DPRINT(("pmc[%d]=0x%lx has active pmc14 settings, clearing ibr\n", cnum, *val)); - - /* don't mix debug with perfmon */ - if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; - - /* - * a count of 0 will mark the debug registers as in use and also - * ensure that they are properly cleared. - */ - ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs); - if (ret) return ret; - - } - - switch(cnum) { - case 4: *val |= 1UL << 23; /* force power enable bit */ - break; - case 8: val8 = *val; - val13 = ctx->ctx_pmcs[13]; - val14 = ctx->ctx_pmcs[14]; - check_case1 = 1; - break; - case 13: val8 = ctx->ctx_pmcs[8]; - val13 = *val; - val14 = ctx->ctx_pmcs[14]; - check_case1 = 1; - break; - case 14: val8 = ctx->ctx_pmcs[8]; - val13 = ctx->ctx_pmcs[13]; - val14 = *val; - check_case1 = 1; - break; - } - /* check illegal configuration which can produce inconsistencies in tagging - * i-side events in L1D and L2 caches - */ - if (check_case1) { - ret = ((val13 >> 45) & 0xf) == 0 - && ((val8 & 0x1) == 0) - && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0) - ||(((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0)); - - if (ret) DPRINT((KERN_DEBUG "perfmon: failure check_case1\n")); - } - - return ret ? -EINVAL : 0; -} - -/* - * impl_pmcs, impl_pmds are computed at runtime to minimize errors! - */ -static pmu_config_t pmu_conf_mck={ - .pmu_name = "Itanium 2", - .pmu_family = 0x1f, - .flags = PFM_PMU_IRQ_RESEND, - .ovfl_val = (1UL << 47) - 1, - .pmd_desc = pfm_mck_pmd_desc, - .pmc_desc = pfm_mck_pmc_desc, - .num_ibrs = 8, - .num_dbrs = 8, - .use_rr_dbregs = 1 /* debug register are use for range restrictions */ -}; - - Index: linux-2.6/arch/ia64/kernel/perfmon_montecito.h =================================================================== --- linux-2.6.orig/arch/ia64/kernel/perfmon_montecito.h +++ /dev/null @@ -1,269 +0,0 @@ -/* - * This file contains the Montecito PMU register description tables - * and pmc checker used by perfmon.c. - * - * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. - * Contributed by Stephane Eranian - */ -static int pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); - -#define RDEP_MONT_ETB (RDEP(38)|RDEP(39)|RDEP(48)|RDEP(49)|RDEP(50)|RDEP(51)|RDEP(52)|RDEP(53)|RDEP(54)|\ - RDEP(55)|RDEP(56)|RDEP(57)|RDEP(58)|RDEP(59)|RDEP(60)|RDEP(61)|RDEP(62)|RDEP(63)) -#define RDEP_MONT_DEAR (RDEP(32)|RDEP(33)|RDEP(36)) -#define RDEP_MONT_IEAR (RDEP(34)|RDEP(35)) - -static pfm_reg_desc_t pfm_mont_pmc_desc[PMU_MAX_PMCS]={ -/* pmc0 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc4 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(4),0, 0, 0}, {0,0, 0, 0}}, -/* pmc5 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(5),0, 0, 0}, {0,0, 0, 0}}, -/* pmc6 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(6),0, 0, 0}, {0,0, 0, 0}}, -/* pmc7 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(7),0, 0, 0}, {0,0, 0, 0}}, -/* pmc8 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(8),0, 0, 0}, {0,0, 0, 0}}, -/* pmc9 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(9),0, 0, 0}, {0,0, 0, 0}}, -/* pmc10 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(10),0, 0, 0}, {0,0, 0, 0}}, -/* pmc11 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(11),0, 0, 0}, {0,0, 0, 0}}, -/* pmc12 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(12),0, 0, 0}, {0,0, 0, 0}}, -/* pmc13 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(13),0, 0, 0}, {0,0, 0, 0}}, -/* pmc14 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(14),0, 0, 0}, {0,0, 0, 0}}, -/* pmc15 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(15),0, 0, 0}, {0,0, 0, 0}}, -/* pmc16 */ { PFM_REG_NOTIMPL, }, -/* pmc17 */ { PFM_REG_NOTIMPL, }, -/* pmc18 */ { PFM_REG_NOTIMPL, }, -/* pmc19 */ { PFM_REG_NOTIMPL, }, -/* pmc20 */ { PFM_REG_NOTIMPL, }, -/* pmc21 */ { PFM_REG_NOTIMPL, }, -/* pmc22 */ { PFM_REG_NOTIMPL, }, -/* pmc23 */ { PFM_REG_NOTIMPL, }, -/* pmc24 */ { PFM_REG_NOTIMPL, }, -/* pmc25 */ { PFM_REG_NOTIMPL, }, -/* pmc26 */ { PFM_REG_NOTIMPL, }, -/* pmc27 */ { PFM_REG_NOTIMPL, }, -/* pmc28 */ { PFM_REG_NOTIMPL, }, -/* pmc29 */ { PFM_REG_NOTIMPL, }, -/* pmc30 */ { PFM_REG_NOTIMPL, }, -/* pmc31 */ { PFM_REG_NOTIMPL, }, -/* pmc32 */ { PFM_REG_CONFIG, 0, 0x30f01ffffffffffUL, 0x30f01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc33 */ { PFM_REG_CONFIG, 0, 0x0, 0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc34 */ { PFM_REG_CONFIG, 0, 0xf01ffffffffffUL, 0xf01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc35 */ { PFM_REG_CONFIG, 0, 0x0, 0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc36 */ { PFM_REG_CONFIG, 0, 0xfffffff0, 0xf, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc37 */ { PFM_REG_MONITOR, 4, 0x0, 0x3fff, NULL, pfm_mont_pmc_check, {RDEP_MONT_IEAR, 0, 0, 0}, {0, 0, 0, 0}}, -/* pmc38 */ { PFM_REG_CONFIG, 0, 0xdb6, 0x2492, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc39 */ { PFM_REG_MONITOR, 6, 0x0, 0xffcf, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}}, -/* pmc40 */ { PFM_REG_MONITOR, 6, 0x2000000, 0xf01cf, NULL, pfm_mont_pmc_check, {RDEP_MONT_DEAR,0, 0, 0}, {0,0, 0, 0}}, -/* pmc41 */ { PFM_REG_CONFIG, 0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc42 */ { PFM_REG_MONITOR, 6, 0x0, 0x7ff4f, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}}, - { PFM_REG_END , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -static pfm_reg_desc_t pfm_mont_pmd_desc[PMU_MAX_PMDS]={ -/* pmd0 */ { PFM_REG_NOTIMPL, }, -/* pmd1 */ { PFM_REG_NOTIMPL, }, -/* pmd2 */ { PFM_REG_NOTIMPL, }, -/* pmd3 */ { PFM_REG_NOTIMPL, }, -/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(4),0, 0, 0}}, -/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(5),0, 0, 0}}, -/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(6),0, 0, 0}}, -/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(7),0, 0, 0}}, -/* pmd8 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(8),0, 0, 0}}, -/* pmd9 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(9),0, 0, 0}}, -/* pmd10 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(10),0, 0, 0}}, -/* pmd11 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(11),0, 0, 0}}, -/* pmd12 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(12),0, 0, 0}}, -/* pmd13 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(13),0, 0, 0}}, -/* pmd14 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(14),0, 0, 0}}, -/* pmd15 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(15),0, 0, 0}}, -/* pmd16 */ { PFM_REG_NOTIMPL, }, -/* pmd17 */ { PFM_REG_NOTIMPL, }, -/* pmd18 */ { PFM_REG_NOTIMPL, }, -/* pmd19 */ { PFM_REG_NOTIMPL, }, -/* pmd20 */ { PFM_REG_NOTIMPL, }, -/* pmd21 */ { PFM_REG_NOTIMPL, }, -/* pmd22 */ { PFM_REG_NOTIMPL, }, -/* pmd23 */ { PFM_REG_NOTIMPL, }, -/* pmd24 */ { PFM_REG_NOTIMPL, }, -/* pmd25 */ { PFM_REG_NOTIMPL, }, -/* pmd26 */ { PFM_REG_NOTIMPL, }, -/* pmd27 */ { PFM_REG_NOTIMPL, }, -/* pmd28 */ { PFM_REG_NOTIMPL, }, -/* pmd29 */ { PFM_REG_NOTIMPL, }, -/* pmd30 */ { PFM_REG_NOTIMPL, }, -/* pmd31 */ { PFM_REG_NOTIMPL, }, -/* pmd32 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(33)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}}, -/* pmd33 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}}, -/* pmd34 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(35),0, 0, 0}, {RDEP(37),0, 0, 0}}, -/* pmd35 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(34),0, 0, 0}, {RDEP(37),0, 0, 0}}, -/* pmd36 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(33),0, 0, 0}, {RDEP(40),0, 0, 0}}, -/* pmd37 */ { PFM_REG_NOTIMPL, }, -/* pmd38 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd39 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd40 */ { PFM_REG_NOTIMPL, }, -/* pmd41 */ { PFM_REG_NOTIMPL, }, -/* pmd42 */ { PFM_REG_NOTIMPL, }, -/* pmd43 */ { PFM_REG_NOTIMPL, }, -/* pmd44 */ { PFM_REG_NOTIMPL, }, -/* pmd45 */ { PFM_REG_NOTIMPL, }, -/* pmd46 */ { PFM_REG_NOTIMPL, }, -/* pmd47 */ { PFM_REG_NOTIMPL, }, -/* pmd48 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd49 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd50 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd51 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd52 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd53 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd54 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd55 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd56 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd57 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd58 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd59 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd60 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd61 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd62 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd63 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, - { PFM_REG_END , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -/* - * PMC reserved fields must have their power-up values preserved - */ -static int -pfm_mont_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs) -{ - unsigned long tmp1, tmp2, ival = *val; - - /* remove reserved areas from user value */ - tmp1 = ival & PMC_RSVD_MASK(cnum); - - /* get reserved fields values */ - tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum); - - *val = tmp1 | tmp2; - - DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n", - cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val)); - return 0; -} - -/* - * task can be NULL if the context is unloaded - */ -static int -pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs) -{ - int ret = 0; - unsigned long val32 = 0, val38 = 0, val41 = 0; - unsigned long tmpval; - int check_case1 = 0; - int is_loaded; - - /* first preserve the reserved fields */ - pfm_mont_reserved(cnum, val, regs); - - tmpval = *val; - - /* sanity check */ - if (ctx == NULL) return -EINVAL; - - is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED; - - /* - * we must clear the debug registers if pmc41 has a value which enable - * memory pipeline event constraints. In this case we need to clear the - * the debug registers if they have not yet been accessed. This is required - * to avoid picking stale state. - * PMC41 is "active" if: - * one of the pmc41.cfg_dtagXX field is different from 0x3 - * AND - * at the corresponding pmc41.en_dbrpXX is set. - * AND - * ctx_fl_using_dbreg == 0 (i.e., dbr not yet used) - */ - DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, tmpval, ctx->ctx_fl_using_dbreg, is_loaded)); - - if (cnum == 41 && is_loaded - && (tmpval & 0x1e00000000000UL) && (tmpval & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) { - - DPRINT(("pmc[%d]=0x%lx has active pmc41 settings, clearing dbr\n", cnum, tmpval)); - - /* don't mix debug with perfmon */ - if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; - - /* - * a count of 0 will mark the debug registers if: - * AND - */ - ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs); - if (ret) return ret; - } - /* - * we must clear the (instruction) debug registers if: - * pmc38.ig_ibrpX is 0 (enabled) - * AND - * ctx_fl_using_dbreg == 0 (i.e., dbr not yet used) - */ - if (cnum == 38 && is_loaded && ((tmpval & 0x492UL) != 0x492UL) && ctx->ctx_fl_using_dbreg == 0) { - - DPRINT(("pmc38=0x%lx has active pmc38 settings, clearing ibr\n", tmpval)); - - /* don't mix debug with perfmon */ - if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; - - /* - * a count of 0 will mark the debug registers as in use and also - * ensure that they are properly cleared. - */ - ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs); - if (ret) return ret; - - } - switch(cnum) { - case 32: val32 = *val; - val38 = ctx->ctx_pmcs[38]; - val41 = ctx->ctx_pmcs[41]; - check_case1 = 1; - break; - case 38: val38 = *val; - val32 = ctx->ctx_pmcs[32]; - val41 = ctx->ctx_pmcs[41]; - check_case1 = 1; - break; - case 41: val41 = *val; - val32 = ctx->ctx_pmcs[32]; - val38 = ctx->ctx_pmcs[38]; - check_case1 = 1; - break; - } - /* check illegal configuration which can produce inconsistencies in tagging - * i-side events in L1D and L2 caches - */ - if (check_case1) { - ret = (((val41 >> 45) & 0xf) == 0 && ((val32>>57) & 0x1) == 0) - && ((((val38>>1) & 0x3) == 0x2 || ((val38>>1) & 0x3) == 0) - || (((val38>>4) & 0x3) == 0x2 || ((val38>>4) & 0x3) == 0)); - if (ret) { - DPRINT(("invalid config pmc38=0x%lx pmc41=0x%lx pmc32=0x%lx\n", val38, val41, val32)); - return -EINVAL; - } - } - *val = tmpval; - return 0; -} - -/* - * impl_pmcs, impl_pmds are computed at runtime to minimize errors! - */ -static pmu_config_t pmu_conf_mont={ - .pmu_name = "Montecito", - .pmu_family = 0x20, - .flags = PFM_PMU_IRQ_RESEND, - .ovfl_val = (1UL << 47) - 1, - .pmd_desc = pfm_mont_pmd_desc, - .pmc_desc = pfm_mont_pmc_desc, - .num_ibrs = 8, - .num_dbrs = 8, - .use_rr_dbregs = 1 /* debug register are use for range retrictions */ -}; Index: linux-2.6/arch/ia64/kernel/process.c =================================================================== --- linux-2.6.orig/arch/ia64/kernel/process.c +++ linux-2.6/arch/ia64/kernel/process.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -44,10 +45,6 @@ #include "entry.h" -#ifdef CONFIG_PERFMON -# include -#endif - #include "sigframe.h" void (*ia64_mark_idle)(int); @@ -164,32 +161,23 @@ do_notify_resume_user (sigset_t *unused, return; } -#ifdef CONFIG_PERFMON - if (current->thread.pfm_needs_checking) - pfm_handle_work(); -#endif + if (test_thread_flag(TIF_PERFMON_WORK)) + pfm_handle_work(task_pt_regs(current)); /* deal with pending signal delivery */ if (test_thread_flag(TIF_SIGPENDING)||test_thread_flag(TIF_RESTORE_SIGMASK)) ia64_do_signal(scr, in_syscall); } -static int pal_halt = 1; static int can_do_pal_halt = 1; static int __init nohalt_setup(char * str) { - pal_halt = can_do_pal_halt = 0; + can_do_pal_halt = 0; return 1; } __setup("nohalt", nohalt_setup); -void -update_pal_halt_status(int status) -{ - can_do_pal_halt = pal_halt && status; -} - /* * We use this if we don't have any better idle routine.. */ @@ -198,10 +186,34 @@ default_idle (void) { local_irq_enable(); while (!need_resched()) { +#ifdef CONFIG_PERFMON + u64 psr = 0; + /* + * If requested, we stop the PMU to avoid + * measuring across the core idle loop. + * + * dcr.pp is not modified on purpose + * it is used when coming out of + * safe_halt() via interrupt + */ + if ((__get_cpu_var(pfm_syst_info) & PFM_ITA_CPUINFO_IDLE_EXCL)) { + psr = ia64_getreg(_IA64_REG_PSR); + if (psr & IA64_PSR_PP) + ia64_rsm(IA64_PSR_PP); + } +#endif + if (can_do_pal_halt) safe_halt(); else cpu_relax(); +#ifdef CONFIG_PERFMON + if ((__get_cpu_var(pfm_syst_info) & PFM_ITA_CPUINFO_IDLE_EXCL)) { + if (psr & IA64_PSR_PP) + ia64_ssm(IA64_PSR_PP); + } +#endif + } } @@ -315,22 +327,9 @@ cpu_idle (void) void ia64_save_extra (struct task_struct *task) { -#ifdef CONFIG_PERFMON - unsigned long info; -#endif - if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) ia64_save_debug_regs(&task->thread.dbr[0]); -#ifdef CONFIG_PERFMON - if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) - pfm_save_regs(task); - - info = __get_cpu_var(pfm_syst_info); - if (info & PFM_CPUINFO_SYST_WIDE) - pfm_syst_wide_update_task(task, info, 0); -#endif - #ifdef CONFIG_IA32_SUPPORT if (IS_IA32_PROCESS(task_pt_regs(task))) ia32_save_state(task); @@ -340,22 +339,9 @@ ia64_save_extra (struct task_struct *tas void ia64_load_extra (struct task_struct *task) { -#ifdef CONFIG_PERFMON - unsigned long info; -#endif - if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) ia64_load_debug_regs(&task->thread.dbr[0]); -#ifdef CONFIG_PERFMON - if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) - pfm_load_regs(task); - - info = __get_cpu_var(pfm_syst_info); - if (info & PFM_CPUINFO_SYST_WIDE) - pfm_syst_wide_update_task(task, info, 1); -#endif - #ifdef CONFIG_IA32_SUPPORT if (IS_IA32_PROCESS(task_pt_regs(task))) ia32_load_state(task); @@ -481,8 +467,7 @@ copy_thread (int nr, unsigned long clone * call behavior where scratch registers are preserved across * system calls (unless used by the system call itself). */ -# define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \ - | IA64_THREAD_PM_VALID) +# define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID) # define THREAD_FLAGS_TO_SET 0 p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR) | THREAD_FLAGS_TO_SET); @@ -503,10 +488,8 @@ copy_thread (int nr, unsigned long clone } #endif -#ifdef CONFIG_PERFMON - if (current->thread.pfm_context) - pfm_inherit(p, child_ptregs); -#endif + pfm_copy_thread(p); + return retval; } @@ -743,15 +726,13 @@ exit_thread (void) { ia64_drop_fpu(current); -#ifdef CONFIG_PERFMON - /* if needed, stop monitoring and flush state to perfmon context */ - if (current->thread.pfm_context) - pfm_exit_thread(current); - - /* free debug register resources */ - if (current->thread.flags & IA64_THREAD_DBG_VALID) - pfm_release_debug_registers(current); -#endif + + /* if needed, stop monitoring and flush state to perfmon context */ + pfm_exit_thread(current); + + /* free debug register resources */ + pfm_release_dbregs(current); + if (IS_IA32_PROCESS(task_pt_regs(current))) ia32_drop_partial_page_list(current); } Index: linux-2.6/arch/ia64/kernel/ptrace.c =================================================================== --- linux-2.6.orig/arch/ia64/kernel/ptrace.c +++ linux-2.6/arch/ia64/kernel/ptrace.c @@ -25,9 +25,6 @@ #include #include #include -#ifdef CONFIG_PERFMON -#include -#endif #include "entry.h" Index: linux-2.6/arch/ia64/kernel/setup.c =================================================================== --- linux-2.6.orig/arch/ia64/kernel/setup.c +++ linux-2.6/arch/ia64/kernel/setup.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -988,6 +989,8 @@ cpu_init (void) } platform_cpu_init(); pm_idle = default_idle; + + pfm_init_percpu(); } /* Index: linux-2.6/arch/ia64/kernel/smpboot.c =================================================================== --- linux-2.6.orig/arch/ia64/kernel/smpboot.c +++ linux-2.6/arch/ia64/kernel/smpboot.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -378,10 +379,6 @@ smp_callin (void) extern void ia64_init_itm(void); extern volatile int time_keeper_id; -#ifdef CONFIG_PERFMON - extern void pfm_init_percpu(void); -#endif - cpuid = smp_processor_id(); phys_id = hard_smp_processor_id(); itc_master = time_keeper_id; @@ -403,10 +400,6 @@ smp_callin (void) ia64_mca_cmc_vector_setup(); /* Setup vector on AP */ -#ifdef CONFIG_PERFMON - pfm_init_percpu(); -#endif - local_irq_enable(); if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) { @@ -738,6 +731,7 @@ int __cpu_disable(void) fixup_irqs(); local_flush_tlb_all(); cpu_clear(cpu, cpu_callin_map); + pfm_cpu_disable(); return 0; } Index: linux-2.6/arch/ia64/kernel/sys_ia64.c =================================================================== --- linux-2.6.orig/arch/ia64/kernel/sys_ia64.c +++ linux-2.6/arch/ia64/kernel/sys_ia64.c @@ -284,3 +284,10 @@ sys_pciconfig_write (unsigned long bus, } #endif /* CONFIG_PCI */ + +#ifndef CONFIG_PERFMON +asmlinkage long sys_perfmonctl (int fd, int cmd, void __user *arg, int count) +{ + return -ENOSYS; +} +#endif Index: linux-2.6/arch/ia64/kernel/time.c =================================================================== --- linux-2.6.orig/arch/ia64/kernel/time.c +++ linux-2.6/arch/ia64/kernel/time.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -62,6 +63,8 @@ timer_interrupt (int irq, void *dev_id) profile_tick(CPU_PROFILING); + pfm_handle_switch_timeout(); + while (1) { update_process_times(user_mode(get_irq_regs())); Index: linux-2.6/arch/ia64/lib/Makefile =================================================================== --- linux-2.6.orig/arch/ia64/lib/Makefile +++ linux-2.6/arch/ia64/lib/Makefile @@ -13,7 +13,6 @@ lib-y := __divsi3.o __udivsi3.o __modsi3 lib-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o lib-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o -lib-$(CONFIG_PERFMON) += carta_random.o AFLAGS___divdi3.o = AFLAGS___udivdi3.o = -DUNSIGNED Index: linux-2.6/arch/ia64/oprofile/init.c =================================================================== --- linux-2.6.orig/arch/ia64/oprofile/init.c +++ linux-2.6/arch/ia64/oprofile/init.c @@ -12,8 +12,8 @@ #include #include -extern int perfmon_init(struct oprofile_operations * ops); -extern void perfmon_exit(void); +extern int op_perfmon_init(struct oprofile_operations * ops); +extern void op_perfmon_exit(void); extern void ia64_backtrace(struct pt_regs * const regs, unsigned int depth); int __init oprofile_arch_init(struct oprofile_operations * ops) @@ -22,7 +22,7 @@ int __init oprofile_arch_init(struct opr #ifdef CONFIG_PERFMON /* perfmon_init() can fail, but we have no way to report it */ - ret = perfmon_init(ops); + ret = op_perfmon_init(ops); #endif ops->backtrace = ia64_backtrace; @@ -33,6 +33,6 @@ int __init oprofile_arch_init(struct opr void oprofile_arch_exit(void) { #ifdef CONFIG_PERFMON - perfmon_exit(); + op_perfmon_exit(); #endif } Index: linux-2.6/arch/ia64/oprofile/perfmon.c =================================================================== --- linux-2.6.orig/arch/ia64/oprofile/perfmon.c +++ linux-2.6/arch/ia64/oprofile/perfmon.c @@ -10,19 +10,21 @@ #include #include #include -#include +#include +#include #include #include static int allow_ints; static int -perfmon_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, - struct pt_regs *regs, unsigned long stamp) +perfmon_handler(void *buf, struct pfm_ovfl_arg *arg, + unsigned long ip, u64 stamp, void *data) { + struct pt_regs *regs = data; int event = arg->pmd_eventid; - arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1; + arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET; /* the owner of the oprofile event buffer may have exited * without perfmon being shutdown (e.g. SIGSEGV) @@ -45,17 +47,13 @@ static void perfmon_stop(void) allow_ints = 0; } - -#define OPROFILE_FMT_UUID { \ - 0x77, 0x7a, 0x6e, 0x61, 0x20, 0x65, 0x73, 0x69, 0x74, 0x6e, 0x72, 0x20, 0x61, 0x65, 0x0a, 0x6c } - -static pfm_buffer_fmt_t oprofile_fmt = { - .fmt_name = "oprofile_format", - .fmt_uuid = OPROFILE_FMT_UUID, - .fmt_handler = perfmon_handler, +static struct pfm_smpl_fmt oprofile_fmt = { + .fmt_name = "OProfile", + .fmt_handler = perfmon_handler, + .fmt_flags = PFM_FMT_BUILTIN_FLAG, + .owner = THIS_MODULE }; - static char * get_cpu_type(void) { __u8 family = local_cpu_data->family; @@ -75,25 +73,26 @@ static char * get_cpu_type(void) static int using_perfmon; -int perfmon_init(struct oprofile_operations * ops) +int __init op_perfmon_init(struct oprofile_operations * ops) { - int ret = pfm_register_buffer_fmt(&oprofile_fmt); + int ret = pfm_fmt_register(&oprofile_fmt); if (ret) return -ENODEV; ops->cpu_type = get_cpu_type(); ops->start = perfmon_start; ops->stop = perfmon_stop; + ops->implementation = "perfmon2"; using_perfmon = 1; printk(KERN_INFO "oprofile: using perfmon.\n"); return 0; } -void perfmon_exit(void) +void __exit op_perfmon_exit(void) { if (!using_perfmon) return; - pfm_unregister_buffer_fmt(oprofile_fmt.fmt_uuid); + pfm_fmt_unregister(&oprofile_fmt); } Index: linux-2.6/arch/ia64/perfmon/Kconfig =================================================================== --- /dev/null +++ linux-2.6/arch/ia64/perfmon/Kconfig @@ -0,0 +1,58 @@ +menu "Hardware Performance Monitoring support" +config PERFMON + bool "Perfmon2 performance monitoring interface" + default n + help + Enables the perfmon2 interface to access the hardware + performance counters. See for + more details. + +config PERFMON_DEBUG + bool "Perfmon debugging" + default n + depends on PERFMON + help + Enables perfmon debugging support + +config IA64_PERFMON_COMPAT + bool "Enable old perfmon-2 compatbility mode" + default n + depends PERFMON + help + Enable this option to allow performance tools which used the old + perfmon-2 interface to continue to work. Old tools are those using + the obsolete commands and arguments. Check your programs and look + in include/asm-ia64/perfmon_compat.h for more information. + +config IA64_PERFMON_GENERIC + tristate "Generic IA-64 PMU support" + depends PERFMON + default n + help + Enables generic IA-64 PMU support. + The generic PMU is defined by the IA-64 architecture document. + This option should only be necessary when running with a PMU that + is not yet explicitely supported. Even then, there is no guarantee + that this support will work. + +config IA64_PERFMON_ITANIUM + tristate "Itanium (Merced) Performance Monitoring support" + depends PERFMON + default n + help + Enables Itanium (Merced) PMU support. + +config IA64_PERFMON_MCKINLEY + tristate "Itanium 2 (McKinley) Performance Monitoring support" + depends PERFMON + default n + help + Enables Itanium 2 (McKinley, Madison, Deerfield) PMU support. + +config IA64_PERFMON_MONTECITO + tristate "Itanium 2 9000 (Montecito) Performance Monitoring support" + depends PERFMON + default n + help + Enables support for Itanium 2 9000 (Montecito) PMU. +endmenu Index: linux-2.6/arch/ia64/perfmon/Makefile =================================================================== --- /dev/null +++ linux-2.6/arch/ia64/perfmon/Makefile @@ -0,0 +1,11 @@ +# +# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. +# Contributed by Stephane Eranian +# +obj-$(CONFIG_PERFMON) += perfmon.o +obj-$(CONFIG_IA64_PERFMON_COMPAT) += perfmon_default_smpl.o \ + perfmon_compat.o +obj-$(CONFIG_IA64_PERFMON_GENERIC) += perfmon_generic.o +obj-$(CONFIG_IA64_PERFMON_ITANIUM) += perfmon_itanium.o +obj-$(CONFIG_IA64_PERFMON_MCKINLEY) += perfmon_mckinley.o +obj-$(CONFIG_IA64_PERFMON_MONTECITO) += perfmon_montecito.o Index: linux-2.6/arch/ia64/perfmon/perfmon.c =================================================================== --- /dev/null +++ linux-2.6/arch/ia64/perfmon/perfmon.c @@ -0,0 +1,951 @@ +/* + * This file implements the IA-64 specific + * support for the perfmon2 interface + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include + +struct pfm_arch_session { + u32 pfs_sys_use_dbr; /* syswide session uses dbr */ + u32 pfs_ptrace_use_dbr; /* a thread uses dbr via ptrace()*/ +}; + +static struct pfm_arch_session pfm_arch_sessions; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_arch_sessions_lock); + +static inline void pfm_clear_psr_pp(void) +{ + ia64_rsm(IA64_PSR_PP); +} + +static inline void pfm_set_psr_pp(void) +{ + ia64_ssm(IA64_PSR_PP); +} + +static inline void pfm_clear_psr_up(void) +{ + ia64_rsm(IA64_PSR_UP); +} + +static inline void pfm_set_psr_up(void) +{ + ia64_ssm(IA64_PSR_UP); +} + +static inline void pfm_set_psr_l(u64 val) +{ + ia64_setreg(_IA64_REG_PSR_L, val); +} + +static inline void pfm_restore_ibrs(u64 *ibrs, unsigned int nibrs) +{ + unsigned int i; + + for (i = 0; i < nibrs; i++) { + ia64_set_ibr(i, ibrs[i]); + ia64_dv_serialize_instruction(); + } + ia64_srlz_i(); +} + +static inline void pfm_restore_dbrs(u64 *dbrs, unsigned int ndbrs) +{ + unsigned int i; + + for (i = 0; i < ndbrs; i++) { + ia64_set_dbr(i, dbrs[i]); + ia64_dv_serialize_data(); + } + ia64_srlz_d(); +} + +irqreturn_t pmu_interrupt_handler(int irq, void *arg) +{ + struct pt_regs *regs; + regs = get_irq_regs(); + irq_enter(); + pfm_interrupt_handler(instruction_pointer(regs), regs); + irq_exit(); + return IRQ_HANDLED; +} +static struct irqaction perfmon_irqaction = { + .handler = pmu_interrupt_handler, + .flags = IRQF_DISABLED, /* means keep interrupts masked */ + .name = "perfmon" +}; + +void pfm_arch_quiesce_pmu_percpu(void) +{ + u64 dcr; + /* + * make sure no measurement is active + * (may inherit programmed PMCs from EFI). + */ + pfm_clear_psr_pp(); + pfm_clear_psr_up(); + + /* + * ensure dcr.pp is cleared + */ + dcr = ia64_getreg(_IA64_REG_CR_DCR); + ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP); + + /* + * we run with the PMU not frozen at all times + */ + ia64_set_pmc(0, 0); + ia64_srlz_d(); +} + +void pfm_arch_init_percpu(void) +{ + pfm_arch_quiesce_pmu_percpu(); + /* + * program PMU interrupt vector + */ + ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR); + ia64_srlz_d(); +} + +int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags) +{ + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + + ctx_arch->flags.use_dbr = 0; + ctx_arch->flags.insecure = (ctx_flags & PFM_ITA_FL_INSECURE) ? 1: 0; + + PFM_DBG("insecure=%d", ctx_arch->flags.insecure); + + return 0; +} + +/* + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * Context is locked. Interrupts are masked. Monitoring may be active. + * PMU access is guaranteed. PMC and PMD registers are live in PMU. + * + * Return: + * non-zero : did not save PMDs (as part of stopping the PMU) + * 0 : saved PMDs (no need to save them in caller) + */ +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + u64 psr, tmp; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * save current PSR: needed because we modify it + */ + ia64_srlz_d(); + psr = ia64_getreg(_IA64_REG_PSR); + + /* + * stop monitoring: + * This is the last instruction which may generate an overflow + * + * we do not clear ipsr.up + */ + pfm_clear_psr_up(); + ia64_srlz_d(); + + /* + * extract overflow status bits + */ + tmp = ia64_get_pmc(0) & ~0xf; + + /* + * keep a copy of psr.up (for reload) + */ + ctx_arch->ctx_saved_psr_up = psr & IA64_PSR_UP; + + /* + * save overflow status bits + */ + set->povfl_pmds[0] = tmp; + + /* + * record how many pending overflows + * XXX: assume identity mapping for counters + */ + set->npend_ovfls = ia64_popcnt(tmp); + + /* + * make sure the PMU is unfrozen for the next task + */ + if (set->npend_ovfls) { + ia64_set_pmc(0, 0); + ia64_srlz_d(); + } + return 1; +} + +/* + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * set cannot be NULL. Context is locked. Interrupts are masked. + * Caller has already restored all PMD and PMC registers. + * + * must reactivate monitoring + */ +void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * when monitoring is not explicitly started + * then psr_up = 0, in which case we do not + * need to restore + */ + if (likely(ctx_arch->ctx_saved_psr_up)) { + pfm_set_psr_up(); + ia64_srlz_d(); + } +} + +int pfm_arch_reserve_session(struct pfm_context *ctx, u32 cpu) +{ + struct pfm_arch_context *ctx_arch; + int is_system; + int ret = 0; + + ctx_arch = pfm_ctx_arch(ctx); + is_system = ctx->flags.system; + + spin_lock(&pfm_arch_sessions_lock); + + if (is_system && ctx_arch->flags.use_dbr) { + PFM_DBG("syswide context uses dbregs"); + + if (pfm_arch_sessions.pfs_ptrace_use_dbr) { + PFM_DBG("cannot reserve syswide context: " + "dbregs in use by ptrace"); + ret = -EBUSY; + } else { + pfm_arch_sessions.pfs_sys_use_dbr++; + } + } + spin_unlock(&pfm_arch_sessions_lock); + + return ret; +} + +void pfm_arch_release_session(struct pfm_context *ctx, u32 cpu) +{ + struct pfm_arch_context *ctx_arch; + int is_system; + + ctx_arch = pfm_ctx_arch(ctx); + is_system = ctx->flags.system; + + spin_lock(&pfm_arch_sessions_lock); + + if (is_system && ctx_arch->flags.use_dbr) { + pfm_arch_sessions.pfs_sys_use_dbr--; + } + spin_unlock(&pfm_arch_sessions_lock); +} + +/* + * function called from pfm_load_context_*(). Task is not guaranteed to be + * current task. If not then other task is guaranteed stopped and off any CPU. + * context is locked and interrupts are masked. + * + * On PFM_LOAD_CONTEXT, the interface guarantees monitoring is stopped. + * + * For system-wide task is NULL + */ +int pfm_arch_load_context(struct pfm_context *ctx, struct pfm_event_set *set, + struct task_struct *task) +{ + struct pfm_arch_context *ctx_arch; + struct pt_regs *regs; + int ret = 0; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * cannot load a context which is using range restrictions, + * into a thread that is being debugged. + * + * if one set out of several is using the debug registers, then + * we assume the context as whole is using them. + */ + if (ctx_arch->flags.use_dbr) { + if (ctx->flags.system) { + spin_lock(&pfm_arch_sessions_lock); + + if (pfm_arch_sessions.pfs_ptrace_use_dbr) { + PFM_DBG("cannot reserve syswide context: " + "dbregs in use by ptrace"); + ret = -EBUSY; + } else { + pfm_arch_sessions.pfs_sys_use_dbr++; + PFM_DBG("pfs_sys_use_dbr=%u", pfm_arch_sessions.pfs_sys_use_dbr); + } + spin_unlock(&pfm_arch_sessions_lock); + + } else if (task->thread.flags & IA64_THREAD_DBG_VALID) { + PFM_DBG("load_pid [%d] thread is debugged, cannot " + "use range restrictions", task->pid); + ret = -EBUSY; + } + if (ret) + return ret; + } + + /* + * We need to intervene on context switch to toggle the + * psr.pp bit in system-wide. As such, we set the TIF + * flag so that pfm_arch_ctxswout_sys() and the + * pfm_arch_ctxswin_sys() functions get called + * from pfm_ctxsw_sys(); + */ + if (ctx->flags.system) { + set_thread_flag(TIF_PERFMON_CTXSW); + PFM_DBG("[%d] set TIF", current->pid); + return 0; + } + + regs = task_pt_regs(task); + + /* + * self-monitoring systematically allows user level control + */ + if (task != current) { + /* + * when not current, task is stopped, so this is safe + */ + ctx_arch->ctx_saved_psr_up = 0; + ia64_psr(regs)->up = ia64_psr(regs)->pp = 0; + } else + ctx_arch->flags.insecure = 1; + + /* + * allow user level control (start/stop/read pmd) if: + * - self-monitoring + * - requested at context creation (PFM_IA64_FL_INSECURE) + * + * There is not security hole with PFM_IA64_FL_INSECURE because + * when not self-monitored, the caller must have permissions to + * attached to the task. + */ + if (ctx_arch->flags.insecure) { + ia64_psr(regs)->sp = 0; + PFM_DBG("clearing psr.sp for [%d]", task->pid); + } + return 0; +} + +int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) +{ +#define PFM_SETFL_BOTH_SWITCH (PFM_SETFL_OVFL_SWITCH|PFM_SETFL_TIME_SWITCH) +#define PFM_ITA_SETFL_BOTH_INTR (PFM_ITA_SETFL_INTR_ONLY|\ + PFM_ITA_SETFL_EXCL_INTR) + +/* exclude return value field */ +#define PFM_SETFL_ALL_MASK ( PFM_ITA_SETFL_BOTH_INTR \ + | PFM_SETFL_BOTH_SWITCH \ + | PFM_ITA_SETFL_IDLE_EXCL) + + if ((flags & ~PFM_SETFL_ALL_MASK)) { + PFM_DBG("invalid flags=0x%x", flags); + return -EINVAL; + } + + if ((flags & PFM_ITA_SETFL_BOTH_INTR) == PFM_ITA_SETFL_BOTH_INTR) { + PFM_DBG("both excl intr and ontr only are set"); + return -EINVAL; + } + + if ((flags & PFM_ITA_SETFL_IDLE_EXCL) && !ctx->flags.system) { + PFM_DBG("idle exclude flag only for system-wide context"); + return -EINVAL; + } + return 0; +} + +/* + * function called from pfm_unload_context_*(). Context is locked. + * interrupts are masked. task is not guaranteed to be current task. + * Access to PMU is not guaranteed. + * + * function must do whatever arch-specific action is required on unload + * of a context. + * + * called for both system-wide and per-thread. task is NULL for ssytem-wide + */ +int pfm_arch_unload_context(struct pfm_context *ctx, struct task_struct *task) +{ + struct pfm_arch_context *ctx_arch; + struct pt_regs *regs; + + ctx_arch = pfm_ctx_arch(ctx); + + if (ctx->flags.system) { + /* + * disable context switch hook + */ + clear_thread_flag(TIF_PERFMON_CTXSW); + + if (ctx_arch->flags.use_dbr) { + spin_lock(&pfm_arch_sessions_lock); + pfm_arch_sessions.pfs_sys_use_dbr--; + PFM_DBG("sys_use_dbr=%u", pfm_arch_sessions.pfs_sys_use_dbr); + spin_unlock(&pfm_arch_sessions_lock); + } + return 0; + } + + regs = task_pt_regs(task); + + /* + * cancel user level control for per-task context + */ + ia64_psr(regs)->sp = 1; + PFM_DBG("setting psr.sp for [%d]", task->pid); + return 0; +} + +/* + * mask monitoring by setting the privilege level to 0 + * we cannot use psr.pp/psr.up for this, it is controlled by + * the user + */ +void pfm_arch_mask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + unsigned long mask; + unsigned int i; + + /* + * as an optimization we look at the first 64 PMC + * registers only starting at PMC4. + */ + mask = arch_info->mask_pmcs[0] >> PFM_ITA_FCNTR; + for(i = PFM_ITA_FCNTR; mask; i++, mask >>=1) { + if (likely(mask & 0x1)) + ia64_set_pmc(i, set->pmcs[i] & ~0xfUL); + } + /* + * make changes visisble + */ + ia64_srlz_d(); +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() + * context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMD registers from set. + */ +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + unsigned long *mask; + u16 i, num; + + ctx_arch = pfm_ctx_arch(ctx); + + if (ctx_arch->flags.insecure) { + num = pfm_pmu_conf->regs.num_rw_pmd; + mask = pfm_pmu_conf->regs.rw_pmds; + } else { + num = set->nused_pmds; + mask = set->used_pmds; + } + PFM_DBG("num=%u mask=0x%lx", num, mask[0]); + /* + * must restore all implemented read-write PMDS to avoid leaking + * information especially when PFM_IA64_FL_INSECURE is set. + * + * XXX: should check PFM_IA64_FL_INSECURE==0 and use used_pmd instead + */ + for (i = 0; num; i++) { + if (likely(test_bit(i, mask))) { + pfm_arch_write_pmd(ctx, i, set->pmds[i].value); + num--; + } + } + ia64_srlz_d(); +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() + * context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMC registers from set if needed + */ +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + u64 mask2 = 0, val, plm; + unsigned long impl_mask, mask_pmcs; + unsigned int i; + + /* + * as an optimization we only look at the first 64 + * PMC registers. In fact, we should never scan the + * entire impl_pmcs because ibr/dbr are implemented + * separately. + * + * always skip PMC0-PMC3. PMC0 taken care of when saving + * state. PMC1-PMC3 not used until we get counters in + * the 60 and above index range. + */ + impl_mask = pfm_pmu_conf->regs.pmcs[0] >> PFM_ITA_FCNTR; + mask_pmcs = arch_info->mask_pmcs[0] >> PFM_ITA_FCNTR; + plm = ctx->state == PFM_CTX_MASKED ? ~0xf : ~0x0; + + for (i = PFM_ITA_FCNTR; + impl_mask; + i++, impl_mask >>=1, mask_pmcs >>=1) { + if (likely(impl_mask & 0x1)) { + mask2 = mask_pmcs & 0x1 ? plm : ~0; + val = set->pmcs[i] & mask2; + ia64_set_pmc(i, val); + PFM_DBG_ovfl("pmc%u=0x%lx", i, val); + } + } + /* + * restore DBR/IBR + */ + if (set->priv_flags & PFM_ITA_SETFL_USE_DBR) { + pfm_restore_ibrs(set->pmcs+256, 8); + pfm_restore_dbrs(set->pmcs+264, 8); + } + ia64_srlz_d(); +} + +void pfm_arch_unmask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 psr; + int is_system; + + is_system = ctx->flags.system; + + psr = ia64_getreg(_IA64_REG_PSR); + + /* + * monitoring is masked via the PMC.plm + * + * As we restore their value, we do not want each counter to + * restart right away. We stop monitoring using the PSR, + * restore the PMC (and PMD) and then re-establish the psr + * as it was. Note that there can be no pending overflow at + * this point, because monitoring is still MASKED. + * + * Because interrupts are masked we can avoid changing + * DCR.pp. + */ + if (is_system) + pfm_clear_psr_pp(); + else + pfm_clear_psr_up(); + + ia64_srlz_d(); + + pfm_arch_restore_pmcs(ctx, set); + + /* + * restore psr + * + * monitoring may start right now but interrupts + * are still masked + */ + pfm_set_psr_l(psr); + ia64_srlz_d(); +} + +/* + * Called from pfm_stop() + * + * For per-thread: + * task is not necessarily current. If not current task, then + * task is guaranteed stopped and off any cpu. Access to PMU + * is not guaranteed. Interrupts are masked. Context is locked. + * Set is the active set. + * + * must disable active monitoring. ctx cannot be NULL + */ +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + struct pt_regs *regs; + u64 dcr, psr; + + ctx_arch = pfm_ctx_arch(ctx); + regs = task_pt_regs(task); + + if (!ctx->flags.system) { + /* + * in ZOMBIE state we always have task == current due to + * pfm_exit_thread() + */ + ia64_psr(regs)->up = 0; + ctx_arch->ctx_saved_psr_up = 0; + + /* + * in case of ZOMBIE state, there is no unload to clear + * insecure monitoring, so we do it in stop instead. + */ + if (ctx->state == PFM_CTX_ZOMBIE) + ia64_psr(regs)->sp = 1; + + if (task == current) { + pfm_clear_psr_up(); + ia64_srlz_d(); + } + } else if (ctx->flags.started) { /* do not stop twice */ + dcr = ia64_getreg(_IA64_REG_CR_DCR); + psr = ia64_getreg(_IA64_REG_PSR); + + ia64_psr(regs)->pp = 0; + ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP); + pfm_clear_psr_pp(); + ia64_srlz_d(); + + if (set->flags & PFM_ITA_SETFL_IDLE_EXCL) { + PFM_DBG("disabling idle exclude"); + __get_cpu_var(pfm_syst_info) &= ~PFM_ITA_CPUINFO_IDLE_EXCL; + } + } +} + +/* + * called from pfm_start() + * + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * Task is not necessarily current. If not current task, then task + * is guaranteed stopped and off any cpu. No access to PMU is task + * is not current. + * + * For system-wide: + * task is always current + * + * must enable active monitoring. + */ +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + struct pt_regs *regs; + u64 dcr, dcr_pp, psr_pp; + u32 flags; + + ctx_arch = pfm_ctx_arch(ctx); + regs = task_pt_regs(task); + flags = set->flags; + + /* + * take care of per-thread mode + */ + if (!ctx->flags.system) { + + ia64_psr(regs)->up = 1; + + if (task == current) { + pfm_set_psr_up(); + ia64_srlz_d(); + } else { + /* + * start monitoring at the kernel level the next + * time the task is scheduled + */ + ctx_arch->ctx_saved_psr_up = IA64_PSR_UP; + } + return; + } + + /* + * take care of system-wide mode + */ + dcr = ia64_getreg(_IA64_REG_CR_DCR); + if (flags & PFM_ITA_SETFL_INTR_ONLY) { + dcr_pp = 1; + psr_pp = 0; + } else if (flags & PFM_ITA_SETFL_EXCL_INTR) { + dcr_pp = 0; + psr_pp = 1; + } else { + dcr_pp = psr_pp = 1; + } + PFM_DBG("dcr_pp=%lu psr_pp=%lu", dcr_pp, psr_pp); + + /* + * update dcr_pp and psr_pp + */ + if (dcr_pp) + ia64_setreg(_IA64_REG_CR_DCR, dcr | IA64_DCR_PP); + else + ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP); + + if (psr_pp) { + pfm_set_psr_pp(); + ia64_psr(regs)->pp = 1; + } else { + pfm_clear_psr_pp(); + ia64_psr(regs)->pp = 0; + } + ia64_srlz_d(); + + if (set->flags & PFM_ITA_SETFL_IDLE_EXCL) { + PFM_DBG("enable idle exclude"); + __get_cpu_var(pfm_syst_info) |= PFM_ITA_CPUINFO_IDLE_EXCL; + } +} + +/* + * Only call this function when a process is trying to + * write the debug registers (reading is always allowed) + * called from arch/ia64/kernel/ptrace.c:access_uarea() + */ +int __pfm_use_dbregs(struct task_struct *task) +{ + struct pfm_arch_context *ctx_arch; + struct pfm_context *ctx; + unsigned long flags; + int ret = 0; + + PFM_DBG("called for [%d]", task->pid); + + ctx = task->pfm_context; + + /* + * do it only once + */ + if (task->thread.flags & IA64_THREAD_DBG_VALID) { + PFM_DBG("IA64_THREAD_DBG_VALID already set"); + return 0; + } + if (ctx) { + spin_lock_irqsave(&ctx->lock, flags); + ctx_arch = pfm_ctx_arch(ctx); + + if (ctx_arch->flags.use_dbr == 1) { + PFM_DBG("PMU using dbregs already, no ptrace access"); + ret = -1; + } + spin_unlock_irqrestore(&ctx->lock, flags); + if (ret) + return ret; + } + + spin_lock(&pfm_arch_sessions_lock); + + /* + * We cannot allow setting breakpoints when system wide monitoring + * sessions are using the debug registers. + */ + if (!pfm_arch_sessions.pfs_sys_use_dbr) + pfm_arch_sessions.pfs_ptrace_use_dbr++; + else + ret = -1; + + PFM_DBG("ptrace_use_dbr=%u sys_use_dbr=%u by [%d] ret = %d", + pfm_arch_sessions.pfs_ptrace_use_dbr, + pfm_arch_sessions.pfs_sys_use_dbr, + task->pid, ret); + + spin_unlock(&pfm_arch_sessions_lock); + if (ret) + return ret; +#ifndef CONFIG_SMP + /* + * in UP, we need to check whether the current + * owner of the PMU is not using the debug registers + * for monitoring. Because we are using a lazy + * save on ctxswout, we must force a save in this + * case because the debug registers are being + * modified by another task. We save the current + * PMD registers, and clear ownership. In ctxswin, + * full state will be reloaded. + * + * Note: we overwrite task. + */ + task = __get_cpu_var(pmu_owner); + ctx = __get_cpu_var(pmu_ctx); + + if (task == NULL) + return 0; + + ctx_arch = pfm_ctx_arch(ctx); + + if (ctx_arch->flags.use_dbr) + pfm_save_pmds_release(ctx); +#endif + return 0; +} + +/* + * This function is called for every task that exits with the + * IA64_THREAD_DBG_VALID set. This indicates a task which was + * able to use the debug registers for debugging purposes via + * ptrace(). Therefore we know it was not using them for + * perfmormance monitoring, so we only decrement the number + * of "ptraced" debug register users to keep the count up to date + */ +int __pfm_release_dbregs(struct task_struct *task) +{ + int ret; + + spin_lock(&pfm_arch_sessions_lock); + + if (pfm_arch_sessions.pfs_ptrace_use_dbr == 0) { + PFM_ERR("invalid release for [%d] ptrace_use_dbr=0", task->pid); + ret = -1; + } else { + pfm_arch_sessions.pfs_ptrace_use_dbr--; + ret = 0; + } + spin_unlock(&pfm_arch_sessions_lock); + + return ret; +} + +int pfm_ia64_mark_dbregs_used(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + struct task_struct *task; + struct thread_struct *thread; + int ret = 0, state; + int i, can_access_pmu = 0; + int is_loaded, is_system; + + ctx_arch = pfm_ctx_arch(ctx); + state = ctx->state; + task = ctx->task; + is_loaded = state == PFM_CTX_LOADED || state == PFM_CTX_MASKED; + is_system = ctx->flags.system; + can_access_pmu = __get_cpu_var(pmu_owner) == task || is_system; + + if (is_loaded == 0) + goto done; + + if (is_system == 0) { + thread = &(task->thread); + + /* + * cannot use debug registers for montioring if they are + * already used for debugging + */ + if (thread->flags & IA64_THREAD_DBG_VALID) { + PFM_DBG("debug registers already in use for [%d]", + task->pid); + return -EBUSY; + } + } + + /* + * check for debug registers in system wide mode + */ + spin_lock(&pfm_arch_sessions_lock); + + if (is_system) { + if (pfm_arch_sessions.pfs_ptrace_use_dbr) + ret = -EBUSY; + else + pfm_arch_sessions.pfs_sys_use_dbr++; + } + + spin_unlock(&pfm_arch_sessions_lock); + + if (ret != 0) + return ret; + + /* + * clear hardware registers to make sure we don't + * pick up stale state. + */ + if (can_access_pmu) { + PFM_DBG("clearing ibrs, dbrs"); + for (i = 0; i < 8; i++) { + ia64_set_ibr(i, 0); + ia64_dv_serialize_instruction(); + } + ia64_srlz_i(); + for (i = 0; i < 8; i++) { + ia64_set_dbr(i, 0); + ia64_dv_serialize_data(); + } + ia64_srlz_d(); + } +done: + /* + * debug registers are now in use + */ + ctx_arch->flags.use_dbr = 1; + set->priv_flags |= PFM_ITA_SETFL_USE_DBR; + PFM_DBG("set%u use_dbr=1", set->id); + return 0; +} +EXPORT_SYMBOL(pfm_ia64_mark_dbregs_used); + +char *pfm_arch_get_pmu_module_name(void) +{ + switch(local_cpu_data->family) { + case 0x07: + return "perfmon_itanium"; + case 0x1f: + return "perfmon_mckinley"; + case 0x20: + return "perfmon_montecito"; + default: + return "perfmon_generic"; + } + return NULL; +} + +/* + * global arch-specific intialization, called only once + */ +int __init pfm_arch_init(void) +{ + int ret; + + spin_lock_init(&pfm_arch_sessions_lock); + +#ifdef CONFIG_IA64_PERFMON_COMPAT + ret = pfm_ia64_compat_init(); + if (ret) + return ret; +#endif + register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction); + + + return 0; +} Index: linux-2.6/arch/ia64/perfmon/perfmon_compat.c =================================================================== --- /dev/null +++ linux-2.6/arch/ia64/perfmon/perfmon_compat.c @@ -0,0 +1,1166 @@ +/* + * This file implements the IA-64 specific + * support for the perfmon2 interface + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage long sys_pfm_stop(int fd); +asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *st); +asmlinkage long sys_pfm_unload_context(int fd); +asmlinkage long sys_pfm_restart(int fd); +asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ld); + +/* + * function providing some help for backward compatiblity with old IA-64 + * applications. In the old model, certain attributes of a counter were + * passed via the PMC, now they are passed via the PMD. + */ +static int pfm_compat_update_pmd(struct pfm_context *ctx, u16 set_id, u16 cnum, + u32 rflags, + unsigned long *smpl_pmds, + unsigned long *reset_pmds, + u64 eventid) +{ + struct pfm_event_set *set; + int is_counting; + unsigned long *impl_pmds; + u32 flags = 0; + u16 max_pmd; + + impl_pmds = pfm_pmu_conf->regs.pmds; + max_pmd = pfm_pmu_conf->regs.max_pmd; + + /* + * given that we do not maintain PMC ->PMD dependencies + * we cannot figure out what to do in case PMCxx != PMDxx + */ + if (cnum > max_pmd) + return 0; + + /* + * assumes PMCxx controls PMDxx which is always true for counters + * on Itanium PMUs. + */ + is_counting = pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64; + set = pfm_find_set(ctx, set_id, 0); + + if (is_counting) { + if (rflags & PFM_REGFL_OVFL_NOTIFY) + flags |= PFM_REGFL_OVFL_NOTIFY; + if (rflags & PFM_REGFL_RANDOM) + flags |= PFM_REGFL_RANDOM; + /* + * verify validity of smpl_pmds + */ + if (unlikely(bitmap_subset(smpl_pmds, + impl_pmds, max_pmd) == 0)) { + PFM_DBG("invalid smpl_pmds=0x%llx for pmd%u", + (unsigned long long)smpl_pmds[0], cnum); + return -EINVAL; + } + /* + * verify validity of reset_pmds + */ + if (unlikely(bitmap_subset(reset_pmds, + impl_pmds, max_pmd) == 0)) { + PFM_DBG("invalid reset_pmds=0x%lx for pmd%u", + reset_pmds[0], cnum); + return -EINVAL; + } + /* + * ensures that a PFM_READ_PMDS succeeds with a + * corresponding PFM_WRITE_PMDS + */ + __set_bit(cnum, set->used_pmds); + + } else if (rflags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) { + PFM_DBG("cannot set ovfl_notify or random on pmd%u", cnum); + return -EINVAL; + } + + set->pmds[cnum].flags = flags; + + if (is_counting) { + bitmap_copy(set->pmds[cnum].reset_pmds, + reset_pmds, + max_pmd); + + bitmap_copy(set->pmds[cnum].smpl_pmds, + smpl_pmds, + max_pmd); + + set->pmds[cnum].eventid = eventid; + + /* + * update ovfl_notify + */ + if (rflags & PFM_REGFL_OVFL_NOTIFY) + __set_bit(cnum, set->ovfl_notify); + else + __clear_bit(cnum, set->ovfl_notify); + + } + PFM_DBG("pmd%u flags=0x%x eventid=0x%lx r_pmds=0x%lx s_pmds=0x%lx", + cnum, flags, + eventid, + reset_pmds[0], + smpl_pmds[0]); + + return 0; +} + + +int __pfm_write_ibrs_old(struct pfm_context *ctx, void *arg, int count) +{ + struct pfarg_dbreg *req = arg; + struct pfarg_pmc pmc; + int i, ret = 0; + + memset(&pmc, 0, sizeof(pmc)); + + for (i = 0; i < count; i++, req++) { + pmc.reg_num = 256+req->dbreg_num; + pmc.reg_value = req->dbreg_value; + pmc.reg_flags = 0; + pmc.reg_set = req->dbreg_set; + + ret = __pfm_write_pmcs(ctx, &pmc, 1); + + req->dbreg_flags &= ~PFM_REG_RETFL_MASK; + req->dbreg_flags |= pmc.reg_flags; + + if (ret) + return ret; + } + return 0; +} + +static long pfm_write_ibrs_old(int fd, void __user *ureq, int count) +{ + struct pfm_context *ctx; + struct file *filp; + struct pfarg_dbreg *req = NULL; + void *fptr; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) + return -EINVAL; + + sz = count*sizeof(*req); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags); + if (ret == 0) + ret = __pfm_write_ibrs_old(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); +error: + fput_light(filp, fput_needed); + return ret; +} + +int __pfm_write_dbrs_old(struct pfm_context *ctx, void *arg, int count) +{ + struct pfarg_dbreg *req = arg; + struct pfarg_pmc pmc; + int i, ret = 0; + + memset(&pmc, 0, sizeof(pmc)); + + for (i = 0; i < count; i++, req++) { + pmc.reg_num = 264+req->dbreg_num; + pmc.reg_value = req->dbreg_value; + pmc.reg_flags = 0; + pmc.reg_set = req->dbreg_set; + + ret = __pfm_write_pmcs(ctx, &pmc, 1); + + req->dbreg_flags &= ~PFM_REG_RETFL_MASK; + req->dbreg_flags |= pmc.reg_flags; + if (ret) + return ret; + } + return 0; +} + +static long pfm_write_dbrs_old(int fd, void __user *ureq, int count) +{ + struct pfm_context *ctx; + struct file *filp; + struct pfarg_dbreg *req = NULL; + void *fptr; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) + return -EINVAL; + + sz = count*sizeof(*req); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags); + if (ret == 0) + ret = __pfm_write_dbrs_old(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); +error: + fput_light(filp, fput_needed); + return ret; +} + +int __pfm_write_pmcs_old(struct pfm_context *ctx, struct pfarg_reg *req_old, + int count) +{ + struct pfarg_pmc req; + unsigned int i; + int ret, error_code; + + memset(&req, 0, sizeof(req)); + + for (i = 0; i < count; i++, req_old++) { + req.reg_num = req_old->reg_num; + req.reg_set = req_old->reg_set; + req.reg_flags = 0; + req.reg_value = req_old->reg_value; + + ret = __pfm_write_pmcs(ctx, (void *)&req, 1); + req_old->reg_flags &= ~PFM_REG_RETFL_MASK; + req_old->reg_flags |= req.reg_flags; + + if (ret) + return ret; + + ret = pfm_compat_update_pmd(ctx, req_old->reg_set, + req_old->reg_num, + (u32)req_old->reg_flags, + req_old->reg_smpl_pmds, + req_old->reg_reset_pmds, + req_old->reg_smpl_eventid); + + error_code = ret ? PFM_REG_RETFL_EINVAL : 0; + req_old->reg_flags &= ~PFM_REG_RETFL_MASK; + req_old->reg_flags |= error_code; + + if (ret) + return ret; + } + return 0; +} + +static long pfm_write_pmcs_old(int fd, void __user *ureq, int count) +{ + struct pfm_context *ctx; + struct file *filp; + struct pfarg_reg *req = NULL; + void *fptr; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) + return -EINVAL; + + sz = count*sizeof(*req); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags); + if (ret == 0) + ret = __pfm_write_pmcs_old(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); + +error: + fput_light(filp, fput_needed); + return ret; +} + +int __pfm_write_pmds_old(struct pfm_context *ctx, struct pfarg_reg *req_old, + int count) +{ + struct pfarg_pmd req; + int i, ret; + + memset(&req, 0, sizeof(req)); + + for (i = 0; i < count; i++, req_old++) { + req.reg_num = req_old->reg_num; + req.reg_set = req_old->reg_set; + req.reg_value = req_old->reg_value; + req.reg_flags = req_old->reg_flags; + + req.reg_long_reset = req_old->reg_long_reset; + req.reg_short_reset = req_old->reg_short_reset; + req.reg_random_mask = req_old->reg_random_mask; + /* + * reg_random_seed is ignored since v2.3 + */ + + /* + * skip last_reset_val not used for writing + * skip smpl_pmds, reset_pmds, eventid, ovfl_swtch_cnt + * as set in pfm_write_pmcs_old. + */ + req.reg_ovfl_switch_cnt = req_old->reg_ovfl_switch_cnt; + + ret = __pfm_write_pmds(ctx, (void *)&req, 1, 1); + + req_old->reg_flags &= ~PFM_REG_RETFL_MASK; + req_old->reg_flags |= req.reg_flags; + + if (ret) + return ret; + } + return 0; +} + +static long pfm_write_pmds_old(int fd, void __user *ureq, int count) +{ + struct pfm_context *ctx; + struct file *filp; + struct pfarg_reg *req = NULL; + void *fptr; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) + return -EINVAL; + + sz = count*sizeof(*req); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags); + if (ret == 0) + ret = __pfm_write_pmds_old(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); +error: + fput_light(filp, fput_needed); + return ret; +} + +int __pfm_read_pmds_old(struct pfm_context *ctx, struct pfarg_reg *req_old, + int count) +{ + struct pfarg_pmd req; + int i, ret; + + memset(&req, 0, sizeof(req)); + + for (i = 0; i < count; i++, req_old++) { + req.reg_num = req_old->reg_num; + req.reg_set = req_old->reg_set; + + /* skip value not used for reading */ + req.reg_flags = req_old->reg_flags; + + /* skip short/long_reset not used for reading */ + /* skip last_reset_val not used for reading */ + /* skip ovfl_switch_cnt not used for reading */ + + ret = __pfm_read_pmds(ctx, (void *)&req, 1); + + req_old->reg_flags &= ~PFM_REG_RETFL_MASK; + req_old->reg_flags |= req.reg_flags; + if (ret) + return ret; + + /* update fields */ + req_old->reg_value = req.reg_value; + + req_old->reg_last_reset_val = req.reg_last_reset_val; + req_old->reg_ovfl_switch_cnt = req.reg_ovfl_switch_cnt; + } + return 0; +} + +static long pfm_read_pmds_old(int fd, void __user *ureq, int count) +{ + struct pfm_context *ctx; + struct file *filp; + struct pfarg_reg *req = NULL; + void *fptr; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) + return -EINVAL; + + sz = count*sizeof(*req); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags); + if (ret == 0) + ret = __pfm_read_pmds_old(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); +error: + fput_light(filp, fput_needed); + return ret; +} + +/* + * OBSOLETE: use /proc/perfmon_map instead + */ +static long pfm_get_default_pmcs_old(int fd, void __user *ureq, int count) +{ + struct pfarg_reg *req = NULL; + void *fptr; + size_t sz; + int ret, i; + unsigned int cnum; + + if (count < 1) + return -EINVAL; + + /* + * ensure the pfm_pmu_conf does not disappear while + * we use it + */ + ret = pfm_pmu_conf_get(1); + if (ret) + return ret; + + sz = count*sizeof(*ureq); + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + + for (i = 0; i < count; i++, req++) { + cnum = req->reg_num; + + if (i >= PFM_MAX_PMCS || + (pfm_pmu_conf->pmc_desc[cnum].type & PFM_REG_I) == 0) { + req->reg_flags = PFM_REG_RETFL_EINVAL; + break; + } + req->reg_value = pfm_pmu_conf->pmc_desc[cnum].dfl_val; + req->reg_flags = 0; + + PFM_DBG("pmc[%u]=0x%lx", cnum, req->reg_value); + } + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); +error: + pfm_pmu_conf_put(); + + return ret; +} + +/* + * allocate a sampling buffer and remaps it into the user address space of + * the task. This is only in compatibility mode + * + * function called ONLY on current task + */ +int pfm_smpl_buffer_alloc_compat(struct pfm_context *ctx, size_t rsize, + struct file *filp) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + struct pfm_arch_context *ctx_arch; + size_t size; + int ret; + extern struct vm_operations_struct pfm_buf_map_vm_ops; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * allocate buffer + map desc + */ + ret = pfm_smpl_buffer_alloc(ctx, rsize); + if (ret) + return ret; + + size = ctx->smpl_size; + + + /* allocate vma */ + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!vma) { + PFM_DBG("Cannot allocate vma"); + goto error_kmem; + } + memset(vma, 0, sizeof(*vma)); + + /* + * partially initialize the vma for the sampling buffer + */ + vma->vm_mm = mm; + vma->vm_flags = VM_READ| VM_MAYREAD |VM_RESERVED; + vma->vm_page_prot = PAGE_READONLY; + vma->vm_ops = &pfm_buf_map_vm_ops; + vma->vm_file = filp; + vma->vm_private_data = ctx; + vma->vm_pgoff = 0; + + /* + * simulate effect of mmap() + */ + get_file(filp); + + /* + * Let's do the difficult operations next. + * + * now we atomically find some area in the address space and + * remap the buffer into it. + */ + down_write(¤t->mm->mmap_sem); + + /* find some free area in address space, must have mmap sem held */ + vma->vm_start = get_unmapped_area(NULL, 0, size, 0, + MAP_PRIVATE|MAP_ANONYMOUS); + if (vma->vm_start == 0) { + PFM_DBG("cannot find unmapped area of size %zu", size); + up_write(¤t->mm->mmap_sem); + goto error; + } + vma->vm_end = vma->vm_start + size; + + PFM_DBG("aligned_size=%zu mapped @0x%lx", size, vma->vm_start); + /* + * now insert the vma in the vm list for the process, must be + * done with mmap lock held + */ + insert_vm_struct(mm, vma); + + mm->total_vm += size >> PAGE_SHIFT; + + up_write(¤t->mm->mmap_sem); + + /* + * IMPORTANT: we do not issue the fput() + * because we want to increase the ref count + * on the descriptor to simulate what mmap() + * would do + */ + + /* + * used to propagate vaddr to syscall stub + */ + ctx_arch->ctx_smpl_vaddr = (void *)vma->vm_start; + + return 0; +error: + kmem_cache_free(vm_area_cachep, vma); +error_kmem: + pfm_release_buf_space(ctx, ctx->smpl_size); + vfree(ctx->smpl_addr); + return -ENOMEM; +} + +#define PFM_DEFAULT_SMPL_UUID { \ + 0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82,\ + 0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97} + +static pfm_uuid_t old_default_uuid = PFM_DEFAULT_SMPL_UUID; +static pfm_uuid_t null_uuid; + +/* + * function invoked in case, pfm_context_create fails + * at the last operation, copy_to_user. It needs to + * undo memory allocations and free the file descriptor + */ +static void pfm_undo_create_context_fd(int fd, struct pfm_context *ctx) +{ + struct files_struct *files = current->files; + struct file *file; + int fput_needed; + + file = fget_light(fd, &fput_needed); + /* + * there is no fd_uninstall(), so we do it + * here. put_unused_fd() does not remove the + * effect of fd_install(). + */ + + spin_lock(&files->file_lock); + files->fd_array[fd] = NULL; + spin_unlock(&files->file_lock); + + fput_light(file, fput_needed); + + /* + * decrement ref count and kill file + */ + put_filp(file); + + put_unused_fd(fd); + + pfm_context_free(ctx); +} + +static int pfm_get_smpl_arg_old(pfm_uuid_t uuid, void __user *fmt_uarg, + size_t usize, void **arg, + struct pfm_smpl_fmt **fmt) +{ + struct pfm_smpl_fmt *f; + void *addr = NULL; + size_t sz; + int ret; + + if (!memcmp(uuid, null_uuid, sizeof(pfm_uuid_t))) + return 0; + + if (memcmp(uuid, old_default_uuid, sizeof(pfm_uuid_t))) { + PFM_DBG("compatibility mode supports only default sampling format"); + return -EINVAL; + } + /* + * find fmt and increase refcount + */ + f = pfm_smpl_fmt_get("default-old"); + if (f == NULL) { + PFM_DBG("default-old buffer format not found"); + return -EINVAL; + } + + /* + * expected format argument size + */ + sz = f->fmt_arg_size; + + /* + * check user size matches expected size + * usize = -1 is for IA-64 backward compatibility + */ + ret = -EINVAL; + if (sz != usize && usize != -1) { + PFM_DBG("invalid arg size %zu, format expects %zu", + usize, sz); + goto error; + } + + ret = -ENOMEM; + addr = kmalloc(sz, GFP_KERNEL); + if (addr == NULL) + goto error; + + ret = -EFAULT; + if (copy_from_user(addr, fmt_uarg, sz)) + goto error; + + *arg = addr; + *fmt = f; + return 0; + +error: + kfree(addr); + pfm_smpl_fmt_put(f); + return ret; +} + +static long pfm_create_context_old(int fd, void __user *ureq, int count) +{ + struct pfm_context *new_ctx; + struct pfm_arch_context *ctx_arch; + struct pfm_smpl_fmt *fmt = NULL; + struct pfarg_context req_old; + void __user *usmpl_arg; + void *smpl_arg = NULL; + struct pfarg_ctx req; + int ret; + + if (count != 1) + return -EINVAL; + + if (copy_from_user(&req_old, ureq, sizeof(req_old))) + return -EFAULT; + + memset(&req, 0, sizeof(req)); + + /* + * sampling format args are following pfarg_context + */ + usmpl_arg = ureq+sizeof(req_old); + + ret = pfm_get_smpl_arg_old(req_old.ctx_smpl_buf_id, usmpl_arg, -1, + &smpl_arg, &fmt); + if (ret) + return ret; + + req.ctx_flags = req_old.ctx_flags; + + /* + * returns file descriptor if >=0, or error code */ + ret = __pfm_create_context(&req, fmt, smpl_arg, PFM_COMPAT, &new_ctx); + if (ret >= 0) { + ctx_arch = pfm_ctx_arch(new_ctx); + req_old.ctx_fd = ret; + req_old.ctx_smpl_vaddr = ctx_arch->ctx_smpl_vaddr; + } + + if (copy_to_user(ureq, &req_old, sizeof(req_old))) { + pfm_undo_create_context_fd(req_old.ctx_fd, new_ctx); + ret = -EFAULT; + } + + kfree(smpl_arg); + + return ret; +} + +/* + * obsolete call: use /proc/perfmon + */ +static long pfm_get_features_old(int fd, void __user *arg, int count) +{ + struct pfarg_features req; + int ret = 0; + + if (count != 1) + return -EINVAL; + + memset(&req, 0, sizeof(req)); + + req.ft_version = PFM_VERSION; + + if (copy_to_user(arg, &req, sizeof(req))) + ret = -EFAULT; + + return ret; +} + +static long pfm_debug_old(int fd, void __user *arg, int count) +{ + int m; + + if (count != 1) + return -EINVAL; + + if (get_user(m, (int __user*)arg)) + return -EFAULT; + + + pfm_controls.debug = m == 0 ? 0 : 1; + + PFM_INFO("debugging %s (timing reset)", + pfm_controls.debug ? "on" : "off"); + + if (m == 0) + for_each_online_cpu(m) { + memset(&per_cpu(pfm_stats,m), 0, + sizeof(struct pfm_stats)); + } + return 0; +} + +static long pfm_unload_context_old(int fd, void __user *arg, int count) +{ + if (count) + return -EINVAL; + + return sys_pfm_unload_context(fd); +} + +static long pfm_restart_old(int fd, void __user *arg, int count) +{ + if (count) + return -EINVAL; + + return sys_pfm_restart(fd); +} + +static long pfm_stop_old(int fd, void __user *arg, int count) +{ + if (count) + return -EINVAL; + + return sys_pfm_stop(fd); +} + +static long pfm_start_old(int fd, void __user *arg, int count) +{ + if (count > 1) + return -EINVAL; + + return sys_pfm_start(fd, arg); +} + +static long pfm_load_context_old(int fd, void __user *ureq, int count) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct file *filp; + unsigned long flags; + struct pfarg_load req; + int ret, fput_needed; + + if (count != 1) + return -EINVAL; + + if (copy_from_user(&req, ureq, sizeof(req))) + return -EFAULT; + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + task = NULL; + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + /* + * in per-thread mode (not self-monitoring), get a reference + * on task to monitor. This must be done with interrupts enabled + * Upon succesful return, refcount on task is increased. + * + * fget_light() is protecting the context. + */ + if (!ctx->flags.system) { + if (req.load_pid != current->pid) { + ret = pfm_get_task(ctx, req.load_pid, &task); + if (ret) + goto error; + } else + task = current; + } + /* + * irqsave is required to avoid race in case context is already + * loaded or with switch timeout in the case of self-monitoring + */ + spin_lock_irqsave(&ctx->lock, flags); + + /* + * the new interface requires the desired CPU to be explicitely set + * in this field. the kernel then checks you are on the right CPU + */ + if (ctx->flags.system) + req.load_pid = smp_processor_id(); + + ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags); + if (!ret) + ret = __pfm_load_context(ctx, &req, task); + + spin_unlock_irqrestore(&ctx->lock, flags); + + /* + * in per-thread mode (not self-monitoring), we need + * to decrease refcount on task to monitor: + * - load successful: we have a reference to the task in ctx->task + * - load failed : undo the effect of pfm_get_task() + */ + if (task && task != current) + put_task_struct(task); + +error: + fput_light(filp, fput_needed); + return ret; +} + +/* + * perfmon command descriptions + */ +struct pfm_cmd_desc { + long (*cmd_func)(int fd, void __user *arg, int count); +}; + +/* + * functions MUST be listed in the increasing order of + * their index (see permfon.h) + */ +#define PFM_CMD(name) \ + { .cmd_func = name, \ + } +#define PFM_CMD_NONE \ + { .cmd_func = NULL \ + } + +static struct pfm_cmd_desc pfm_cmd_tab[]={ +/* 0 */PFM_CMD_NONE, +/* 1 */PFM_CMD(pfm_write_pmcs_old), +/* 2 */PFM_CMD(pfm_write_pmds_old), +/* 3 */PFM_CMD(pfm_read_pmds_old), +/* 4 */PFM_CMD(pfm_stop_old), +/* 5 */PFM_CMD(pfm_start_old), +/* 6 */PFM_CMD_NONE, +/* 7 */PFM_CMD_NONE, +/* 8 */PFM_CMD(pfm_create_context_old), +/* 9 */PFM_CMD_NONE, +/* 10 */PFM_CMD(pfm_restart_old), +/* 11 */PFM_CMD_NONE, +/* 12 */PFM_CMD(pfm_get_features_old), +/* 13 */PFM_CMD(pfm_debug_old), +/* 14 */PFM_CMD_NONE, +/* 15 */PFM_CMD(pfm_get_default_pmcs_old), +/* 16 */PFM_CMD(pfm_load_context_old), +/* 17 */PFM_CMD(pfm_unload_context_old), +/* 18 */PFM_CMD_NONE, +/* 19 */PFM_CMD_NONE, +/* 20 */PFM_CMD_NONE, +/* 21 */PFM_CMD_NONE, +/* 22 */PFM_CMD_NONE, +/* 23 */PFM_CMD_NONE, +/* 24 */PFM_CMD_NONE, +/* 25 */PFM_CMD_NONE, +/* 26 */PFM_CMD_NONE, +/* 27 */PFM_CMD_NONE, +/* 28 */PFM_CMD_NONE, +/* 29 */PFM_CMD_NONE, +/* 30 */PFM_CMD_NONE, +/* 31 */PFM_CMD_NONE, +/* 32 */PFM_CMD(pfm_write_ibrs_old), +/* 33 */PFM_CMD(pfm_write_dbrs_old), +}; +#define PFM_CMD_COUNT ARRAY_SIZE(pfm_cmd_tab) + +/* + * system-call entry point (must return long) + */ +asmlinkage long sys_perfmonctl (int fd, int cmd, void __user *arg, int count) +{ + if (perfmon_disabled) + return -ENOSYS; + + if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT + || pfm_cmd_tab[cmd].cmd_func == NULL)) { + PFM_DBG("invalid cmd=%d", cmd); + return -EINVAL; + } + return (long)pfm_cmd_tab[cmd].cmd_func(fd, arg, count); +} + +/* + * legacy /proc/perfmon simplified interface (we only maintain the + * global information (no more per-cpu stats, use + * /sys/devices/system/cpu/cpuXX/perfmon + */ +static struct proc_dir_entry *perfmon_proc; + +static void *pfm_proc_start(struct seq_file *m, loff_t *pos) +{ + if (*pos == 0) + return (void *)1; + + return NULL; +} + +static void *pfm_proc_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return pfm_proc_start(m, pos); +} + +static void pfm_proc_stop(struct seq_file *m, void *v) +{ +} + +/* + * this is a simplified version of the legacy /proc/perfmon. + * We have retained ONLY the key information that tools are actually + * using + */ +static void pfm_proc_show_header(struct seq_file *m) +{ + char buf[128]; + + pfm_sysfs_session_show(buf, sizeof(buf), 3); + + seq_printf(m, "perfmon version : %u.%u\n", + PFM_VERSION_MAJ, PFM_VERSION_MIN); + + seq_printf(m, "model : %s", buf); +} + +static int pfm_proc_show(struct seq_file *m, void *v) +{ + pfm_proc_show_header(m); + return 0; +} + +struct seq_operations pfm_proc_seq_ops = { + .start = pfm_proc_start, + .next = pfm_proc_next, + .stop = pfm_proc_stop, + .show = pfm_proc_show +}; + +static int pfm_proc_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &pfm_proc_seq_ops); +} + + +static struct file_operations pfm_proc_fops = { + .open = pfm_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * called from pfm_arch_init(), global initialization, called once + */ +int __init pfm_ia64_compat_init(void) +{ + /* + * create /proc/perfmon + */ + perfmon_proc = create_proc_entry("perfmon", S_IRUGO, NULL); + if (perfmon_proc == NULL) { + PFM_ERR("cannot create /proc entry, perfmon disabled"); + return -1; + } + perfmon_proc->proc_fops = &pfm_proc_fops; + return 0; +} Index: linux-2.6/arch/ia64/perfmon/perfmon_default_smpl.c =================================================================== --- /dev/null +++ linux-2.6/arch/ia64/perfmon/perfmon_default_smpl.c @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file implements the old default sampling buffer format + * for the Linux/ia64 perfmon-2 subsystem. This is for backward + * compatibility only. use the new default format in perfmon/ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include + +#ifdef MODULE +#define FMT_FLAGS 0 +#else +#define FMT_FLAGS PFM_FMTFL_IS_BUILTIN +#endif + +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("perfmon old default sampling format"); +MODULE_LICENSE("GPL"); + +static int pfm_default_fmt_validate(u32 flags, u16 npmds, void *data) +{ + struct pfm_default_smpl_arg *arg = data; + size_t min_buf_size; + + if (data == NULL) { + PFM_DBG("no argument passed"); + return -EINVAL; + } + + /* + * compute min buf size. All PMD are manipulated as 64bit entities + */ + min_buf_size = sizeof(struct pfm_default_smpl_hdr) + + (sizeof(struct pfm_default_smpl_entry) + + (npmds*sizeof(u64))); + + PFM_DBG("validate flags=0x%x npmds=%u min_buf_size=%lu " + "buf_size=%lu CPU%d", flags, npmds, min_buf_size, + arg->buf_size, smp_processor_id()); + + /* + * must hold at least the buffer header + one minimally sized entry + */ + if (arg->buf_size < min_buf_size) return -EINVAL; + + return 0; +} + +static int pfm_default_fmt_get_size(unsigned int flags, void *data, + size_t *size) +{ + struct pfm_default_smpl_arg *arg = data; + + /* + * size has been validated in default_validate + */ + *size = arg->buf_size; + + return 0; +} + +static int pfm_default_fmt_init(struct pfm_context *ctx, void *buf, + u32 flags, u16 npmds, void *data) +{ + struct pfm_default_smpl_hdr *hdr; + struct pfm_default_smpl_arg *arg = data; + + hdr = buf; + + hdr->hdr_version = PFM_DEFAULT_SMPL_VERSION; + hdr->hdr_buf_size = arg->buf_size; + hdr->hdr_cur_offs = sizeof(*hdr); + hdr->hdr_overflows = 0; + hdr->hdr_count = 0; + + PFM_DBG("buffer=%p buf_size=%lu hdr_size=%lu " + "hdr_version=%u cur_offs=%lu", + buf, + hdr->hdr_buf_size, + sizeof(*hdr), + hdr->hdr_version, + hdr->hdr_cur_offs); + + return 0; +} + +static int pfm_default_fmt_handler(void *buf, struct pfm_ovfl_arg *arg, + unsigned long ip, u64 tstamp, void *data) +{ + struct pfm_default_smpl_hdr *hdr; + struct pfm_default_smpl_entry *ent; + void *cur, *last; + u64 *e; + size_t entry_size; + u16 npmds, i, ovfl_pmd; + + hdr = buf; + cur = buf+hdr->hdr_cur_offs; + last = buf+hdr->hdr_buf_size; + ovfl_pmd = arg->ovfl_pmd; + + /* + * precheck for sanity + */ + if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; + + npmds = arg->num_smpl_pmds; + + ent = cur; + + prefetch(arg->smpl_pmds_values); + + entry_size = sizeof(*ent) + (npmds << 3); + + /* position for first pmd */ + e = (unsigned long *)(ent+1); + + hdr->hdr_count++; + + PFM_DBG_ovfl("count=%lu cur=%p last=%p free_bytes=%lu " + "ovfl_pmd=%d npmds=%u", + hdr->hdr_count, + cur, last, + last-cur, + ovfl_pmd, + npmds); + + /* + * current = task running at the time of the overflow. + * + * per-task mode: + * - this is ususally the task being monitored. + * Under certain conditions, it might be a different task + * + * system-wide: + * - this is not necessarily the task controlling the session + */ + ent->pid = current->pid; + ent->ovfl_pmd = ovfl_pmd; + ent->last_reset_val = arg->pmd_last_reset; + + /* + * where did the fault happen (includes slot number) + */ + ent->ip = ip; + + ent->tstamp = tstamp; + ent->cpu = smp_processor_id(); + ent->set = arg->active_set; + ent->tgid = current->tgid; + + /* + * selectively store PMDs in increasing index number + */ + if (npmds) { + u64 *val = arg->smpl_pmds_values; + for(i=0; i < npmds; i++) { + *e++ = *val++; + } + } + + /* + * update position for next entry + */ + hdr->hdr_cur_offs += entry_size; + cur += entry_size; + + /* + * post check to avoid losing the last sample + */ + if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; + + /* + * reset before returning from interrupt handler + */ + arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET; + return 0; +full: + PFM_DBG_ovfl("smpl buffer full free=%lu, count=%lu", + last-cur, hdr->hdr_count); + + /* + * increment number of buffer overflow. + * important to detect duplicate set of samples. + */ + hdr->hdr_overflows++; + + /* + * request notification and masking of monitoring. + * Notification is still subject to the overflowed + */ + arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY| PFM_OVFL_CTRL_MASK; + + return -ENOBUFS; /* we are full, sorry */ +} + +static int pfm_default_fmt_restart(int is_active, u32 *ovfl_ctrl, void *buf) +{ + struct pfm_default_smpl_hdr *hdr; + + hdr = buf; + + hdr->hdr_count = 0; + hdr->hdr_cur_offs = sizeof(*hdr); + + *ovfl_ctrl = PFM_OVFL_CTRL_RESET; + + return 0; +} + +static int pfm_default_fmt_exit(void *buf) +{ + return 0; +} + +static struct pfm_smpl_fmt default_fmt={ + .fmt_name = "default-old", + .fmt_version = 0x10000, + .fmt_arg_size = sizeof(struct pfm_default_smpl_arg), + .fmt_validate = pfm_default_fmt_validate, + .fmt_getsize = pfm_default_fmt_get_size, + .fmt_init = pfm_default_fmt_init, + .fmt_handler = pfm_default_fmt_handler, + .fmt_restart = pfm_default_fmt_restart, + .fmt_exit = pfm_default_fmt_exit, + .fmt_flags = FMT_FLAGS, + .owner= THIS_MODULE +}; + +static int pfm_default_fmt_init_module(void) +{ + int ret; + + return pfm_fmt_register(&default_fmt); + return ret; +} + +static void pfm_default_fmt_cleanup_module(void) +{ + pfm_fmt_unregister(&default_fmt); +} + +module_init(pfm_default_fmt_init_module); +module_exit(pfm_default_fmt_cleanup_module); Index: linux-2.6/arch/ia64/perfmon/perfmon_generic.c =================================================================== --- /dev/null +++ linux-2.6/arch/ia64/perfmon/perfmon_generic.c @@ -0,0 +1,148 @@ +/* + * This file contains the generic PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. + * contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Generic IA-64 PMU description tables"); +MODULE_LICENSE("GPL"); + +#define RDEP(x) (1UL << (x)) + +#define PFM_IA64GEN_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)) +#define PFM_IA64GEN_RSVD (0xffffffffffff0080UL) +#define PFM_IA64GEN_NO64 (1UL<<5) + +/* forward declaration */ +static struct pfm_pmu_config pfm_ia64gen_pmu_conf; + +static struct pfm_arch_pmu_info pfm_ia64gen_pmu_info={ + .mask_pmcs = {PFM_IA64GEN_MASK_PMCS,}, +}; + +static struct pfm_regmap_desc pfm_ia64gen_pmc_desc[]={ +/* pmc0 */ PMX_NA, +/* pmc1 */ PMX_NA, +/* pmc2 */ PMX_NA, +/* pmc3 */ PMX_NA, +/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 4), +/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 5), +/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 6), +/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 7) +}; +#define PFM_IA64GEN_NUM_PMCS ARRAY_SIZE(pfm_ia64gen_pmc_desc) + +static struct pfm_regmap_desc pfm_ia64gen_pmd_desc[]={ +/* pmd0 */ PMX_NA, +/* pmd1 */ PMX_NA, +/* pmd2 */ PMX_NA, +/* pmd3 */ PMX_NA, +/* pmd4 */ PMD_D(PFM_REG_C, "PMD4", 4), +/* pmd5 */ PMD_D(PFM_REG_C, "PMD5", 5), +/* pmd6 */ PMD_D(PFM_REG_C, "PMD6", 6), +/* pmd7 */ PMD_D(PFM_REG_C, "PMD7", 7) +}; +#define PFM_IA64GEN_NUM_PMDS ARRAY_SIZE(pfm_ia64gen_pmd_desc) + +static int pfm_ia64gen_pmc_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ +#define PFM_IA64GEN_PMC_PM_POS6 (1UL<< 6) + u64 tmpval; + int is_system; + + is_system = ctx->flags.system; + tmpval = req->reg_value; + + switch(req->reg_num) { + case 4: + case 5: + case 6: + case 7: + /* set pmc.oi for 64-bit emulation */ + tmpval |= 1UL << 5; + + if (is_system) + tmpval |= PFM_IA64GEN_PMC_PM_POS6; + else + tmpval &= ~PFM_IA64GEN_PMC_PM_POS6; + break; + + } + req->reg_value = tmpval; + + return 0; +} + +/* + * matches anything + */ +static int pfm_ia64gen_probe_pmu(void) +{ + u64 pm_buffer[16]; + pal_perf_mon_info_u_t pm_info; + + /* + * call PAL_PERFMON_INFO to retrieve counter width which + * is implementation specific + */ + if (ia64_pal_perf_mon_info(pm_buffer, &pm_info)) + return -1; + + pfm_ia64gen_pmu_conf.counter_width = pm_info.pal_perf_mon_info_s.width; + + return 0; +} + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_ia64gen_pmu_conf={ + .pmu_name = "Generic IA-64", + .counter_width = 0, /* computed from PAL_PERFMON_INFO */ + .pmd_desc = pfm_ia64gen_pmd_desc, + .pmc_desc = pfm_ia64gen_pmc_desc, + .probe_pmu = pfm_ia64gen_probe_pmu, + .num_pmc_entries = PFM_IA64GEN_NUM_PMCS, + .num_pmd_entries = PFM_IA64GEN_NUM_PMDS, + .pmc_write_check = pfm_ia64gen_pmc_check, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .arch_info = & pfm_ia64gen_pmu_info + /* no read/write checkers */ +}; + +static int __init pfm_gen_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_ia64gen_pmu_conf); +} + +static void __exit pfm_gen_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_ia64gen_pmu_conf); +} + +module_init(pfm_gen_pmu_init_module); +module_exit(pfm_gen_pmu_cleanup_module); Index: linux-2.6/arch/ia64/perfmon/perfmon_itanium.c =================================================================== --- /dev/null +++ linux-2.6/arch/ia64/perfmon/perfmon_itanium.c @@ -0,0 +1,229 @@ +/* + * This file contains the Itanium PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Itanium (Merced) PMU description tables"); +MODULE_LICENSE("GPL"); + +#define RDEP(x) (1ULL << (x)) + +#define PFM_ITA_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|RDEP(10)|RDEP(11)|\ + RDEP(12)) + +#define PFM_ITA_NO64 (1ULL<<5) + +static struct pfm_arch_pmu_info pfm_ita_pmu_info={ + .mask_pmcs = {PFM_ITA_MASK_PMCS,}, +}; +/* reserved bits are 1 in the mask */ +#define PFM_ITA_RSVD 0xfffffffffc8000a0UL +/* + * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using + * PMC256+y means we use PMC256+y+1. Yet, we do not have dependency information + * but this is fine because they are handled separately in the IA-64 specific + * code. + */ +static struct pfm_regmap_desc pfm_ita_pmc_desc[]={ +/* pmc0 */ PMX_NA, +/* pmc1 */ PMX_NA, +/* pmc2 */ PMX_NA, +/* pmc3 */ PMX_NA, +/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 4), +/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 5), +/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 6), +/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 7), +/* pmc8 */ PMC_D(PFM_REG_W , "PMC8" , 0xfffffffe3ffffff8UL, 0xfff00000001c0000UL, 0, 8), +/* pmc9 */ PMC_D(PFM_REG_W , "PMC9" , 0xfffffffe3ffffff8UL, 0xfff00000001c0000UL, 0, 9), +/* pmc10 */ PMC_D(PFM_REG_W , "PMC10", 0x0, 0xfffffffff3f0ff30UL, 0, 10), +/* pmc11 */ PMC_D(PFM_REG_W , "PMC11", 0x10000000UL, 0xffffffffecf0ff30UL, 0, 11), +/* pmc12 */ PMC_D(PFM_REG_W , "PMC12", 0x0, 0xffffffffffff0030UL, 0, 12), +/* pmc13 */ PMC_D(PFM_REG_W , "PMC13", 0x3ffff00000001UL, 0xfffffffffffffffeUL, 0, 13), +/* pmc14 */ PMX_NA, +/* pmc15 */ PMX_NA, +/* pmc16 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc24 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc32 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc40 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc48 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc56 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc64 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc72 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc80 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc88 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc96 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc104 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc112 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc120 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc128 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc136 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc144 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc152 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc160 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc168 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc176 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc184 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc192 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc200 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc208 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc216 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc224 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc232 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc240 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc248 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc256 */ PMC_D(PFM_REG_W , "IBR0", 0x0, 0, 0, 0), +/* pmc257 */ PMC_D(PFM_REG_W , "IBR1", 0x0, 0x8000000000000000UL, 0, 1), +/* pmc258 */ PMC_D(PFM_REG_W , "IBR2", 0x0, 0, 0, 2), +/* pmc259 */ PMC_D(PFM_REG_W , "IBR3", 0x0, 0x8000000000000000UL, 0, 3), +/* pmc260 */ PMC_D(PFM_REG_W , "IBR4", 0x0, 0, 0, 4), +/* pmc261 */ PMC_D(PFM_REG_W , "IBR5", 0x0, 0x8000000000000000UL, 0, 5), +/* pmc262 */ PMC_D(PFM_REG_W , "IBR6", 0x0, 0, 0, 6), +/* pmc263 */ PMC_D(PFM_REG_W , "IBR7", 0x0, 0x8000000000000000UL, 0, 7), +/* pmc264 */ PMC_D(PFM_REG_W , "DBR0", 0x0, 0, 0, 0), +/* pmc265 */ PMC_D(PFM_REG_W , "DBR1", 0x0, 0xc000000000000000UL, 0, 1), +/* pmc266 */ PMC_D(PFM_REG_W , "DBR2", 0x0, 0, 0, 2), +/* pmc267 */ PMC_D(PFM_REG_W , "DBR3", 0x0, 0xc000000000000000UL, 0, 3), +/* pmc268 */ PMC_D(PFM_REG_W , "DBR4", 0x0, 0, 0, 4), +/* pmc269 */ PMC_D(PFM_REG_W , "DBR5", 0x0, 0xc000000000000000UL, 0, 5), +/* pmc270 */ PMC_D(PFM_REG_W , "DBR6", 0x0, 0, 0, 6), +/* pmc271 */ PMC_D(PFM_REG_W , "DBR7", 0x0, 0xc000000000000000UL, 0, 7) +}; +#define PFM_ITA_NUM_PMCS ARRAY_SIZE(pfm_ita_pmc_desc) + +static struct pfm_regmap_desc pfm_ita_pmd_desc[]={ +/* pmd0 */ PMD_D(PFM_REG_I , "PMD0", 0), +/* pmd1 */ PMD_D(PFM_REG_I , "PMD1", 1), +/* pmd2 */ PMD_D(PFM_REG_I , "PMD2", 2), +/* pmd3 */ PMD_D(PFM_REG_I , "PMD3", 3), +/* pmd4 */ PMD_D(PFM_REG_C , "PMD4", 4), +/* pmd5 */ PMD_D(PFM_REG_C , "PMD5", 5), +/* pmd6 */ PMD_D(PFM_REG_C , "PMD6", 6), +/* pmd7 */ PMD_D(PFM_REG_C , "PMD7", 7), +/* pmd8 */ PMD_D(PFM_REG_I , "PMD8", 8), +/* pmd9 */ PMD_D(PFM_REG_I , "PMD9", 9), +/* pmd10 */ PMD_D(PFM_REG_I , "PMD10", 10), +/* pmd11 */ PMD_D(PFM_REG_I , "PMD11", 11), +/* pmd12 */ PMD_D(PFM_REG_I , "PMD12", 12), +/* pmd13 */ PMD_D(PFM_REG_I , "PMD13", 13), +/* pmd14 */ PMD_D(PFM_REG_I , "PMD14", 14), +/* pmd15 */ PMD_D(PFM_REG_I , "PMD15", 15), +/* pmd16 */ PMD_D(PFM_REG_I , "PMD16", 16), +/* pmd17 */ PMD_D(PFM_REG_I , "PMD17", 17) +}; +#define PFM_ITA_NUM_PMDS ARRAY_SIZE(pfm_ita_pmd_desc) + +static int pfm_ita_pmc_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ +#define PFM_ITA_PMC_PM_POS6 (1UL<< 6) + struct pfm_arch_context *ctx_arch; + u64 tmpval; + u16 cnum; + int ret = 0, is_system; + + tmpval = req->reg_value; + cnum = req->reg_num; + ctx_arch = pfm_ctx_arch(ctx); + is_system = ctx->flags.system; + + switch(cnum) { + case 4: + case 5: + case 6: + case 7: + case 10: + case 11: + case 12: if (is_system) + tmpval |= PFM_ITA_PMC_PM_POS6; + else + tmpval &= ~PFM_ITA_PMC_PM_POS6; + break; + } + + /* + * we must clear the (instruction) debug registers if pmc13.ta bit is + * cleared before they are written (fl_using_dbreg==0) to avoid + * picking up stale information. + */ + if (cnum == 13 && ((tmpval & 0x1) == 0) + && ctx_arch->flags.use_dbr == 0) { + PFM_DBG("pmc13 has pmc13.ta cleared, clearing ibr"); + ret = pfm_ia64_mark_dbregs_used(ctx, set); + if (ret) return ret; + } + + /* + * we must clear the (data) debug registers if pmc11.pt bit is cleared + * before they are written (fl_using_dbreg==0) to avoid picking up + * stale information. + */ + if (cnum == 11 && ((tmpval >> 28)& 0x1) == 0 + && ctx_arch->flags.use_dbr == 0) { + PFM_DBG("pmc11 has pmc11.pt cleared, clearing dbr"); + ret = pfm_ia64_mark_dbregs_used(ctx, set); + if (ret) return ret; + } + + req->reg_value = tmpval; + + return 0; +} + +static int pfm_ita_probe_pmu(void) +{ + return local_cpu_data->family == 0x7 && !ia64_platform_is("hpsim") + ? 0 : -1; +} + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_ita_pmu_conf={ + .pmu_name = "Itanium", + .counter_width = 32, + .pmd_desc = pfm_ita_pmd_desc, + .pmc_desc = pfm_ita_pmc_desc, + .pmc_write_check = pfm_ita_pmc_check, + .num_pmc_entries = PFM_ITA_NUM_PMCS, + .num_pmd_entries = PFM_ITA_NUM_PMDS, + .probe_pmu = pfm_ita_probe_pmu, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .arch_info = &pfm_ita_pmu_info +}; + +static int __init pfm_ita_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_ita_pmu_conf); +} + +static void __exit pfm_ita_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_ita_pmu_conf); +} + +module_init(pfm_ita_pmu_init_module); +module_exit(pfm_ita_pmu_cleanup_module); + Index: linux-2.6/arch/ia64/perfmon/perfmon_mckinley.c =================================================================== --- /dev/null +++ linux-2.6/arch/ia64/perfmon/perfmon_mckinley.c @@ -0,0 +1,285 @@ +/* + * This file contains the McKinley PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Itanium 2 (McKinley) PMU description tables"); +MODULE_LICENSE("GPL"); + +#define RDEP(x) (1UL << (x)) + +#define PFM_MCK_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|RDEP(10)|RDEP(11)|\ + RDEP(12)) + +#define PFM_MCK_NO64 (1UL<<5) + +static struct pfm_arch_pmu_info pfm_mck_pmu_info={ + .mask_pmcs = {PFM_MCK_MASK_PMCS,}, +}; + +/* reserved bits are 1 in the mask */ +#define PFM_ITA2_RSVD 0xfffffffffc8000a0UL + +/* + * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using + * PMC256+y means we use PMC256+y+1. Yet, we do not have dependency information + * but this is fine because they are handled separately in the IA-64 specific + * code. + */ +static struct pfm_regmap_desc pfm_mck_pmc_desc[]={ +/* pmc0 */ PMX_NA, +/* pmc1 */ PMX_NA, +/* pmc2 */ PMX_NA, +/* pmc3 */ PMX_NA, +/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4" , 0x800020UL, 0xfffffffffc8000a0, PFM_MCK_NO64, 4), +/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 5), +/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 6), +/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 7), +/* pmc8 */ PMC_D(PFM_REG_W , "PMC8" , 0xffffffff3fffffffUL, 0xc0000004UL, 0, 8), +/* pmc9 */ PMC_D(PFM_REG_W , "PMC9" , 0xffffffff3ffffffcUL, 0xc0000004UL, 0, 9), +/* pmc10 */ PMC_D(PFM_REG_W , "PMC10", 0x0, 0xffffffffffff0000UL, 0, 10), +/* pmc11 */ PMC_D(PFM_REG_W , "PMC11", 0x0, 0xfffffffffcf0fe30UL, 0, 11), +/* pmc12 */ PMC_D(PFM_REG_W , "PMC12", 0x0, 0xffffffffffff0000UL, 0, 12), +/* pmc13 */ PMC_D(PFM_REG_W , "PMC13", 0x2078fefefefeUL, 0xfffe1fffe7e7e7e7UL, 0, 13), +/* pmc14 */ PMC_D(PFM_REG_W , "PMC14", 0x0db60db60db60db6UL, 0xffffffffffffdb6dUL, 0, 14), +/* pmc15 */ PMC_D(PFM_REG_W , "PMC15", 0xfffffff0UL, 0xfffffffffffffff0UL, 0, 15), +/* pmc16 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc24 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc32 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc40 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc48 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc56 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc64 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc72 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc80 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc88 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc96 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc104 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc112 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc120 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc128 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc136 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc144 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc152 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc160 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc168 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc176 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc184 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc192 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc200 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc208 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc216 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc224 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc232 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc240 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc248 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc256 */ PMC_D(PFM_REG_W , "IBR0", 0x0, 0, 0, 0), +/* pmc257 */ PMC_D(PFM_REG_W , "IBR1", 0x0, 0x8000000000000000UL, 0, 1), +/* pmc258 */ PMC_D(PFM_REG_W , "IBR2", 0x0, 0, 0, 2), +/* pmc259 */ PMC_D(PFM_REG_W , "IBR3", 0x0, 0x8000000000000000UL, 0, 3), +/* pmc260 */ PMC_D(PFM_REG_W , "IBR4", 0x0, 0, 0, 4), +/* pmc261 */ PMC_D(PFM_REG_W , "IBR5", 0x0, 0x8000000000000000UL, 0, 5), +/* pmc262 */ PMC_D(PFM_REG_W , "IBR6", 0x0, 0, 0, 6), +/* pmc263 */ PMC_D(PFM_REG_W , "IBR7", 0x0, 0x8000000000000000UL, 0, 7), +/* pmc264 */ PMC_D(PFM_REG_W , "DBR0", 0x0, 0, 0, 0), +/* pmc265 */ PMC_D(PFM_REG_W , "DBR1", 0x0, 0xc000000000000000UL, 0, 1), +/* pmc266 */ PMC_D(PFM_REG_W , "DBR2", 0x0, 0, 0, 2), +/* pmc267 */ PMC_D(PFM_REG_W , "DBR3", 0x0, 0xc000000000000000UL, 0, 3), +/* pmc268 */ PMC_D(PFM_REG_W , "DBR4", 0x0, 0, 0, 4), +/* pmc269 */ PMC_D(PFM_REG_W , "DBR5", 0x0, 0xc000000000000000UL, 0, 5), +/* pmc270 */ PMC_D(PFM_REG_W , "DBR6", 0x0, 0, 0, 6), +/* pmc271 */ PMC_D(PFM_REG_W , "DBR7", 0x0, 0xc000000000000000UL, 0, 7) +}; +#define PFM_MCK_NUM_PMCS ARRAY_SIZE(pfm_mck_pmc_desc) + +static struct pfm_regmap_desc pfm_mck_pmd_desc[]={ +/* pmd0 */ PMD_D(PFM_REG_I, "PMD0", 0), +/* pmd1 */ PMD_D(PFM_REG_I, "PMD1", 1), +/* pmd2 */ PMD_D(PFM_REG_I, "PMD2", 2), +/* pmd3 */ PMD_D(PFM_REG_I, "PMD3", 3), +/* pmd4 */ PMD_D(PFM_REG_C, "PMD4", 4), +/* pmd5 */ PMD_D(PFM_REG_C, "PMD5", 5), +/* pmd6 */ PMD_D(PFM_REG_C, "PMD6", 6), +/* pmd7 */ PMD_D(PFM_REG_C, "PMD7", 7), +/* pmd8 */ PMD_D(PFM_REG_I, "PMD8", 8), +/* pmd9 */ PMD_D(PFM_REG_I, "PMD9", 9), +/* pmd10 */ PMD_D(PFM_REG_I, "PMD10", 10), +/* pmd11 */ PMD_D(PFM_REG_I, "PMD11", 11), +/* pmd12 */ PMD_D(PFM_REG_I, "PMD12", 12), +/* pmd13 */ PMD_D(PFM_REG_I, "PMD13", 13), +/* pmd14 */ PMD_D(PFM_REG_I, "PMD14", 14), +/* pmd15 */ PMD_D(PFM_REG_I, "PMD15", 15), +/* pmd16 */ PMD_D(PFM_REG_I, "PMD16", 16), +/* pmd17 */ PMD_D(PFM_REG_I, "PMD17", 17) +}; +#define PFM_MCK_NUM_PMDS ARRAY_SIZE(pfm_mck_pmd_desc) + +static int pfm_mck_pmc_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ + struct pfm_arch_context *ctx_arch; + u64 val8 = 0, val14 = 0, val13 = 0; + u64 tmpval; + u16 cnum; + int ret = 0, check_case1 = 0; + int is_system; + + tmpval = req->reg_value; + cnum = req->reg_num; + ctx_arch = pfm_ctx_arch(ctx); + is_system = ctx->flags.system; + +#define PFM_MCK_PMC_PM_POS6 (1UL<< 6) +#define PFM_MCK_PMC_PM_POS4 (1UL<< 4) + + switch(cnum) { + case 4: + case 5: + case 6: + case 7: + case 11: + case 12: if (is_system) + tmpval |= PFM_MCK_PMC_PM_POS6; + else + tmpval &= ~PFM_MCK_PMC_PM_POS6; + break; + + case 8: val8 = tmpval; + val13 = set->pmcs[13]; + val14 = set->pmcs[14]; + check_case1 = 1; + break; + + case 10: if (is_system) + tmpval |= PFM_MCK_PMC_PM_POS4; + else + tmpval &= ~PFM_MCK_PMC_PM_POS4; + break; + + case 13: + val8 = set->pmcs[8]; + val13 = tmpval; + val14 = set->pmcs[14]; + check_case1 = 1; + break; + + case 14: + val8 = set->pmcs[8]; + val13 = set->pmcs[13]; + val14 = tmpval; + check_case1 = 1; + break; + } + + /* + * check illegal configuration which can produce inconsistencies + * in tagging i-side events in L1D and L2 caches + */ + if (check_case1) { + ret = (((val13 >> 45) & 0xf) == 0 && ((val8 & 0x1) == 0)) + && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0) + ||(((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0)); + + if (ret) { + PFM_DBG("perfmon: invalid config pmc8=0x%lx " + "pmc13=0x%lx pmc14=0x%lx", + val8, val13, val14); + return -EINVAL; + } + } + + /* + * check if configuration implicitely activates the use of + * the debug registers. If true, then we ensure that this is + * possible and that we do not pick up stale value in the HW + * registers. + * + * We postpone the checks of pmc13 and pmc14 to avoid side effects + * in case of errors + */ + + /* + * pmc13 is "active" if: + * one of the pmc13.cfg_dbrpXX field is different from 0x3 + * AND + * at the corresponding pmc13.ena_dbrpXX is set. + */ + if (cnum == 13 && (tmpval & 0x1e00000000000UL) + && (tmpval & 0x18181818UL) != 0x18181818UL + && ctx_arch->flags.use_dbr == 0) { + PFM_DBG("pmc13=0x%lx active", tmpval); + ret = pfm_ia64_mark_dbregs_used(ctx, set); + if (ret) return ret; + } + + /* + * if any pmc14.ibrpX bit is enabled we must clear the ibrs + */ + if (cnum == 14 && ((tmpval & 0x2222UL) != 0x2222UL) + && ctx_arch->flags.use_dbr == 0) { + PFM_DBG("pmc14=0x%lx active", tmpval); + ret = pfm_ia64_mark_dbregs_used(ctx, set); + if (ret) return ret; + } + + req->reg_value = tmpval; + + return 0; +} + +static int pfm_mck_probe_pmu(void) +{ + return local_cpu_data->family == 0x1f ? 0 : -1; +} + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_mck_pmu_conf={ + .pmu_name = "Itanium 2", + .counter_width = 47, + .pmd_desc = pfm_mck_pmd_desc, + .pmc_desc = pfm_mck_pmc_desc, + .pmc_write_check = pfm_mck_pmc_check, + .num_pmc_entries = PFM_MCK_NUM_PMCS, + .num_pmd_entries = PFM_MCK_NUM_PMDS, + .probe_pmu = pfm_mck_probe_pmu, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .arch_info = &pfm_mck_pmu_info, +}; + +static int __init pfm_mck_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_mck_pmu_conf); +} + +static void __exit pfm_mck_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_mck_pmu_conf); +} + +module_init(pfm_mck_pmu_init_module); +module_exit(pfm_mck_pmu_cleanup_module); Index: linux-2.6/arch/ia64/perfmon/perfmon_montecito.c =================================================================== --- /dev/null +++ linux-2.6/arch/ia64/perfmon/perfmon_montecito.c @@ -0,0 +1,404 @@ +/* + * This file contains the McKinley PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Dual-Core Itanium 2 (Montecito) PMU description table"); +MODULE_LICENSE("GPL"); + +#define RDEP(x) (1UL << (x)) + +#define PFM_MONT_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|\ + RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|\ + RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|\ + RDEP(37)|RDEP(39)|RDEP(40)|RDEP(42)) + +#define PFM_MONT_NO64 (1UL<<5) + +static struct pfm_arch_pmu_info pfm_mont_pmu_info={ + .mask_pmcs = {PFM_MONT_MASK_PMCS,}, +}; + +#define PFM_MONT_RSVD 0xffffffff838000a0UL +/* + * + * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using + * PMC256+y means we use PMC256+y+1. Yet, we do not have dependency information + * but this is fine because they are handled separately in the IA-64 specific + * code. + * + * For PMC4-PMC15, PMC40: we force pmc.ism=2 (IA-64 mode only) + */ +static struct pfm_regmap_desc pfm_mont_pmc_desc[]={ +/* pmc0 */ PMX_NA, +/* pmc1 */ PMX_NA, +/* pmc2 */ PMX_NA, +/* pmc3 */ PMX_NA, +/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 4), +/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 5), +/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 6), +/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 7), +/* pmc8 */ PMC_D(PFM_REG_W64, "PMC8" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 8), +/* pmc9 */ PMC_D(PFM_REG_W64, "PMC9" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 9), +/* pmc10 */ PMC_D(PFM_REG_W64, "PMC10", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 10), +/* pmc11 */ PMC_D(PFM_REG_W64, "PMC11", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 11), +/* pmc12 */ PMC_D(PFM_REG_W64, "PMC12", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 12), +/* pmc13 */ PMC_D(PFM_REG_W64, "PMC13", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 13), +/* pmc14 */ PMC_D(PFM_REG_W64, "PMC14", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 14), +/* pmc15 */ PMC_D(PFM_REG_W64, "PMC15", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 15), +/* pmc16 */ PMX_NA, +/* pmc17 */ PMX_NA, +/* pmc18 */ PMX_NA, +/* pmc19 */ PMX_NA, +/* pmc20 */ PMX_NA, +/* pmc21 */ PMX_NA, +/* pmc22 */ PMX_NA, +/* pmc23 */ PMX_NA, +/* pmc24 */ PMX_NA, +/* pmc25 */ PMX_NA, +/* pmc26 */ PMX_NA, +/* pmc27 */ PMX_NA, +/* pmc28 */ PMX_NA, +/* pmc29 */ PMX_NA, +/* pmc30 */ PMX_NA, +/* pmc31 */ PMX_NA, +/* pmc32 */ PMC_D(PFM_REG_W , "PMC32", 0x30f01ffffffffffUL, 0xfcf0fe0000000000UL, 0, 32), +/* pmc33 */ PMC_D(PFM_REG_W , "PMC33", 0x0, 0xfffffe0000000000UL, 0, 33), +/* pmc34 */ PMC_D(PFM_REG_W , "PMC34", 0xf01ffffffffffUL, 0xfff0fe0000000000UL, 0, 34), +/* pmc35 */ PMC_D(PFM_REG_W , "PMC35", 0x0, 0x1ffffffffffUL, 0, 35), +/* pmc36 */ PMC_D(PFM_REG_W , "PMC36", 0xfffffff0UL, 0xfffffffffffffff0UL, 0, 36), +/* pmc37 */ PMC_D(PFM_REG_W , "PMC37", 0x0, 0xffffffffffffc000UL, 0, 37), +/* pmc38 */ PMC_D(PFM_REG_W , "PMC38", 0xdb6UL, 0xffffffffffffdb6dUL, 0, 38), +/* pmc39 */ PMC_D(PFM_REG_W , "PMC39", 0x0, 0xffffffffffff0030UL, 0, 39), +/* pmc40 */ PMC_D(PFM_REG_W , "PMC40", 0x2000000UL, 0xfffffffffff0fe30UL, 0, 40), +/* pmc41 */ PMC_D(PFM_REG_W , "PMC41", 0x00002078fefefefeUL, 0xfffe1fffe7e7e7e7UL, 0, 41), +/* pmc42 */ PMC_D(PFM_REG_W , "PMC42", 0x0, 0xfff800b0UL, 0, 42), +/* pmc43 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc48 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc56 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc64 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc72 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc80 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc88 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc96 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc104 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc112 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc120 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc128 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc136 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc144 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc152 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc160 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc168 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc176 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc184 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc192 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc200 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc208 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc216 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc224 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc232 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc240 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc248 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA, +/* pmc256 */ PMC_D(PFM_REG_W, "IBR0", 0x0, 0, 0, 0), +/* pmc257 */ PMC_D(PFM_REG_W, "IBR1", 0x0, 0x8000000000000000UL, 0, 1), +/* pmc258 */ PMC_D(PFM_REG_W, "IBR2", 0x0, 0, 0, 2), +/* pmc259 */ PMC_D(PFM_REG_W, "IBR3", 0x0, 0x8000000000000000UL, 0, 3), +/* pmc260 */ PMC_D(PFM_REG_W, "IBR4", 0x0, 0, 0, 4), +/* pmc261 */ PMC_D(PFM_REG_W, "IBR5", 0x0, 0x8000000000000000UL, 0, 5), +/* pmc262 */ PMC_D(PFM_REG_W, "IBR6", 0x0, 0, 0, 6), +/* pmc263 */ PMC_D(PFM_REG_W, "IBR7", 0x0, 0x8000000000000000UL, 0, 7), +/* pmc264 */ PMC_D(PFM_REG_W, "DBR0", 0x0, 0, 0, 0), +/* pmc265 */ PMC_D(PFM_REG_W, "DBR1", 0x0, 0xc000000000000000UL, 0, 1), +/* pmc266 */ PMC_D(PFM_REG_W, "DBR2", 0x0, 0, 0, 2), +/* pmc267 */ PMC_D(PFM_REG_W, "DBR3", 0x0, 0xc000000000000000UL, 0, 3), +/* pmc268 */ PMC_D(PFM_REG_W, "DBR4", 0x0, 0, 0, 4), +/* pmc269 */ PMC_D(PFM_REG_W, "DBR5", 0x0, 0xc000000000000000UL, 0, 5), +/* pmc270 */ PMC_D(PFM_REG_W, "DBR6", 0x0, 0, 0, 6), +/* pmc271 */ PMC_D(PFM_REG_W, "DBR7", 0x0, 0xc000000000000000UL, 0, 7) +}; +#define PFM_MONT_NUM_PMCS ARRAY_SIZE(pfm_mont_pmc_desc) + +static struct pfm_regmap_desc pfm_mont_pmd_desc[]={ +/* pmd0 */ PMX_NA, +/* pmd1 */ PMX_NA, +/* pmd2 */ PMX_NA, +/* pmd3 */ PMX_NA, +/* pmd4 */ PMD_D(PFM_REG_C, "PMD4", 4), +/* pmd5 */ PMD_D(PFM_REG_C, "PMD5", 5), +/* pmd6 */ PMD_D(PFM_REG_C, "PMD6", 6), +/* pmd7 */ PMD_D(PFM_REG_C, "PMD7", 7), +/* pmd8 */ PMD_D(PFM_REG_C, "PMD8", 8), +/* pmd9 */ PMD_D(PFM_REG_C, "PMD9", 9), +/* pmd10 */ PMD_D(PFM_REG_C, "PMD10", 10), +/* pmd11 */ PMD_D(PFM_REG_C, "PMD11", 11), +/* pmd12 */ PMD_D(PFM_REG_C, "PMD12", 12), +/* pmd13 */ PMD_D(PFM_REG_C, "PMD13", 13), +/* pmd14 */ PMD_D(PFM_REG_C, "PMD14", 14), +/* pmd15 */ PMD_D(PFM_REG_C, "PMD15", 15), +/* pmd16 */ PMX_NA, +/* pmd17 */ PMX_NA, +/* pmd18 */ PMX_NA, +/* pmd19 */ PMX_NA, +/* pmd20 */ PMX_NA, +/* pmd21 */ PMX_NA, +/* pmd22 */ PMX_NA, +/* pmd23 */ PMX_NA, +/* pmd24 */ PMX_NA, +/* pmd25 */ PMX_NA, +/* pmd26 */ PMX_NA, +/* pmd27 */ PMX_NA, +/* pmd28 */ PMX_NA, +/* pmd29 */ PMX_NA, +/* pmd30 */ PMX_NA, +/* pmd31 */ PMX_NA, +/* pmd32 */ PMD_D(PFM_REG_I, "PMD32", 32), +/* pmd33 */ PMD_D(PFM_REG_I, "PMD33", 33), +/* pmd34 */ PMD_D(PFM_REG_I, "PMD34", 34), +/* pmd35 */ PMD_D(PFM_REG_I, "PMD35", 35), +/* pmd36 */ PMD_D(PFM_REG_I, "PMD36", 36), +/* pmd37 */ PMX_NA, +/* pmd38 */ PMD_D(PFM_REG_I, "PMD38", 38), +/* pmd39 */ PMD_D(PFM_REG_I, "PMD39", 39), +/* pmd40 */ PMX_NA, +/* pmd41 */ PMX_NA, +/* pmd42 */ PMX_NA, +/* pmd43 */ PMX_NA, +/* pmd44 */ PMX_NA, +/* pmd45 */ PMX_NA, +/* pmd46 */ PMX_NA, +/* pmd47 */ PMX_NA, +/* pmd48 */ PMD_D(PFM_REG_I, "PMD48", 48), +/* pmd49 */ PMD_D(PFM_REG_I, "PMD49", 49), +/* pmd50 */ PMD_D(PFM_REG_I, "PMD50", 50), +/* pmd51 */ PMD_D(PFM_REG_I, "PMD51", 51), +/* pmd52 */ PMD_D(PFM_REG_I, "PMD52", 52), +/* pmd53 */ PMD_D(PFM_REG_I, "PMD53", 53), +/* pmd54 */ PMD_D(PFM_REG_I, "PMD54", 54), +/* pmd55 */ PMD_D(PFM_REG_I, "PMD55", 55), +/* pmd56 */ PMD_D(PFM_REG_I, "PMD56", 56), +/* pmd57 */ PMD_D(PFM_REG_I, "PMD57", 57), +/* pmd58 */ PMD_D(PFM_REG_I, "PMD58", 58), +/* pmd59 */ PMD_D(PFM_REG_I, "PMD59", 59), +/* pmd60 */ PMD_D(PFM_REG_I, "PMD60", 60), +/* pmd61 */ PMD_D(PFM_REG_I, "PMD61", 61), +/* pmd62 */ PMD_D(PFM_REG_I, "PMD62", 62), +/* pmd63 */ PMD_D(PFM_REG_I, "PMD63", 63) +}; +#define PFM_MONT_NUM_PMDS ARRAY_SIZE(pfm_mont_pmd_desc) + +static int pfm_mont_has_ht; + +static int pfm_mont_pmc_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ + struct pfm_arch_context *ctx_arch; + u64 val32 = 0, val38 = 0, val41 = 0; + u64 tmpval; + u16 cnum; + int ret = 0, check_case1 = 0; + int is_system; + + tmpval = req->reg_value; + cnum = req->reg_num; + ctx_arch = pfm_ctx_arch(ctx); + is_system = ctx->flags.system; + +#define PFM_MONT_PMC_PM_POS6 (1UL<<6) +#define PFM_MONT_PMC_PM_POS4 (1UL<<4) + + switch(cnum) { + case 4: + case 5: + case 6: + case 7: + case 8: + case 9: if (is_system) + tmpval |= PFM_MONT_PMC_PM_POS6; + else + tmpval &= ~PFM_MONT_PMC_PM_POS6; + break; + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: if ((req->reg_flags & PFM_REGFL_NO_EMUL64) == 0) { + if (pfm_mont_has_ht) { + PFM_INFO("perfmon: Errata 121 PMD10/PMD15 cannot be used to overflow" + "when threads on on"); + return -EINVAL; + } + } + if (is_system) + tmpval |= PFM_MONT_PMC_PM_POS6; + else + tmpval &= ~PFM_MONT_PMC_PM_POS6; + break; + case 39: + case 40: + case 42: if (pfm_mont_has_ht && ((req->reg_value >> 8) & 0x7) == 4) { + PFM_INFO("perfmon: Errata 120: IP-EAR not available when threads are on"); + return -EINVAL; + } + if (is_system) + tmpval |= PFM_MONT_PMC_PM_POS6; + else + tmpval &= ~PFM_MONT_PMC_PM_POS6; + break; + + case 32: val32 = tmpval; + val38 = set->pmcs[38]; + val41 = set->pmcs[41]; + check_case1 = 1; + break; + + case 37: + if (is_system) + tmpval |= PFM_MONT_PMC_PM_POS4; + else + tmpval &= ~PFM_MONT_PMC_PM_POS4; + break; + + case 38: val38 = tmpval; + val32 = set->pmcs[32]; + val41 = set->pmcs[41]; + check_case1 = 1; + break; + case 41: val41 = tmpval; + val32 = set->pmcs[32]; + val38 = set->pmcs[38]; + check_case1 = 1; + break; + } + + if (check_case1) { + ret = (((val41 >> 45) & 0xf) == 0 && ((val32>>57) & 0x1) == 0) + && ((((val38>>1) & 0x3) == 0x2 || ((val38>>1) & 0x3) == 0) + || (((val38>>4) & 0x3) == 0x2 || ((val38>>4) & 0x3) == 0)); + if (ret) { + PFM_DBG("perfmon: invalid config pmc38=0x%lx " + "pmc41=0x%lx pmc32=0x%lx", + val38, val41, val32); + return -EINVAL; + } + } + + /* + * check if configuration implicitely activates the use of the + * debug registers. If true, then we ensure that this is possible + * and that we do not pick up stale value in the HW registers. + */ + + /* + * + * pmc41 is "active" if: + * one of the pmc41.cfgdtagXX field is different from 0x3 + * AND + * the corsesponding pmc41.en_dbrpXX is set. + * AND + * ctx_fl_use_dbr (dbr not yet used) + */ + if (cnum == 41 + && (tmpval & 0x1e00000000000) + && (tmpval & 0x18181818) != 0x18181818 + && ctx_arch->flags.use_dbr == 0) { + PFM_DBG("pmc41=0x%lx active, clearing dbr", tmpval); + ret = pfm_ia64_mark_dbregs_used(ctx, set); + if (ret) return ret; + } + /* + * we must clear the (instruction) debug registers if: + * pmc38.ig_ibrpX is 0 (enabled) + * and + * fl_use_dbr == 0 (dbr not yet used) + */ + if (cnum == 38 && ((tmpval & 0x492) != 0x492) + && ctx_arch->flags.use_dbr == 0) { + PFM_DBG("pmc38=0x%lx active pmc38, clearing ibr", tmpval); + ret = pfm_ia64_mark_dbregs_used(ctx, set); + if (ret) return ret; + + } + req->reg_value = tmpval; + return 0; +} + +static void pfm_handle_errata(void) +{ + pfm_mont_has_ht = 1; + + PFM_INFO("activating workaround for errata 120 " + "(Disable IP-EAR when threads are on)"); + + PFM_INFO("activating workaround for Errata 121 " + "(PMC10-PMC15 cannot be used to overflow" + " when threads are on"); +} +static int pfm_mont_probe_pmu(void) +{ + if (local_cpu_data->family != 0x20) + return -1; + + /* + * the 2 errata must be activated when + * threads are/can be enabled + */ + if (is_multithreading_enabled()) + pfm_handle_errata(); + + return 0; +} + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_mont_pmu_conf={ + .pmu_name = "Montecito", + .counter_width = 47, + .pmd_desc = pfm_mont_pmd_desc, + .pmc_desc = pfm_mont_pmc_desc, + .num_pmc_entries = PFM_MONT_NUM_PMCS, + .num_pmd_entries = PFM_MONT_NUM_PMDS, + .pmc_write_check = pfm_mont_pmc_check, + .probe_pmu = pfm_mont_probe_pmu, + .version = "1.0", + .arch_info = &pfm_mont_pmu_info, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE +}; + +static int __init pfm_mont_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_mont_pmu_conf); +} + +static void __exit pfm_mont_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_mont_pmu_conf); +} + +module_init(pfm_mont_pmu_init_module); +module_exit(pfm_mont_pmu_cleanup_module); Index: linux-2.6/arch/mips/Kconfig =================================================================== --- linux-2.6.orig/arch/mips/Kconfig +++ linux-2.6/arch/mips/Kconfig @@ -1775,6 +1775,8 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. +source "arch/mips/perfmon/Kconfig" + endmenu config RWSEM_GENERIC_SPINLOCK Index: linux-2.6/arch/mips/Makefile =================================================================== --- linux-2.6.orig/arch/mips/Makefile +++ linux-2.6/arch/mips/Makefile @@ -148,6 +148,12 @@ endif endif # +# Perfmon support +# + +core-$(CONFIG_PERFMON) += arch/mips/perfmon/ + +# # Firmware support # libs-$(CONFIG_ARC) += arch/mips/arc/ Index: linux-2.6/arch/mips/kernel/process.c =================================================================== --- linux-2.6.orig/arch/mips/kernel/process.c +++ linux-2.6/arch/mips/kernel/process.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -90,6 +91,7 @@ void start_thread(struct pt_regs * regs, void exit_thread(void) { + pfm_exit_thread(current); } void flush_thread(void) @@ -164,6 +166,8 @@ int copy_thread(int nr, unsigned long cl if (clone_flags & CLONE_SETTLS) ti->tp_value = regs->regs[7]; + pfm_copy_thread(p); + return 0; } Index: linux-2.6/arch/mips/kernel/scall32-o32.S =================================================================== --- linux-2.6.orig/arch/mips/kernel/scall32-o32.S +++ linux-2.6/arch/mips/kernel/scall32-o32.S @@ -662,6 +662,18 @@ einval: li v0, -EINVAL sys sys_signalfd 3 sys sys_timerfd 4 sys sys_eventfd 1 + sys sys_pfm_create_context 4 /* 4320 */ + sys sys_pfm_write_pmcs 3 + sys sys_pfm_write_pmds 4 + sys sys_pfm_read_pmds 3 + sys sys_pfm_load_context 2 + sys sys_pfm_start 2 /* 4325 */ + sys sys_pfm_stop 1 + sys sys_pfm_restart 1 + sys sys_pfm_create_evtsets 3 + sys sys_pfm_getinfo_evtsets 3 + sys sys_pfm_delete_evtsets 3 /* 4326 */ + sys sys_pfm_unload_context 1 .endm /* We pre-compute the number of _instruction_ bytes needed to Index: linux-2.6/arch/mips/kernel/scall64-64.S =================================================================== --- linux-2.6.orig/arch/mips/kernel/scall64-64.S +++ linux-2.6/arch/mips/kernel/scall64-64.S @@ -477,4 +477,16 @@ sys_call_table: PTR sys_signalfd PTR sys_timerfd PTR sys_eventfd + PTR sys_pfm_create_context + PTR sys_pfm_write_pmcs /* 5280 */ + PTR sys_pfm_write_pmds + PTR sys_pfm_read_pmds + PTR sys_pfm_load_context + PTR sys_pfm_start + PTR sys_pfm_stop /* 5285 */ + PTR sys_pfm_restart + PTR sys_pfm_create_evtsets + PTR sys_pfm_getinfo_evtsets + PTR sys_pfm_delete_evtsets + PTR sys_pfm_unload_context /* 5290 */ .size sys_call_table,.-sys_call_table Index: linux-2.6/arch/mips/kernel/scall64-n32.S =================================================================== --- linux-2.6.orig/arch/mips/kernel/scall64-n32.S +++ linux-2.6/arch/mips/kernel/scall64-n32.S @@ -400,7 +400,19 @@ EXPORT(sysn32_call_table) PTR sys_ioprio_set PTR sys_ioprio_get PTR compat_sys_utimensat - PTR compat_sys_signalfd /* 5280 */ + PTR compat_sys_signalfd /* 6280 */ PTR compat_sys_timerfd PTR sys_eventfd + PTR sys_pfm_create_context + PTR sys_pfm_write_pmcs + PTR sys_pfm_write_pmds /* 6285 */ + PTR sys_pfm_read_pmds + PTR sys_pfm_load_context + PTR sys_pfm_start + PTR sys_pfm_stop + PTR sys_pfm_restart /* 6290 */ + PTR sys_pfm_create_evtsets + PTR sys_pfm_getinfo_evtsets + PTR sys_pfm_delete_evtsets + PTR sys_pfm_unload_context .size sysn32_call_table,.-sysn32_call_table Index: linux-2.6/arch/mips/kernel/scall64-o32.S =================================================================== --- linux-2.6.orig/arch/mips/kernel/scall64-o32.S +++ linux-2.6/arch/mips/kernel/scall64-o32.S @@ -525,4 +525,16 @@ sys_call_table: PTR compat_sys_signalfd PTR compat_sys_timerfd PTR sys_eventfd + PTR sys_pfm_create_context /* 4320 */ + PTR sys_pfm_write_pmcs + PTR sys_pfm_write_pmds + PTR sys_pfm_read_pmds + PTR sys_pfm_load_context + PTR sys_pfm_start /* 4325 */ + PTR sys_pfm_stop + PTR sys_pfm_restart + PTR sys_pfm_create_evtsets + PTR sys_pfm_getinfo_evtsets + PTR sys_pfm_delete_evtsets /* 4330 */ + PTR sys_pfm_unload_context .size sys_call_table,.-sys_call_table Index: linux-2.6/arch/mips/kernel/signal.c =================================================================== --- linux-2.6.orig/arch/mips/kernel/signal.c +++ linux-2.6/arch/mips/kernel/signal.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -696,6 +697,9 @@ static void do_signal(struct pt_regs *re asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { + if (thread_info_flags & _TIF_PERFMON_WORK) + pfm_handle_work(regs); + /* deal with pending signal delivery */ if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) do_signal(regs); Index: linux-2.6/arch/mips/kernel/smp.c =================================================================== --- linux-2.6.orig/arch/mips/kernel/smp.c +++ linux-2.6/arch/mips/kernel/smp.c @@ -203,6 +203,52 @@ void smp_call_function_interrupt(void) } } +int smp_call_function_single (int cpu, void (*func) (void *info), void *info, int retry, + int wait) +{ + struct call_data_struct data; + int me = smp_processor_id(); + + /* + * Can die spectacularly if this CPU isn't yet marked online + */ + BUG_ON(!cpu_online(me)); + if (cpu == me) { + WARN_ON(1); + return -EBUSY; + } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + + data.func = func; + data.info = info; + atomic_set(&data.started, 0); + data.wait = wait; + if (wait) + atomic_set(&data.finished, 0); + + spin_lock(&smp_call_lock); + call_data = &data; + mb(); + + /* Send a message to the other CPU */ + core_send_ipi(cpu, SMP_CALL_FUNCTION); + + /* Wait for response */ + /* FIXME: lock-up detection, backtrace on lock-up */ + while (atomic_read(&data.started) != 1) + barrier(); + + if (wait) + while (atomic_read(&data.finished) != 1) + barrier(); + call_data = NULL; + spin_unlock(&smp_call_lock); + + return 0; +} + static void stop_this_cpu(void *dummy) { /* Index: linux-2.6/arch/mips/kernel/time.c =================================================================== --- linux-2.6.orig/arch/mips/kernel/time.c +++ linux-2.6/arch/mips/kernel/time.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -140,6 +141,7 @@ static long last_rtc_update; void local_timer_interrupt(int irq, void *dev_id) { profile_tick(CPU_PROFILING); + pfm_handle_switch_timeout(); update_process_times(user_mode(get_irq_regs())); } Index: linux-2.6/arch/mips/mips-boards/generic/time.c =================================================================== --- linux-2.6.orig/arch/mips/mips-boards/generic/time.c +++ linux-2.6/arch/mips/mips-boards/generic/time.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include Index: linux-2.6/arch/mips/perfmon/Kconfig =================================================================== --- /dev/null +++ linux-2.6/arch/mips/perfmon/Kconfig @@ -0,0 +1,23 @@ +menu "Hardware Performance Monitoring support" +config PERFMON + bool "Perfmon2 performance monitoring interface" + default n + help + Enables the perfmon2 interface to access the hardware + performance counters. See for + more details. + +config PERFMON_DEBUG + bool "Perfmon debugging" + default n + depends on PERFMON + help + Enables perfmon debugging support + +config PERFMON_MIPS64 + tristate "Support for MIPS64 hardware performance counters" + depends on PERFMON + default n + help + Enables support for the MIPS64 hardware performance counters" +endmenu Index: linux-2.6/arch/mips/perfmon/Makefile =================================================================== --- /dev/null +++ linux-2.6/arch/mips/perfmon/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_PERFMON) += perfmon.o +obj-$(CONFIG_PERFMON_MIPS64) += perfmon_mips64.o Index: linux-2.6/arch/mips/perfmon/perfmon.c =================================================================== --- /dev/null +++ linux-2.6/arch/mips/perfmon/perfmon.c @@ -0,0 +1,299 @@ +/* + * This file implements the MIPS64 specific + * support for the perfmon2 interface + * + * Copyright (c) 2005 Philip J. Mucci + * + * based on versions for other architectures: + * Copyright (c) 2005 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include + +/* + * collect pending overflowed PMDs. Called from pfm_ctxsw() + * and from PMU interrupt handler. Must fill in set->povfl_pmds[] + * and set->npend_ovfls. Interrupts are masked + */ +static void __pfm_get_ovfl_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 new_val, wmask; + u64 *used_mask, *cnt_pmds; + u64 mask[PFM_PMD_BV]; + unsigned int i, max; + + max = pfm_pmu_conf->regs.max_cnt_pmd; + cnt_pmds = pfm_pmu_conf->regs.cnt_pmds; + used_mask = set->used_pmds; + wmask = 1ULL << pfm_pmu_conf->counter_width; + bitmap_and(cast_ulp(mask), + cast_ulp(cnt_pmds), + cast_ulp(used_mask),max); + + for (i = 0; i < max; i++) { + /* assume all PMD are counters */ + if (test_bit(i, mask)) { + new_val = pfm_arch_read_pmd(ctx, i); + + PFM_DBG_ovfl("pmd%u new_val=0x%llx bit=%d\n", + i, (unsigned long long)new_val, + (new_val&wmask) ? 1 : 0); + + if (new_val & wmask) { + __set_bit(i, set->povfl_pmds); + set->npend_ovfls++; + } + } + } +} + +static void pfm_stop_active(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i, max; + + max = pfm_pmu_conf->regs.max_pmc; + + /* + * clear enable bits + */ + for (i = 0; i < max; i++) { + if (test_bit(i, set->used_pmcs)) + pfm_arch_write_pmc(ctx, i,0); + } + + if (set->npend_ovfls) + return; + + __pfm_get_ovfl_pmds(ctx, set); +} + +/* + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * Context is locked. Interrupts are masked. Monitoring is active. + * PMU access is guaranteed. PMC and PMD registers are live in PMU. + * + * for per-thread: + * must stop monitoring for the task + * + * Return: + * non-zero : did not save PMDs (as part of stopping the PMU) + * 0 : saved PMDs (no need to save them in caller) + */ +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + /* + * disable lazy restore of PMC registers. + */ + set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; + + pfm_stop_active(task, ctx, set); + + return 1; +} + +/* + * Called from pfm_stop() and pfm_ctxsw() when idle + * task and EXCL_IDLE is on. + * + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * task is not necessarily current. If not current task, then + * task is guaranteed stopped and off any cpu. Access to PMU + * is not guaranteed. Interrupts are masked. Context is locked. + * Set is the active set. + * + * For system-wide: + * task is current + * + * must disable active monitoring. ctx cannot be NULL + */ +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + /* + * no need to go through stop_save() + * if we are already stopped + */ + if (!ctx->flags.started) + return; + + /* + * stop live registers and collect pending overflow + */ + if (task == current) + pfm_stop_active(task, ctx, set); +} + +/* + * called from pfm_start() or pfm_ctxsw() when idle task and + * EXCL_IDLE is on. + * + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-trhead: + * Task is not necessarily current. If not current task, then task + * is guaranteed stopped and off any cpu. Access to PMU is not guaranteed. + * + * For system-wide: + * task is always current + * + * must enable active monitoring. + */ +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i, max_pmc; + + if (task != current) + return; + + max_pmc = pfm_pmu_conf->regs.max_pmc; + + for (i = 0; i < max_pmc; i++) { + if (test_bit(i, set->used_pmcs)) + pfm_arch_write_pmc(ctx, i, set->pmcs[i]); + } +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() + * context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMD registers from set. + */ +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 ovfl_mask, val; + u64 *impl_pmds; + unsigned int i; + unsigned int max_pmd; + + max_pmd = pfm_pmu_conf->regs.max_pmd; + ovfl_mask = pfm_pmu_conf->ovfl_mask; + impl_pmds = pfm_pmu_conf->regs.pmds; + + /* + * must restore all pmds to avoid leaking + * information to user. + */ + for (i = 0; i < max_pmd; i++) { + + if (test_bit(i, impl_pmds) == 0) + continue; + + val = set->pmds[i].value; + + /* + * set upper bits for counter to ensure + * overflow will trigger + */ + val &= ovfl_mask; + + pfm_arch_write_pmd(ctx, i, val); + } +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(). + * Context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMC registers from set, if needed. + */ +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 *impl_pmcs; + unsigned int i, max_pmc; + + max_pmc = pfm_pmu_conf->regs.max_pmc; + impl_pmcs = pfm_pmu_conf->regs.pmcs; + + /* + * - by default no PMCS measures anything + * - on ctxswout, all used PMCs are disabled (cccr enable bit cleared) + * hence when masked we do not need to restore anything + */ + if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0) + return; + + /* + * restore all pmcs + */ + for (i = 0; i < max_pmc; i++) + if (test_bit(i, impl_pmcs)) + pfm_arch_write_pmc(ctx, i, set->pmcs[i]); +} + +char *pfm_arch_get_pmu_module_name(void) +{ + switch(cpu_data->cputype) { +#ifndef CONFIG_SMP + case CPU_34K: +#if defined(CPU_74K) + case CPU_74K: +#endif +#endif + case CPU_SB1: + case CPU_SB1A: + case CPU_25KF: + case CPU_24K: + case CPU_20KC: + case CPU_5KC: + return "perfmon_mips64"; + default: + return NULL; + } + return NULL; +} + +int perfmon_perf_irq(void) +{ + /* BLATANTLY STOLEN FROM OPROFILE, then modified */ + struct pt_regs *regs; + unsigned int counters = pfm_pmu_conf->regs.max_pmc; + unsigned int control; + unsigned int counter; + + regs = get_irq_regs(); + switch (counters) { +#define HANDLE_COUNTER(n) \ + case n + 1: \ + control = read_c0_perfctrl ## n(); \ + counter = read_c0_perfcntr ## n(); \ + if ((control & MIPS64_PMC_INT_ENABLE_MASK) && \ + (counter & MIPS64_PMD_INTERRUPT)) { \ + pfm_interrupt_handler(instruction_pointer(regs),\ + regs); \ + return(1); \ + } + HANDLE_COUNTER(3) + HANDLE_COUNTER(2) + HANDLE_COUNTER(1) + HANDLE_COUNTER(0) + } + + return 0; +} +EXPORT_SYMBOL(perfmon_perf_irq); Index: linux-2.6/arch/mips/perfmon/perfmon_mips64.c =================================================================== --- /dev/null +++ linux-2.6/arch/mips/perfmon/perfmon_mips64.c @@ -0,0 +1,187 @@ +/* + * This file contains the MIPS64 and decendent PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2005 Philip Mucci + * + * Based on perfmon_p6.c: + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +MODULE_AUTHOR("Philip Mucci "); +MODULE_DESCRIPTION("MIPS64 PMU description tables"); +MODULE_LICENSE("GPL"); + +/* + * reserved: + * - bit 63-9 + * RSVD: reserved bits must be 1 + */ +#define PFM_MIPS64_PMC_RSVD 0xfffffffffffffe10ULL +#define PFM_MIPS64_PMC_VAL (1ULL<<4) + +extern int null_perf_irq(struct pt_regs *regs); +extern int (*perf_irq)(struct pt_regs *regs); +extern int perfmon_perf_irq(struct pt_regs *regs); + +static struct pfm_arch_pmu_info pfm_mips64_pmu_info; + +static struct pfm_regmap_desc pfm_mips64_pmc_desc[]={ +/* pmc0 */ PMC_D(PFM_REG_I64, "CP0_25_0", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 0), +/* pmc1 */ PMC_D(PFM_REG_I64, "CP0_25_1", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 1), +/* pmc2 */ PMC_D(PFM_REG_I64, "CP0_25_2", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 2), +/* pmc3 */ PMC_D(PFM_REG_I64, "CP0_25_3", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 3) +}; +#define PFM_MIPS64_NUM_PMCS ARRAY_SIZE(pfm_mips64_pmc_desc) + +static struct pfm_regmap_desc pfm_mips64_pmd_desc[]={ +/* pmd0 */ PMD_D(PFM_REG_C, "CP0_25_0", 0), +/* pmd1 */ PMD_D(PFM_REG_C, "CP0_25_1", 1), +/* pmd2 */ PMD_D(PFM_REG_C, "CP0_25_2", 2), +/* pmd3 */ PMD_D(PFM_REG_C, "CP0_25_3", 3) +}; +#define PFM_MIPS64_NUM_PMDS ARRAY_SIZE(pfm_mips64_pmd_desc) + +static int pfm_mips64_probe_pmu(void) +{ + struct cpuinfo_mips *c = ¤t_cpu_data; + + switch (c->cputype) { +#ifndef CONFIG_SMP + case CPU_34K: +#if defined(CPU_74K) + case CPU_74K: +#endif +#endif + case CPU_SB1: + case CPU_SB1A: + case CPU_25KF: + case CPU_24K: + case CPU_20KC: + case CPU_5KC: + return 0; + break; + default: + PFM_INFO("Unknown cputype 0x%x",c->cputype); + } + return -1; +} + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_mips64_pmu_conf = { + .pmu_name = "MIPS", /* placeholder */ + .counter_width = 31, + .pmd_desc = pfm_mips64_pmd_desc, + .pmc_desc = pfm_mips64_pmc_desc, + .num_pmc_entries = PFM_MIPS64_NUM_PMCS, + .num_pmd_entries = PFM_MIPS64_NUM_PMDS, + .probe_pmu = pfm_mips64_probe_pmu, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .arch_info = &pfm_mips64_pmu_info +}; + +static inline int n_counters(void) +{ + if (!(read_c0_config1() & MIPS64_CONFIG_PMC_MASK)) + return 0; + if (!(read_c0_perfctrl0() & MIPS64_PMC_CTR_MASK)) + return 1; + if (!(read_c0_perfctrl1() & MIPS64_PMC_CTR_MASK)) + return 2; + if (!(read_c0_perfctrl2() & MIPS64_PMC_CTR_MASK)) + return 3; + return 4; +} + +static int __init pfm_mips64_pmu_init_module(void) +{ + struct cpuinfo_mips *c = ¤t_cpu_data; + int i, ret, num; + + switch (c->cputype) { + case CPU_5KC: + pfm_mips64_pmu_conf.pmu_name = "MIPS5KC"; + break; + case CPU_20KC: + pfm_mips64_pmu_conf.pmu_name = "MIPS20KC"; + break; + case CPU_24K: + pfm_mips64_pmu_conf.pmu_name = "MIPS24K"; + break; + case CPU_25KF: + pfm_mips64_pmu_conf.pmu_name = "MIPS25KF"; + break; + case CPU_SB1: + pfm_mips64_pmu_conf.pmu_name = "SB1"; + break; + case CPU_SB1A: + pfm_mips64_pmu_conf.pmu_name = "SB1A"; + break; +#ifndef CONFIG_SMP + case CPU_34K: + pfm_mips64_pmu_conf.pmu_name = "MIPS34K"; + break; +#if defined(CPU_74K) + case CPU_74K: + pfm_mips64_pmu_conf.pmu_name = "MIPS74K"; + break; +#endif +#endif + default: + PFM_INFO("Unknown cputype 0x%x",c->cputype); + return -1; + } + + num = n_counters(); + if (num == 0) { + PFM_INFO("cputype 0x%x has no counters",c->cputype); + return -1; + } + /* mark remaining counters unavailable */ + for(i=num; i < PFM_MIPS64_NUM_PMCS; i++) { + pfm_mips64_pmc_desc[i].type = PFM_REG_NA; + } + + for(i=num; i < PFM_MIPS64_NUM_PMDS; i++) { + pfm_mips64_pmd_desc[i].type = PFM_REG_NA; + } + + pfm_mips64_pmu_conf.num_pmc_entries = num; + pfm_mips64_pmu_conf.num_pmd_entries = num; + + pfm_mips64_pmu_info.pmu_style = c->cputype; + + ret = pfm_pmu_register(&pfm_mips64_pmu_conf); + if (ret == 0) + perf_irq = perfmon_perf_irq; + return ret; +} + +static void __exit pfm_mips64_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_mips64_pmu_conf); + perf_irq = null_perf_irq; +} + +module_init(pfm_mips64_pmu_init_module); +module_exit(pfm_mips64_pmu_cleanup_module); Index: linux-2.6/arch/powerpc/Kconfig =================================================================== --- linux-2.6.orig/arch/powerpc/Kconfig +++ linux-2.6/arch/powerpc/Kconfig @@ -140,6 +140,8 @@ config PPC_OF_PLATFORM_PCI depends on PPC64 # not supported on 32 bits yet default n +source "arch/powerpc/perfmon/Kconfig" + source "init/Kconfig" source "arch/powerpc/platforms/Kconfig" Index: linux-2.6/arch/powerpc/Makefile =================================================================== --- linux-2.6.orig/arch/powerpc/Makefile +++ linux-2.6/arch/powerpc/Makefile @@ -137,6 +137,7 @@ core-y += arch/powerpc/kernel/ \ arch/powerpc/platforms/ core-$(CONFIG_MATH_EMULATION) += arch/powerpc/math-emu/ core-$(CONFIG_XMON) += arch/powerpc/xmon/ +core-$(CONFIG_PERFMON) += arch/powerpc/perfmon/ drivers-$(CONFIG_OPROFILE) += arch/powerpc/oprofile/ Index: linux-2.6/arch/powerpc/kernel/entry_32.S =================================================================== --- linux-2.6.orig/arch/powerpc/kernel/entry_32.S +++ linux-2.6/arch/powerpc/kernel/entry_32.S @@ -38,7 +38,7 @@ * MSR_KERNEL is > 0x10000 on 4xx/Book-E since it include MSR_CE. */ #if MSR_KERNEL >= 0x10000 -#define LOAD_MSR_KERNEL(r, x) lis r,(x)@h; ori r,r,(x)@l +#define LOAD_MSR_KERNEL(r, x) lis r,(x)@ha; ori r,r,(x)@l #else #define LOAD_MSR_KERNEL(r, x) li r,(x) #endif Index: linux-2.6/arch/powerpc/kernel/entry_64.S =================================================================== --- linux-2.6.orig/arch/powerpc/kernel/entry_64.S +++ linux-2.6/arch/powerpc/kernel/entry_64.S @@ -588,6 +588,10 @@ user_work: b .ret_from_except_lite 1: bl .save_nvgprs +#ifdef CONFIG_PERFMON + addi r3,r1,STACK_FRAME_OVERHEAD + bl .pfm_handle_work +#endif /* CONFIG_PERFMON */ li r3,0 addi r4,r1,STACK_FRAME_OVERHEAD bl .do_signal Index: linux-2.6/arch/powerpc/kernel/process.c =================================================================== --- linux-2.6.orig/arch/powerpc/kernel/process.c +++ linux-2.6/arch/powerpc/kernel/process.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -324,6 +325,9 @@ struct task_struct *__switch_to(struct t new_thread->start_tb = current_tb; } #endif + if (test_tsk_thread_flag(new, TIF_PERFMON_CTXSW) + || test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)) + pfm_ctxsw(prev, new); local_irq_save(flags); @@ -457,6 +461,7 @@ void show_regs(struct pt_regs * regs) void exit_thread(void) { discard_lazy_cpu_state(); + pfm_exit_thread(current); } void flush_thread(void) @@ -570,6 +575,7 @@ int copy_thread(int nr, unsigned long cl #else kregs->nip = (unsigned long)ret_from_fork; #endif + pfm_copy_thread(p); return 0; } Index: linux-2.6/arch/powerpc/kernel/time.c =================================================================== --- linux-2.6.orig/arch/powerpc/kernel/time.c +++ linux-2.6/arch/powerpc/kernel/time.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -651,6 +652,8 @@ void timer_interrupt(struct pt_regs * re profile_tick(CPU_PROFILING); calculate_steal_time(); + pfm_handle_switch_timeout(); + #ifdef CONFIG_PPC_ISERIES if (firmware_has_feature(FW_FEATURE_ISERIES)) get_lppaca()->int_dword.fields.decr_int = 0; Index: linux-2.6/arch/powerpc/perfmon/Kconfig =================================================================== --- /dev/null +++ linux-2.6/arch/powerpc/perfmon/Kconfig @@ -0,0 +1,50 @@ +menu "Hardware Performance Monitoring support" +config PERFMON + bool "Perfmon2 performance monitoring interface" + default n + help + Enables the perfmon2 interface to access the hardware + performance counters. See for + more details. + +config PERFMON_DEBUG + bool "Perfmon debugging" + default n + depends on PERFMON + help + Enables perfmon debugging support + +config PERFMON_POWER5 + tristate "Support for Power5 hardware performance counters" + depends on PERFMON && PPC64 + default n + help + Enables support for the Power 5 hardware performance counters + If unsure, say M. + +config PERFMON_PPC32 + tristate "Support for PPC32 hardware performance counters" + depends on PERFMON && PPC32 + default n + help + Enables support for the PPC32 hardware performance counters + If unsure, say M. + +config PERFMON_CELL + tristate "Support for Cell hardware performance counters" + depends on PERFMON && PPC_CELL + default n + help + Enables support for the Cell hardware performance counters. + If unsure, say M. + +config PERFMON_CELL_HW_SMPL + tristate "Support for Cell hardware counter sampling" + depends on PERFMON_CELL + default n + help + Enables support for the Cell hardware counter sampling modes using + the PMU trace-buffer. + If unsure, say M. + +endmenu Index: linux-2.6/arch/powerpc/perfmon/Makefile =================================================================== --- /dev/null +++ linux-2.6/arch/powerpc/perfmon/Makefile @@ -0,0 +1,5 @@ +obj-$(CONFIG_PERFMON) += perfmon.o +obj-$(CONFIG_PERFMON_POWER5) += perfmon_power5.o +obj-$(CONFIG_PERFMON_PPC32) += perfmon_ppc32.o +obj-$(CONFIG_PERFMON_CELL) += perfmon_cell.o +obj-$(CONFIG_PERFMON_CELL_HW_SMPL) += perfmon_cell_hw_smpl.o Index: linux-2.6/arch/powerpc/perfmon/perfmon.c =================================================================== --- /dev/null +++ linux-2.6/arch/powerpc/perfmon/perfmon.c @@ -0,0 +1,282 @@ +/* + * This file implements the powerpc specific + * support for the perfmon2 interface + * + * Copyright (c) 2005 David Gibson, IBM Corporation. + * + * based on versions for other architectures: + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +static void pfm_stop_active(struct task_struct *task, + struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + + BUG_ON(!arch_info->disable_counters || !arch_info->get_ovfl_pmds); + + arch_info->disable_counters(ctx, set); + + if (set->npend_ovfls) + return; + + arch_info->get_ovfl_pmds(ctx, set); +} + +/* + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * Context is locked. Interrupts are masked. Monitoring is active. + * PMU access is guaranteed. PMC and PMD registers are live in PMU. + * + * for per-thread: + * must stop monitoring for the task + * Return: + * non-zero : did not save PMDs (as part of stopping the PMU) + * 0 : saved PMDs (no need to save them in caller) + */ +int pfm_arch_ctxswout_thread(struct task_struct *task, + struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + + /* + * disable lazy restore of PMC registers. + */ + set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; + + pfm_stop_active(task, ctx, set); + + if (arch_info->ctxswout_thread) { + arch_info->ctxswout_thread(task, ctx, set); + } + + return 1; +} + +/* + * Called from pfm_ctxsw + */ +void pfm_arch_ctxswin_thread(struct task_struct *task, + struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + + if (ctx->state != PFM_CTX_MASKED && ctx->flags.started == 1) { + BUG_ON(!arch_info->enable_counters); + arch_info->enable_counters(ctx, set); + } + + if (arch_info->ctxswin_thread) { + arch_info->ctxswin_thread(task, ctx, set); + } +} + +/* + * Called from pfm_stop() and idle notifier + * + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * task is not necessarily current. If not current task, then + * task is guaranteed stopped and off any cpu. Access to PMU + * is not guaranteed. Interrupts are masked. Context is locked. + * Set is the active set. + * + * For system-wide: + * task is current + * + * must disable active monitoring. ctx cannot be NULL + */ +void pfm_arch_stop(struct task_struct *task, + struct pfm_context *ctx, struct pfm_event_set *set) +{ + /* + * no need to go through stop_save() + * if we are already stopped + */ + if (!ctx->flags.started) + return; + + /* + * stop live registers and collect pending overflow + */ + if (task == current) + pfm_stop_active(task, ctx, set); +} + +/* + * called from pfm_start() and idle notifier + * + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * Task is not necessarily current. If not current task, then task + * is guaranteed stopped and off any cpu. No access to PMU is task + * is not current. + * + * For system-wide: + * task is always current + * + * must enable active monitoring. + */ +void pfm_arch_start(struct task_struct *task, + struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + + if (task != current) + return; + + BUG_ON(!arch_info->enable_counters); + + arch_info->enable_counters(ctx, set); +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() + * context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMD registers from set. + */ +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + u64 *used_pmds; + unsigned int i, max_pmd; + + /* The model-specific module can override the default + * restore-PMD method. + */ + if (arch_info->restore_pmds) { + return arch_info->restore_pmds(set); + } + + max_pmd = pfm_pmu_conf->regs.max_pmd; + used_pmds = set->used_pmds; + + for (i = 0; i < max_pmd; i++) + if (test_bit(i, used_pmds) && + !(pfm_pmu_conf->pmd_desc[i].type & PFM_REG_RO)) + pfm_arch_write_pmd(ctx, i, set->pmds[i].value); +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() + * context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMC registers from set, if needed. + */ +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info; + u64 *impl_pmcs; + unsigned int i, max_pmc; + + /* The model-specific module can override the default + * restore-PMC method. + */ + arch_info = pfm_pmu_conf->arch_info; + if (arch_info->restore_pmcs) { + return arch_info->restore_pmcs(set); + } + + /* The "common" powerpc model's enable the counters simply by writing + * all the control registers. Therefore, if we're masked or stopped we + * don't need to bother restoring the PMCs now. + */ + if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0) + return; + + max_pmc = pfm_pmu_conf->regs.max_pmc; + impl_pmcs = pfm_pmu_conf->regs.pmcs; + + /* + * restore all pmcs + */ + for (i = 0; i < max_pmc; i++) + if (test_bit(i, impl_pmcs)) + pfm_arch_write_pmc(ctx, i, set->pmcs[i]); +} + +char *pfm_arch_get_pmu_module_name(void) +{ + unsigned int pvr = mfspr(SPRN_PVR); + + switch (PVR_VER(pvr)) { + case 0x0004: /* 604 */ + case 0x0009: /* 604e; */ + case 0x000A: /* 604ev */ + case 0x0008: /* 750/740 */ + case 0x7000: /* 750FX */ + case 0x7001: + case 0x7002: /* 750GX */ + case 0x000C: /* 7400 */ + case 0x800C: /* 7410 */ + case 0x8000: /* 7451/7441 */ + case 0x8001: /* 7455/7445 */ + case 0x8002: /* 7457/7447 */ + case 0x8003: /* 7447A */ + case 0x8004: /* 7448 */ + return("perfmon_ppc32"); + case PV_POWER4: + case PV_POWER4p: + return "perfmon_power4"; + case PV_POWER5: + case PV_POWER5p: + return "perfmon_power5"; + case PV_970: + case PV_970FX: + case PV_970MP: + return "perfmon_ppc970"; + case PV_BE: + return "perfmon_cell"; + } + return NULL; +} + +void pfm_arch_init_percpu(void) +{ +#ifdef CONFIG_PPC64 + extern void ppc64_enable_pmcs(void); + ppc64_enable_pmcs(); +#endif +} + +/** + * powerpc_irq_handler + * + * Get the perfmon context that belongs to the current CPU, and call the + * model-specific interrupt handler. + **/ +void powerpc_irq_handler(struct pt_regs *regs) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + struct pfm_context *ctx; + + if (arch_info->irq_handler) { + ctx = __get_cpu_var(pmu_ctx); + if (likely(ctx)) + arch_info->irq_handler(regs, ctx); + } +} Index: linux-2.6/arch/powerpc/perfmon/perfmon_cell.c =================================================================== --- /dev/null +++ linux-2.6/arch/powerpc/perfmon/perfmon_cell.c @@ -0,0 +1,610 @@ +/* + * This file contains the Cell PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright IBM Corporation 2007 + * + * Based on other Perfmon2 PMU modules. + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include "../platforms/cell/cbe_regs.h" +#include + +MODULE_AUTHOR("Kevin Corry , " + "Carl Love "); +MODULE_DESCRIPTION("Cell PMU description table"); +MODULE_LICENSE("GPL"); + +/* + * Mapping from Perfmon logical control registers to Cell hardware registers. + */ +static struct pfm_regmap_desc pfm_cell_pmc_desc[] = { + /* Per-counter control registers. */ + PMC_D(PFM_REG_I, "pm0_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm1_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm2_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm3_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm4_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm5_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm6_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm7_control", 0, 0, 0, 0), + + /* Per-counter RTAS arguments. Each of these registers has three fields. + * bits 63-48: debug-bus word + * bits 47-32: sub-unit + * bits 31-0 : full signal number + * (MSB = 63, LSB = 0) + */ + PMC_D(PFM_REG_I, "pm0_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm1_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm2_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm3_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm4_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm5_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm6_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm7_event", 0, 0, 0, 0), + + /* Global control registers. Same order as enum pm_reg_name. */ + PMC_D(PFM_REG_I, "group_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "debug_bus_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "trace_address", 0, 0, 0, 0), /* KMC: Not sure if user-space needs access to this one. */ + PMC_D(PFM_REG_I, "ext_trace_timer", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm_status", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm_interval", 0, 0, 0, 0), /* KMC: Does user-space also need read access to this one? */ + PMC_D(PFM_REG_I, "pm_start_stop", 0, 0, 0, 0), +}; +#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_cell_pmc_desc) + +#define CELL_PMC_PM_STATUS 20 +/* + * Mapping from Perfmon logical data counters to Cell hardware counters. + */ +static struct pfm_regmap_desc pfm_cell_pmd_desc[] = { + PMD_D(PFM_REG_C, "pm0", 0), + PMD_D(PFM_REG_C, "pm1", 0), + PMD_D(PFM_REG_C, "pm2", 0), + PMD_D(PFM_REG_C, "pm3", 0), + PMD_D(PFM_REG_C, "pm4", 0), + PMD_D(PFM_REG_C, "pm5", 0), + PMD_D(PFM_REG_C, "pm6", 0), + PMD_D(PFM_REG_C, "pm7", 0), +}; +#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_cell_pmd_desc) + +/* The firmware only sees physical CPUs, so divide by 2 if SMT is on. */ +#ifdef CONFIG_SCHED_SMT +#define RTAS_CPU(cpu) ((cpu) / 2) +#else +#define RTAS_CPU(cpu) (cpu) +#endif +#define RTAS_BUS_WORD(x) (u16)(((x) >> 48) & 0x0000ffff) +#define RTAS_SUB_UNIT(x) (u16)(((x) >> 32) & 0x0000ffff) +#define RTAS_SIGNAL_NUMBER(x) (s32)( (x) & 0xffffffff) + +#define subfunc_RESET 1 +#define subfunc_ACTIVATE 2 + +#define passthru_ENABLE 1 +#define passthru_DISABLE 2 + +/** + * struct cell_rtas_arg + * + * @cpu: Processor to modify. Linux numbers CPUs based on SMT IDs, but the + * firmware only sees the physical CPUs. So this value should be the + * SMT ID (from smp_processor_id() or get_cpu()) divided by 2. + * @sub_unit: Hardware subunit this applies to (if applicable). + * @signal_group: Signal group to enable/disable on the trace bus. + * @bus_word: For signal groups that propagate via the trace bus, this trace + * bus word will be used. This is a mask of (1 << TraceBusWord). + * For other signal groups, this specifies the trigger or event bus. + * @bit: Trigger/Event bit, if applicable for the signal group. + * + * An array of these structures are passed to rtas_call() to set up the + * signals on the debug bus. + **/ +struct cell_rtas_arg { + u16 cpu; + u16 sub_unit; + s16 signal_group; + u8 bus_word; + u8 bit; +}; + +/** + * rtas_reset_signals + * + * Set up the RTAS arguments for a RESET command. The buffer will be only + * the first entry in the rtas_args[cpu].signal[] array. + **/ +static int rtas_reset_signals(u32 cpu) +{ + struct cell_rtas_arg signal; + u64 real_addr = virt_to_phys(&signal); + int rc; + + memset(&signal, 0, sizeof(signal)); + signal.cpu = RTAS_CPU(cpu); + rc = rtas_call(rtas_token("ibm,cbe-perftools"), + 5, 1, NULL, + subfunc_RESET, + passthru_DISABLE, + real_addr >> 32, + real_addr & 0xffffffff, + sizeof(signal)); + + return rc; +} + +/** + * rtas_activate_signals + * + * Set up the RTAS arguments for an ACTIVATE command. The buffer will be the + * number of entries in the rtas_args[cpu].signal[] array that were filled + * in by attach_signal_to_counter(). + **/ +static int rtas_activate_signals(struct cell_rtas_arg *signals, + int num_signals) +{ + u64 real_addr = virt_to_phys(signals); + int rc; + + rc = rtas_call(rtas_token("ibm,cbe-perftools"), + 5, 1, NULL, + subfunc_ACTIVATE, + passthru_ENABLE, + real_addr >> 32, + real_addr & 0xffffffff, + num_signals * sizeof(*signals)); + + return rc; +} + +/** + * write_pm07_event + * + * Pull out the RTAS arguments from the 64-bit register value and make the + * RTAS activate-signals call. + **/ +static void write_pm07_event(int cpu, unsigned int ctr, u64 value) +{ + struct cell_rtas_arg signal; + int rc; + + signal.cpu = RTAS_CPU(cpu); + signal.bus_word = 1 << RTAS_BUS_WORD(value); + signal.sub_unit = RTAS_SUB_UNIT(value); + signal.signal_group = RTAS_SIGNAL_NUMBER(value) / 100; + signal.bit = RTAS_SIGNAL_NUMBER(value) % 100; + + rc = rtas_activate_signals(&signal, 1); + if (rc) { + PFM_WARN("%s(%d, %u, %lu): Error calling " + "rtas_activate_signal(): %d\n", __FUNCTION__, + cpu, ctr, (unsigned long)value, rc); + /* FIX: Could we change this routine to return an error? */ + } +} + +/** + * pfm_cell_probe_pmu + * + * Simply check the processor version register to see if we're currently + * on a Cell system. + **/ +static int pfm_cell_probe_pmu(void) +{ + unsigned long pvr = mfspr(SPRN_PVR); + + if (PVR_VER(pvr) != PV_BE) + return -1; + + return 0; +} + +/** + * pfm_cell_write_pmc + **/ +static void pfm_cell_write_pmc(unsigned int cnum, u64 value) +{ + int cpu = smp_processor_id(); + + if (cnum < NR_CTRS) { + cbe_write_pm07_control(cpu, cnum, value); + + } else if (cnum < NR_CTRS * 2) { + write_pm07_event(cpu, cnum - NR_CTRS, value); + + } else if (cnum == CELL_PMC_PM_STATUS) { + /* The pm_status register must be treated separately from + * the other "global" PMCs. This call will ensure that + * the interrupts are routed to the correct CPU, as well + * as writing the desired value to the pm_status register. + */ + cbe_enable_pm_interrupts(cpu, cbe_get_hw_thread_id(cpu), value); + + } else if (cnum < PFM_PM_NUM_PMCS) { + cbe_write_pm(cpu, cnum - (NR_CTRS * 2), value); + } +} + +/** + * pfm_cell_write_pmd + **/ +static void pfm_cell_write_pmd(unsigned int cnum, u64 value) +{ + int cpu = smp_processor_id(); + + if (cnum < NR_CTRS) { + cbe_write_ctr(cpu, cnum, value); + } +} + +/** + * pfm_cell_read_pmd + **/ +static u64 pfm_cell_read_pmd(unsigned int cnum) +{ + int cpu = smp_processor_id(); + + if (cnum < NR_CTRS) { + return cbe_read_ctr(cpu, cnum); + } + + return -EINVAL; +} + +/** + * pfm_cell_enable_counters + * + * Just need to turn on the global disable bit in pm_control. + **/ +static void pfm_cell_enable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + cbe_enable_pm(smp_processor_id()); +} + +/** + * pfm_cell_disable_counters + * + * Just need to turn off the global disable bit in pm_control. + * + * Also, if we're using the hardware-sampling module, we need to empty the + * trace-buffer, since it cannot be restored to its current state when this + * event-set is enabled again. + **/ +static void pfm_cell_disable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_smpl_fmt *smpl_fmt = ctx->smpl_fmt; + struct pt_regs *regs; + + cbe_disable_pm(smp_processor_id()); + + if (smpl_fmt && !strcmp(smpl_fmt->fmt_name, PFM_CELL_HW_SMPL_NAME)) { + ctx->ovfl_arg.ovfl_pmd = PFM_CELL_HW_SMPL_OVFL_PMD; + ctx->ovfl_arg.active_set = ctx->active_set->id; + regs = current->thread.regs; + smpl_fmt->fmt_handler(ctx->smpl_addr, &ctx->ovfl_arg, + instruction_pointer(regs), 0, regs); + } +} + +/** + * pfm_cell_restore_pmcs + * + * Write all control register values that are saved in the specified event + * set. We could use the pfm_arch_write_pmc() function to restore each PMC + * individually (as is done in other architectures), but that results in + * multiple RTAS calls. As an optimization, we will setup the RTAS argument + * array so we can do all event-control registers in one RTAS call. + **/ +void pfm_cell_restore_pmcs(struct pfm_event_set *set) +{ + struct cell_rtas_arg signals[NR_CTRS]; + u64 value, *used_pmcs = set->used_pmcs; + int i, rc, num_used = 0, cpu = smp_processor_id(); + + memset(signals, 0, sizeof(signals)); + + for (i = 0; i < NR_CTRS; i++) { + /* Write the per-counter control register. If the PMC is not + * in use, then it will simply clear the register, which will + * disable the associated counter. + */ + cbe_write_pm07_control(cpu, i, set->pmcs[i]); + + if (test_bit(i + NR_CTRS, used_pmcs)) { + /* Set up the next RTAS array entry for this counter. + * Only include pm07_event registers that are in use + * by this set so the RTAS call doesn't have to + * process blank array entries. + */ + value = set->pmcs[i + NR_CTRS]; + signals[num_used].cpu = RTAS_CPU(cpu); + signals[num_used].sub_unit = RTAS_SUB_UNIT(value); + signals[num_used].bus_word = 1 << RTAS_BUS_WORD(value); + signals[num_used].bit = RTAS_SIGNAL_NUMBER(value) % 100; + signals[num_used].signal_group = + RTAS_SIGNAL_NUMBER(value) / 100; + num_used++; + } + } + + rc = rtas_activate_signals(signals, num_used); + if (rc) { + PFM_WARN("Error calling rtas_activate_signal(): %d\n", rc); + /* FIX: We will also need this routine to be able to return + * an error if Stephane agrees to change pfm_arch_write_pmc + * to return an error. + */ + } + + /* Write all the global PMCs. Need to call pfm_cell_write_pmc() + * instead of cbe_write_pm() due to special handling for the + * pm_status register. + */ + for (i *= 2; i < PFM_PM_NUM_PMCS; i++) + pfm_cell_write_pmc(i, set->pmcs[i]); +} + +/** + * pfm_cell_unload_context + * + * For system-wide contexts and self-monitored contexts, make the RTAS call + * to reset the debug-bus signals. + * + * For non-self-monitored contexts, the monitored thread will already have + * been taken off the CPU and we don't need to do anything additional. + **/ +static int pfm_cell_unload_context(struct pfm_context *ctx, + struct task_struct *task) +{ + if (task == current || ctx->flags.system) { + rtas_reset_signals(smp_processor_id()); + } + return 0; +} + +/** + * pfm_cell_ctxswout_thread + * + * When a monitored thread is switched out (self-monitored or externally + * monitored) we need to reset the debug-bus signals so the next context that + * gets switched in can start from a clean set of signals. + **/ +int pfm_cell_ctxswout_thread(struct task_struct *task, + struct pfm_context *ctx, struct pfm_event_set *set) +{ + rtas_reset_signals(smp_processor_id()); + return 0; +} + +/** + * pfm_cell_get_ovfl_pmds + * + * Determine which counters in this set have overflowed and fill in the + * set->povfl_pmds mask and set->npend_ovfls count. On Cell, the pm_status + * register contains a bit for each counter to indicate overflow. However, + * those 8 bits are in the reverse order than what Perfmon2 is expecting, + * so we need to reverse the order of the overflow bits. + **/ +static void pfm_cell_get_ovfl_pmds(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx); + u32 pm_status, ovfl_ctrs; + u64 povfl_pmds = 0; + int i; + + if (!ctx_arch->last_read_updated) + /* This routine was not called via the interrupt handler. + * Need to start by getting interrupts and updating + * last_read_pm_status. + */ + ctx_arch->last_read_pm_status = + cbe_get_and_clear_pm_interrupts(smp_processor_id()); + + /* Reset the flag that the interrupt handler last read pm_status. */ + ctx_arch->last_read_updated = 0; + + pm_status = ctx_arch->last_read_pm_status & + set->pmcs[CELL_PMC_PM_STATUS]; + ovfl_ctrs = CBE_PM_OVERFLOW_CTRS(pm_status); + + /* Reverse the order of the bits in ovfl_ctrs + * and store the result in povfl_pmds. + */ + for (i = 0; i < PFM_PM_NUM_PMDS; i++) { + povfl_pmds = (povfl_pmds << 1) | (ovfl_ctrs & 1); + ovfl_ctrs >>= 1; + } + + /* Mask povfl_pmds with set->used_pmds to get set->povfl_pmds. + * Count the bits set in set->povfl_pmds to get set->npend_ovfls. + */ + bitmap_and(set->povfl_pmds, &povfl_pmds, + set->used_pmds, PFM_PM_NUM_PMDS); + set->npend_ovfls = bitmap_weight(set->povfl_pmds, PFM_PM_NUM_PMDS); +} + +/** + * handle_trace_buffer_interrupts + * + * This routine is for processing just the interval timer and trace buffer + * overflow interrupts. Performance counter interrupts are handled by the + * perf_irq_handler() routine, which reads and saves the pm_status register. + * This routine should not read the actual pm_status register, but rather + * the value passed in. + **/ +static void handle_trace_buffer_interrupts(unsigned long iip, + struct pt_regs *regs, + struct pfm_context *ctx, + u32 pm_status) +{ + struct pfm_smpl_fmt *smpl_fmt; + + if (pm_status & CBE_PM_TRACE_BUFFER_FULL_INTR) { + /* The trace-buffer is full. Get the + * sampling-buffer address and call the handler. + */ + smpl_fmt = ctx->smpl_fmt; + + if (smpl_fmt && + !strcmp(smpl_fmt->fmt_name, PFM_CELL_HW_SMPL_NAME)) { + ctx->ovfl_arg.ovfl_pmd = PFM_CELL_HW_SMPL_OVFL_PMD; + ctx->ovfl_arg.active_set = ctx->active_set->id; + smpl_fmt->fmt_handler(ctx->smpl_addr, &ctx->ovfl_arg, + iip, 0, regs); + } + } + + /* Currently the trace buffer underflow and interval timer + * interrupts are ignored. + */ + + return; +} + +/** + * pfm_cell_irq_handler + * + * Handler for all Cell performance-monitor interrupts. + **/ +static void pfm_cell_irq_handler(struct pt_regs *regs, struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx); + u32 last_read_pm_status; + int cpu = smp_processor_id(); + + /* Need to disable and reenable the performance counters to get the + * desired behavior from the hardware. This is specific to the Cell + * PMU hardware. + */ + cbe_disable_pm(cpu); + + /* Read the pm_status register to get the interrupt bits. If a + * perfmormance counter overflow interrupt occurred, call the core + * perfmon interrupt handler to service the counter overflow. If the + * interrupt was for the interval timer or the trace_buffer, + * call the interval timer and trace buffer interrupt handler. + * + * The value read from the pm_status register is stored in the + * pmf_arch_context structure for use by other routines. Note that + * reading the pm_status register resets the interrupt flags to zero. + * Hence, it is important that the register is only read in one place. + * + * The pm_status reg interrupt reg format is: + * [pmd0:pmd1:pmd2:pmd3:pmd4:pmd5:pmd6:pmd7:intt:tbf:tbu:] + * - pmd0 to pm7 are the perf counter overflow interrupts. + * - intt is the interval timer overflowed interrupt. + * - tbf is the trace buffer full interrupt. + * - tbu is the trace buffer underflow interrupt. + * - The pmd0 bit is the MSB of the 32 bit register. + */ + ctx_arch->last_read_pm_status = last_read_pm_status = + cbe_get_and_clear_pm_interrupts(cpu); + + /* Set flag for pfm_cell_get_ovfl_pmds() routine so it knows + * last_read_pm_status was updated by the interrupt handler. + */ + ctx_arch->last_read_updated = 1; + + if (last_read_pm_status & CBE_PM_ALL_OVERFLOW_INTR) + /* At least one counter overflowed. */ + pfm_interrupt_handler(instruction_pointer(regs), regs); + + if (last_read_pm_status & (CBE_PM_INTERVAL_INTR | + CBE_PM_TRACE_BUFFER_FULL_INTR | + CBE_PM_TRACE_BUFFER_UNDERFLOW_INTR)) + /* Trace buffer or interval timer overflow. */ + handle_trace_buffer_interrupts(instruction_pointer(regs), + regs, ctx, last_read_pm_status); + + /* If the hardware-sampling module masked monitoring for this context, + * don't re-enable the PMU. + */ + if (ctx->state & PFM_CTX_MASKED) { + return; + } + + /* The interrupt settings is the value written to the pm_status + * register. It is saved in the context when the register is + * written. + */ + cbe_enable_pm_interrupts(cpu, cbe_get_hw_thread_id(cpu), + ctx->active_set->pmcs[CELL_PMC_PM_STATUS]); + + /* The writes to the various performance counters only writes to a + * latch. The new values (interrupt setting bits, reset counter value + * etc.) are not copied to the actual registers until the performance + * monitor is enabled. In order to get this to work as desired, the + * permormance monitor needs to be disabled while writting to the + * latches. This is a HW design issue. + */ + cbe_enable_pm(cpu); +} + +static struct pfm_arch_pmu_info pfm_cell_pmu_info = { + .pmu_style = PFM_POWERPC_PMU_CELL, + .write_pmc = pfm_cell_write_pmc, + .write_pmd = pfm_cell_write_pmd, + .read_pmd = pfm_cell_read_pmd, + .enable_counters = pfm_cell_enable_counters, + .disable_counters = pfm_cell_disable_counters, + .irq_handler = pfm_cell_irq_handler, + .get_ovfl_pmds = pfm_cell_get_ovfl_pmds, + .restore_pmcs = pfm_cell_restore_pmcs, + .ctxswout_thread = pfm_cell_ctxswout_thread, + .unload_context = pfm_cell_unload_context, +}; + +static struct pfm_pmu_config pfm_cell_pmu_conf = { + .pmu_name = "Cell", + .version = "0.1", + .counter_width = 32, + .pmd_desc = pfm_cell_pmd_desc, + .pmc_desc = pfm_cell_pmc_desc, + .num_pmc_entries = PFM_PM_NUM_PMCS, + .num_pmd_entries = PFM_PM_NUM_PMDS, + .probe_pmu = pfm_cell_probe_pmu, + .arch_info = &pfm_cell_pmu_info, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, +}; + +static int __init pfm_cell_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_cell_pmu_conf); +} + +static void __exit pfm_cell_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_cell_pmu_conf); +} + +module_init(pfm_cell_pmu_init_module); +module_exit(pfm_cell_pmu_cleanup_module); Index: linux-2.6/arch/powerpc/perfmon/perfmon_cell_hw_smpl.c =================================================================== --- /dev/null +++ linux-2.6/arch/powerpc/perfmon/perfmon_cell_hw_smpl.c @@ -0,0 +1,298 @@ +/* + * Copyright IBM Corp 2007 + * + * Contributed by Carl Love + * and Kevin Corry + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + * + * + * This file implements the IBM Cell PMU hardware-sampling module. + */ +#include +#include +#include +#include + +MODULE_AUTHOR("Carl Love , " + "Kevin Corry "); +MODULE_DESCRIPTION("Perfmon2 CELL hardware sampling format"); +MODULE_LICENSE("GPL"); + +/** + * pfm_cell_hw_smpl_validate + * + * Validate the arguments passed from user-space for creating the + * sampling-buffer. The buffer must be large enough to hold the + * sampling-buffer header and at least one copy of the trace-buffer. + **/ +static int pfm_cell_hw_smpl_validate(u32 flags, u16 npmds, void *data) +{ + struct pfm_cell_hw_smpl_arg *arg = data; + + if (!arg) { + PFM_ERR("No argument passed."); + return -EINVAL; + } + + if (arg->buf_size < PFM_CELL_HW_SMPL_MIN_BUF_SIZE) { + PFM_ERR("Specified buffer size (%lu) too small. " + "Min size is %lu bytes.", + arg->buf_size, PFM_CELL_HW_SMPL_MIN_BUF_SIZE); + return -EINVAL; + } + + return 0; +} + +/** + * pfm_cell_hw_smpl_get_size + * + * Tell the Perfmon2 core how large a buffer we need to have allocated, and + * it will do the allocation for us. The size of the buffer has already been + * validated. + **/ +static int pfm_cell_hw_smpl_get_size(unsigned int flags, + void *data, size_t *size) +{ + struct pfm_cell_hw_smpl_arg *arg = data; + *size = arg->buf_size; + return 0; +} + +/** + * pfm_cell_hw_smpl_init + * + * Initialize the start of the sampling-buffer with a header structure. + * The buffer has already been allocated by the Perfmon2 core. + **/ +static int pfm_cell_hw_smpl_init(struct pfm_context *ctx, void *buf, + u32 flags, u16 npmds, void *data) +{ + struct pfm_cell_hw_smpl_hdr *hdr = buf; + struct pfm_cell_hw_smpl_arg *arg = data; + + hdr->count = 0; + hdr->cur_offset = sizeof(*hdr); + hdr->overflows = 0; + hdr->buf_size = arg->buf_size; + hdr->version = PFM_CELL_HW_SMPL_VERSION; + hdr->buf_flags = arg->buf_flags; + + return 0; +} + +/** + * pfm_cell_hw_smpl_notify_user + * + * Add a "buffer full" message to the context and wake up any user-space + * process that is polling on the context's file descriptor. That process + * can then read() from the file-descriptor to get a copy of the message. + **/ +static int pfm_cell_hw_smpl_notify_user(struct pfm_context *ctx) +{ + union pfarg_msg *msg; + + if (ctx->flags.no_msg) { + return 0; + } + + msg = pfm_get_new_msg(ctx); + if (msg == NULL) { + /* The message queue is full. The user must have called + * pfm_restart(), but didn't extract any messages. + */ + PFM_ERR("No notification messages available."); + return -EBUSY; + } + + msg->type = PFM_MSG_CELL_HW_SMPL_BUF_FULL; + + return pfm_notify_user(ctx); +} + +/** + * pfm_cell_hw_smpl_handler + * + * Create a new entry-header in the sampling-buffer and copy the current + * contents of the trace-buffer into the sampling-buffer. + **/ +static int pfm_cell_hw_smpl_handler(void *buf, + struct pfm_ovfl_arg *arg, + unsigned long ip, + u64 tstamp, + void *data) +{ + struct pfm_cell_hw_smpl_hdr *hdr = buf; + struct pfm_cell_hw_smpl_entry_hdr *ent; + struct pfm_context *ctx; + void *cur, *end; + u64 *trace_buffer_lines; + u32 trace_addr; + + /* If this handler was called due to an actual PMD overflowing, do + * nothing. Only store the contents of the trace-buffer if the trace- + * buffer overflowed. + */ + if (arg->ovfl_pmd != PFM_CELL_HW_SMPL_OVFL_PMD) + return 0; + + cur = buf + hdr->cur_offset; + end = buf + hdr->buf_size; + ctx = __get_cpu_var(pmu_ctx); + + /* Check if the sampling-buffer is full. This should never happen, + * since we check if the buffer is full after adding the new entry. + */ + if ((end - cur) < PFM_CELL_HW_SMPL_MAX_ENTRY_SIZE) { + PFM_ERR("Cell HW Sampling: Buffer is full " + "before adding new entry."); + goto full; + } + + ent = cur; + + /* current = task running at the time of the overflow. + * + * per-task mode: + * - This is ususally the task being monitored. + * Under certain conditions, it might be a different task + * + * system-wide: + * - This is not necessarily the task controlling the session + */ + ent->pid = current->pid; + ent->tgid = current->tgid; + ent->cpu = smp_processor_id(); + ent->set = arg->active_set; + ent->num_samples = 0; + ent->entry_num = hdr->count; + + /* Read at most 1024 lines from the trace-buffer. Note, lines could be + * added to the trace-buffer while it is being read. However, we only + * made sure we had space for up to 1024 lines. + */ + + trace_buffer_lines = (u64 *)(ent + 1); + trace_addr = cbe_read_pm(ent->cpu, trace_address); + while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY) && + ent->num_samples < CBE_PM_TRACE_BUF_MAX_COUNT) { + cbe_read_trace_buffer(ent->cpu, trace_buffer_lines); + trace_buffer_lines += 2; + ent->num_samples++; + trace_addr = cbe_read_pm(ent->cpu, trace_address); + } + + /* Update the sampling-buffer header for the next entry. Since the + * hw_smpl_hdr and hw_smpl_entry_hdr structures are both padded to + * 128-bits, and each trace-buffer line is 128-bits, we know that + * every buffer entry will start on a 128-bit boundary. + */ + if (ent->num_samples) { + cur = trace_buffer_lines; + hdr->cur_offset = cur - (void *)hdr; + hdr->count++; + } + + /* Check the available size in the buffer again so we won't lose the + * next sample entry. + */ + if ((end - cur) < PFM_CELL_HW_SMPL_MAX_ENTRY_SIZE) + goto full; + + return 0; + +full: + PFM_DBG_ovfl("Sampling-buffer full. free bytes=%lu, count=%lu", + end-cur, hdr->count); + + /* Increment the number of sampling-buffer overflows. This + * is important for detecting duplicate sets of samples. + */ + hdr->overflows++; + + /* Add a message to the context's message queue and wake up any + * user-space program's that are polling on the context's file + * descriptor. + */ + pfm_cell_hw_smpl_notify_user(ctx); + + /* Mask monitoring until a pfm_restart() occurs. */ + pfm_mask_monitoring(ctx, ctx->active_set); + ctx->state = PFM_CTX_MASKED; + ctx->flags.can_restart = 1; + + return -ENOBUFS; +} + +/** + * pfm_cell_hw_smpl_restart + * + * Reinitialize the sampling-buffer header, effectively deleting all entries + * previously stored in the sampling-buffer. + * + * FIX: What is the "is_active" argument for? It's not used by any of the + * other sampling modules. + **/ +static int pfm_cell_hw_smpl_restart(int is_active, u32 *ovfl_ctrl, void *buf) +{ + struct pfm_cell_hw_smpl_hdr *hdr = buf; + + hdr->count = 0; + hdr->cur_offset = sizeof(*hdr); + hdr->overflows = 0; + + return 0; +} + +/** + * pfm_cell_hw_smpl_exit + **/ +static int pfm_cell_hw_smpl_exit(void *buf) +{ + return 0; +} + +/** + * cell_hw_smpl_fmt + * + * Structure to describe the Cell hardware-sampling module to the Perfmon2 core. + **/ +static struct pfm_smpl_fmt cell_hw_smpl_fmt = { + .fmt_name = PFM_CELL_HW_SMPL_NAME, + .fmt_arg_size = sizeof(struct pfm_cell_hw_smpl_arg), + .fmt_flags = PFM_FMT_BUILTIN_FLAG, + .fmt_version = PFM_CELL_HW_SMPL_VERSION, + .fmt_validate = pfm_cell_hw_smpl_validate, + .fmt_getsize = pfm_cell_hw_smpl_get_size, + .fmt_init = pfm_cell_hw_smpl_init, + .fmt_handler = pfm_cell_hw_smpl_handler, + .fmt_restart = pfm_cell_hw_smpl_restart, + .fmt_exit = pfm_cell_hw_smpl_exit, + .owner = THIS_MODULE, +}; + +static int __init pfm_cell_hw_smpl_init_module(void) +{ + return pfm_fmt_register(&cell_hw_smpl_fmt); +} + +static void __exit pfm_cell_hw_smpl_exit_module(void) +{ + pfm_fmt_unregister(&cell_hw_smpl_fmt); +} + +module_init(pfm_cell_hw_smpl_init_module); +module_exit(pfm_cell_hw_smpl_exit_module); Index: linux-2.6/arch/powerpc/perfmon/perfmon_power5.c =================================================================== --- /dev/null +++ linux-2.6/arch/powerpc/perfmon/perfmon_power5.c @@ -0,0 +1,292 @@ +/* + * This file contains the POWER5 PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2005 David Gibson, IBM Corporation. + * + * Based on perfmon_p6.c: + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +MODULE_AUTHOR("David Gibson "); +MODULE_DESCRIPTION("POWER5 PMU description table"); +MODULE_LICENSE("GPL"); + +static struct pfm_regmap_desc pfm_power5_pmc_desc[]={ +/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", MMCR0_FC, 0, 0, SPRN_MMCR0), +/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0, 0, 0, SPRN_MMCR1), +/* mmcra */ PMC_D(PFM_REG_I, "MMCRA", 0, 0, 0, SPRN_MMCRA) +}; +#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_power5_pmc_desc) + +/* The TB and PURR registers are read-only. Also, note that the TB register + * actually consists of both the 32-bit SPRN_TBRU and SPRN_TBRL registers. + * For Perfmon2's purposes, we'll treat it as a single 64-bit register. + */ +static struct pfm_regmap_desc pfm_power5_pmd_desc[]={ +/* tb */ PMD_D((PFM_REG_I|PFM_REG_RO), "TB", SPRN_TBRL), +/* pmd1 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1), +/* pmd2 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2), +/* pmd3 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3), +/* pmd4 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4), +/* pmd5 */ PMD_D(PFM_REG_C, "PMC5", SPRN_PMC5), +/* pmd6 */ PMD_D(PFM_REG_C, "PMC6", SPRN_PMC6), +/* purr */ PMD_D((PFM_REG_I|PFM_REG_RO), "PURR", SPRN_PURR), +}; +#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_power5_pmd_desc) + +static int pfm_power5_probe_pmu(void) +{ + unsigned long pvr = mfspr(SPRN_PVR); + + if (PVR_VER(pvr) != PV_POWER5) + return -1; + + return 0; +} + +static void pfm_power5_write_pmc(unsigned int cnum, u64 value) +{ + switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { + case SPRN_MMCR0: + mtspr(SPRN_MMCR0, value); + break; + case SPRN_MMCR1: + mtspr(SPRN_MMCR1, value); + break; + case SPRN_MMCRA: + mtspr(SPRN_MMCRA, value); + break; + default: + BUG(); + } +} + +static void pfm_power5_write_pmd(unsigned int cnum, u64 value) +{ + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC1: + mtspr(SPRN_PMC1, value); + break; + case SPRN_PMC2: + mtspr(SPRN_PMC2, value); + break; + case SPRN_PMC3: + mtspr(SPRN_PMC3, value); + break; + case SPRN_PMC4: + mtspr(SPRN_PMC4, value); + break; + case SPRN_PMC5: + mtspr(SPRN_PMC5, value); + break; + case SPRN_PMC6: + mtspr(SPRN_PMC6, value); + break; + case SPRN_PMC7: + mtspr(SPRN_PMC7, value); + break; + case SPRN_PMC8: + mtspr(SPRN_PMC8, value); + break; + case SPRN_TBRL: + case SPRN_PURR: + /* Ignore writes to read-only registers. */ + break; + default: + BUG(); + } +} + +static u64 pfm_power5_read_pmd(unsigned int cnum) +{ + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC1: + return mfspr(SPRN_PMC1); + case SPRN_PMC2: + return mfspr(SPRN_PMC2); + case SPRN_PMC3: + return mfspr(SPRN_PMC3); + case SPRN_PMC4: + return mfspr(SPRN_PMC4); + case SPRN_PMC5: + return mfspr(SPRN_PMC5); + case SPRN_PMC6: + return mfspr(SPRN_PMC6); + case SPRN_PMC7: + return mfspr(SPRN_PMC7); + case SPRN_PMC8: + return mfspr(SPRN_PMC8); + case SPRN_TBRL: + return ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL); + case SPRN_PURR: + if (cpu_has_feature(CPU_FTR_PURR)) + return mfspr(SPRN_PURR); + else + return 0; + default: + BUG(); + } +} + +/** + * pfm_power5_enable_counters + * + * Just need to load the current values into the control registers. + **/ +static void pfm_power5_enable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i, max_pmc; + + max_pmc = pfm_pmu_conf->regs.max_pmc; + + for (i = 0; i < max_pmc; i++) + if (test_bit(i, set->used_pmcs)) + pfm_power5_write_pmc(i, set->pmcs[i]); +} + +/** + * pfm_power5_disable_counters + * + * Just need to zero all the control registers. + **/ +static void pfm_power5_disable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i, max; + + max = pfm_pmu_conf->regs.max_pmc; + + for (i = 0; i < max; i++) + if (test_bit(i, set->used_pmcs)) + pfm_power5_write_pmc(i, 0); +} + +/** + * pfm_power5_get_ovfl_pmds + * + * Determine which counters in this set have overflowed and fill in the + * set->povfl_pmds mask and set->npend_ovfls count. + **/ +static void pfm_power5_get_ovfl_pmds(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i; + unsigned int max_pmd = pfm_pmu_conf->regs.max_cnt_pmd; + u64 *used_pmds = set->used_pmds; + u64 *cntr_pmds = pfm_pmu_conf->regs.cnt_pmds; + u64 width_mask = 1 << pfm_pmu_conf->counter_width; + u64 new_val, mask[PFM_PMD_BV]; + + bitmap_and(cast_ulp(mask), cast_ulp(cntr_pmds), + cast_ulp(used_pmds), max_pmd); + + for (i = 0; i < max_pmd; i++) { + if (test_bit(i, mask)) { + new_val = pfm_power5_read_pmd(i); + if (new_val & width_mask) { + set_bit(i, set->povfl_pmds); + set->npend_ovfls++; + } + } + } +} + +static void pfm_power5_irq_handler(struct pt_regs *regs, + struct pfm_context *ctx) +{ + u32 mmcr0; + u64 mmcra; + + /* Disable the counters (set the freeze bit) to not polute + * the counts. + */ + mmcr0 = mfspr(SPRN_MMCR0); + mtspr(SPRN_MMCR0, (mmcr0 | MMCR0_FC)); + mmcra = mfspr(SPRN_MMCRA); + + /* Set the PMM bit (see comment below). */ + mtmsrd(mfmsr() | MSR_PMM); + + pfm_interrupt_handler(instruction_pointer(regs), regs); + + mmcr0 = mfspr(SPRN_MMCR0); + /* Reset the perfmon trigger. */ + mmcr0 |= MMCR0_PMXE; + + /* + * We must clear the PMAO bit on some (GQ) chips. Just do it + * all the time. + */ + mmcr0 &= ~MMCR0_PMAO; + + /* Clear the appropriate bits in the MMCRA. */ + mmcra &= POWER6_MMCRA_THRM | POWER6_MMCRA_OTHER; + mtspr(SPRN_MMCRA, mmcra); + + /* + * Now clear the freeze bit, counting will not start until we + * rfid from this exception, because only at that point will + * the PMM bit be cleared. + */ + mmcr0 &= ~MMCR0_FC; + mtspr(SPRN_MMCR0, mmcr0); +} + +struct pfm_arch_pmu_info pfm_power5_pmu_info = { + .pmu_style = PFM_POWERPC_PMU_POWER5, + .write_pmc = pfm_power5_write_pmc, + .write_pmd = pfm_power5_write_pmd, + .read_pmd = pfm_power5_read_pmd, + .irq_handler = pfm_power5_irq_handler, + .get_ovfl_pmds = pfm_power5_get_ovfl_pmds, + .enable_counters = pfm_power5_enable_counters, + .disable_counters = pfm_power5_disable_counters, +}; + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_power5_pmu_conf = { + .pmu_name = "POWER5", + .counter_width = 31, + .pmd_desc = pfm_power5_pmd_desc, + .pmc_desc = pfm_power5_pmc_desc, + .num_pmc_entries = PFM_PM_NUM_PMCS, + .num_pmd_entries = PFM_PM_NUM_PMDS, + .probe_pmu = pfm_power5_probe_pmu, + .arch_info = &pfm_power5_pmu_info, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE +}; + +static int __init pfm_power5_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_power5_pmu_conf); +} + +static void __exit pfm_power5_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_power5_pmu_conf); +} + +module_init(pfm_power5_pmu_init_module); +module_exit(pfm_power5_pmu_cleanup_module); Index: linux-2.6/arch/powerpc/perfmon/perfmon_ppc32.c =================================================================== --- /dev/null +++ linux-2.6/arch/powerpc/perfmon/perfmon_ppc32.c @@ -0,0 +1,340 @@ +/* + * This file contains the PPC32 PMU register description tables + * and pmc checker used by perfmon.c. + * + * Philip Mucci, mucci@cs.utk.edu + * + * Based on code from: + * Copyright (c) 2005 David Gibson, IBM Corporation. + * + * Based on perfmon_p6.c: + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include + +MODULE_AUTHOR("Philip Mucci "); +MODULE_DESCRIPTION("PPC32 PMU description table"); +MODULE_LICENSE("GPL"); + +static struct pfm_pmu_config pfm_ppc32_pmu_conf; + +static struct pfm_regmap_desc pfm_ppc32_pmc_desc[] = { +/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", 0x0, 0, 0, SPRN_MMCR0), +/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0x0, 0, 0, SPRN_MMCR1), +/* mmcr2 */ PMC_D(PFM_REG_I, "MMCR2", 0x0, 0, 0, SPRN_MMCR2), +}; +#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_ppc32_pmc_desc) + +static struct pfm_regmap_desc pfm_ppc32_pmd_desc[] = { +/* pmd0 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1), +/* pmd1 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2), +/* pmd2 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3), +/* pmd3 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4), +/* pmd4 */ PMD_D(PFM_REG_C, "PMC5", SPRN_PMC5), +/* pmd5 */ PMD_D(PFM_REG_C, "PMC6", SPRN_PMC6), +}; +#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_ppc32_pmd_desc) + +static void perfmon_perf_irq(struct pt_regs *regs) +{ + u32 mmcr0; + + /* BLATANTLY STOLEN FROM OPROFILE, then modified */ + + /* set the PMM bit (see comment below) */ + mtmsr(mfmsr() | MSR_PMM); + + pfm_interrupt_handler(instruction_pointer(regs), regs); + + /* The freeze bit was set by the interrupt. + * Clear the freeze bit, and reenable the interrupt. + * The counters won't actually start until the rfi clears + * the PMM bit. + */ + + /* Unfreezes the counters on this CPU, enables the interrupt, + * enables the counters to trigger the interrupt, and sets the + * counters to only count when the mark bit is not set. + */ + mmcr0 = mfspr(SPRN_MMCR0); + + mmcr0 &= ~(MMCR0_FC | MMCR0_FCM0); + mmcr0 |= (MMCR0_FCECE | MMCR0_PMC1CE | MMCR0_PMCnCE | MMCR0_PMXE); + + mtspr(SPRN_MMCR0, mmcr0); +} + +static int pfm_ppc32_probe_pmu(void) +{ + enum ppc32_pmu_type pm_type; + int nmmcr = 0, npmds = 0, intsok = 0, i; + unsigned int pvr; + char *str; + + pvr = mfspr(SPRN_PVR); + + switch (PVR_VER(pvr)) { + case 0x0004: /* 604 */ + str = "PPC604"; + pm_type = PFM_POWERPC_PMU_604; + nmmcr = 1; + npmds = 2; + break; + case 0x0009: /* 604e; */ + case 0x000A: /* 604ev */ + str = "PPC604e"; + pm_type = PFM_POWERPC_PMU_604e; + nmmcr = 2; + npmds = 4; + break; + case 0x0008: /* 750/740 */ + str = "PPC750"; + pm_type = PFM_POWERPC_PMU_750; + nmmcr = 2; + npmds = 4; + break; + case 0x7000: /* 750FX */ + case 0x7001: + str = "PPC750"; + pm_type = PFM_POWERPC_PMU_750; + nmmcr = 2; + npmds = 4; + if ((pvr & 0xFF0F) >= 0x0203) + intsok = 1; + break; + case 0x7002: /* 750GX */ + str = "PPC750"; + pm_type = PFM_POWERPC_PMU_750; + nmmcr = 2; + npmds = 4; + intsok = 1; + case 0x000C: /* 7400 */ + str = "PPC7400"; + pm_type = PFM_POWERPC_PMU_7400; + nmmcr = 3; + npmds = 4; + break; + case 0x800C: /* 7410 */ + str = "PPC7410"; + pm_type = PFM_POWERPC_PMU_7400; + nmmcr = 3; + npmds = 4; + if ((pvr & 0xFFFF) >= 0x01103) + intsok = 1; + break; + case 0x8000: /* 7451/7441 */ + case 0x8001: /* 7455/7445 */ + case 0x8002: /* 7457/7447 */ + case 0x8003: /* 7447A */ + case 0x8004: /* 7448 */ + str = "PPC7450"; + pm_type = PFM_POWERPC_PMU_7450; + nmmcr = 3; npmds = 6; + intsok = 1; + break; + default: + PFM_INFO("Unknown PVR_VER(0x%x)\n", PVR_VER(pvr)); + return -1; + } + + /* + * deconfigure unimplemented registers + */ + for (i = npmds; i < PFM_PM_NUM_PMDS; i++) + pfm_ppc32_pmd_desc[i].type = PFM_REG_NA; + + for (i = nmmcr; i < PFM_PM_NUM_PMCS; i++) + pfm_ppc32_pmc_desc[i].type = PFM_REG_NA; + + /* + * update PMU description structure + */ + pfm_ppc32_pmu_conf.pmu_name = str; + pfm_ppc32_pmu_info.pmu_style = pm_type; + pfm_ppc32_pmu_conf.num_pmc_entries = nmmcr; + pfm_ppc32_pmu_conf.num_pmd_entries = npmds; + + if (intsok == 0) + PFM_INFO("Interrupts unlikely to work\n"); + + return reserve_pmc_hardware(perfmon_perf_irq); +} + +static void pfm_ppc32_write_pmc(unsigned int cnum, u64 value) +{ + switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { + case SPRN_MMCR0: + mtspr(SPRN_MMCR0, value); + break; + case SPRN_MMCR1: + mtspr(SPRN_MMCR1, value); + break; + case SPRN_MMCR2: + mtspr(SPRN_MMCR2, value); + break; + default: + BUG(); + } +} + +static void pfm_ppc32_write_pmd(unsigned int cnum, u64 value) +{ + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC1: + mtspr(SPRN_PMC1, value); + break; + case SPRN_PMC2: + mtspr(SPRN_PMC2, value); + break; + case SPRN_PMC3: + mtspr(SPRN_PMC3, value); + break; + case SPRN_PMC4: + mtspr(SPRN_PMC4, value); + break; + case SPRN_PMC5: + mtspr(SPRN_PMC5, value); + break; + case SPRN_PMC6: + mtspr(SPRN_PMC6, value); + break; + default: + BUG(); + } +} + +static u64 pfm_ppc32_read_pmd(unsigned int cnum) +{ + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC1: + return mfspr(SPRN_PMC1); + case SPRN_PMC2: + return mfspr(SPRN_PMC2); + case SPRN_PMC3: + return mfspr(SPRN_PMC3); + case SPRN_PMC4: + return mfspr(SPRN_PMC4); + case SPRN_PMC5: + return mfspr(SPRN_PMC5); + case SPRN_PMC6: + return mfspr(SPRN_PMC6); + default: + BUG(); + } +} + +/** + * pfm_ppc32_enable_counters + * + * Just need to load the current values into the control registers. + **/ +static void pfm_ppc32_enable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i, max_pmc; + + max_pmc = pfm_pmu_conf->regs.max_pmc; + + for (i = 0; i < max_pmc; i++) + if (test_bit(i, set->used_pmcs)) + pfm_ppc32_write_pmc(i, set->pmcs[i]); +} + +/** + * pfm_ppc32_disable_counters + * + * Just need to zero all the control registers. + **/ +static void pfm_ppc32_disable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i, max; + + max = pfm_pmu_conf->regs.max_pmc; + + for (i = 0; i < max; i++) + if (test_bit(i, set->used_pmcs)) + pfm_ppc32_write_pmc(ctx, 0); +} + +/** + * pfm_ppc32_get_ovfl_pmds + * + * Determine which counters in this set have overflowed and fill in the + * set->povfl_pmds mask and set->npend_ovfls count. + **/ +static void pfm_ppc32_get_ovfl_pmds(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i; + unsigned int max_pmd = pfm_pmu_conf->regs.max_cnt_pmd; + u64 *used_pmds = set->used_pmds; + u64 *cntr_pmds = pfm_pmu_conf->regs.cnt_pmds; + u64 width_mask = 1 << pfm_pmu_conf->counter_width; + u64 new_val, mask[PFM_PMD_BV]; + + bitmap_and(cast_ulp(mask), cast_ulp(cntr_pmds), + cast_ulp(used_pmds), max_pmd); + + for (i = 0; i < max_pmd; i++) { + if (test_bit(i, mask)) { + new_val = pfm_ppc32_read_pmd(i); + if (new_val & width_mask) { + set_bit(i, set->povfl_pmds); + set->npend_ovfls++; + } + } + } +} + +struct pfm_arch_pmu_info pfm_ppc32_pmu_info = { + .pmu_style = PFM_POWERPC_PMU_NONE, + .write_pmc = pfm_ppc32_write_pmc, + .write_pmd = pfm_ppc32_write_pmd, + .read_pmd = pfm_ppc32_read_pmd, + .get_ovfl_pmds = pfm_ppc32_get_ovfl_pmds, + .enable_counters = pfm_ppc32_enable_counters, + .disable_counters = pfm_ppc32_disable_counters, +}; + +static struct pfm_pmu_config pfm_ppc32_pmu_conf = { + .counter_width = 31, + .pmd_desc = pfm_ppc32_pmd_desc, + .pmc_desc = pfm_ppc32_pmc_desc, + .probe_pmu = pfm_ppc32_probe_pmu, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .version = "0.1", + .arch_info = &pfm_ppc32_pmu_info, +}; + +static int __init pfm_ppc32_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_ppc32_pmu_conf); +} + +static void __exit pfm_ppc32_pmu_cleanup_module(void) +{ + release_pmc_hardware(); + pfm_pmu_unregister(&pfm_ppc32_pmu_conf); +} + +module_init(pfm_ppc32_pmu_init_module); +module_exit(pfm_ppc32_pmu_cleanup_module); Index: linux-2.6/arch/x86_64/Kconfig =================================================================== --- linux-2.6.orig/arch/x86_64/Kconfig +++ linux-2.6/arch/x86_64/Kconfig @@ -667,6 +667,8 @@ config K8_NB def_bool y depends on AGP_AMD64 || IOMMU || (PCI && NUMA) +source "arch/x86_64/perfmon/Kconfig" + endmenu # Index: linux-2.6/arch/x86_64/Makefile =================================================================== --- linux-2.6.orig/arch/x86_64/Makefile +++ linux-2.6/arch/x86_64/Makefile @@ -79,6 +79,7 @@ core-y += arch/x86_64/kernel/ \ arch/x86_64/crypto/ core-$(CONFIG_IA32_EMULATION) += arch/x86_64/ia32/ drivers-$(CONFIG_PCI) += arch/x86_64/pci/ +drivers-$(CONFIG_PERFMON) += arch/x86_64/perfmon/ drivers-$(CONFIG_OPROFILE) += arch/x86_64/oprofile/ boot := arch/x86_64/boot Index: linux-2.6/arch/x86_64/ia32/ia32entry.S =================================================================== --- linux-2.6.orig/arch/x86_64/ia32/ia32entry.S +++ linux-2.6/arch/x86_64/ia32/ia32entry.S @@ -719,4 +719,16 @@ ia32_sys_call_table: .quad compat_sys_signalfd .quad compat_sys_timerfd .quad sys_eventfd + .quad sys_pfm_create_context + .quad sys_pfm_write_pmcs /* 325 */ + .quad sys_pfm_write_pmds + .quad sys_pfm_read_pmds + .quad sys_pfm_load_context + .quad sys_pfm_start + .quad sys_pfm_stop /* 330 */ + .quad sys_pfm_restart + .quad sys_pfm_create_evtsets + .quad sys_pfm_getinfo_evtsets + .quad sys_pfm_delete_evtsets + .quad sys_pfm_unload_context /* 335 */ ia32_syscall_end: Index: linux-2.6/arch/x86_64/kernel/entry.S =================================================================== --- linux-2.6.orig/arch/x86_64/kernel/entry.S +++ linux-2.6/arch/x86_64/kernel/entry.S @@ -282,7 +282,7 @@ sysret_careful: sysret_signal: TRACE_IRQS_ON sti - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_PERFMON_WORK),%edx jz 1f /* Really a signal */ @@ -375,7 +375,7 @@ int_very_careful: jmp int_restore_rest int_signal: - testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx + testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_PERFMON_WORK),%edx jz 1f movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 @@ -599,7 +599,7 @@ retint_careful: jmp retint_check retint_signal: - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_PERFMON_WORK),%edx jz retint_swapgs TRACE_IRQS_ON sti @@ -691,7 +691,12 @@ END(error_interrupt) ENTRY(spurious_interrupt) apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt END(spurious_interrupt) - + +#ifdef CONFIG_PERFMON +ENTRY(pmu_interrupt) + apicinterrupt LOCAL_PERFMON_VECTOR,smp_pmu_interrupt +#endif + /* * Exception entry points. */ Index: linux-2.6/arch/x86_64/kernel/i8259.c =================================================================== --- linux-2.6.orig/arch/x86_64/kernel/i8259.c +++ linux-2.6/arch/x86_64/kernel/i8259.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include Index: linux-2.6/arch/x86_64/kernel/process.c =================================================================== --- linux-2.6.orig/arch/x86_64/kernel/process.c +++ linux-2.6/arch/x86_64/kernel/process.c @@ -379,6 +379,7 @@ void exit_thread(void) t->io_bitmap_max = 0; put_cpu(); } + pfm_exit_thread(me); } void flush_thread(void) @@ -487,6 +488,8 @@ int copy_thread(int nr, unsigned long cl asm("mov %%es,%0" : "=m" (p->thread.es)); asm("mov %%ds,%0" : "=m" (p->thread.ds)); + pfm_copy_thread(p); + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -557,6 +560,10 @@ static inline void __switch_to_xtra(stru */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } + + if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW) + || test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw(prev_p, next_p); } /* @@ -661,7 +668,7 @@ __switch_to(struct task_struct *prev_p, * Now maybe reload the debug registers and handle I/O bitmaps */ if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)) - || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) + || (task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW)) __switch_to_xtra(prev_p, next_p, tss); /* If the task has used fpu the last 5 timeslices, just do a full Index: linux-2.6/arch/x86_64/kernel/setup64.c =================================================================== --- linux-2.6.orig/arch/x86_64/kernel/setup64.c +++ linux-2.6/arch/x86_64/kernel/setup64.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -286,4 +287,6 @@ void __cpuinit cpu_init (void) fpu_init(); raw_local_save_flags(kernel_eflags); + + pfm_init_percpu(); } Index: linux-2.6/arch/x86_64/kernel/signal.c =================================================================== --- linux-2.6.orig/arch/x86_64/kernel/signal.c +++ linux-2.6/arch/x86_64/kernel/signal.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -472,6 +473,9 @@ do_notify_resume(struct pt_regs *regs, v clear_thread_flag(TIF_SINGLESTEP); } + if (thread_info_flags & _TIF_PERFMON_WORK) + pfm_handle_work(regs); + /* deal with pending signal delivery */ if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) do_signal(regs); Index: linux-2.6/arch/x86_64/kernel/smpboot.c =================================================================== --- linux-2.6.orig/arch/x86_64/kernel/smpboot.c +++ linux-2.6/arch/x86_64/kernel/smpboot.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -1043,6 +1044,7 @@ int __cpu_disable(void) spin_unlock(&vector_lock); remove_cpu_from_maps(); fixup_irqs(cpu_online_map); + pfm_cpu_disable(); return 0; } Index: linux-2.6/arch/x86_64/oprofile/Makefile =================================================================== --- linux-2.6.orig/arch/x86_64/oprofile/Makefile +++ linux-2.6/arch/x86_64/oprofile/Makefile @@ -15,5 +15,6 @@ OPROFILE-y := init.o backtrace.o OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \ op_model_ppro.o OPROFILE-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o +OPROFILE-$(CONFIG_PERFMON) += perfmon.o oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y)) Index: linux-2.6/arch/x86_64/perfmon/Kconfig =================================================================== --- /dev/null +++ linux-2.6/arch/x86_64/perfmon/Kconfig @@ -0,0 +1,55 @@ +menu "Hardware Performance Monitoring support" +config PERFMON + bool "Perfmon2 performance monitoring interface" + default n + help + Enables the perfmon2 interface to access the hardware + performance counters. See for + more details. + +config PERFMON_DEBUG + bool "Perfmon debugging" + default n + depends on PERFMON + help + Enables perfmon debugging support + +config X86_64_PERFMON_K8 + tristate "Support 64-bit mode AMD Athlon64 and Opteron64 hardware performance counters" + depends on PERFMON + default n + help + Enables support for 64-bit mode AMD Athlon64 and Opteron64 processors + hardware performance counters. + +config X86_64_PERFMON_P4 + tristate "Support for Intel 64-bit Pentium 4/Xeon hardware performance counters" + depends on PERFMON + default n + help + Enables support for Intel 64-bit mode Pentium 4/Xeon hardware performance + counters. + +config X86_64_PERFMON_CORE + tristate "Support for Intel Core-based performance counters" + depends on PERFMON + default n + help + Enables 64-bit support for Intel Core-based performance counters. Enable + this option to support Intel Core 2 Duo processors. + +config X86_64_PERFMON_INTEL_ARCH + tristate "Support for Intel architectural performance counters" + depends on PERFMON + default n + help + Enables 64-bit support for Intel architectural performance counters. + +config X86_64_PERFMON_PEBS + tristate "Support for Intel Precise Event-Based Sampling (PEBS)" + depends on PERFMON + default n + help + Enables support for 64-bit Precise Event-Based Sampling (PEBS) on the Intel + Pentium 4, Xeon, and Core-based processors which support it. +endmenu Index: linux-2.6/arch/x86_64/perfmon/Makefile =================================================================== --- /dev/null +++ linux-2.6/arch/x86_64/perfmon/Makefile @@ -0,0 +1,16 @@ +# +# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. +# Contributed by Stephane Eranian +# + +obj-$(CONFIG_PERFMON) += perfmon.o +obj-$(CONFIG_X86_64_PERFMON_K8) += perfmon_k8.o +obj-$(CONFIG_X86_64_PERFMON_P4) += perfmon_p4.o +obj-$(CONFIG_X86_64_PERFMON_CORE) += perfmon_core.o +obj-$(CONFIG_X86_64_PERFMON_INTEL_ARCH) += perfmon_intel_arch.o +obj-$(CONFIG_X86_64_PERFMON_PEBS) += perfmon_pebs_smpl.o + +perfmon-$(subst m,y,$(CONFIG_PERFMON)) += ../../i386/perfmon/perfmon.o +perfmon_p4-$(subst m,y,$(CONFIG_X86_64_PERFMON_P4)) += ../../i386/perfmon/perfmon_p4.o +perfmon_intel_arch-$(subst m,y,$(CONFIG_X86_64_PERFMON_INTEL_ARCH)) += ../../i386/perfmon/perfmon_intel_arch.o +perfmon_pebs_smpl-$(subst m,y,$(CONFIG_X86_64_PERFMON_PEBS)) += ../../i386/perfmon/perfmon_pebs_smpl.o Index: linux-2.6/arch/x86_64/perfmon/perfmon_core.c =================================================================== --- /dev/null +++ linux-2.6/arch/x86_64/perfmon/perfmon_core.c @@ -0,0 +1,215 @@ +/* + * This file contains the Intel Core PMU registers description tables. + * Intel Core-based processors support architectural perfmon v1 + * + * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + */ +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Intel Core"); +MODULE_LICENSE("GPL"); + +static int force_nmi; +MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); +module_param(force_nmi, bool, 0600); + +/* + * - upper 32 bits are reserved + * - INT: APIC enable bit is reserved (forced to 1) + * - bit 21 is reserved + * + * RSVD: reserved bits must be 1 + */ +#define PFM_CORE_PMC_RSVD ((~((1ULL<<32)-1)) \ + | (1ULL<<20) \ + | (1ULL<<21)) + +/* + * force Local APIC interrupt on overflow + * disable with NO_EMUL64 + */ +#define PFM_CORE_PMC_VAL (1ULL<<20) +#define PFM_CORE_NO64 (1ULL<<20) + +#define PFM_CORE_NA { .reg_type = PFM_REGT_NA} + +#define PFM_CORE_CA(m, c, t) \ + { \ + .addrs[0] = m, \ + .ctr = c, \ + .reg_type = t \ + } +/* + * physical addresses of MSR for evntsel and perfctr registers + * + * IMPORTANT: + * The mapping was chosen to be compatible with the Intel + * architectural perfmon, so that applications which only + * know about the architectural perfmon can work on Core + * without any changes. After all, this is the goal of + * having an architected PMU. + * + * Because of this compatibility constraint, it is not + * possibleto use PERF_GLOBAL_CTRL, unless we force it to + * 'enable all' which is the default value. Applications + * written for architectural perfmon do not know about + * PERF_GLOBAL_CTRL which is an Intel Core specific + * extension. + * + * Fixed counters are placed after the generic counters + * which are compatible with architectural perfmon. + */ +struct pfm_arch_pmu_info pfm_core_pmu_info={ + .pmc_addrs = { + PFM_CORE_CA(MSR_P6_EVNTSEL0, 0, PFM_REGT_EN), + PFM_CORE_CA(MSR_P6_EVNTSEL1, 1, PFM_REGT_EN), + PFM_CORE_CA(MSR_CORE_PERF_FIXED_CTR_CTRL, 0, PFM_REGT_EN), + PFM_CORE_CA(MSR_IA32_PEBS_ENABLE, 0, PFM_REGT_EN) + }, + .pmd_addrs = { + PFM_CORE_CA(MSR_P6_PERFCTR0, 0, PFM_REGT_CTR), + PFM_CORE_CA(MSR_P6_PERFCTR1, 0, PFM_REGT_CTR), + PFM_CORE_CA(MSR_CORE_PERF_FIXED_CTR0, 0, PFM_REGT_CTR), + PFM_CORE_CA(MSR_CORE_PERF_FIXED_CTR1, 0, PFM_REGT_CTR), + PFM_CORE_CA(MSR_CORE_PERF_FIXED_CTR2, 0, PFM_REGT_CTR) + }, + .pebs_ctr_idx = 0, /* IA32_PMC0 */ + .pmu_style = PFM_X86_PMU_CORE +}; + +static struct pfm_regmap_desc pfm_core_pmc_desc[]={ +/* pmc0 */ { + .type = PFM_REG_I64, + .desc = "PERFEVTSEL0", + .dfl_val = PFM_CORE_PMC_VAL, + .rsvd_msk = PFM_CORE_PMC_RSVD, + .no_emul64_msk = PFM_CORE_NO64, + .hw_addr = MSR_P6_EVNTSEL0 + }, +/* pmc1 */ { + .type = PFM_REG_W64, + .desc = "PERFEVTSEL1", + .dfl_val = PFM_CORE_PMC_VAL, + .rsvd_msk = PFM_CORE_PMC_RSVD, + .no_emul64_msk = PFM_CORE_NO64, + .hw_addr = MSR_P6_EVNTSEL1 + }, +/* pmc2 */ { .type = PFM_REG_W, + .desc = "FIXED_CTRL", + .dfl_val = 0x888ULL, + .rsvd_msk = 0xfffffffffffff444ULL, + .no_emul64_msk = 0, + .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL + }, +/* pmc3 */ { .type = PFM_REG_I, + .desc = "PEBS_ENABLE", + .dfl_val = 0, + .rsvd_msk = 0xfffffffffffffffeULL, + .no_emul64_msk = 0, + .hw_addr = MSR_IA32_PEBS_ENABLE + } +}; + +#define PFM_CORE_D(n) PMD_D(PFM_REG_C, "PMC"#n, MSR_P6_PERFCTR0+n) +#define PFM_CORE_FD(n) PMD_D(PFM_REG_C, "FIXED_CTR"#n, MSR_CORE_PERF_FIXED_CTR0+n) + +static struct pfm_regmap_desc pfm_core_pmd_desc[]={ +/* pmd0 */ PFM_CORE_D(0), +/* pmd1 */ PFM_CORE_D(1), +/* pmd2 */ PFM_CORE_FD(0), +/* pmd3 */ PFM_CORE_FD(1), +/* pmd4 */ PFM_CORE_FD(2) +}; +#define PFM_CORE_NUM_PMCS ARRAY_SIZE(pfm_core_pmc_desc) +#define PFM_CORE_NUM_PMDS ARRAY_SIZE(pfm_core_pmd_desc) + +static struct pfm_pmu_config pfm_core_pmu_conf; + +static int pfm_core_probe_pmu(void) +{ + /* + * Check for Intel Core processors. + * Checking for cpu_has_perfmon is not enough as this + * matches intel Core Duo/Core Solo which do not have + * PEBS nor fixed counters. + */ + if (cpu_data->x86 != 6 || cpu_data->x86_model != 15) + return -1; + + if (!cpu_has_apic) { + PFM_INFO("no Local APIC, unsupported"); + return -1; + } + + PFM_INFO("nmi_watchdog=%d nmi_active=%d force_nmi=%d", + nmi_watchdog, atomic_read(&nmi_active), force_nmi); + + /* + * NMI using PMU? + * Actual removal of NMI counter is done by pfm_pmu_acquire() + */ + if (nmi_watchdog == NMI_LOCAL_APIC || force_nmi) + pfm_core_pmu_info.flags |= PFM_X86_FL_USE_NMI; + + /* + * Intel Core processors implement DS and PEBS, no need to check + */ + pfm_core_pmu_info.flags |= PFM_X86_FL_PMU_DS|PFM_X86_FL_PMU_PEBS; + PFM_INFO("PEBS supported, enabled"); + + return 0; +} + +/* + * called on PMC1, FIXED_CTR. Reject access when PEBS is used + */ +static int pfm_core_pmc_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + + return ctx_arch->flags & PFM_X86_USE_PEBS ? -EINVAL : 0; +} + +/* + * Counters may have model-specific width which can be probed using + * the CPUID.0xa leaf. Yet, the documentation says: " + * In the initial implementation, only the read bit width is reported + * by CPUID, write operations are limited to the low 32 bits. + * Bits [w-32] are sign extensions of bit 31. As such the effective width + * of a counter is 31 bits only. + */ +static struct pfm_pmu_config pfm_core_pmu_conf={ + .pmu_name = "Intel Core", + .pmd_desc = pfm_core_pmd_desc, + .counter_width = 31, + .num_pmc_entries = PFM_CORE_NUM_PMCS, + .num_pmd_entries = PFM_CORE_NUM_PMDS, + .pmc_write_check = pfm_core_pmc_check, + .pmc_desc = pfm_core_pmc_desc, + .probe_pmu = pfm_core_probe_pmu, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .arch_info = &pfm_core_pmu_info +}; + +static int __init pfm_core_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_core_pmu_conf); +} + +static void __exit pfm_core_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_core_pmu_conf); +} + +module_init(pfm_core_pmu_init_module); +module_exit(pfm_core_pmu_cleanup_module); Index: linux-2.6/arch/x86_64/perfmon/perfmon_k8.c =================================================================== --- /dev/null +++ linux-2.6/arch/x86_64/perfmon/perfmon_k8.c @@ -0,0 +1,347 @@ +/* + * This file contains the PMU description for the Athlon64 and Opteron64 + * processors. It supports 32 and 64-bit modes. + * + * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("AMD64 PMU description table"); +MODULE_LICENSE("GPL"); + +static int force_nmi; +MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); +module_param(force_nmi, bool, 0600); + +static struct pfm_arch_pmu_info pfm_k8_pmu_info = { + .pmc_addrs = { +/* pmc0 */ {{MSR_K7_EVNTSEL0, 0}, 0, PFM_REGT_EN}, +/* pmc1 */ {{MSR_K7_EVNTSEL1, 0}, 1, PFM_REGT_EN}, +/* pmc2 */ {{MSR_K7_EVNTSEL2, 0}, 2, PFM_REGT_EN}, +/* pmc3 */ {{MSR_K7_EVNTSEL3, 0}, 3, PFM_REGT_EN}, + }, + .pmd_addrs = { +/* pmd0 */ {{MSR_K7_PERFCTR0, 0}, 0, PFM_REGT_CTR}, +/* pmd1 */ {{MSR_K7_PERFCTR1, 0}, 0, PFM_REGT_CTR}, +/* pmd2 */ {{MSR_K7_PERFCTR2, 0}, 0, PFM_REGT_CTR}, +/* pmd3 */ {{MSR_K7_PERFCTR3, 0}, 0, PFM_REGT_CTR}, + }, + .pmu_style = PFM_X86_PMU_AMD64 +}; + +/* + * force Local APIC interrupt on overflow + */ +#define PFM_K8_VAL (1ULL<<20) +#define PFM_K8_NO64 (1ULL<<20) + +/* + * reserved bits must be zero + * + * - upper 32 bits are reserved + * - APIC enable bit is reserved (forced to 1) + * - bit 21 is reserved + */ +#define PFM_K8_RSVD ((~((1ULL<<32)-1)) \ + | (1ULL<<20) \ + | (1ULL<<21)) + +/* + * force Local APIC interrupt on overflow + */ +#define PFM_K8_VAL (1ULL<<20) +#define PFM_K8_NO64 (1ULL<<20) + +static struct pfm_regmap_desc pfm_k8_pmc_desc[]={ +/* pmc0 */ PMC_D(PFM_REG_I64, "PERFSEL0", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL0), +/* pmc1 */ PMC_D(PFM_REG_I64, "PERFSEL1", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL1), +/* pmc2 */ PMC_D(PFM_REG_I64, "PERFSEL2", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL2), +/* pmc3 */ PMC_D(PFM_REG_I64, "PERFSEL3", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL3), +}; +#define PFM_AMD_NUM_PMCS ARRAY_SIZE(pfm_k8_pmc_desc) + +static struct pfm_regmap_desc pfm_k8_pmd_desc[] = { +/* pmd0 */ PMD_D(PFM_REG_C, "PERFCTR0", MSR_K7_PERFCTR0), +/* pmd1 */ PMD_D(PFM_REG_C, "PERFCTR1", MSR_K7_PERFCTR1), +/* pmd2 */ PMD_D(PFM_REG_C, "PERFCTR2", MSR_K7_PERFCTR2), +/* pmd3 */ PMD_D(PFM_REG_C, "PERFCTR3", MSR_K7_PERFCTR3) +}; +#define PFM_AMD_NUM_PMDS ARRAY_SIZE(pfm_k8_pmd_desc) + +static struct pfm_context **pfm_nb_sys_owners; +static struct pfm_context *pfm_nb_task_owner; + +static struct pfm_pmu_config pfm_k8_pmu_conf; + +/* + * There can only be one user per socket for the Northbridge (NB) events, + * so we enforce mutual exclusion as follows: + * - per-thread : only one context machine-wide can use NB events + * - system-wide: only one context per processor socket + * + * Exclusion is enforced at: + * - pfm_load_context() + * - pfm_write_pmcs() for attached contexts + * + * Exclusion is released at: + * - pfm_unload_context() or any calls that implicitely uses it + * + * return: + * 0 : successfully acquire NB access + * < 0: errno, failed to acquire NB access + */ +static int pfm_k8_acquire_nb(struct pfm_context *ctx) +{ + struct pfm_context **entry, *old; + int proc_id; + +#ifdef CONFIG_SMP + proc_id = topology_physical_package_id(smp_processor_id()); +#else + proc_id = 0; +#endif + + if (ctx->flags.system) + entry = &pfm_nb_sys_owners[proc_id]; + else + entry = &pfm_nb_task_owner; + + old = cmpxchg(entry, NULL, ctx); + if (!old) { + if (ctx->flags.system) + PFM_DBG("acquired Northbridge event access on socket %u", proc_id); + else + PFM_DBG("acquired Northbridge event access globally"); + } else if (old != ctx) { + if (ctx->flags.system) + PFM_DBG("NorthBridge event conflict on socket %u", proc_id); + else + PFM_DBG("global NorthBridge event conflict"); + return -EBUSY; + } + return 0; +} + +/* + * invoked from pfm_write_pmcs() when pfm_nb_sys_owners is not NULL,i.e., + * when we have detected a multi-core processor. + * + * context is locked, interrupts are masked + */ +static int pfm_k8_pmc_write_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ + unsigned int event; + /* + * delay checking NB event until we load the context + */ + if (ctx->state == PFM_CTX_UNLOADED) + return 0; + + /* + * check event is NB event + */ + event = (unsigned int)(req->reg_value & 0xff); + if (event < 0xee) + return 0; + + return pfm_k8_acquire_nb(ctx); +} + +/* + * invoked on pfm_load_context(). + * context is locked, interrupts are masked + */ +static int pfm_k8_load_context(struct pfm_context *ctx) +{ + struct pfm_event_set *set; + unsigned int i, n; + + /* + * scan all sets for NB events + */ + list_for_each_entry(set, &ctx->list, list) { + n = set->nused_pmcs; + for(i=0; n; i++) { + if (!test_bit(i, cast_ulp(set->used_pmcs))) + continue; + if ((set->pmcs[i] & 0xff) >= 0xee) + goto found; + n--; + } + } + return 0; +found: + return pfm_k8_acquire_nb(ctx); +} + +/* + * invoked on pfm_unload_context() + */ +static int pfm_k8_unload_context(struct pfm_context *ctx) +{ + struct pfm_context **entry, *old; + int proc_id; + +#ifdef CONFIG_SMP + proc_id = topology_physical_package_id(smp_processor_id()); +#else + proc_id = 0; +#endif + + /* + * unload always happens on the monitored CPU in system-wide + */ + if (ctx->flags.system) + entry = &pfm_nb_sys_owners[proc_id]; + else + entry = &pfm_nb_task_owner; + + old = cmpxchg(entry, ctx, NULL); + if (old == ctx) { + if (ctx->flags.system) + PFM_DBG("released NorthBridge on socket %u", proc_id); + else + PFM_DBG("released NorthBridge events globally"); + } + return 0; +} + +/* + * detect if we need to active NorthBridge event access control + */ +static int pfm_k8_setup_nb_event_control(void) +{ + unsigned int c, n = 0; + unsigned int max_phys = 0; + +#ifdef CONFIG_SMP + for_each_possible_cpu(c) { + if (cpu_data[c].phys_proc_id > max_phys) + max_phys = cpu_data[c].phys_proc_id; + } +#else + max_phys = 0; +#endif + if (max_phys > 255) { + PFM_INFO("socket id %d is too big to handle", max_phys); + return -ENOMEM; + } + + n = max_phys + 1; + if (n < 2) + return 0; + + pfm_nb_sys_owners = vmalloc(n * sizeof(*pfm_nb_sys_owners)); + if (!pfm_nb_sys_owners) + return -ENOMEM; + + memset(pfm_nb_sys_owners, 0, n * sizeof(*pfm_nb_sys_owners)); + pfm_nb_task_owner = NULL; + + /* + * activate write-checker for PMC registers + */ + for(c=0; c < PFM_AMD_NUM_PMCS; c++) { + pfm_k8_pmc_desc[c].type |= PFM_REG_WC; + } + + pfm_k8_pmu_info.load_context = pfm_k8_load_context; + pfm_k8_pmu_info.unload_context = pfm_k8_unload_context; + + pfm_k8_pmu_conf.pmc_write_check = pfm_k8_pmc_write_check; + + PFM_INFO("NorthBridge event access control enabled"); + + return 0; +} + +static int pfm_k8_probe_pmu(void) +{ + if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) { + PFM_INFO("not an AMD processor"); + return -1; + } + + switch (current_cpu_data.x86) { + case 15: + case 16: + PFM_INFO("found family=%d", current_cpu_data.x86); + break; + default: + PFM_INFO("unsupported family=%d", current_cpu_data.x86); + return -1; + } + + /* + * check for local APIC (required) + */ + if (!cpu_has_apic) { + PFM_INFO("no local APIC, unsupported"); + return -1; + } + + if (current_cpu_data.x86_max_cores > 1) + return pfm_k8_setup_nb_event_control(); + + PFM_INFO("nmi_watchdog=%d nmi_active=%d force_nmi=%d", + nmi_watchdog, atomic_read(&nmi_active), force_nmi); + /* + * NMI using PMU? + * Actual removal of NMI counter is done by pfm_pmu_acquire() + */ + if (nmi_watchdog == NMI_LOCAL_APIC || force_nmi) + pfm_k8_pmu_info.flags |= PFM_X86_FL_USE_NMI; + + return 0; +} + +static struct pfm_pmu_config pfm_k8_pmu_conf = { + .pmu_name = "AMD64", + .counter_width = 47, + .pmd_desc = pfm_k8_pmd_desc, + .pmc_desc = pfm_k8_pmc_desc, + .num_pmc_entries = PFM_AMD_NUM_PMCS, + .num_pmd_entries = PFM_AMD_NUM_PMDS, + .probe_pmu = pfm_k8_probe_pmu, + .version = "1.1", + .arch_info = &pfm_k8_pmu_info, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE +}; + +static int __init pfm_k8_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_k8_pmu_conf); +} + +static void __exit pfm_k8_pmu_cleanup_module(void) +{ + if (pfm_nb_sys_owners) + vfree(pfm_nb_sys_owners); + + pfm_pmu_unregister(&pfm_k8_pmu_conf); +} + +module_init(pfm_k8_pmu_init_module); +module_exit(pfm_k8_pmu_cleanup_module); Index: linux-2.6/drivers/oprofile/oprofile_files.c =================================================================== --- linux-2.6.orig/drivers/oprofile/oprofile_files.c +++ linux-2.6/drivers/oprofile/oprofile_files.c @@ -117,7 +117,17 @@ static ssize_t dump_write(struct file * static const struct file_operations dump_fops = { .write = dump_write, }; - + +static ssize_t implementation(struct file * file, char __user * buf, size_t count, loff_t * offset) +{ + return oprofilefs_str_to_user(oprofile_ops.implementation, buf, count, offset); +} + + +static struct file_operations implementation_fops = { + .read = implementation, +}; + void oprofile_create_files(struct super_block * sb, struct dentry * root) { oprofilefs_create_file(sb, root, "enable", &enable_fops); @@ -127,6 +137,7 @@ void oprofile_create_files(struct super_ oprofilefs_create_ulong(sb, root, "buffer_watershed", &fs_buffer_watershed); oprofilefs_create_ulong(sb, root, "cpu_buffer_size", &fs_cpu_buffer_size); oprofilefs_create_file(sb, root, "cpu_type", &cpu_type_fops); + oprofilefs_create_file(sb, root, "implementation", &implementation_fops); oprofilefs_create_file(sb, root, "backtrace_depth", &depth_fops); oprofilefs_create_file(sb, root, "pointer_size", &pointer_size_fops); oprofile_create_stats_files(sb, root); Index: linux-2.6/drivers/oprofile/timer_int.c =================================================================== --- linux-2.6.orig/drivers/oprofile/timer_int.c +++ linux-2.6/drivers/oprofile/timer_int.c @@ -43,4 +43,5 @@ void __init oprofile_timer_init(struct o ops->start = timer_start; ops->stop = timer_stop; ops->cpu_type = "timer"; + ops->implementation = "timer"; } Index: linux-2.6/include/asm-i386/mach-default/entry_arch.h =================================================================== --- linux-2.6.orig/include/asm-i386/mach-default/entry_arch.h +++ linux-2.6/include/asm-i386/mach-default/entry_arch.h @@ -31,4 +31,8 @@ BUILD_INTERRUPT(spurious_interrupt,SPURI BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) #endif +#ifdef CONFIG_PERFMON +BUILD_INTERRUPT(pmu_interrupt,LOCAL_PERFMON_VECTOR) +#endif + #endif Index: linux-2.6/include/asm-i386/mach-default/irq_vectors.h =================================================================== --- linux-2.6.orig/include/asm-i386/mach-default/irq_vectors.h +++ linux-2.6/include/asm-i386/mach-default/irq_vectors.h @@ -56,6 +56,7 @@ * sources per level' errata. */ #define LOCAL_TIMER_VECTOR 0xef +#define LOCAL_PERFMON_VECTOR 0xee /* * First APIC vector available to drivers: (vectors 0x30-0xee) @@ -63,7 +64,7 @@ * levels. (0x80 is the syscall vector) */ #define FIRST_DEVICE_VECTOR 0x31 -#define FIRST_SYSTEM_VECTOR 0xef +#define FIRST_SYSTEM_VECTOR 0xee #define TIMER_IRQ 0 Index: linux-2.6/include/asm-i386/perfmon.h =================================================================== --- /dev/null +++ linux-2.6/include/asm-i386/perfmon.h @@ -0,0 +1,441 @@ +/* + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file contains X86 Processor Family specific definitions + * for the perfmon interface. This covers P6, Pentium M, P4/Xeon + * (32-bit and 64-bit, i.e., EM64T) and AMD X86-64. + * + * This file MUST never be included directly. Use linux/perfmon.h. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_I386_PERFMON_H_ +#define _ASM_I386_PERFMON_H_ + +#ifdef __KERNEL__ + +#ifdef CONFIG_4KSTACKS +#define PFM_ARCH_PMD_STK_ARG 2 +#define PFM_ARCH_PMC_STK_ARG 2 +#else +#define PFM_ARCH_PMD_STK_ARG 4 /* about 700 bytes of stack space */ +#define PFM_ARCH_PMC_STK_ARG 4 /* about 200 bytes of stack space */ +#endif + +/* + * For P4: + * - bits 31 - 63 reserved + * - T1_OS and T1_USR bits are reserved - set depending on logical proc + * user mode application should use T0_OS and T0_USR to indicate + * RSVD: reserved bits must be 1 + */ +#define PFM_ESCR_RSVD ~0x000000007ffffffcULL + +/* + * bitmask for reg_type + */ +#define PFM_REGT_NA 0x0000 /* not available */ +#define PFM_REGT_EN 0x0001 /* has enable bit (cleared on ctxsw) */ +#define PFM_REGT_ESCR 0x0002 /* P4: ESCR */ +#define PFM_REGT_CCCR 0x0004 /* P4: CCCR */ +#define PFM_REGT_PEBS 0x0010 /* PEBS related */ +#define PFM_REGT_NOHT 0x0020 /* unavailable with HT */ +#define PFM_REGT_CTR 0x0040 /* counter */ +#define PFM_REGT_OTH 0x0080 /* other type of register */ + +/* + * This design and the partitioning of resources for SMT (hyper threads) + * is very static and limited due to limitations in the number of ESCRs + * and CCCRs per group. + */ +#define MAX_SMT_ID 1 + +/* + * For extended register information in addition to address that is used + * at runtime to figure out the mapping of reg addresses to logical procs + * and association of registers to hardware specific features + */ +struct pfm_arch_ext_reg { + /* + * one each for the logical CPUs. Index 0 corresponds to T0 and + * index 1 corresponds to T1. Index 1 can be zero if no T1 + * complement reg exists. + */ + unsigned long addrs[MAX_SMT_ID+1]; + unsigned int ctr; /* for CCCR/PERFEVTSEL, associated counter */ + unsigned int reg_type; +}; + +typedef int (*pfm_check_session_t)(struct pfm_context *ctx); + +struct pfm_arch_pmu_info { + struct pfm_arch_ext_reg pmc_addrs[PFM_MAX_PMCS]; + struct pfm_arch_ext_reg pmd_addrs[PFM_MAX_PMDS]; + u64 enable_mask[PFM_PMC_BV]; /* PMC registers with enable bit */ + u64 ovfl_reg_mask; /* relevant bits of PERF_OVFL_STATS (Core) */ + + u16 max_ena; /* highest enable bit + 1 */ + u16 flags; /* PMU feature flags */ + u16 pebs_ctr_idx; /* index of PEBS counter for overflow */ + u16 reserved; /* for future use */ + + /* + * optional callbacks invoked by pfm_arch_*load_context() + */ + int (*load_context)(struct pfm_context *ctx); + int (*unload_context)(struct pfm_context *ctx); + + u8 pmu_style; /* type of PMU interface (P4, P6, CORE) */ +}; + +/* + * X86 PMU style + */ +#define PFM_X86_PMU_P4 1 /* Intel P4/Xeon/EM64T processor PMU */ +#define PFM_X86_PMU_P6 2 /* Intel P6/Pentium M */ +#define PFM_X86_PMU_CORE 3 /* Intel Core PMU */ +#define PFM_X86_PMU_AMD64 4 /* AMD64 PMU (K8, family 10h) */ + +/* + * PMU feature flags + */ +#define PFM_X86_FL_PMU_DS 0x01 /* Intel: support for Data Save Area (DS) */ +#define PFM_X86_FL_PMU_PEBS 0x02 /* Intel: support PEBS (implies DS) */ +#define PFM_X86_FL_USE_NMI 0x04 /* must use NMI interrupt */ + +void __pfm_read_reg_p4(const struct pfm_arch_ext_reg *xreg, u64 *val); +void __pfm_write_reg_p4(const struct pfm_arch_ext_reg *xreg, u64 val); + + +extern void pfm_arch_resend_irq(void); + +static inline void pfm_arch_serialize(void) +{} + +/* + * on x86, the PMDs are already saved by pfm_arch_freeze_pmu() + * when entering the PMU interrupt handler, thus, we do not need + * to save them again in pfm_switch_sets_from_intr() + */ +static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +/* + * in certain situations, ctx may be NULL + */ +static inline void pfm_arch_write_pmc(struct pfm_context *ctx, unsigned int cnum, u64 value) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + /* + * we only write to the actual register when monitoring is + * active (pfm_start was issued) + */ + if (ctx && ctx->flags.started == 0) + return; + + PFM_DBG_ovfl("pfm_arch_write_pmc(0x%016Lx, 0x%016Lx)", + (unsigned long long) pfm_pmu_conf->pmc_desc[cnum].hw_addr, + (unsigned long long) value); + if (arch_info->pmu_style == PFM_X86_PMU_P4) + __pfm_write_reg_p4(&arch_info->pmc_addrs[cnum], value); + else + wrmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value); +} + +static inline void pfm_arch_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + /* + * force upper bit set for counter to ensure overflow + */ + if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64) + value |= ~pfm_pmu_conf->ovfl_mask; + + PFM_DBG_ovfl("pfm_arch_write_pmd(0x%016Lx, 0x%016Lx)", + (unsigned long long) pfm_pmu_conf->pmd_desc[cnum].hw_addr, + (unsigned long long) value); + if (arch_info->pmu_style == PFM_X86_PMU_P4) + __pfm_write_reg_p4(&arch_info->pmd_addrs[cnum], value); + else + wrmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value); +} + +static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + u64 tmp; + + if (arch_info->pmu_style == PFM_X86_PMU_P4) + __pfm_read_reg_p4(&arch_info->pmd_addrs[cnum], &tmp); + else + rdmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, tmp); + PFM_DBG_ovfl("pfm_arch_read_pmd(0x%016Lx) = 0x%016Lx", + (unsigned long long) pfm_pmu_conf->pmd_desc[cnum].hw_addr, + (unsigned long long) tmp); + return tmp; +} + +static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + u64 tmp; + if (arch_info->pmu_style == PFM_X86_PMU_P4) + __pfm_read_reg_p4(&arch_info->pmc_addrs[cnum], &tmp); + else + rdmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, tmp); + PFM_DBG_ovfl("pfm_arch_read_pmc(0x%016Lx) = 0x%016Lx", + (unsigned long long) pfm_pmu_conf->pmc_desc[cnum].hw_addr, + (unsigned long long) tmp); + return tmp; +} + +/* + * At certain points, perfmon needs to know if monitoring has been + * explicitely started/stopped by user via pfm_start/pfm_stop. The + * information is tracked in flags.started. However on certain + * architectures, it may be possible to start/stop directly from + * user level with a single assembly instruction bypassing + * the kernel. This function is used to determine by + * an arch-specific mean if monitoring is actually started/stopped. + */ +static inline int pfm_arch_is_active(struct pfm_context *ctx) +{ + return ctx->flags.started; +} + +static inline void pfm_arch_ctxswout_sys(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +static inline void pfm_arch_ctxswin_sys(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +static inline int pfm_arch_init(void) +{ + return 0; +} + +static inline void pfm_arch_init_percpu(void) +{} + +int pfm_arch_ctxswout_thread(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set); + +void pfm_arch_ctxswin_thread(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set); + +void pfm_arch_stop(struct task_struct *task, + struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_start(struct task_struct *task, + struct pfm_context *ctx, struct pfm_event_set *set); + +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx); +int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg); +void pfm_arch_pmu_config_remove(void); +char *pfm_arch_get_pmu_module_name(void); + +static inline int pfm_arch_unload_context(struct pfm_context *ctx, + struct task_struct *task) +{ + struct pfm_arch_pmu_info *arch_info; + int ret = 0; + + arch_info = pfm_pmu_conf->arch_info; + if (arch_info->unload_context) { + ret = arch_info->unload_context(ctx); + } + return ret; +} + +static inline int pfm_arch_load_context(struct pfm_context *ctx, + struct pfm_event_set *set, + struct task_struct *task) +{ + struct pfm_arch_pmu_info *arch_info; + int ret = 0; + + arch_info = pfm_pmu_conf->arch_info; + if (arch_info->load_context) { + ret = arch_info->load_context(ctx); + } + return ret; +} + +/* + * this function is called from the PMU interrupt handler ONLY. + * On x86, the PMU is frozen via arch_stop, masking would be implemented + * via arch-stop as well. Given that the PMU is already stopped when + * entering the interrupt handler, we do not need to stop it again, so + * this function is a nop. + */ +static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +/* + * on x86 masking/unmasking uses the start/stop mechanism, so we simply + * need to start here. + */ +static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_arch_start(current, ctx, set); +} + +/* + * called from __pfm_interrupt_handler(). ctx is not NULL. + * ctx is locked. interrupts are masked + * + * The following actions must take place: + * - stop all monitoring to ensure handler has consistent view. + * - collect overflowed PMDs bitmask into povfls_pmds and + * npend_ovfls. If no interrupt detected then npend_ovfls + * must be set to zero. + */ +static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + /* + * on X86, freezing is equivalent to stopping + */ + pfm_arch_stop(current, ctx, set); + + /* + * we mark monitoring as stopped to avoid + * certain side effects especially in + * pfm_switch_sets_from_intr() and + * pfm_arch_restore_pmcs() + */ + ctx->flags.started = 0; +} + +/* + * function called from pfm_setfl_sane(). Context is locked + * and interrupts are masked. + * The value of flags is the value of ctx_flags as passed by + * user. + * + * function must check arch-specific set flags. + * Return: + * 1 when flags are valid + * 0 on error + */ +static inline int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) +{ + return 0; +} + +int pfm_arch_pmu_acquire(void); +void pfm_arch_pmu_release(void); + +/* + * For some CPUs, the upper bits of a counter must be set in order for the + * overflow interrupt to happen. On overflow, the counter has wrapped around, + * and the upper bits are cleared. This function may be used to set them back. + * + * x86: The current version loses whatever is remaining in the counter, + * which is usually has a small count. In order not to loose this count, + * we do a read-modify-write to set the upper bits while preserving the + * low-order bits. This is slow but works. + */ +static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + u64 val; + val = pfm_arch_read_pmd(ctx, cnum); + pfm_arch_write_pmd(ctx, cnum, val); +} + +/* + * not used for i386/x86_64 + */ +static inline int pfm_smpl_buffer_alloc_compat(struct pfm_context *ctx, + size_t rsize, struct file *filp) +{ + return -EINVAL; +} + +/* + * architecture specific context extension. + * located at: (struct pfm_arch_context *)(ctx+1) + */ + +struct pfm_arch_p4_context { + u32 npend_ovfls; /* P4 NMI #pending ovfls */ + u32 reserved; + u64 povfl_pmds[PFM_HW_PMD_BV]; /* P4 NMI overflowed counters */ + u64 saved_cccrs[PFM_MAX_PMCS]; +}; + +struct pfm_arch_context { + u64 saved_real_iip; /* instr pointer of last NMI intr (ctxsw) */ + u32 flags; /* arch-specific flags */ + unsigned long ds_area; /* address of DS management area */ + struct pfm_arch_p4_context *p4; /* P4 specific state */ +}; + +static inline int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + struct pfm_arch_context *ctx_arch; + + if (arch_info->pmu_style != PFM_X86_PMU_P4) + return 0; + + ctx_arch = pfm_ctx_arch(ctx); + + ctx_arch->p4 = kzalloc(sizeof(*(ctx_arch->p4)), GFP_KERNEL); + if (!ctx_arch->p4) + return -ENOMEM; + + return 0; +} + +static inline void pfm_arch_context_free(struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * we do not check if P4, because it would be NULL and + * kfree can deal with NULL + */ + kfree(ctx_arch->p4); +} + + +/* + * pfm_arch_context flags + */ +#define PFM_X86_USE_PEBS 0x1 /* context is using PEBS */ +#define PFM_X86_USE_BTS 0x2 /* context is using BTS */ +#define PFM_X86_USE_DS (PFM_X86_USE_PEBS|PFM_X86_USE_BTS) + +#define PFM_ARCH_CTX_SIZE (sizeof(struct pfm_arch_context)) + +asmlinkage void pmu_interrupt(void); + +#endif /* __KERNEL__ */ + +#endif /* _ASM_I386_PERFMON_H_ */ Index: linux-2.6/include/asm-i386/perfmon_api.h =================================================================== --- /dev/null +++ linux-2.6/include/asm-i386/perfmon_api.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file contains x86-64 specific definitions for the perfmon + * interface. + * + * This file MUST never be included directly. Use linux/perfmon.h. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_I386_PERFMON_API_H_ +#define _ASM_I386_PERFMON_API_H_ +/* + * both i386 and x86-64 maximums MUST be identical to ensure ABI + * compatibility + */ +#define PFM_ARCH_MAX_HW_PMCS 256 /* maximum number of PMC registers */ +#define PFM_ARCH_MAX_HW_PMDS 256 /* maximum number of PMD registers */ + +/* + * Virtual PMU registers: registers mapped to non-PMU resources + * IMPORTANT: + * - must appear in PMC/PMD namespace *AFTER* PMU registers + * - SW PMD can be specified as smpl_pmds, reset_pmds + * - SW PMD cannot overflow + * - SW PMD do not show up in pfarg_msg.ovfl_pmds/pfarg_setinfo_t.ovfl_pmds + */ +#define PFM_ARCH_MAX_SW_PMCS 64 /* max virtual PMCS */ +#define PFM_ARCH_MAX_SW_PMDS 64 /* max virtual PMDS */ + +#endif /* _ASM_I386_PERFMON_API_H_ */ Index: linux-2.6/include/asm-i386/perfmon_pebs_smpl.h =================================================================== --- /dev/null +++ linux-2.6/include/asm-i386/perfmon_pebs_smpl.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + * + * This file implements the sampling format to support Intel + * Precise Event Based Sampling (PEBS) feature of Pentium 4 and + * Intel Core-based processors. + * + * What is PEBS? + * ------------ + * This is a hardware feature to enhance sampling by providing + * better precision as to where a sample is taken. This avoids the + * typical skew in the instruction one can observe with any + * interrupt-based sampling technique. + * + * PEBS also lowers sampling overhead significantly by having the + * processor store samples instead of the OS. PMU interrupt are only + * generated after multiple samples are written. + * + * Another benefit of PEBS is that samples can be captured inside + * critical sections where interrupts are masked. + * + * How does it work? + * PEBS effectively implements a Hw buffer. The Os must pass a region + * of memory where samples are to be stored. The region can have any + * size. The OS must also specify the sampling period to reload. The PMU + * will interrupt when it reaches the end of the buffer or a specified + * threshold location inside the memory region. + * + * The description of the buffer is stored in the Data Save Area (DS). + * The samples are stored sequentially in the buffer. The format of the + * buffer is fixed and specified in the PEBS documentation. The sample + * format changes between 32-bit and 64-bit modes due to extended register + * file. + * + * PEBS does not work when HyperThreading is enabled due to certain MSR + * being shared being to two threads. + * + * What does the format do? + * It provides access to the PEBS feature for both 32-bit and 64-bit + * processors that support it. + * + * The same code is used for both 32-bit and 64-bit mode, but different + * format names are used because the two modes are not compatible due to + * data model and register file differences. Similarly the public data + * structures describing the samples are different. + * + * It is important to realize that the format provide a zero-copy environment + * for the samples, i.e,, the OS never touches the samples. Whatever the + * processor write is directly accessible to the user. + * + * Parameters to the buffer can be passed via pfm_create_context() in + * the pfm_pebs_smpl_arg structure. + * + * It is not possible to mix a 32-bit PEBS application on top of a 64-bit + * host kernel. + */ +#ifndef __PERFMON_PEBS_SMPL_H__ +#define __PERFMON_PEBS_SMPL_H__ 1 + +#ifdef __i386__ +/* + * The 32-bit and 64-bit formats are not compatible, thus we have + * two different identifications so that 32-bit programs running on + * 64-bit OS will fail to use the 64-bit PEBS support. + */ +#define PFM_PEBS_SMPL_NAME "pebs32" +#else +#define PFM_PEBS_SMPL_NAME "pebs64" +#endif + +/* + * format specific parameters (passed at context creation) + * + * intr_thres: index from start of buffer of entry where the + * PMU interrupt must be triggered. It must be several samples + * short of the end of the buffer. + */ +struct pfm_pebs_smpl_arg { + u64 cnt_reset; /* counter reset value */ + size_t buf_size; /* size of the PEBS buffer in bytes */ + size_t intr_thres;/* index of PEBS interrupt threshold entry */ + u64 reserved[6]; /* for future use */ +}; + +/* + * Data Save Area (32 and 64-bit mode) + * + * The DS area must be exposed to the user because this is the only + * way to report on the number of valid entries recorded by the CPU. + * This is required when the buffer is not full, i..e, there was not + * PMU interrupt. + * + * Layout of the structure is mandated by hardware and specified in + * the Intel documentation. + */ +struct pfm_ds_area { + unsigned long bts_buf_base; + unsigned long bts_index; + unsigned long bts_abs_max; + unsigned long bts_intr_thres; + unsigned long pebs_buf_base; + unsigned long pebs_index; + unsigned long pebs_abs_max; + unsigned long pebs_intr_thres; + u64 pebs_cnt_reset; +}; + +/* + * This header is at the beginning of the sampling buffer returned to the user. + * + * Because of PEBS alignement constraints, the actual PEBS buffer area does + * not necessarily begin right after the header. The hdr_start_offs must be + * used to compute the first byte of the buffer. The offset is defined as + * the number of bytes between the end of the header and the beginning of + * the buffer. As such the formula is: + * actual_buffer = (unsigned long)(hdr+1)+hdr->hdr_start_offs + */ +struct pfm_pebs_smpl_hdr { + u64 overflows; /* #overflows for buffer */ + size_t buf_size; /* bytes in the buffer */ + size_t start_offs; /* actual buffer start offset */ + u32 version; /* smpl format version */ + u32 reserved1; /* for future use */ + u64 reserved2[5]; /* for future use */ + struct pfm_ds_area ds; /* data save area */ +}; + +/* + * 64-bit PEBS record format is described in + * http://www.intel.com/technology/64bitextensions/30083502.pdf + * + * The format does not peek at samples. The sample structure is only + * used to ensure that the buffer is large enough to accomodate one + * sample. + */ +#ifdef __i386__ +struct pfm_pebs_smpl_entry { + u32 eflags; + u32 ip; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esi; + u32 edi; + u32 ebp; + u32 esp; +}; +#else +struct pfm_pebs_smpl_entry { + u64 eflags; + u64 ip; + u64 eax; + u64 ebx; + u64 ecx; + u64 edx; + u64 esi; + u64 edi; + u64 ebp; + u64 esp; + u64 r8; + u64 r9; + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; +}; +#endif + +#define PFM_PEBS_SMPL_VERSION_MAJ 1U +#define PFM_PEBS_SMPL_VERSION_MIN 0U +#define PFM_PEBS_SMPL_VERSION (((PFM_PEBS_SMPL_VERSION_MAJ&0xffff)<<16)|\ + (PFM_PEBS_SMPL_VERSION_MIN & 0xffff)) + +#endif /* __PERFMON_PEBS_SMPL_H__ */ Index: linux-2.6/include/asm-i386/thread_info.h =================================================================== --- linux-2.6.orig/include/asm-i386/thread_info.h +++ linux-2.6/include/asm-i386/thread_info.h @@ -160,7 +160,7 @@ static inline struct thread_info *curren #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP) /* flags to check in __switch_to() */ -#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP) +#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP|_TIF_PERFMON_CTXSW) /* * Thread-synchronous status. Index: linux-2.6/include/asm-i386/unistd.h =================================================================== --- linux-2.6.orig/include/asm-i386/unistd.h +++ linux-2.6/include/asm-i386/unistd.h @@ -329,10 +329,22 @@ #define __NR_signalfd 321 #define __NR_timerfd 322 #define __NR_eventfd 323 +#define __NR_pfm_create_context 324 +#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1) +#define __NR_pfm_write_pmds (__NR_pfm_create_context+2) +#define __NR_pfm_read_pmds (__NR_pfm_create_context+3) +#define __NR_pfm_load_context (__NR_pfm_create_context+4) +#define __NR_pfm_start (__NR_pfm_create_context+5) +#define __NR_pfm_stop (__NR_pfm_create_context+6) +#define __NR_pfm_restart (__NR_pfm_create_context+7) +#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8) +#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9) +#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10) +#define __NR_pfm_unload_context (__NR_pfm_create_context+11) #ifdef __KERNEL__ -#define NR_syscalls 324 +#define NR_syscalls 335 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR Index: linux-2.6/include/asm-ia64/hw_irq.h =================================================================== --- linux-2.6.orig/include/asm-ia64/hw_irq.h +++ linux-2.6/include/asm-ia64/hw_irq.h @@ -63,9 +63,9 @@ extern int ia64_last_device_vector; #define IA64_NUM_DEVICE_VECTORS (IA64_LAST_DEVICE_VECTOR - IA64_FIRST_DEVICE_VECTOR + 1) #define IA64_MCA_RENDEZ_VECTOR 0xe8 /* MCA rendez interrupt */ -#define IA64_PERFMON_VECTOR 0xee /* performanc monitor interrupt vector */ #define IA64_TIMER_VECTOR 0xef /* use highest-prio group 15 interrupt for timer */ #define IA64_MCA_WAKEUP_VECTOR 0xf0 /* MCA wakeup (must be >MCA_RENDEZ_VECTOR) */ +#define IA64_PERFMON_VECTOR 0xf1 /* performance monitor interrupt vector */ #define IA64_IPI_LOCAL_TLB_FLUSH 0xfc /* SMP flush local TLB */ #define IA64_IPI_RESCHEDULE 0xfd /* SMP reschedule */ #define IA64_IPI_VECTOR 0xfe /* inter-processor interrupt vector */ Index: linux-2.6/include/asm-ia64/perfmon.h =================================================================== --- linux-2.6.orig/include/asm-ia64/perfmon.h +++ linux-2.6/include/asm-ia64/perfmon.h @@ -1,279 +1,302 @@ /* - * Copyright (C) 2001-2003 Hewlett-Packard Co - * Stephane Eranian - */ - -#ifndef _ASM_IA64_PERFMON_H -#define _ASM_IA64_PERFMON_H - -/* - * perfmon comamnds supported on all CPU models - */ -#define PFM_WRITE_PMCS 0x01 -#define PFM_WRITE_PMDS 0x02 -#define PFM_READ_PMDS 0x03 -#define PFM_STOP 0x04 -#define PFM_START 0x05 -#define PFM_ENABLE 0x06 /* obsolete */ -#define PFM_DISABLE 0x07 /* obsolete */ -#define PFM_CREATE_CONTEXT 0x08 -#define PFM_DESTROY_CONTEXT 0x09 /* obsolete use close() */ -#define PFM_RESTART 0x0a -#define PFM_PROTECT_CONTEXT 0x0b /* obsolete */ -#define PFM_GET_FEATURES 0x0c -#define PFM_DEBUG 0x0d -#define PFM_UNPROTECT_CONTEXT 0x0e /* obsolete */ -#define PFM_GET_PMC_RESET_VAL 0x0f -#define PFM_LOAD_CONTEXT 0x10 -#define PFM_UNLOAD_CONTEXT 0x11 + * Copyright (c) 2001-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file contains Itanium Processor Family specific definitions + * for the perfmon interface. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_IA64_PERFMON_H_ +#define _ASM_IA64_PERFMON_H_ +#ifdef __KERNEL__ /* - * PMU model specific commands (may not be supported on all PMU models) + * compatibility for previous versions of the interface */ -#define PFM_WRITE_IBRS 0x20 -#define PFM_WRITE_DBRS 0x21 +#include -/* - * context flags - */ -#define PFM_FL_NOTIFY_BLOCK 0x01 /* block task on user level notifications */ -#define PFM_FL_SYSTEM_WIDE 0x02 /* create a system wide context */ -#define PFM_FL_OVFL_NO_MSG 0x80 /* do not post overflow/end messages for notification */ +#define PFM_ARCH_PMD_STK_ARG 8 +#define PFM_ARCH_PMC_STK_ARG 8 -/* - * event set flags - */ -#define PFM_SETFL_EXCL_IDLE 0x01 /* exclude idle task (syswide only) XXX: DO NOT USE YET */ +#include /* - * PMC flags + * describe the content of the pfm_syst_info field + * layout: + * bits[00-15] : generic flags + * bits[16-31] : arch-specific flags */ -#define PFM_REGFL_OVFL_NOTIFY 0x1 /* send notification on overflow */ -#define PFM_REGFL_RANDOM 0x2 /* randomize sampling interval */ +#define PFM_ITA_CPUINFO_IDLE_EXCL 0x10000 /* stop monitoring in idle loop */ /* - * PMD/PMC/IBR/DBR return flags (ignored on input) - * - * Those flags are used on output and must be checked in case EAGAIN is returned - * by any of the calls using a pfarg_reg_t or pfarg_dbreg_t structure. + * For some CPUs, the upper bits of a counter must be set in order for the + * overflow interrupt to happen. On overflow, the counter has wrapped around, + * and the upper bits are cleared. This function may be used to set them back. */ -#define PFM_REG_RETFL_NOTAVAIL (1UL<<31) /* set if register is implemented but not available */ -#define PFM_REG_RETFL_EINVAL (1UL<<30) /* set if register entry is invalid */ -#define PFM_REG_RETFL_MASK (PFM_REG_RETFL_NOTAVAIL|PFM_REG_RETFL_EINVAL) +static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum) +{} -#define PFM_REG_HAS_ERROR(flag) (((flag) & PFM_REG_RETFL_MASK) != 0) +static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) +{ + return 0; +} -typedef unsigned char pfm_uuid_t[16]; /* custom sampling buffer identifier type */ +static inline void pfm_arch_pmu_config_remove(void) +{} /* - * Request structure used to define a context - */ -typedef struct { - pfm_uuid_t ctx_smpl_buf_id; /* which buffer format to use (if needed) */ - unsigned long ctx_flags; /* noblock/block */ - unsigned short ctx_nextra_sets; /* number of extra event sets (you always get 1) */ - unsigned short ctx_reserved1; /* for future use */ - int ctx_fd; /* return arg: unique identification for context */ - void *ctx_smpl_vaddr; /* return arg: virtual address of sampling buffer, is used */ - unsigned long ctx_reserved2[11];/* for future use */ -} pfarg_context_t; + * called from __pfm_interrupt_handler(). ctx is not NULL. + * ctx is locked. PMU interrupt is masked. + * + * must stop all monitoring to ensure handler has consistent view. + * must collect overflowed PMDs bitmask into povfls_pmds and + * npend_ovfls. If no interrupt detected then npend_ovfls + * must be set to zero. + */ +static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + u64 tmp; + + /* + * do not overwrite existing value, must + * process those first (coming from context switch replay) + */ + if (set->npend_ovfls) + return; + + ia64_srlz_d(); + + tmp = ia64_get_pmc(0) & ~0xf; + + set->povfl_pmds[0] = tmp; + + set->npend_ovfls = ia64_popcnt(tmp); +} + +static inline int pfm_arch_init_pmu_config(void) +{ + return 0; +} + +static inline void pfm_arch_resend_irq(void) +{ + ia64_resend_irq(IA64_PERFMON_VECTOR); +} + +static inline void pfm_arch_serialize(void) +{ + ia64_srlz_d(); +} + +static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) +{ + PFM_DBG_ovfl("state=%d", ctx->state); + ia64_set_pmc(0, 0); + /* no serialization */ +} + +static inline void pfm_arch_write_pmc(struct pfm_context *ctx, unsigned int cnum, u64 value) +{ + if (cnum < 256) { + ia64_set_pmc(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value); + } else if (cnum < 264) { + ia64_set_ibr(cnum-256, value); + ia64_dv_serialize_instruction(); + } else { + ia64_set_dbr(cnum-264, value); + ia64_dv_serialize_instruction(); + } +} + +/* + * On IA-64, for per-thread context which have the ITA_FL_INSECURE + * flag, it is possible to start/stop monitoring directly from user evel + * without calling pfm_start()/pfm_stop. This allows very lightweight + * control yet the kernel sometimes needs to know if monitoring is actually + * on or off. + * + * Tracking of this information is normally done by pfm_start/pfm_stop + * in flags.started. Here we need to compensate by checking actual + * psr bit. + */ +static inline int pfm_arch_is_active(struct pfm_context *ctx) +{ + return ctx->flags.started || ia64_getreg(_IA64_REG_PSR) & (IA64_PSR_UP|IA64_PSR_PP); +} + +static inline void pfm_arch_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value) +{ + /* + * for a counting PMD, overflow bit must be cleared + */ + if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64) + value &= pfm_pmu_conf->ovfl_mask; + + /* + * for counters, write to upper bits are ignored, no need to mask + */ + ia64_set_pmd(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value); +} + +static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + return ia64_get_pmd(pfm_pmu_conf->pmd_desc[cnum].hw_addr); +} + +static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum) +{ + return ia64_get_pmc(pfm_pmu_conf->pmc_desc[cnum].hw_addr); +} + +static inline void pfm_arch_ctxswout_sys(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pt_regs *regs; + + regs = task_pt_regs(task); + ia64_psr(regs)->pp = 0; +} + +static inline void pfm_arch_ctxswin_sys(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pt_regs *regs; + + if (!(set->flags & PFM_ITA_SETFL_INTR_ONLY)) { + regs = task_pt_regs(task); + ia64_psr(regs)->pp = 1; + } +} + +/* + * On IA-64, the PMDs are NOT saved by pfm_arch_freeze_pmu() + * when entering the PMU interrupt handler, thus, we need + * to save them in pfm_switch_sets_from_intr() + */ +static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_save_pmds(ctx, set); +} + +int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags); + +static inline void pfm_arch_context_free(struct pfm_context *ctx) +{} + +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set); +void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set); + +int pfm_arch_unload_context(struct pfm_context *ctx, struct task_struct *task); +int pfm_arch_load_context(struct pfm_context *ctx, struct pfm_event_set *set, + struct task_struct *task); +int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags); + +void pfm_arch_mask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_unmask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set); + +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); + +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set); +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set); + +int pfm_arch_init(void); +void pfm_arch_init_percpu(void); +char *pfm_arch_get_pmu_module_name(void); + +int __pfm_use_dbregs(struct task_struct *task); +int __pfm_release_dbregs(struct task_struct *task); +int pfm_ia64_mark_dbregs_used(struct pfm_context *ctx, + struct pfm_event_set *set); + +void pfm_arch_show_session(struct seq_file *m); + +static inline int pfm_arch_pmu_acquire(void) +{ + return 0; +} + +static inline void pfm_arch_pmu_release(void) +{} /* - * Request structure used to write/read a PMC or PMD + * miscellaneous architected definitions */ -typedef struct { - unsigned int reg_num; /* which register */ - unsigned short reg_set; /* event set for this register */ - unsigned short reg_reserved1; /* for future use */ - - unsigned long reg_value; /* initial pmc/pmd value */ - unsigned long reg_flags; /* input: pmc/pmd flags, return: reg error */ - - unsigned long reg_long_reset; /* reset after buffer overflow notification */ - unsigned long reg_short_reset; /* reset after counter overflow */ - - unsigned long reg_reset_pmds[4]; /* which other counters to reset on overflow */ - unsigned long reg_random_seed; /* seed value when randomization is used */ - unsigned long reg_random_mask; /* bitmask used to limit random value */ - unsigned long reg_last_reset_val;/* return: PMD last reset value */ - - unsigned long reg_smpl_pmds[4]; /* which pmds are accessed when PMC overflows */ - unsigned long reg_smpl_eventid; /* opaque sampling event identifier */ - - unsigned long reg_reserved2[3]; /* for future use */ -} pfarg_reg_t; - -typedef struct { - unsigned int dbreg_num; /* which debug register */ - unsigned short dbreg_set; /* event set for this register */ - unsigned short dbreg_reserved1; /* for future use */ - unsigned long dbreg_value; /* value for debug register */ - unsigned long dbreg_flags; /* return: dbreg error */ - unsigned long dbreg_reserved2[1]; /* for future use */ -} pfarg_dbreg_t; - -typedef struct { - unsigned int ft_version; /* perfmon: major [16-31], minor [0-15] */ - unsigned int ft_reserved; /* reserved for future use */ - unsigned long reserved[4]; /* for future use */ -} pfarg_features_t; - -typedef struct { - pid_t load_pid; /* process to load the context into */ - unsigned short load_set; /* first event set to load */ - unsigned short load_reserved1; /* for future use */ - unsigned long load_reserved2[3]; /* for future use */ -} pfarg_load_t; - -typedef struct { - int msg_type; /* generic message header */ - int msg_ctx_fd; /* generic message header */ - unsigned long msg_ovfl_pmds[4]; /* which PMDs overflowed */ - unsigned short msg_active_set; /* active set at the time of overflow */ - unsigned short msg_reserved1; /* for future use */ - unsigned int msg_reserved2; /* for future use */ - unsigned long msg_tstamp; /* for perf tuning/debug */ -} pfm_ovfl_msg_t; - -typedef struct { - int msg_type; /* generic message header */ - int msg_ctx_fd; /* generic message header */ - unsigned long msg_tstamp; /* for perf tuning */ -} pfm_end_msg_t; - -typedef struct { - int msg_type; /* type of the message */ - int msg_ctx_fd; /* unique identifier for the context */ - unsigned long msg_tstamp; /* for perf tuning */ -} pfm_gen_msg_t; - -#define PFM_MSG_OVFL 1 /* an overflow happened */ -#define PFM_MSG_END 2 /* task to which context was attached ended */ - -typedef union { - pfm_ovfl_msg_t pfm_ovfl_msg; - pfm_end_msg_t pfm_end_msg; - pfm_gen_msg_t pfm_gen_msg; -} pfm_msg_t; +#define PFM_ITA_FCNTR 4 /* first counting monitor (PMC/PMD) */ /* - * Define the version numbers for both perfmon as a whole and the sampling buffer format. + * private event set flags (set_priv_flags) */ -#define PFM_VERSION_MAJ 2U -#define PFM_VERSION_MIN 0U -#define PFM_VERSION (((PFM_VERSION_MAJ&0xffff)<<16)|(PFM_VERSION_MIN & 0xffff)) -#define PFM_VERSION_MAJOR(x) (((x)>>16) & 0xffff) -#define PFM_VERSION_MINOR(x) ((x) & 0xffff) +#define PFM_ITA_SETFL_USE_DBR 0x1000000 /* set uses debug registers */ /* - * miscellaneous architected definitions + * Itanium-specific data structures */ -#define PMU_FIRST_COUNTER 4 /* first counting monitor (PMC/PMD) */ -#define PMU_MAX_PMCS 256 /* maximum architected number of PMC registers */ -#define PMU_MAX_PMDS 256 /* maximum architected number of PMD registers */ +struct pfm_ia64_context_flags { + unsigned int use_dbr:1; /* use range restrictions (debug registers) */ + unsigned int insecure:1; /* insecure monitoring for non-self session */ + unsigned int reserved:30;/* for future use */ +}; -#ifdef __KERNEL__ +struct pfm_arch_context { + struct pfm_ia64_context_flags flags; /* arch specific ctx flags */ + u64 ctx_saved_psr_up;/* storage for ctxsw psr_up */ +#ifdef CONFIG_IA64_PERFMON_COMPAT + void *ctx_smpl_vaddr; /* vaddr of user mapping */ +#endif +}; -extern long perfmonctl(int fd, int cmd, void *arg, int narg); +#ifdef CONFIG_IA64_PERFMON_COMPAT +int pfm_ia64_compat_init(void); +int pfm_smpl_buffer_alloc_compat(struct pfm_context *ctx, + size_t rsize, struct file *filp); +#else +static inline int pfm_smpl_buffer_alloc_compat(struct pfm_context *ctx, + size_t rsize, struct file *filp) +{ + return -EINVAL; +} +#endif -typedef struct { - void (*handler)(int irq, void *arg, struct pt_regs *regs); -} pfm_intr_handler_desc_t; - -extern void pfm_save_regs (struct task_struct *); -extern void pfm_load_regs (struct task_struct *); - -extern void pfm_exit_thread(struct task_struct *); -extern int pfm_use_debug_registers(struct task_struct *); -extern int pfm_release_debug_registers(struct task_struct *); -extern void pfm_syst_wide_update_task(struct task_struct *, unsigned long info, int is_ctxswin); -extern void pfm_inherit(struct task_struct *task, struct pt_regs *regs); -extern void pfm_init_percpu(void); -extern void pfm_handle_work(void); -extern int pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *h); -extern int pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *h); +extern struct pfm_ia64_pmu_info *pfm_ia64_pmu_info; +#define PFM_ARCH_CTX_SIZE (sizeof(struct pfm_arch_context)) -/* - * Reset PMD register flags - */ -#define PFM_PMD_SHORT_RESET 0 -#define PFM_PMD_LONG_RESET 1 - -typedef union { - unsigned int val; - struct { - unsigned int notify_user:1; /* notify user program of overflow */ - unsigned int reset_ovfl_pmds:1; /* reset overflowed PMDs */ - unsigned int block_task:1; /* block monitored task on kernel exit */ - unsigned int mask_monitoring:1; /* mask monitors via PMCx.plm */ - unsigned int reserved:28; /* for future use */ - } bits; -} pfm_ovfl_ctrl_t; - -typedef struct { - unsigned char ovfl_pmd; /* index of overflowed PMD */ - unsigned char ovfl_notify; /* =1 if monitor requested overflow notification */ - unsigned short active_set; /* event set active at the time of the overflow */ - pfm_ovfl_ctrl_t ovfl_ctrl; /* return: perfmon controls to set by handler */ - - unsigned long pmd_last_reset; /* last reset value of of the PMD */ - unsigned long smpl_pmds[4]; /* bitmask of other PMD of interest on overflow */ - unsigned long smpl_pmds_values[PMU_MAX_PMDS]; /* values for the other PMDs of interest */ - unsigned long pmd_value; /* current 64-bit value of the PMD */ - unsigned long pmd_eventid; /* eventid associated with PMD */ -} pfm_ovfl_arg_t; - - -typedef struct { - char *fmt_name; - pfm_uuid_t fmt_uuid; - size_t fmt_arg_size; - unsigned long fmt_flags; - - int (*fmt_validate)(struct task_struct *task, unsigned int flags, int cpu, void *arg); - int (*fmt_getsize)(struct task_struct *task, unsigned int flags, int cpu, void *arg, unsigned long *size); - int (*fmt_init)(struct task_struct *task, void *buf, unsigned int flags, int cpu, void *arg); - int (*fmt_handler)(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp); - int (*fmt_restart)(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs); - int (*fmt_restart_active)(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs); - int (*fmt_exit)(struct task_struct *task, void *buf, struct pt_regs *regs); +static inline void pfm_release_dbregs(struct task_struct *task) +{ + if (task->thread.flags & IA64_THREAD_DBG_VALID) + __pfm_release_dbregs(task); +} - struct list_head fmt_list; -} pfm_buffer_fmt_t; +#define pfm_use_dbregs(_t) __pfm_use_dbregs(_t) -extern int pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt); -extern int pfm_unregister_buffer_fmt(pfm_uuid_t uuid); - -/* - * perfmon interface exported to modules - */ -extern int pfm_mod_read_pmds(struct task_struct *, void *req, unsigned int nreq, struct pt_regs *regs); -extern int pfm_mod_write_pmcs(struct task_struct *, void *req, unsigned int nreq, struct pt_regs *regs); -extern int pfm_mod_write_ibrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs); -extern int pfm_mod_write_dbrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs); - -/* - * describe the content of the local_cpu_date->pfm_syst_info field - */ -#define PFM_CPUINFO_SYST_WIDE 0x1 /* if set a system wide session exists */ -#define PFM_CPUINFO_DCR_PP 0x2 /* if set the system wide session has started */ -#define PFM_CPUINFO_EXCL_IDLE 0x4 /* the system wide session excludes the idle task */ - -/* - * sysctl control structure. visible to sampling formats - */ -typedef struct { - int debug; /* turn on/off debugging via syslog */ - int debug_ovfl; /* turn on/off debug printk in overflow handler */ - int fastctxsw; /* turn on/off fast (unsecure) ctxsw */ - int expert_mode; /* turn on/off value checking */ -} pfm_sysctl_t; -extern pfm_sysctl_t pfm_sysctl; +struct pfm_arch_pmu_info { + unsigned long mask_pmcs[PFM_PMC_BV]; /* PMC to modify when masking monitoring */ +}; #endif /* __KERNEL__ */ - -#endif /* _ASM_IA64_PERFMON_H */ +#endif /* _ASM_IA64_PERFMON_H_ */ Index: linux-2.6/include/asm-ia64/perfmon_api.h =================================================================== --- /dev/null +++ linux-2.6/include/asm-ia64/perfmon_api.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2001-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file contains Itanium Processor Family specific definitions + * for the perfmon interface. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_IA64_PERFMON_API_H_ +#define _ASM_IA64_PERFMON_API_H_ + +#define PFM_ARCH_MAX_HW_PMCS 256 /* architecture-defined max PMCS */ +#define PFM_ARCH_MAX_HW_PMDS 256 /* architecture-defined max PMDS */ +/* + * Virtual PMU registers: registers mapped to non-PMU resources + * IMPORTANT: + * - must appear in PMC/PMD namespace *AFTER* PMU registers + * - SW PMD can be specified as smpl_pmds, reset_pmds + * - SW PMD cannot overflow + * - SW PMD do not show up in pfarg_msg.ovfl_pmds/pfarg_setinfo_t.ovfl_pmds + */ +#define PFM_ARCH_MAX_SW_PMCS 64 /* max virtual PMCS */ +#define PFM_ARCH_MAX_SW_PMDS 64 /* max virtual PMDS */ + +/* + * Itanium specific context flags + */ +#define PFM_ITA_FL_INSECURE 0x10000 /* force psr.sp=0 for non self-monitoring */ + +/* + * Itanium specific public event set flags (set_flags) + * + * event set flags layout: + * bits[00-15] : generic flags + * bits[16-31] : arch-specific flags + */ +#define PFM_ITA_SETFL_EXCL_INTR 0x10000 /* exclude interrupt execution */ +#define PFM_ITA_SETFL_INTR_ONLY 0x20000 /* include only interrupt execution */ +#define PFM_ITA_SETFL_IDLE_EXCL 0x40000 /* stop monitoring in idle loop */ + +#endif /* _ASM_IA64_PERFMON_API_H_ */ Index: linux-2.6/include/asm-ia64/perfmon_compat.h =================================================================== --- /dev/null +++ linux-2.6/include/asm-ia64/perfmon_compat.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This header file contains perfmon interface definition + * that are now obsolete and should be dropped in favor + * of their equivalent functions as explained below. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ + +#ifndef _ASM_IA64_PERFMON_COMPAT_H_ +#define _ASM_IA64_PERFMON_COMPAT_H_ + +/* + * custom sampling buffer identifier type + */ +typedef __u8 pfm_uuid_t[16]; + +/* + * obsolete perfmon commands. Supported only on IA-64 for + * backward compatiblity reasons with perfmon v2.0. + */ +#define PFM_WRITE_PMCS 0x01 /* use pfm_write_pmcs */ +#define PFM_WRITE_PMDS 0x02 /* use pfm_write_pmds */ +#define PFM_READ_PMDS 0x03 /* use pfm_read_pmds */ +#define PFM_STOP 0x04 /* use pfm_stop */ +#define PFM_START 0x05 /* use pfm_start */ +#define PFM_ENABLE 0x06 /* obsolete */ +#define PFM_DISABLE 0x07 /* obsolete */ +#define PFM_CREATE_CONTEXT 0x08 /* use pfm_create_context */ +#define PFM_DESTROY_CONTEXT 0x09 /* use close() */ +#define PFM_RESTART 0x0a /* use pfm_restart */ +#define PFM_PROTECT_CONTEXT 0x0b /* obsolete */ +#define PFM_GET_FEATURES 0x0c /* use /proc/sys/perfmon */ +#define PFM_DEBUG 0x0d /* /proc/sys/kernel/perfmon/debug */ +#define PFM_UNPROTECT_CONTEXT 0x0e /* obsolete */ +#define PFM_GET_PMC_RESET_VAL 0x0f /* use /proc/perfmon_map */ +#define PFM_LOAD_CONTEXT 0x10 /* use pfm_load_context */ +#define PFM_UNLOAD_CONTEXT 0x11 /* use pfm_unload_context */ + +/* + * PMU model specific commands (may not be supported on all PMU models) + */ +#define PFM_WRITE_IBRS 0x20 /* obsolete: use PFM_WRITE_PMCS[256-263] */ +#define PFM_WRITE_DBRS 0x21 /* obsolete: use PFM_WRITE_PMCS[264-271] */ + +/* + * argument to PFM_CREATE_CONTEXT + */ +struct pfarg_context { + pfm_uuid_t ctx_smpl_buf_id; /* buffer format to use */ + unsigned long ctx_flags; /* noblock/block */ + unsigned int ctx_reserved1; /* for future use */ + int ctx_fd; /* return: fildesc */ + void *ctx_smpl_vaddr; /* return: vaddr of buffer */ + unsigned long ctx_reserved3[11];/* for future use */ +}; + +/* + * argument structure for PFM_WRITE_PMCS/PFM_WRITE_PMDS/PFM_WRITE_PMDS + */ +struct pfarg_reg { + unsigned int reg_num; /* which register */ + unsigned short reg_set; /* event set for this register */ + unsigned short reg_reserved1; /* for future use */ + + unsigned long reg_value; /* initial pmc/pmd value */ + unsigned long reg_flags; /* input: pmc/pmd flags, return: reg error */ + + unsigned long reg_long_reset; /* reset after buffer overflow notification */ + unsigned long reg_short_reset; /* reset after counter overflow */ + + unsigned long reg_reset_pmds[4]; /* which other counters to reset on overflow */ + unsigned long reg_random_seed; /* seed for randomization */ + unsigned long reg_random_mask; /* random range limit */ + unsigned long reg_last_reset_val;/* return: PMD last reset value */ + + unsigned long reg_smpl_pmds[4]; /* pmds to be saved on overflow */ + unsigned long reg_smpl_eventid; /* opaque sampling event id */ + unsigned long reg_ovfl_switch_cnt;/* #overflows to switch */ + + unsigned long reg_reserved2[2]; /* for future use */ +}; + +/* + * argument to PFM_WRITE_IBRS/PFM_WRITE_DBRS + */ +struct pfarg_dbreg { + unsigned int dbreg_num; /* which debug register */ + unsigned short dbreg_set; /* event set */ + unsigned short dbreg_reserved1; /* for future use */ + unsigned long dbreg_value; /* value for debug register */ + unsigned long dbreg_flags; /* return: dbreg error */ + unsigned long dbreg_reserved2[1]; /* for future use */ +}; + +/* + * argument to PFM_GET_FEATURES + */ +struct pfarg_features { + unsigned int ft_version; /* major [16-31], minor [0-15] */ + unsigned int ft_reserved; /* reserved for future use */ + unsigned long reserved[4]; /* for future use */ +}; + +#endif /* _ASM_IA64_PERFMON_COMPAT_H_ */ Index: linux-2.6/include/asm-ia64/perfmon_default_smpl.h =================================================================== --- linux-2.6.orig/include/asm-ia64/perfmon_default_smpl.h +++ linux-2.6/include/asm-ia64/perfmon_default_smpl.h @@ -1,83 +1,106 @@ /* - * Copyright (C) 2002-2003 Hewlett-Packard Co - * Stephane Eranian + * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian * - * This file implements the default sampling buffer format - * for Linux/ia64 perfmon subsystem. - */ -#ifndef __PERFMON_DEFAULT_SMPL_H__ -#define __PERFMON_DEFAULT_SMPL_H__ 1 + * This file implements the old default sampling buffer format + * for the perfmon2 subsystem. For IA-64 only. + * + * It requires the use of the perfmon_compat.h header. It is recommended + * that applications be ported to the new format instead. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef __ASM_IA64_PERFMON_DEFAULT_SMPL_H__ +#define __ASM_IA64_PERFMON_DEFAULT_SMPL_H__ 1 + +#ifndef __ia64__ +#error "this file must be used for compatibility reasons only on IA-64" +#endif #define PFM_DEFAULT_SMPL_UUID { \ - 0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82, 0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97} + 0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82,\ + 0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97} /* * format specific parameters (passed at context creation) */ -typedef struct { +struct pfm_default_smpl_arg { unsigned long buf_size; /* size of the buffer in bytes */ unsigned int flags; /* buffer specific flags */ unsigned int res1; /* for future use */ unsigned long reserved[2]; /* for future use */ -} pfm_default_smpl_arg_t; +}; /* * combined context+format specific structure. Can be passed - * to PFM_CONTEXT_CREATE + * to PFM_CONTEXT_CREATE (not PFM_CONTEXT_CREATE2) */ -typedef struct { - pfarg_context_t ctx_arg; - pfm_default_smpl_arg_t buf_arg; -} pfm_default_smpl_ctx_arg_t; +struct pfm_default_smpl_ctx_arg { + struct pfarg_context ctx_arg; + struct pfm_default_smpl_arg buf_arg; +}; /* * This header is at the beginning of the sampling buffer returned to the user. * It is directly followed by the first record. */ -typedef struct { - unsigned long hdr_count; /* how many valid entries */ - unsigned long hdr_cur_offs; /* current offset from top of buffer */ - unsigned long hdr_reserved2; /* reserved for future use */ - - unsigned long hdr_overflows; /* how many times the buffer overflowed */ - unsigned long hdr_buf_size; /* how many bytes in the buffer */ - - unsigned int hdr_version; /* contains perfmon version (smpl format diffs) */ - unsigned int hdr_reserved1; /* for future use */ - unsigned long hdr_reserved[10]; /* for future use */ -} pfm_default_smpl_hdr_t; +struct pfm_default_smpl_hdr { + u64 hdr_count; /* how many valid entries */ + u64 hdr_cur_offs; /* current offset from top of buffer */ + u64 dr_reserved2; /* reserved for future use */ + + u64 hdr_overflows; /* how many times the buffer overflowed */ + u64 hdr_buf_size; /* how many bytes in the buffer */ + + u32 hdr_version; /* smpl format version*/ + u32 hdr_reserved1; /* for future use */ + u64 hdr_reserved[10]; /* for future use */ +}; /* * Entry header in the sampling buffer. The header is directly followed - * with the values of the PMD registers of interest saved in increasing - * index order: PMD4, PMD5, and so on. How many PMDs are present depends + * with the values of the PMD registers of interest saved in increasing + * index order: PMD4, PMD5, and so on. How many PMDs are present depends * on how the session was programmed. * * In the case where multiple counters overflow at the same time, multiple * entries are written consecutively. * - * last_reset_value member indicates the initial value of the overflowed PMD. + * last_reset_value member indicates the initial value of the overflowed PMD. */ -typedef struct { - int pid; /* thread id (for NPTL, this is gettid()) */ - unsigned char reserved1[3]; /* reserved for future use */ - unsigned char ovfl_pmd; /* index of overflowed PMD */ - - unsigned long last_reset_val; /* initial value of overflowed PMD */ - unsigned long ip; /* where did the overflow interrupt happened */ - unsigned long tstamp; /* ar.itc when entering perfmon intr. handler */ - - unsigned short cpu; /* cpu on which the overfow occured */ - unsigned short set; /* event set active when overflow ocurred */ - int tgid; /* thread group id (for NPTL, this is getpid()) */ -} pfm_default_smpl_entry_t; - -#define PFM_DEFAULT_MAX_PMDS 64 /* how many pmds supported by data structures (sizeof(unsigned long) */ -#define PFM_DEFAULT_MAX_ENTRY_SIZE (sizeof(pfm_default_smpl_entry_t)+(sizeof(unsigned long)*PFM_DEFAULT_MAX_PMDS)) -#define PFM_DEFAULT_SMPL_MIN_BUF_SIZE (sizeof(pfm_default_smpl_hdr_t)+PFM_DEFAULT_MAX_ENTRY_SIZE) +struct pfm_default_smpl_entry { + pid_t pid; /* thread id (for NPTL, this is gettid()) */ + uint8_t reserved1[3]; /* for future use */ + uint8_t ovfl_pmd; /* overflow pmd for this sample */ + u64 last_reset_val; /* initial value of overflowed PMD */ + unsigned long ip; /* where did the overflow interrupt happened */ + u64 tstamp; /* overflow timetamp */ + u16 cpu; /* cpu on which the overfow occured */ + u16 set; /* event set active when overflow ocurred */ + pid_t tgid; /* thread group id (for NPTL, this is getpid()) */ +}; + +#define PFM_DEFAULT_MAX_PMDS 64 /* #pmds supported */ +#define PFM_DEFAULT_MAX_ENTRY_SIZE (sizeof(struct pfm_default_smpl_entry)+\ + (sizeof(u64)*PFM_DEFAULT_MAX_PMDS)) +#define PFM_DEFAULT_SMPL_MIN_BUF_SIZE (sizeof(struct pfm_default_smpl_hdr)+\ + PFM_DEFAULT_MAX_ENTRY_SIZE) #define PFM_DEFAULT_SMPL_VERSION_MAJ 2U -#define PFM_DEFAULT_SMPL_VERSION_MIN 0U -#define PFM_DEFAULT_SMPL_VERSION (((PFM_DEFAULT_SMPL_VERSION_MAJ&0xffff)<<16)|(PFM_DEFAULT_SMPL_VERSION_MIN & 0xffff)) +#define PFM_DEFAULT_SMPL_VERSION_MIN 1U +#define PFM_DEFAULT_SMPL_VERSION (((PFM_DEFAULT_SMPL_VERSION_MAJ&0xffff)<<16)|\ + (PFM_DEFAULT_SMPL_VERSION_MIN & 0xffff)) -#endif /* __PERFMON_DEFAULT_SMPL_H__ */ +#endif /* __ASM_IA64_PERFMON_DEFAULT_SMPL_H__ */ Index: linux-2.6/include/asm-ia64/processor.h =================================================================== --- linux-2.6.orig/include/asm-ia64/processor.h +++ linux-2.6/include/asm-ia64/processor.h @@ -41,7 +41,6 @@ #define IA64_THREAD_FPH_VALID (__IA64_UL(1) << 0) /* floating-point high state valid? */ #define IA64_THREAD_DBG_VALID (__IA64_UL(1) << 1) /* debug registers valid? */ -#define IA64_THREAD_PM_VALID (__IA64_UL(1) << 2) /* performance registers valid? */ #define IA64_THREAD_UAC_NOPRINT (__IA64_UL(1) << 3) /* don't log unaligned accesses */ #define IA64_THREAD_UAC_SIGBUS (__IA64_UL(1) << 4) /* generate SIGBUS on unaligned acc. */ #define IA64_THREAD_MIGRATION (__IA64_UL(1) << 5) /* require migration @@ -257,14 +256,6 @@ struct thread_struct { #else # define INIT_THREAD_IA32 #endif /* CONFIG_IA32_SUPPORT */ -#ifdef CONFIG_PERFMON - void *pfm_context; /* pointer to detailed PMU context */ - unsigned long pfm_needs_checking; /* when >0, pending perfmon work on kernel exit */ -# define INIT_THREAD_PM .pfm_context = NULL, \ - .pfm_needs_checking = 0UL, -#else -# define INIT_THREAD_PM -#endif __u64 dbr[IA64_NUM_DBG_REGS]; __u64 ibr[IA64_NUM_DBG_REGS]; struct ia64_fpreg fph[96]; /* saved/loaded on demand */ @@ -279,7 +270,6 @@ struct thread_struct { .task_size = DEFAULT_TASK_SIZE, \ .last_fph_cpu = -1, \ INIT_THREAD_IA32 \ - INIT_THREAD_PM \ .dbr = {0, }, \ .ibr = {0, }, \ .fph = {{{{0}}}, } \ Index: linux-2.6/include/asm-ia64/system.h =================================================================== --- linux-2.6.orig/include/asm-ia64/system.h +++ linux-2.6/include/asm-ia64/system.h @@ -208,22 +208,18 @@ struct task_struct; extern void ia64_save_extra (struct task_struct *task); extern void ia64_load_extra (struct task_struct *task); -#ifdef CONFIG_PERFMON - DECLARE_PER_CPU(unsigned long, pfm_syst_info); -# define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1) -#else -# define PERFMON_IS_SYSWIDE() (0) -#endif - -#define IA64_HAS_EXTRA_STATE(t) \ - ((t)->thread.flags & (IA64_THREAD_DBG_VALID|IA64_THREAD_PM_VALID) \ - || IS_IA32_PROCESS(task_pt_regs(t)) || PERFMON_IS_SYSWIDE()) +#define IA64_HAS_EXTRA_STATE(t) \ + (((t)->thread.flags & IA64_THREAD_DBG_VALID) \ + || IS_IA32_PROCESS(task_pt_regs(t))) #define __switch_to(prev,next,last) do { \ if (IA64_HAS_EXTRA_STATE(prev)) \ ia64_save_extra(prev); \ if (IA64_HAS_EXTRA_STATE(next)) \ ia64_load_extra(next); \ + if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW) \ + || test_tsk_thread_flag(next, TIF_PERFMON_CTXSW)) \ + pfm_ctxsw(prev, next); \ ia64_psr(task_pt_regs(next))->dfh = !ia64_is_local_fpu_owner(next); \ (last) = ia64_switch_to((next)); \ } while (0) Index: linux-2.6/include/asm-ia64/thread_info.h =================================================================== --- linux-2.6.orig/include/asm-ia64/thread_info.h +++ linux-2.6/include/asm-ia64/thread_info.h @@ -86,11 +86,13 @@ struct thread_info { #define TIF_SYSCALL_AUDIT 4 /* syscall auditing active */ #define TIF_SINGLESTEP 5 /* restore singlestep on return to user mode */ #define TIF_RESTORE_SIGMASK 6 /* restore signal mask in do_signal() */ +#define TIF_PERFMON_WORK 7 /* work for pfm_handle_work() */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 17 #define TIF_MCA_INIT 18 /* this task is processing MCA or INIT */ #define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */ #define TIF_FREEZE 20 /* is freezing for suspend */ +#define TIF_PERFMON_CTXSW 21 /* perfmon needs ctxsw calls */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) @@ -104,9 +106,11 @@ struct thread_info { #define _TIF_MCA_INIT (1 << TIF_MCA_INIT) #define _TIF_DB_DISABLED (1 << TIF_DB_DISABLED) #define _TIF_FREEZE (1 << TIF_FREEZE) +#define _TIF_PERFMON_CTXSW (1 << TIF_PERFMON_CTXSW) +#define _TIF_PERFMON_WORK (1 << TIF_PERFMON_WORK) /* "work to do on user-return" bits */ -#define TIF_ALLWORK_MASK (_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_RESTORE_SIGMASK) +#define TIF_ALLWORK_MASK (_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_RESTORE_SIGMASK|_TIF_PERFMON_WORK) /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */ #define TIF_WORK_MASK (TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)) Index: linux-2.6/include/asm-ia64/unistd.h =================================================================== --- linux-2.6.orig/include/asm-ia64/unistd.h +++ linux-2.6/include/asm-ia64/unistd.h @@ -299,11 +299,23 @@ #define __NR_signalfd 1307 #define __NR_timerfd 1308 #define __NR_eventfd 1309 +#define __NR_pfm_create_context 1310 +#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1) +#define __NR_pfm_write_pmds (__NR_pfm_create_context+2) +#define __NR_pfm_read_pmds (__NR_pfm_create_context+3) +#define __NR_pfm_load_context (__NR_pfm_create_context+4) +#define __NR_pfm_start (__NR_pfm_create_context+5) +#define __NR_pfm_stop (__NR_pfm_create_context+6) +#define __NR_pfm_restart (__NR_pfm_create_context+7) +#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8) +#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9) +#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10) +#define __NR_pfm_unload_context (__NR_pfm_create_context+11) #ifdef __KERNEL__ -#define NR_syscalls 286 /* length of syscall table */ +#define NR_syscalls 298 /* length of syscall table */ /* * The following defines stop scripts/checksyscalls.sh from complaining about Index: linux-2.6/include/asm-mips/perfmon.h =================================================================== --- /dev/null +++ linux-2.6/include/asm-mips/perfmon.h @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2005 Philip Mucci. + * + * Based on other versions: + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file contains mips64 specific definitions for the perfmon + * interface. + * + * This file MUST never be included directly. Use linux/perfmon.h. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_MIPS64_PERFMON_H_ +#define _ASM_MIPS64_PERFMON_H_ + +#ifdef __KERNEL__ + +#define PFM_ARCH_PMD_STK_ARG 2 +#define PFM_ARCH_PMC_STK_ARG 2 + +struct pfm_arch_pmu_info { + u32 pmu_style; +}; + +#define MIPS64_CONFIG_PMC_MASK (1 << 4) +#define MIPS64_PMC_INT_ENABLE_MASK (1 << 4) +#define MIPS64_PMC_CNT_ENABLE_MASK (0xf) +#define MIPS64_PMC_EVT_MASK (0x7 << 6) +#define MIPS64_PMC_CTR_MASK (1 << 31) +#define MIPS64_PMD_INTERRUPT (1 << 31) + +/* Coprocessor register 25 contains the PMU interface. */ +/* Sel 0 is control for counter 0 */ +/* Sel 1 is count for counter 0. */ +/* Sel 2 is control for counter 1. */ +/* Sel 3 is count for counter 1. */ + +/* + +31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +M 0--------------------------------------------------------------0 Event-- IE U S K EXL + +M 31 If this bit is one, another pair of Performance Control +and Counter registers is implemented at a MTC0 + +Event 8:5 Counter event enabled for this counter. Possible events +are listed in Table 6-30. R/W Undefined + +IE 4 Counter Interrupt Enable. This bit masks bit 31 of the +associated count register from the interrupt exception +request output. R/W 0 + +U 3 Count in User Mode. When this bit is set, the specified +event is counted in User Mode. R/W Undefined + +S 2 Count in Supervisor Mode. When this bit is set, the +specified event is counted in Supervisor Mode. R/W Undefined + +K 1 Count in Kernel Mode. When this bit is set, count the +event in Kernel Mode when EXL and ERL both are 0. R/W Undefined + +EXL 0 Count when EXL. When this bit is set, count the event +when EXL = 1 and ERL = 0. R/W Undefined +*/ + +static inline void pfm_arch_resend_irq(void) +{} + +static inline void pfm_arch_serialize(void) +{} + + +static inline void pfm_arch_unfreeze_pmu(void) +{} + +/* + * MIPS does not save the PMDs during pfm_arch_intr_freeze_pmu(), thus + * this routine needs to do it when switching sets on overflow + */ +static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_save_pmds(ctx, set); +} + +static inline void pfm_arch_write_pmc(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + /* + * we only write to the actual register when monitoring is + * active (pfm_start was issued) + */ + if (ctx && (ctx->flags.started == 0)) + return; + + switch(pfm_pmu_conf->pmc_desc[cnum].hw_addr) { + case 0: + write_c0_perfctrl0(value); + break; + case 1: + write_c0_perfctrl1(value); + break; + case 2: + write_c0_perfctrl2(value); + break; + case 3: + write_c0_perfctrl3(value); + break; + default: + BUG(); + } +} + +static inline void pfm_arch_write_pmd(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + value &= pfm_pmu_conf->ovfl_mask; + + switch(pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case 0: + write_c0_perfcntr0(value); + break; + case 1: + write_c0_perfcntr1(value); + break; + case 2: + write_c0_perfcntr2(value); + break; + case 3: + write_c0_perfcntr3(value); + break; + default: + BUG(); + } +} + +static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + switch(pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case 0: + return read_c0_perfcntr0(); + break; + case 1: + return read_c0_perfcntr1(); + break; + case 2: + return read_c0_perfcntr2(); + break; + case 3: + return read_c0_perfcntr3(); + break; + default: + BUG(); + return 0; + } +} + +static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum) +{ + switch(pfm_pmu_conf->pmc_desc[cnum].hw_addr) { + case 0: + return read_c0_perfctrl0(); + break; + case 1: + return read_c0_perfctrl1(); + break; + case 2: + return read_c0_perfctrl2(); + break; + case 3: + return read_c0_perfctrl3(); + break; + default: + BUG(); + return 0; + } +} + +/* + * For some CPUs, the upper bits of a counter must be set in order for the + * overflow interrupt to happen. On overflow, the counter has wrapped around, + * and the upper bits are cleared. This function may be used to set them back. + */ +static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, + unsigned int cnum) +{ + u64 val; + val = pfm_arch_read_pmd(ctx, cnum); + /* This masks out overflow bit 31 */ + pfm_arch_write_pmd(ctx, cnum, val); +} + +/* + * At certain points, perfmon needs to know if monitoring has been + * explicitely started/stopped by user via pfm_start/pfm_stop. The + * information is tracked in ctx.flags.started. However on certain + * architectures, it may be possible to start/stop directly from + * user level with a single assembly instruction bypassing + * the kernel. This function must be used to determine by + * an arch-specific mean if monitoring is actually started/stopped. + */ +static inline int pfm_arch_is_active(struct pfm_context *ctx) +{ + return ctx->flags.started; +} + +static inline void pfm_arch_ctxswout_sys(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +static inline void pfm_arch_ctxswin_sys(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +static inline void pfm_arch_ctxswin_thread(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +int pfm_arch_is_monitoring_active(struct pfm_context *ctx); +int pfm_arch_ctxswout_thread(struct task_struct *task, + struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set); +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set); +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); +char *pfm_arch_get_pmu_module_name(void); + +static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_arch_stop(current, ctx, set); + /* + * we mark monitoring as stopped to avoid + * certain side effects especially in + * pfm_switch_sets_from_intr() on + * pfm_arch_restore_pmcs() + */ + ctx->flags.started = 0; +} + +/* + * unfreeze PMU from pfm_do_interrupt_handler() + * ctx may be NULL for spurious + */ +static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) +{ + if (!ctx) + return; + + PFM_DBG_ovfl("state=%d", ctx->state); + + ctx->flags.started = 1; + + if (ctx->state == PFM_CTX_MASKED) + return; + + pfm_arch_restore_pmcs(ctx, ctx->active_set); +} + +static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) +{ + return 0; +} + +/* + * this function is called from the PMU interrupt handler ONLY. + * On MIPS, the PMU is frozen via arch_stop, masking would be implemented + * via arch-stop as well. Given that the PMU is already stopped when + * entering the interrupt handler, we do not need to stop it again, so + * this function is a nop. + */ +static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +/* + * on MIPS masking/unmasking uses the start/stop mechanism, so we simply + * need to start here. + */ +static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_arch_start(current, ctx, set); +} + +static inline void pfm_arch_pmu_config_remove(void) +{} + +static inline int pfm_arch_context_create(struct pfm_context *ctx, + u32 ctx_flags) +{ + return 0; +} + +static inline void pfm_arch_context_free(struct pfm_context *ctx) +{} + + + + + +/* + * function called from pfm_setfl_sane(). Context is locked + * and interrupts are masked. + * The value of flags is the value of ctx_flags as passed by + * user. + * + * function must check arch-specific set flags. + * Return: + * 1 when flags are valid + * 0 on error + */ +static inline int +pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) +{ + return 0; +} + +static inline int pfm_arch_init(void) +{ + return 0; +} + +static inline void pfm_arch_init_percpu(void) +{} + +static inline int pfm_arch_load_context(struct pfm_context *ctx, + struct pfm_event_set *set, + struct task_struct *task) +{ + return 0; +} + +static inline int pfm_arch_unload_context(struct pfm_context *ctx, + struct task_struct *task) +{ + return 0; +} + +static inline int pfm_arch_pmu_acquire(void) +{ + return 0; +} + +static inline void pfm_arch_pmu_release(void) +{} + +/* + * not used for mips + */ +static inline int pfm_smpl_buffer_alloc_compat(struct pfm_context *ctx, + size_t rsize, struct file *filp) +{ + return -EINVAL; +} + +struct pfm_arch_context { + /* empty */ +}; + +#define PFM_ARCH_CTX_SIZE sizeof(struct pfm_arch_context) + +#endif /* __KERNEL__ */ +#endif /* _ASM_MIPS64_PERFMON_H_ */ Index: linux-2.6/include/asm-mips/perfmon_api.h =================================================================== --- /dev/null +++ linux-2.6/include/asm-mips/perfmon_api.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file contains mips64 specific definitions for the perfmon + * interface. + * + * This file MUST never be included directly. Use linux/perfmon.h. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_MIPS64_PERFMON_API_H_ +#define _ASM_MIPS64_PERFMON_API_H_ + +#define PFM_ARCH_MAX_HW_PMCS 256 /* maximum number of PMC registers */ +#define PFM_ARCH_MAX_HW_PMDS 256 /* maximum number of PMD registers */ +/* + * Virtual PMU registers: registers mapped to non-PMU resources + * IMPORTANT: + * - must appear in PMC/PMD namespace *AFTER* PMU registers + * - SW PMD can be specified as smpl_pmds, reset_pmds + * - SW PMD cannot overflow + * - SW PMD do not show up in pfarg_msg.ovfl_pmds/pfarg_setinfo_t.ovfl_pmds + */ +#define PFM_ARCH_MAX_SW_PMCS 64 /* max virtual PMCS */ +#define PFM_ARCH_MAX_SW_PMDS 64 /* max virtual PMDS */ + +#endif /* _ASM_MIPS64_PERFMON_API_H_ */ Index: linux-2.6/include/asm-mips/smp.h =================================================================== --- linux-2.6.orig/include/asm-mips/smp.h +++ linux-2.6/include/asm-mips/smp.h @@ -109,6 +109,8 @@ static inline void smp_send_reschedule(i core_send_ipi(cpu, SMP_RESCHEDULE_YOURSELF); } +extern int smp_call_function_single(int cpuid, void (*func) (void *info), + void *info, int retry, int wait); extern asmlinkage void smp_call_function_interrupt(void); #endif /* CONFIG_SMP */ Index: linux-2.6/include/asm-mips/system.h =================================================================== --- linux-2.6.orig/include/asm-mips/system.h +++ linux-2.6/include/asm-mips/system.h @@ -65,6 +65,9 @@ do { \ do { \ if (cpu_has_dsp) \ __save_dsp(prev); \ + if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW) \ + || test_tsk_thread_flag(next, TIF_PERFMON_CTXSW)) \ + pfm_ctxsw(prev, next); \ (last) = resume(prev, next, task_thread_info(next)); \ if (cpu_has_dsp) \ __restore_dsp(current); \ Index: linux-2.6/include/asm-mips/thread_info.h =================================================================== --- linux-2.6.orig/include/asm-mips/thread_info.h +++ linux-2.6/include/asm-mips/thread_info.h @@ -115,10 +115,12 @@ register struct thread_info *__current_t #define TIF_SYSCALL_AUDIT 4 /* syscall auditing active */ #define TIF_SECCOMP 5 /* secure computing */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ +#define TIF_PERFMON_WORK 10 /* work for pfm_handle_work() */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 18 #define TIF_FREEZE 19 +#define TIF_PERFMON_CTXSW 21 /* perfmon needs ctxsw calls */ #define TIF_SYSCALL_TRACE 31 /* syscall trace active */ #define _TIF_SYSCALL_TRACE (1<> 24) & 0xff) +#define CBE_PM_ALL_OVERFLOW_INTR 0xff000000 +#define CBE_PM_INTERVAL_INTR 0x00800000 +#define CBE_PM_TRACE_BUFFER_FULL_INTR 0x00400000 +#define CBE_PM_TRACE_BUFFER_UNDERFLOW_INTR 0x00200000 enum pm_reg_name { group_control, Index: linux-2.6/include/asm-powerpc/perfmon.h =================================================================== --- /dev/null +++ linux-2.6/include/asm-powerpc/perfmon.h @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2005 David Gibson, IBM Corporation. + * + * Based on other versions: + * Copyright (c) 2005 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file contains powerpc specific definitions for the perfmon + * interface. + * + * This file MUST never be included directly. Use linux/perfmon.h. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_POWERPC_PERFMON_H_ +#define _ASM_POWERPC_PERFMON_H_ + +#ifdef __KERNEL__ + +#include + +enum powerpc_pmu_type { + PFM_POWERPC_PMU_NONE, + PFM_POWERPC_PMU_604, + PFM_POWERPC_PMU_604e, + PFM_POWERPC_PMU_750, /* XXX: Minor event set diffs between IBM and Moto. */ + PFM_POWERPC_PMU_7400, + PFM_POWERPC_PMU_7450, + PFM_POWERPC_PMU_POWER5, + PFM_POWERPC_PMU_CELL, +}; + +struct pfm_arch_pmu_info { + enum powerpc_pmu_type pmu_style; + + void (*write_pmc)(unsigned int cnum, u64 value); + void (*write_pmd)(unsigned int cnum, u64 value); + + u64 (*read_pmd)(unsigned int cnum); + + void (*enable_counters)(struct pfm_context *ctx, + struct pfm_event_set *set); + void (*disable_counters)(struct pfm_context *ctx, + struct pfm_event_set *set); + + void (*irq_handler)(struct pt_regs *regs, struct pfm_context *ctx); + void (*get_ovfl_pmds)(struct pfm_context *ctx, + struct pfm_event_set *set); + + /* The following routines are optional. */ + void (*restore_pmcs)(struct pfm_event_set *set); + void (*restore_pmds)(struct pfm_event_set *set); + + int (*ctxswout_thread)(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set); + void (*ctxswin_thread)(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set); + int (*load_context)(struct pfm_context *ctx, + struct pfm_event_set *set, + struct task_struct *task); + int (*unload_context)(struct pfm_context *ctx, + struct task_struct *task); +}; + +#ifdef CONFIG_PPC32 +#define PFM_ARCH_PMD_STK_ARG 6 /* conservative value */ +#define PFM_ARCH_PMC_STK_ARG 6 /* conservative value */ +#else +#define PFM_ARCH_PMD_STK_ARG 8 /* conservative value */ +#define PFM_ARCH_PMC_STK_ARG 8 /* conservative value */ +#endif + +static inline void pfm_arch_resend_irq(void) +{} + +static inline void pfm_arch_serialize(void) +{} + +static inline void pfm_arch_write_pmc(struct pfm_context *ctx, + unsigned int cnum, + u64 value) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + + /* + * we only write to the actual register when monitoring is + * active (pfm_start was issued) + */ + if (ctx && ctx->flags.started == 0) + return; + + BUG_ON(!arch_info->write_pmc); + + arch_info->write_pmc(cnum, value); +} + +static inline void pfm_arch_write_pmd(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + + value &= pfm_pmu_conf->ovfl_mask; + + BUG_ON(!arch_info->write_pmd); + + arch_info->write_pmd(cnum, value); +} + +static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + + BUG_ON(!arch_info->read_pmd); + + return arch_info->read_pmd(cnum); +} + +/* + * For some CPUs, the upper bits of a counter must be set in order for the + * overflow interrupt to happen. On overflow, the counter has wrapped around, + * and the upper bits are cleared. This function may be used to set them back. + */ +static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, + unsigned int cnum) +{ + u64 val = pfm_arch_read_pmd(ctx, cnum); + + /* This masks out overflow bit 31 */ + pfm_arch_write_pmd(ctx, cnum, val); +} + +/* + * At certain points, perfmon needs to know if monitoring has been + * explicitely started/stopped by user via pfm_start/pfm_stop. The + * information is tracked in flags.started. However on certain + * architectures, it may be possible to start/stop directly from + * user level with a single assembly instruction bypassing + * the kernel. This function must be used to determine by + * an arch-specific mean if monitoring is actually started/stopped. + */ +static inline int pfm_arch_is_active(struct pfm_context *ctx) +{ + return ctx->flags.started; +} + +static inline void pfm_arch_ctxswout_sys(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +static inline void pfm_arch_ctxswin_sys(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +void pfm_arch_init_percpu(void); +int pfm_arch_is_monitoring_active(struct pfm_context *ctx); +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set); +void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set); +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set); +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set); +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); +int pfm_arch_get_ovfl_pmds(struct pfm_context *ctx, + struct pfm_event_set *set); +char *pfm_arch_get_pmu_module_name(void); +/* + * called from __pfm_interrupt_handler(). ctx is not NULL. + * ctx is locked. PMU interrupt is masked. + * + * must stop all monitoring to ensure handler has consistent view. + * must collect overflowed PMDs bitmask into povfls_pmds and + * npend_ovfls. If no interrupt detected then npend_ovfls + * must be set to zero. + */ +static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, struct pfm_event_set *set) +{ + pfm_arch_stop(current, ctx, set); +} + +void powerpc_irq_handler(struct pt_regs *regs); + +/* + * unfreeze PMU from pfm_do_interrupt_handler() + * ctx may be NULL for spurious + */ +static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *arch_info; + + if (!ctx) + return; + + PFM_DBG_ovfl("state=%d", ctx->state); + + ctx->flags.started = 1; + + if (ctx->state == PFM_CTX_MASKED) + return; + + arch_info = pfm_pmu_conf->arch_info; + BUG_ON(!arch_info->enable_counters); + arch_info->enable_counters(ctx, ctx->active_set); +} + +/* + * PowerPC does not save the PMDs during pfm_arch_intr_freeze_pmu(), thus + * this routine needs to do it when switching sets on overflow + */ +static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_save_pmds(ctx, set); +} + +/* + * this function is called from the PMU interrupt handler ONLY. + * On PPC, the PMU is frozen via arch_stop, masking would be implemented + * via arch-stop as well. Given that the PMU is already stopped when + * entering the interrupt handler, we do not need to stop it again, so + * this function is a nop. + */ +static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +/* + * on x86 masking/unmasking uses the start/stop mechanism, so we simply + * need to start here. + */ +static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_arch_start(current, ctx, set); +} + + +static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) +{ + return 0; +} + +static inline void pfm_arch_pmu_config_remove(void) +{} + +static inline int pfm_arch_context_create(struct pfm_context *ctx, + u32 ctx_flags) +{ + return 0; +} + +static inline void pfm_arch_context_free(struct pfm_context *ctx) +{} + +/* + * function called from pfm_setfl_sane(). Context is locked + * and interrupts are masked. + * The value of flags is the value of ctx_flags as passed by + * user. + * + * function must check arch-specific set flags. + * Return: + * 1 when flags are valid + * 0 on error + */ +static inline int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) +{ + return 0; +} + +static inline int pfm_arch_init(void) +{ + return 0; +} + +static inline int pfm_arch_load_context(struct pfm_context *ctx, + struct pfm_event_set *set, + struct task_struct *task) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + int rc = 0; + + if (arch_info->load_context) { + rc = arch_info->load_context(ctx, set, task); + } + + return rc; +} + +static inline int pfm_arch_unload_context(struct pfm_context *ctx, + struct task_struct *task) +{ + struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info; + int rc = 0; + + if (arch_info->unload_context) { + rc = arch_info->unload_context(ctx, task); + } + + return rc; +} + +/* + * not applicable to powerpc + */ +static inline int pfm_smpl_buffer_alloc_compat(struct pfm_context *ctx, + size_t rsize, struct file *filp) +{ + return -EINVAL; +} + +static inline int pfm_arch_pmu_acquire(void) +{ + return reserve_pmc_hardware(powerpc_irq_handler); +} + +static inline void pfm_arch_pmu_release(void) +{ + release_pmc_hardware(); +} + +struct pfm_arch_context { + /* Cell: Most recent value of the pm_status + * register read by the interrupt handler. + * + * Interrupt handler sets last_read_updated if it + * just read and updated last_read_pm_status + */ + u32 last_read_pm_status; + u32 last_read_updated; +}; + +#define PFM_ARCH_CTX_SIZE sizeof(struct pfm_arch_context) + +#endif /* __KERNEL__ */ +#endif /* _ASM_POWERPC_PERFMON_H_ */ Index: linux-2.6/include/asm-powerpc/perfmon_api.h =================================================================== --- /dev/null +++ linux-2.6/include/asm-powerpc/perfmon_api.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file contains powerpc specific definitions for the perfmon + * interface. + * + * This file MUST never be included directly. Use linux/perfmon.h. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_POWERPC_PERFMON_API_H_ +#define _ASM_POWERPC_PERFMON_API_H_ + +#define PFM_ARCH_MAX_HW_PMCS 256 /* maximum number of PMC registers */ +#define PFM_ARCH_MAX_HW_PMDS 256 /* maximum number of PMD registers */ +/* + * Virtual PMU registers: registers mapped to non-PMU resources + * IMPORTANT: + * - must appear in PMC/PMD namespace *AFTER* PMU registers + * - SW PMD can be specified as smpl_pmds, reset_pmds + * - SW PMD cannot overflow + * - SW PMD do not show up in pfarg_msg.ovfl_pmds/pfarg_setinfo_t.ovfl_pmds + */ +#define PFM_ARCH_MAX_SW_PMCS 64 /* max virtual PMCS */ +#define PFM_ARCH_MAX_SW_PMDS 64 /* max virtual PMDS */ + + +#endif /* _ASM_POWERPC_PERFMON_API_H_ */ Index: linux-2.6/include/asm-powerpc/perfmon_cell_hw_smpl.h =================================================================== --- /dev/null +++ linux-2.6/include/asm-powerpc/perfmon_cell_hw_smpl.h @@ -0,0 +1,106 @@ +/* + * Copyright IBM Corp 2007 + * + * Contributed by Carl Love + * and Kevin Corry + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef __ASM_POWERPC_PERFMON_CELL_HW_SMPL_H__ +#define __ASM_POWERPC_PERFMON_CELL_HW_SMPL_H__ + +/** + * struct pfm_cell_hw_smpl_arg + * + * @buf_size: Size of full sampling-buffer (in bytes). + * @buf_flags: Sampling-buffer-specific flags. + * @reserved1: Pad to 128-bit boundary. + * + * Format specific parameters (passed at context creation). + **/ +struct pfm_cell_hw_smpl_arg { + __u64 buf_size; + __u32 buf_flags; + __u32 reserved1; +}; + +/** + * struct pfm_cell_hw_smpl_hdr + * + * @count: Number of valid sampling-buffer entries. + * @cur_offset: Offset (in bytes) from the top of the sampling-buffer to the + * next available space for a sampling-buffer entry. + * @overflows: Number of times the sampling-buffer has filled up. + * @buf_size: Total bytes in the sampling-buffer. + * @version: Sampling module version. + * @buf_flags: Copy of buf_flags from pfm_cell_hw_smpl_arg. + * @reserved1: Pad to 128-bit boundary. + * + * This header is at the beginning of the sampling-buffer returned to the user. + * It is directly followed by the first record. + **/ +struct pfm_cell_hw_smpl_hdr { + __u64 count; + __u64 cur_offset; + __u64 overflows; + __u64 buf_size; + __u32 version; + __u32 buf_flags; + __u64 reserved1; +}; + +/** + * struct pfm_cell_hw_smpl_entry_hdr + * + * @pid: Thread ID. + * @tgid: Thread group ID. For NPTL, this is getpid(). + * @cpu: CPU on which the overflow occurred. + * @set: Event-set that was active when the overflow occurred. + * @num_samples: Number of 128-bit trace-buffer samples in this entry. + * @entry_num: Sequence number of sampling-buffer entries. + * + * The header for each data entry in the sampling-buffer. The entry header + * is immediately followed by the contents of the trace-buffer. Each line in + * the trace-buffer is 128 bits wide. The entry header specifies the number + * of 128-bit trace-buffer lines that follow the header. + **/ +struct pfm_cell_hw_smpl_entry_hdr { + __u32 pid; + __u32 tgid; + __u16 cpu; + __u16 set; + __u16 num_samples; + __u16 entry_num; +}; + +/* The max size of each sampling-buffer entry is the size of the entry header + * plus the full size of the trace-buffer. + */ +#define PFM_CELL_HW_SMPL_MAX_ENTRY_SIZE \ + (sizeof(struct pfm_cell_hw_smpl_entry_hdr) + \ + 2 * sizeof(u64) * CBE_PM_TRACE_BUF_MAX_COUNT) + +/* The sampling-buffer must be at least as large as the sampling-buffer header + * and the max size of one sampling-buffer entry. + */ +#define PFM_CELL_HW_SMPL_MIN_BUF_SIZE (sizeof(struct pfm_cell_hw_smpl_hdr) + \ + PFM_CELL_HW_SMPL_MAX_ENTRY_SIZE) + +#define PFM_CELL_HW_SMPL_VERSION 1 +#define PFM_CELL_HW_SMPL_NAME "perfmon_cell_hw_smpl" +#define PFM_CELL_HW_SMPL_OVFL_PMD (PFM_MAX_PMDS + 1) +#define PFM_MSG_CELL_HW_SMPL_BUF_FULL 99 + +#endif /* __ASM_POWERPC_PERFMON_CELL_HW_SMPL_H__ */ Index: linux-2.6/include/asm-powerpc/systbl.h =================================================================== --- linux-2.6.orig/include/asm-powerpc/systbl.h +++ linux-2.6/include/asm-powerpc/systbl.h @@ -312,3 +312,15 @@ COMPAT_SYS_SPU(signalfd) COMPAT_SYS_SPU(timerfd) SYSCALL_SPU(eventfd) COMPAT_SYS_SPU(sync_file_range2) +SYSCALL(pfm_create_context) +SYSCALL(pfm_write_pmcs) +SYSCALL(pfm_write_pmds) +SYSCALL(pfm_read_pmds) +SYSCALL(pfm_load_context) +SYSCALL(pfm_start) +SYSCALL(pfm_stop) +SYSCALL(pfm_restart) +SYSCALL(pfm_create_evtsets) +SYSCALL(pfm_getinfo_evtsets) +SYSCALL(pfm_delete_evtsets) +SYSCALL(pfm_unload_context) Index: linux-2.6/include/asm-powerpc/thread_info.h =================================================================== --- linux-2.6.orig/include/asm-powerpc/thread_info.h +++ linux-2.6/include/asm-powerpc/thread_info.h @@ -147,7 +147,7 @@ static inline struct thread_info *curren #define _TIF_SYSCALL_T_OR_A (_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP) #define _TIF_USER_WORK_MASK (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \ - _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK) + _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK| _TIF_PERFMON_WORK) #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR) /* Bits in local_flags */ Index: linux-2.6/include/asm-powerpc/unistd.h =================================================================== --- linux-2.6.orig/include/asm-powerpc/unistd.h +++ linux-2.6/include/asm-powerpc/unistd.h @@ -331,10 +331,22 @@ #define __NR_timerfd 306 #define __NR_eventfd 307 #define __NR_sync_file_range2 308 +#define __NR_pfm_create_context 309 +#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1) +#define __NR_pfm_write_pmds (__NR_pfm_create_context+2) +#define __NR_pfm_read_pmds (__NR_pfm_create_context+3) +#define __NR_pfm_load_context (__NR_pfm_create_context+4) +#define __NR_pfm_start (__NR_pfm_create_context+5) +#define __NR_pfm_stop (__NR_pfm_create_context+6) +#define __NR_pfm_restart (__NR_pfm_create_context+7) +#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8) +#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9) +#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10) +#define __NR_pfm_unload_context (__NR_pfm_create_context+11) #ifdef __KERNEL__ -#define __NR_syscalls 309 +#define __NR_syscalls 321 #define __NR__exit __NR_exit #define NR_syscalls __NR_syscalls Index: linux-2.6/include/asm-x86_64/hw_irq.h =================================================================== --- linux-2.6.orig/include/asm-x86_64/hw_irq.h +++ linux-2.6/include/asm-x86_64/hw_irq.h @@ -84,6 +84,7 @@ * sources per level' errata. */ #define LOCAL_TIMER_VECTOR 0xef +#define LOCAL_PERFMON_VECTOR 0xee /* * First APIC vector available to drivers: (vectors 0x30-0xee) Index: linux-2.6/include/asm-x86_64/perfmon.h =================================================================== --- /dev/null +++ linux-2.6/include/asm-x86_64/perfmon.h @@ -0,0 +1 @@ +#include Index: linux-2.6/include/asm-x86_64/perfmon_api.h =================================================================== --- /dev/null +++ linux-2.6/include/asm-x86_64/perfmon_api.h @@ -0,0 +1 @@ +#include Index: linux-2.6/include/asm-x86_64/perfmon_pebs_smpl.h =================================================================== --- /dev/null +++ linux-2.6/include/asm-x86_64/perfmon_pebs_smpl.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include Index: linux-2.6/include/asm-x86_64/thread_info.h =================================================================== --- linux-2.6.orig/include/asm-x86_64/thread_info.h +++ linux-2.6/include/asm-x86_64/thread_info.h @@ -123,6 +123,7 @@ static inline struct thread_info *stack_ #define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_FREEZE 23 /* is freezing for suspend */ +#define TIF_PERFMON_CTXSW 24 /* perfmon needs ctxsw calls */ #define _TIF_SYSCALL_TRACE (1< + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ + +#ifndef __LINUX_PERFMON_H__ +#define __LINUX_PERFMON_H__ + +#ifdef CONFIG_PERFMON + +/* + * include arch-specific constants and user visible definitions + */ +#include + +#define PFM_MAX_PMCS (PFM_ARCH_MAX_HW_PMCS+PFM_ARCH_MAX_SW_PMCS) +#define PFM_MAX_PMDS (PFM_ARCH_MAX_HW_PMDS+PFM_ARCH_MAX_SW_PMDS) + +/* + * number of elements for each type of bitvector + * all bitvectors use u64 fixed size type on all architectures. + */ +#define PFM_BVSIZE(x) (((x)+(sizeof(u64)<<3)-1) / (sizeof(u64)<<3)) +#define PFM_HW_PMD_BV PFM_BVSIZE(PFM_ARCH_MAX_HW_PMDS) +#define PFM_PMD_BV PFM_BVSIZE(PFM_MAX_PMDS) +#define PFM_PMC_BV PFM_BVSIZE(PFM_MAX_PMCS) +/* + * PMC/PMD flags to use with pfm_write_pmds() or pfm_write_pmcs() + * + * reg_flags layout: + * bit 00-15 : generic flags + * bit 16-23 : arch-specific flags + * bit 24-31 : error codes + */ +#define PFM_REGFL_OVFL_NOTIFY 0x1 /* PMD: send notification on overflow */ +#define PFM_REGFL_RANDOM 0x2 /* PMD: randomize sampling interval */ +#define PFM_REGFL_NO_EMUL64 0x4 /* PMC: no 64-bit emulation for counter */ + +/* + * event set flags layout: + * bits[00-15] : generic flags + * bits[16-31] : arch-specific flags (see asm/perfmon.h) + */ +#define PFM_SETFL_OVFL_SWITCH 0x01 /* enable switch on overflow */ +#define PFM_SETFL_TIME_SWITCH 0x02 /* enable switch on timeout */ + +/* + * PMD/PMC return flags in case of error (ignored on input) + * + * reg_flags layout: + * bit 00-15 : generic flags + * bits[16-23] : arch-specific flags (see asm/perfmon.h) + * bit 24-31 : error codes + * + * Those flags are used on output and must be checked in case EINVAL is + * returned by a command accepting a vector of values and each has a flag + * field, such as pfarg_pmc or pfarg_pmd. + */ +#define PFM_REG_RETFL_NOTAVAIL (1<<31) /* not implemented or unaccessible */ +#define PFM_REG_RETFL_EINVAL (1<<30) /* entry is invalid */ +#define PFM_REG_RETFL_NOSET (1<<29) /* event set does not exist */ +#define PFM_REG_RETFL_MASK (PFM_REG_RETFL_NOTAVAIL|\ + PFM_REG_RETFL_EINVAL|\ + PFM_REG_RETFL_NOSET) + +#define PFM_REG_HAS_ERROR(flag) (((flag) & PFM_REG_RETFL_MASK) != 0) + +/* + * argument to pfm_create_context() system call + * structure shared with user level + */ +struct pfarg_ctx { + __u32 ctx_flags; /* noblock/block/syswide */ + __u32 ctx_reserved1; /* ret arg: fd for context */ + __u64 ctx_reserved2[7]; /* for future use */ +}; + +/* + * context flags (ctx_flags) + * + */ +#define PFM_FL_NOTIFY_BLOCK 0x01 /* block task on user notifications */ +#define PFM_FL_SYSTEM_WIDE 0x02 /* create a system wide context */ +#define PFM_FL_OVFL_NO_MSG 0x80 /* no overflow msgs */ +#define PFM_FL_MAP_SETS 0x10 /* event sets are remapped */ + +/* + * argument to pfm_write_pmcs() system call. + * structure shared with user level + */ +struct pfarg_pmc { + __u16 reg_num; /* which register */ + __u16 reg_set; /* event set for this register */ + __u32 reg_flags; /* input: flags, return: reg error */ + __u64 reg_value; /* pmc value */ + __u64 reg_reserved2[4]; /* for future use */ +}; + +/* + * argument to pfm_write_pmds() and pfm_read_pmds() system calls. + * structure shared with user level + */ +struct pfarg_pmd { + __u16 reg_num; /* which register */ + __u16 reg_set; /* event set for this register */ + __u32 reg_flags; /* input: flags, return: reg error */ + __u64 reg_value; /* initial pmc/pmd value */ + __u64 reg_long_reset; /* value to reload after notification */ + __u64 reg_short_reset; /* reset after counter overflow */ + __u64 reg_last_reset_val; /* return: PMD last reset value */ + __u64 reg_ovfl_switch_cnt; /* #overflows before switch */ + __u64 reg_reset_pmds[PFM_PMD_BV]; /* reset on overflow */ + __u64 reg_smpl_pmds[PFM_PMD_BV]; /* record in sample */ + __u64 reg_smpl_eventid; /* opaque event identifier */ + __u64 reg_random_mask; /* bitmask used to limit random value */ + __u32 reg_random_seed; /* seed for randomization (OBSOLETE) */ + __u32 reg_reserved2[7]; /* for future use */ +}; + +/* + * optional argument to pfm_start() system call. Pass NULL if not needed. + * structure shared with user level + */ +struct pfarg_start { + __u16 start_set; /* event set to start with */ + __u16 start_reserved1; /* for future use */ + __u32 start_reserved2; /* for future use */ + __u64 reserved3[3]; /* for future use */ +}; + +/* + * argument to pfm_load_context() system call. + * structure shared with user level + */ +struct pfarg_load { + __u32 load_pid; /* thread or CPU to attach to */ + __u16 load_set; /* set to load first */ + __u16 load_reserved1; /* for future use */ + __u64 load_reserved2[3]; /* for future use */ +}; + +/* + * argument to pfm_create_evtsets() and pfm_delete_evtsets() system calls. + * structure shared with user level. + */ +struct pfarg_setdesc { + __u16 set_id; /* which set */ + __u16 set_reserved1; /* for future use */ + __u32 set_flags; /* input: flags, return: err flag */ + __u64 set_timeout; /* req/eff switch timeout in nsecs */ + __u64 reserved[6]; /* for future use */ +}; + +/* + * argument to pfm_getinfo_evtsets() system call. + * structure shared with user level + */ +struct pfarg_setinfo { + __u16 set_id; /* which set */ + __u16 set_reserved1; /* for future use */ + __u32 set_flags; /* out:flags or error */ + __u64 set_ovfl_pmds[PFM_HW_PMD_BV]; /* out: last ovfl PMDs */ + __u64 set_runs; /* out: #times the set was active */ + __u64 set_timeout; /* out: effective/leftover switch timeout in nsecs */ + __u64 set_act_duration; /* out: time set was active in nsecs */ + __u64 set_avail_pmcs[PFM_PMC_BV];/* unavailable PMCs */ + __u64 set_avail_pmds[PFM_PMD_BV];/* unavailable PMDs */ + __u64 reserved[6]; /* for future use */ +}; + +/* + * default value for the user and group security parameters in + * /proc/sys/kernel/perfmon/sys_group + * /proc/sys/kernel/perfmon/task_group + */ +#define PFM_GROUP_PERM_ANY -1 /* any user/group */ + +/* + * overflow notification message. + * structure shared with user level + */ +struct pfarg_ovfl_msg { + __u32 msg_type; /* message type: PFM_MSG_OVFL */ + __u32 msg_ovfl_pid; /* process id */ + __u64 msg_ovfl_pmds[PFM_HW_PMD_BV];/* overflowed PMDs */ + __u16 msg_active_set; /* active set at overflow */ + __u16 msg_ovfl_cpu; /* cpu of PMU interrupt */ + __u32 msg_ovfl_tid; /* thread id */ + __u64 msg_ovfl_ip; /* IP on PMU intr */ +}; + +#define PFM_MSG_OVFL 1 /* an overflow happened */ +#define PFM_MSG_END 2 /* task to which context was attached ended */ + +/* + * generic notification message (union). + * union shared with user level + */ +union pfarg_msg { + __u32 type; + struct pfarg_ovfl_msg pfm_ovfl_msg; +}; + +/* + * perfmon version number + */ +#define PFM_VERSION_MAJ 2U +#define PFM_VERSION_MIN 5U +#define PFM_VERSION (((PFM_VERSION_MAJ&0xffff)<<16)|\ + (PFM_VERSION_MIN & 0xffff)) +#define PFM_VERSION_MAJOR(x) (((x)>>16) & 0xffff) +#define PFM_VERSION_MINOR(x) ((x) & 0xffff) + +/* + * This part of the header file is meant for kernel level code only including + * kernel modules + */ +#ifdef __KERNEL__ + +#include +#include +#include +#include + +/* + * perfmon context state + */ +#define PFM_CTX_UNLOADED 1 /* context is not loaded onto any task */ +#define PFM_CTX_LOADED 2 /* context is loaded onto a task */ +#define PFM_CTX_MASKED 3 /* context is loaded, monitoring is masked */ +#define PFM_CTX_ZOMBIE 4 /* context lost owner but is still attached */ + +/* + * depth of message queue + */ +#define PFM_MSGS_ORDER 3 /* log2(number of messages) */ +#define PFM_MSGS_COUNT (1</proc/sys/kernel/printk_ratelimit + */ +#ifdef CONFIG_PERFMON_DEBUG +#define PFM_DBG(f, x...) \ + do { \ + if (unlikely(pfm_controls.debug >0 && printk_ratelimit())) { \ + printk("perfmon: %s.%d: CPU%d [%d]: " f "\n", \ + __FUNCTION__, __LINE__, \ + smp_processor_id(), current->pid , ## x); \ + } \ + } while (0) + +#define PFM_DBG_ovfl(f, x...) \ + do { \ + if (unlikely(pfm_controls.debug_ovfl >0 && printk_ratelimit())) { \ + printk("perfmon: %s.%d: CPU%d [%d]: " f "\n", \ + __FUNCTION__, __LINE__, \ + smp_processor_id(), current->pid , ## x); \ + } \ + } while (0) +#else +#define PFM_DBG(f, x...) do {} while(0) +#define PFM_DBG_ovfl(f, x...) do {} while(0) +#endif + +/* + * PMD information + */ +struct pfm_pmd { + u64 value; /* currnet 64-bit value */ + u64 lval; /* last reset value */ + u64 ovflsw_thres; /* #overflows left before switching */ + u64 long_reset; /* reset value on sampling overflow */ + u64 short_reset; /* reset value on overflow */ + u64 reset_pmds[PFM_PMD_BV]; /* pmds to reset on overflow */ + u64 smpl_pmds[PFM_PMD_BV]; /* pmds to record on overflow */ + u64 mask; /* mask for generator */ + u32 flags; /* notify/do not notify */ + u64 ovflsw_ref_thres; /* #overflows before switching to next set */ + u64 eventid; /* overflow event identifier */ +}; + +/* + * perfmon context: encapsulates all the state of a monitoring session + */ +struct pfm_event_set { + u16 id; + u16 id_next; /* which set to go to from this one */ + u32 flags; /* public set flags */ + u64 runs; /* number of activations */ + struct list_head list; /* next in the ordered list */ + struct pfm_event_set *sw_next; /* address of set to go to */ + u32 priv_flags; /* private flags */ + u32 npend_ovfls; /* number of pending PMD overflow */ + + u64 used_pmds[PFM_PMD_BV]; /* used PMDs */ + u64 povfl_pmds[PFM_HW_PMD_BV]; /* pending overflowed PMDs */ + u64 ovfl_pmds[PFM_HW_PMD_BV]; /* last overflowed PMDs */ + u64 reset_pmds[PFM_PMD_BV]; /* union of PMDs to reset */ + u64 ovfl_notify[PFM_PMD_BV]; /* notify on overflow */ + u64 pmcs[PFM_MAX_PMCS]; /* PMC values */ + + u16 nused_pmds; /* max number of used PMDs */ + u16 nused_pmcs; /* max number of used PMCs */ + + struct pfm_pmd pmds[PFM_MAX_PMDS]; /* 64-bit SW PMDs */ + u64 timeout_sw_ref; /* switch timeout reference */ + u64 timeout_sw_left; /* timeout remaining */ + u64 timeout_sw_exp; /* timeout expiration jiffies */ + u64 duration_start; /* start ns */ + u64 duration; /* total active ns */ + u64 used_pmcs[PFM_PMC_BV]; /* used PMCs (keep for arbitration) */ +}; + +/* + * common private event set flags (priv_flags) + * + * upper 16 bits: for arch-specific use + * lower 16 bits: for common use + */ +#define PFM_SETFL_PRIV_MOD_PMDS 0x1 /* PMD register(s) modified */ +#define PFM_SETFL_PRIV_MOD_PMCS 0x2 /* PMC register(s) modified */ +#define PFM_SETFL_PRIV_SWITCH 0x4 /* must switch set on restart */ +#define PFM_SETFL_PRIV_MOD_BOTH (PFM_SETFL_PRIV_MOD_PMDS | PFM_SETFL_PRIV_MOD_PMCS) + +/* + * context flags + */ +struct pfm_context_flags { + unsigned int block:1; /* task blocks on user notifications */ + unsigned int system:1; /* do system wide monitoring */ + unsigned int no_msg:1; /* no message sent on overflow */ + unsigned int can_restart:1; /* allowed to issue a PFM_RESTART */ + unsigned int switch_ovfl:1; /* switch set on counter ovfl */ + unsigned int switch_time:1; /* switch set on timeout */ + unsigned int started:1; /* pfm_start() issued */ + unsigned int work_type:2; /* type of work for pfm_handle_work */ + unsigned int mmap_nlock:1; /* no lock in pfm_release_buf_space */ + unsigned int reserved:20; /* for future use */ +}; + +/* + * values for work_type (TIF_PERFMON_WORK must be set) + */ +#define PFM_WORK_NONE 0 /* nothing to do */ +#define PFM_WORK_RESET 1 /* reset overflowed counters */ +#define PFM_WORK_BLOCK 2 /* block current thread */ +#define PFM_WORK_ZOMBIE 3 /* cleanup zombie context */ + +/* + * check_mask bitmask values for pfm_check_task_state() + */ +#define PFM_CMD_STOPPED 0x01 /* command needs thread stopped */ +#define PFM_CMD_UNLOADED 0x02 /* command needs ctx unloaded */ +#define PFM_CMD_UNLOAD 0x04 /* command is unload */ + +#include +#include + +/* + * context: encapsulates all the state of a monitoring session + */ +struct pfm_context { + spinlock_t lock; /* context protection */ + + struct pfm_context_flags flags; /* flags */ + u32 state; /* state */ + struct task_struct *task; /* attached task */ + + struct completion restart_complete;/* block on notification */ + u64 last_act; /* last activation */ + u32 last_cpu; /* last CPU used (SMP only) */ + u32 cpu; /* cpu bound to context */ + + struct pfm_smpl_fmt *smpl_fmt; /* buffer format callbacks */ + void *smpl_addr; /* smpl buffer base */ + size_t smpl_size; + + wait_queue_head_t msgq_wait; /* used when flags.kapi=0 */ + union pfarg_msg msgq[PFM_MSGS_COUNT]; + int msgq_head; + int msgq_tail; + + struct fasync_struct *async_queue; + + u64 set_all_runs; /* total number of set activations */ + struct pfm_event_set *active_set; /* active set */ + struct list_head list; /* ordered list of sets */ + + /* + * save stack space by allocating temporary variables for + * pfm_overflow_handler() in pfm_context + */ + struct pfm_ovfl_arg ovfl_arg; + u64 ovfl_ovfl_notify[PFM_PMD_BV]; +}; + +static inline struct pfm_arch_context *pfm_ctx_arch(struct pfm_context *c) +{ + return (struct pfm_arch_context *)(c+1); +} + +static inline void pfm_set_pmu_owner(struct task_struct *task, + struct pfm_context *ctx) +{ + BUG_ON(task && task->pid == 0); + __get_cpu_var(pmu_owner) = task; + __get_cpu_var(pmu_ctx) = ctx; +} + +static inline void pfm_inc_activation(void) +{ + __get_cpu_var(pmu_activation_number)++; +} + +static inline void pfm_set_activation(struct pfm_context *ctx) +{ + ctx->last_act = __get_cpu_var(pmu_activation_number); +} + +static inline void pfm_set_last_cpu(struct pfm_context *ctx, int cpu) +{ + ctx->last_cpu = cpu; +} + +static inline void pfm_retflag_set(u32 flags, u32 val) +{ + flags &= ~PFM_REG_RETFL_MASK; + flags |= (val); +} + +extern struct pfm_pmu_config *pfm_pmu_conf; +extern struct pfm_controls pfm_controls; +extern int perfmon_disabled; + +int pfm_get_args(void __user *ureq, size_t sz, size_t lsz, void *laddr, + void **req, void **to_free); + +int pfm_get_task(struct pfm_context *ctx, pid_t pid, struct task_struct **task); +int pfm_get_smpl_arg(char __user *fmt_uname, void __user *uaddr, size_t usize, void **arg, + struct pfm_smpl_fmt **fmt); + +int pfm_alloc_fd(struct file **cfile); + +int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmc *req, int count); +int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count, + int compat); +int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count); +int __pfm_load_context(struct pfm_context *ctx, struct pfarg_load *req, + struct task_struct *task); +int __pfm_unload_context(struct pfm_context *ctx, int *can_release); +int __pfm_stop(struct pfm_context *ctx); +int __pfm_restart(struct pfm_context *ctx, int *complete_needed); +int __pfm_start(struct pfm_context *ctx, struct pfarg_start *start); +int __pfm_delete_evtsets(struct pfm_context *ctx, void *arg, int count); +int __pfm_getinfo_evtsets(struct pfm_context *ctx, struct pfarg_setinfo *req, + int count); +int __pfm_create_evtsets(struct pfm_context *ctx, struct pfarg_setdesc *req, + int count); + +int __pfm_create_context(struct pfarg_ctx *req, + struct pfm_smpl_fmt *fmt, + void *fmt_arg, + int mode, + struct pfm_context **new_ctx); + +int pfm_check_task_state(struct pfm_context *ctx, int check_mask, + unsigned long *flags); + +struct pfm_event_set *pfm_find_set(struct pfm_context *ctx, u16 set_id, + int alloc); + +struct pfm_context *pfm_get_ctx(int fd); + +void pfm_context_free(struct pfm_context *ctx); +struct pfm_context *pfm_context_alloc(void); +int pfm_pmu_conf_get(int autoload); +void pfm_pmu_conf_put(void); + +int pfm_pmu_acquire(void); +void pfm_pmu_release(void); + +int pfm_reserve_session(int is_system, u32 cpu); +int pfm_release_session(int is_system, u32 cpu); + +int pfm_reserve_allcpus(void); +int pfm_release_allcpus(void); + +int pfm_smpl_buffer_alloc(struct pfm_context *ctx, size_t rsize); +int pfm_reserve_buf_space(size_t size); +void pfm_release_buf_space(struct pfm_context *ctx, size_t size); + +struct pfm_smpl_fmt *pfm_smpl_fmt_get(char *name); +void pfm_smpl_fmt_put(struct pfm_smpl_fmt *fmt); + +int pfm_init_sysfs(void); +ssize_t pfm_sysfs_session_show(char *buf, size_t sz, int what); +int pfm_sysfs_remove_pmu(struct pfm_pmu_config *pmu); +int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu); + +int pfm_sysfs_add_fmt(struct pfm_smpl_fmt *fmt); +int pfm_sysfs_remove_fmt(struct pfm_smpl_fmt *fmt); + +int pfm_sysfs_add_cpu(int mycpu); +void pfm_sysfs_del_cpu(int mycpu); + +void pfm_interrupt_handler(unsigned long iip, struct pt_regs *regs); +void pfm_save_prev_context(struct pfm_context *ctxp); + +void pfm_reset_pmds(struct pfm_context *ctx, struct pfm_event_set *set, + int num_pmds, + int reset_mode); + +void __pfm_handle_switch_timeout(void); +int pfm_prepare_sets(struct pfm_context *ctx, struct pfm_event_set *act_set); +int pfm_sets_init(void); + +int pfm_mmap_set(struct pfm_context *ctx, struct vm_area_struct *vma, + size_t size); + +void pfm_free_sets(struct pfm_context *ctx); +void pfm_init_evtset(struct pfm_event_set *set); +void pfm_switch_sets_from_intr(struct pfm_context *ctx); +void pfm_switch_sets(struct pfm_context *ctx, + struct pfm_event_set *new_set, + int reset_mode, + int no_restart); + +union pfarg_msg *pfm_get_new_msg(struct pfm_context *ctx); +void pfm_save_pmds(struct pfm_context *ctx, struct pfm_event_set *set); +int pfm_notify_user(struct pfm_context *ctx); +int pfm_ovfl_notify_user(struct pfm_context *ctx, + struct pfm_event_set *set, + unsigned long ip); +void pfm_mask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set); + +int pfm_init_fs(void); + +struct pfm_stats { + u64 ovfl_intr_replay_count; /* replayed ovfl interrupts */ + u64 ovfl_intr_regular_count; /* processed ovfl interrupts */ + u64 ovfl_intr_all_count; /* total ovfl interrupts */ + u64 ovfl_intr_ns; /* cycles in ovfl interrupts */ + u64 ovfl_intr_phase1; /* cycles in ovfl interrupts */ + u64 ovfl_intr_phase2; /* cycles in ovfl interrupts */ + u64 ovfl_intr_phase3; /* cycles in ovfl interrupts */ + u64 fmt_handler_calls; /* # calls smpl buffer handler */ + u64 fmt_handler_ns; /* cycle in smpl format handler */ + u64 set_switch_count; /* #set_switches on this CPU */ + u64 set_switch_ns; /* cycles for switching sets */ + u64 ctxsw_count; /* #context switches on this CPU */ + u64 ctxsw_ns; /* cycles for context switches */ + u64 handle_timeout_count; /* #of set timeouts handled */ + u64 ovfl_intr_nmi_count; /* number of NMI-base ovfl */ + u64 handle_work_count; /* calls to pfm_handle_work */ + u64 ovfl_notify_count; /* notification messages */ + u64 reset_pmds_count; /* calls to pfm_reset_pmds */ + u64 pfm_restart_count; /* calls to pfm_restart_count */ + u64 ccnt0; + u64 ccnt1; + u64 ccnt2; + u64 ccnt3; + u64 ccnt4; + u64 ccnt5; + u64 ccnt6; + struct kobject kobj; /* for sysfs internal use only */ +}; +#define to_stats(n) container_of(n, struct pfm_stats, kobj) + +/* + * include arch-specific kernel level only definitions + * (split with perfmon_api.h is necessary to avoid circular + * dependencies on certain data structures definitions) + */ +#include + +extern const struct file_operations pfm_file_ops; +/* + * max vector argument elements for local storage (no kmalloc/kfree) + * The PFM_ARCH_PM*_ARG should be defined in the arch specific perfmon.h + * file. If not, default (conservative) values are used + */ + +#ifndef PFM_ARCH_PMC_STK_ARG +#define PFM_ARCH_PMC_STK_ARG 1 +#endif + +#ifndef PFM_ARCH_PMD_STK_ARG +#define PFM_ARCH_PMD_STK_ARG 1 +#endif + +#define PFM_PMC_STK_ARG PFM_ARCH_PMC_STK_ARG +#define PFM_PMD_STK_ARG PFM_ARCH_PMD_STK_ARG + +#define PFM_BPL 64 +#define PFM_LBPL 6 /* log2(BPL) */ + +/* + * upper limit for count in calls that take vector arguments. This is used + * to prevent for multiplication overflow when we compute actual storage size + */ +#define PFM_MAX_ARG_COUNT(m) (INT_MAX/sizeof(*(m))) + +/* + * read a single PMD register. PMD register mapping is provided by PMU + * description module. Virtual PMD registers have special handler. + */ +static inline u64 pfm_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + if (unlikely(pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V)) + return pfm_pmu_conf->pmd_sread(ctx, cnum); + + return pfm_arch_read_pmd(ctx, cnum); +} + +static inline void pfm_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value) +{ + /* + * PMD writes are ignored for read-only registers + */ + if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_RO) + return; + + if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V) { + pfm_pmu_conf->pmd_swrite(ctx, cnum, value); + return; + } + pfm_arch_write_pmd(ctx, cnum, value); +} + +#define cast_ulp(_x) ((unsigned long *)_x) + +#define PFM_NORMAL 0 +#define PFM_COMPAT 1 + +void __pfm_exit_thread(struct task_struct *task); +void __pfm_copy_thread(struct task_struct *task); +void pfm_ctxsw(struct task_struct *prev, struct task_struct *next); +void pfm_handle_work(struct pt_regs *regs); +void __pfm_handle_switch_timeout(void); +void __pfm_init_percpu (void *dummy); +void pfm_cpu_disable(void); + +static inline void pfm_exit_thread(struct task_struct *task) +{ + if (task->pfm_context) + __pfm_exit_thread(task); +} + +static inline void pfm_copy_thread(struct task_struct *task) +{ + /* + * context or perfmon TIF state is NEVER inherited + * in child task. Holds for per-thread and system-wide + */ + task->pfm_context = NULL; + clear_tsk_thread_flag(task, TIF_PERFMON_CTXSW); + clear_tsk_thread_flag(task, TIF_PERFMON_WORK); +} + +static inline void pfm_handle_switch_timeout(void) +{ + unsigned long info; + info = __get_cpu_var(pfm_syst_info); + if (info & PFM_CPUINFO_TIME_SWITCH) + __pfm_handle_switch_timeout(); +} + +static inline void pfm_init_percpu(void) +{ + __pfm_init_percpu(NULL); +} + +#endif /* __KERNEL__ */ + +#else /* !CONFIG_PERFMON */ +#ifdef __KERNEL__ + +#define tsks_have_perfmon(p, n) (0) +#define pfm_cpu_disable() do { } while (0) +#define pfm_init_percpu() do { } while (0) +#define pfm_exit_thread(_t) do { } while (0) +#define pfm_handle_work(_t) do { } while (0) +#define pfm_copy_thread(_t) do { } while (0) +#define pfm_ctxsw(_p, _t) do { } while (0) +#define pfm_handle_switch_timeout() do { } while (0) +#define pfm_release_allcpus() do { } while (0) +#define pfm_reserve_allcpus() (0) +#ifdef __ia64__ +#define pfm_release_dbregs(_t) do { } while (0) +#define pfm_use_dbregs(_t) (0) +#endif + +#endif /* __KERNEL__ */ + +#endif /* CONFIG_PERFMON */ + +#endif /* __LINUX_PERFMON_H__ */ Index: linux-2.6/include/linux/perfmon_dfl_smpl.h =================================================================== --- /dev/null +++ linux-2.6/include/linux/perfmon_dfl_smpl.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file implements the new dfl sampling buffer format + * for perfmon2 subsystem. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef __PERFMON_DFL_SMPL_H__ +#define __PERFMON_DFL_SMPL_H__ 1 + +/* + * format specific parameters (passed at context creation) + */ +struct pfm_dfl_smpl_arg { + __u64 buf_size; /* size of the buffer in bytes */ + __u32 buf_flags; /* buffer specific flags */ + __u32 reserved1; /* for future use */ + __u64 reserved[6]; /* for future use */ +}; + +/* + * This header is at the beginning of the sampling buffer returned to the user. + * It is directly followed by the first record. + */ +struct pfm_dfl_smpl_hdr { + __u64 hdr_count; /* how many valid entries */ + __u64 hdr_cur_offs; /* current offset from top of buffer */ + __u64 hdr_overflows; /* #overflows for buffer */ + __u64 hdr_buf_size; /* bytes in the buffer */ + __u64 hdr_min_buf_space;/* minimal buffer size (internal use) */ + __u32 hdr_version; /* smpl format version */ + __u32 hdr_buf_flags; /* copy of buf_flags */ + __u64 hdr_reserved[10]; /* for future use */ +}; + +/* + * Entry header in the sampling buffer. The header is directly followed + * with the values of the PMD registers of interest saved in increasing + * index order: PMD4, PMD5, and so on. How many PMDs are present depends + * on how the session was programmed. + * + * In the case where multiple counters overflow at the same time, multiple + * entries are written consecutively. + * + * last_reset_value member indicates the initial value of the overflowed PMD. + */ +struct pfm_dfl_smpl_entry { + __u32 pid; /* thread id (for NPTL, this is gettid()) */ + __u16 ovfl_pmd; /* index of overflowed PMD for this sample */ + __u16 reserved; /* for future use */ + __u64 last_reset_val; /* initial value of overflowed PMD */ + __u64 ip; /* where did the overflow interrupt happened */ + __u64 tstamp; /* overflow timetamp */ + __u16 cpu; /* cpu on which the overfow occurred */ + __u16 set; /* event set active when overflow ocurred */ + __u32 tgid; /* thread group id (for NPTL, this is getpid())*/ +}; + +#define PFM_DFL_SMPL_VERSION_MAJ 1U +#define PFM_DFL_SMPL_VERSION_MIN 0U +#define PFM_DFL_SMPL_VERSION (((PFM_DFL_SMPL_VERSION_MAJ&0xffff)<<16)|\ + (PFM_DFL_SMPL_VERSION_MIN & 0xffff)) + +#endif /* __PERFMON_DFL_SMPL_H__ */ Index: linux-2.6/include/linux/perfmon_fmt.h =================================================================== --- /dev/null +++ linux-2.6/include/linux/perfmon_fmt.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * Interface for custom sampling buffer format modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef __PERFMON_FMT_H__ +#define __PERFMON_FMT_H__ 1 + +#include + +struct pfm_ovfl_arg { + u16 ovfl_pmd; /* index of overflowed PMD */ + u16 active_set; /* set active at the time of the overflow */ + u32 ovfl_ctrl; /* control flags */ + u64 pmd_last_reset; /* last reset value of overflowed PMD */ + u64 smpl_pmds_values[PFM_MAX_PMDS]; /* values of other PMDs */ + u64 pmd_eventid; /* eventid associated with PMD */ + u16 num_smpl_pmds; /* number of PMDS in smpl_pmd_values */ +}; + +/* + * ovfl_ctrl bitmask of flags + */ +#define PFM_OVFL_CTRL_NOTIFY 0x1 /* notify user */ +#define PFM_OVFL_CTRL_RESET 0x2 /* reset overflowed pmds */ +#define PFM_OVFL_CTRL_MASK 0x4 /* mask monitoring */ + + +typedef int (*fmt_validate_t )(u32 flags, u16 npmds, void *arg); +typedef int (*fmt_getsize_t)(u32 flags, void *arg, size_t *size); +typedef int (*fmt_init_t)(struct pfm_context *ctx, void *buf, u32 flags, u16 nmpds, void *arg); +typedef int (*fmt_restart_t)(int is_active, u32 *ovfl_ctrl, void *buf); +typedef int (*fmt_exit_t)(void *buf); +typedef int (*fmt_handler_t)(void *buf, struct pfm_ovfl_arg *arg, + unsigned long ip, u64 stamp, void *data); + +struct pfm_smpl_fmt { + char *fmt_name; /* name of the format (required) */ + size_t fmt_arg_size; /* size of fmt args for ctx create */ + u32 fmt_flags; /* format specific flags */ + u32 fmt_version; /* format version number */ + + fmt_validate_t fmt_validate; /* validate context flags */ + fmt_getsize_t fmt_getsize; /* get size for sampling buffer */ + fmt_init_t fmt_init; /* initialize buffer area */ + fmt_handler_t fmt_handler; /* overflow handler (required) */ + fmt_restart_t fmt_restart; /* restart after notification */ + fmt_exit_t fmt_exit; /* context termination */ + + struct list_head fmt_list; /* internal use only */ + + struct kobject kobj; /* sysfs internal use only */ + struct module *owner; /* pointer to module owner */ + u32 fmt_qdepth; /* Max notify queue depth (required) */ +}; +#define to_smpl_fmt(n) container_of(n, struct pfm_smpl_fmt, kobj) + +#define PFM_FMTFL_IS_BUILTIN 0x1 /* fmt is compiled in */ +/* + * we need to know whether the format is builtin or compiled + * as a module + */ +#ifdef MODULE +#define PFM_FMT_BUILTIN_FLAG 0 /* not built as a module */ +#else +#define PFM_FMT_BUILTIN_FLAG PFM_PMUFL_IS_BUILTIN /* built as a module */ +#endif + +int pfm_fmt_register(struct pfm_smpl_fmt *fmt); +int pfm_fmt_unregister(struct pfm_smpl_fmt *fmt); +void pfm_sysfs_builtin_fmt_add(void); + +#endif /* __PERFMON_FMT_H__ */ Index: linux-2.6/include/linux/perfmon_pmu.h =================================================================== --- /dev/null +++ linux-2.6/include/linux/perfmon_pmu.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * Interface for PMU description modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef __PERFMON_PMU_H__ +#define __PERFMON_PMU_H__ 1 + +/* + * generic information about a PMC or PMD register + */ +struct pfm_regmap_desc { + u16 type; /* role of the register */ + u16 reserved1; /* for future use */ + u32 reserved2; /* for future use */ + u64 dfl_val; /* power-on default value (quiescent) */ + u64 rsvd_msk; /* reserved bits: 1 means reserved */ + u64 no_emul64_msk; /* bits to clear for PFM_REGFL_NO_EMUL64 */ + unsigned long hw_addr; /* HW register address or index */ + struct kobject kobj; /* for internal use only */ + char *desc; /* HW register description string */ +}; +#define to_reg(n) container_of(n, struct pfm_regmap_desc, kobj) + +/* + * pfm_reg_desc declaration help macros + */ +#define PMC_D(t,d,v,r,n, h) \ + { .type = t, \ + .desc = d, \ + .dfl_val = v, \ + .rsvd_msk = r, \ + .no_emul64_msk = n, \ + .hw_addr = h \ + } + +#define PMD_D(t,d, h) \ + { .type = t, \ + .desc = d, \ + .rsvd_msk = 0, \ + .no_emul64_msk = 0, \ + .hw_addr = h \ + } + +#define PMX_NA \ + { .type = PFM_REG_NA } + +/* + * type of a PMU register (16-bit bitmask) for use with pfm_reg_desc.type + */ +#define PFM_REG_NA 0x00 /* not avail. (not impl.,no access) must be 0 */ +#define PFM_REG_I 0x01 /* implemented */ +#define PFM_REG_WC 0x02 /* has write_checker */ +#define PFM_REG_C64 0x04 /* PMD: 64-bit virtualization */ +#define PFM_REG_RO 0x08 /* PMD: read-only (writes ignored) */ +#define PFM_REG_V 0x10 /* PMD: virtual reg (provided by PMU description) */ +#define PFM_REG_NO64 0x100 /* PMC: supports PFM_REGFL_NO_EMUL64 */ + +/* + * define some shortcuts for common types + */ +#define PFM_REG_W (PFM_REG_WC|PFM_REG_I) +#define PFM_REG_W64 (PFM_REG_WC|PFM_REG_NO64|PFM_REG_I) +#define PFM_REG_C (PFM_REG_C64|PFM_REG_I) +#define PFM_REG_I64 (PFM_REG_NO64|PFM_REG_I) + +typedef int (*pfm_pmc_check_t)(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req); + +typedef int (*pfm_pmd_check_t)(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmd *req); + + +typedef u64 (*pfm_pmd_sread_t)(struct pfm_context *ctx, unsigned int cnum); +typedef void (*pfm_pmd_swrite_t)(struct pfm_context *ctx, unsigned int cnum, u64 val); + +/* + * registers description + */ +struct pfm_regdesc { + u64 pmcs[PFM_PMC_BV]; /* available PMC */ + u64 pmds[PFM_PMD_BV]; /* available PMD */ + u64 rw_pmds[PFM_PMD_BV]; /* available RW PMD */ + u64 cnt_pmds[PFM_PMD_BV]; /* available counting PMD (counters) */ + u16 max_pmc; /* highest+1 avail PMC */ + u16 max_pmd; /* highest+1 avail PMD */ + u16 max_rw_pmd; /* highest+1 avail RW PMD */ + u16 first_cnt_pmd; /* first counting PMD */ + u16 max_cnt_pmd; /* highest+1 avail counter */ + u16 num_rw_pmd; /* number of avail RW PMD */ + u16 num_pmcs; /* logical PMCS */ + u16 num_pmds; /* logical PMDS */ + u16 num_counters; /* PMC/PMD counter pairs */ +}; + +/* + * structure used by pmu description modules + * + * probe_pmu() routine return value: + * - 1 means recognized PMU + * - 0 means not recognized PMU + */ +struct pfm_pmu_config { + char *pmu_name; /* PMU family name */ + char *version; /* config module version number */ + + int counter_width; /* width of hardware counter */ + + struct pfm_regmap_desc *pmc_desc; /* PMC register descriptions */ + struct pfm_regmap_desc *pmd_desc; /* PMD register descriptions */ + + pfm_pmc_check_t pmc_write_check;/* PMC write checker callback (optional) */ + pfm_pmd_check_t pmd_write_check;/* PMD write checker callback (optional) */ + pfm_pmd_check_t pmd_read_check; /* PMD read checker callback (optional) */ + + pfm_pmd_sread_t pmd_sread; /* PMD model specific read (optional) */ + pfm_pmd_swrite_t pmd_swrite; /* PMD model specific write (optional) */ + + int (*probe_pmu)(void);/* probe PMU routine */ + + u16 num_pmc_entries;/* number of entries in pmc_desc */ + u16 num_pmd_entries;/* number of entries in pmd_desc */ + + void *arch_info; /* arch-specific information */ + u32 flags; /* set of flags */ + + struct module *owner; /* pointer to module struct */ + + /* + * fields computed internally, do not set in module + */ + struct pfm_regdesc regs; /* registers currently available */ + struct pfm_regdesc full_regs; /* registers presented by module */ + + u64 ovfl_mask; /* overflow mask */ + struct kobject kobj; /* for internal use only */ +}; +#define to_pmu(n) container_of(n, struct pfm_pmu_config, kobj) + +/* + * pfm_pmu_config flags + */ +#define PFM_PMUFL_IS_BUILTIN 0x1 /* pmu config is compiled in */ + +/* + * we need to know whether the PMU description is builtin or compiled + * as a module + */ +#ifdef MODULE +#define PFM_PMU_BUILTIN_FLAG 0 /* not built as a module */ +#else +#define PFM_PMU_BUILTIN_FLAG PFM_PMUFL_IS_BUILTIN /* built as a module */ +#endif + +int pfm_pmu_register(struct pfm_pmu_config *cfg); +void pfm_pmu_unregister(struct pfm_pmu_config *cfg); + +#endif /* __PERFMON_PMU_H__ */ Index: linux-2.6/include/linux/sched.h =================================================================== --- linux-2.6.orig/include/linux/sched.h +++ linux-2.6/include/linux/sched.h @@ -89,6 +89,7 @@ struct sched_param { struct exec_domain; struct futex_pi_state; struct bio; +struct pfm_context; /* * List of flags we want to share for kernel threads, @@ -1076,6 +1077,9 @@ struct task_struct { #ifdef CONFIG_FAULT_INJECTION int make_it_fail; #endif +#ifdef CONFIG_PERFMON + struct pfm_context *pfm_context; +#endif }; static inline pid_t process_group(struct task_struct *tsk) Index: linux-2.6/include/linux/syscalls.h =================================================================== --- linux-2.6.orig/include/linux/syscalls.h +++ linux-2.6/include/linux/syscalls.h @@ -29,6 +29,13 @@ struct msqid_ds; struct new_utsname; struct nfsctl_arg; struct __old_kernel_stat; +struct pfarg_ctx; +struct pfarg_pmc; +struct pfarg_pmd; +struct pfarg_start; +struct pfarg_load; +struct pfarg_setinfo; +struct pfarg_setdesc; struct pollfd; struct rlimit; struct rusage; @@ -613,4 +620,27 @@ asmlinkage long sys_eventfd(unsigned int int kernel_execve(const char *filename, char *const argv[], char *const envp[]); +asmlinkage long sys_pfm_create_context(struct pfarg_ctx __user *ureq, + void __user *uarg, size_t smpl_size); +asmlinkage long sys_pfm_write_pmcs(int fd, struct pfarg_pmc __user *ureq, + int count); +asmlinkage long sys_pfm_write_pmds(int fd, struct pfarg_pmd __user *ureq, + int count); +asmlinkage long sys_pfm_read_pmds(int fd, struct pfarg_pmd __user *ureq, + int count); +asmlinkage long sys_pfm_restart(int fd); +asmlinkage long sys_pfm_stop(int fd); +asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *ureq); +asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ureq); +asmlinkage long sys_pfm_unload_context(int fd); +asmlinkage long sys_pfm_delete_evtsets(int fd, + struct pfarg_setinfo __user *ureq, + int count); +asmlinkage long sys_pfm_create_evtsets(int fd, + struct pfarg_setdesc __user *ureq, + int count); +asmlinkage long sys_pfm_getinfo_evtsets(int fd, + struct pfarg_setinfo __user *ureq, + int count); + #endif Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c +++ linux-2.6/kernel/sched.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include Index: linux-2.6/kernel/sys_ni.c =================================================================== --- linux-2.6.orig/kernel/sys_ni.c +++ linux-2.6/kernel/sys_ni.c @@ -113,6 +113,19 @@ cond_syscall(sys_vm86); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); +cond_syscall(sys_pfm_create_context); +cond_syscall(sys_pfm_write_pmcs); +cond_syscall(sys_pfm_write_pmds); +cond_syscall(sys_pfm_read_pmds); +cond_syscall(sys_pfm_restart); +cond_syscall(sys_pfm_start); +cond_syscall(sys_pfm_stop); +cond_syscall(sys_pfm_load_context); +cond_syscall(sys_pfm_unload_context); +cond_syscall(sys_pfm_create_evtsets); +cond_syscall(sys_pfm_delete_evtsets); +cond_syscall(sys_pfm_getinfo_evtsets); + /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); cond_syscall(sys_pciconfig_write); Index: linux-2.6/perfmon/Makefile =================================================================== --- /dev/null +++ linux-2.6/perfmon/Makefile @@ -0,0 +1,8 @@ +# +# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. +# Contributed by Stephane Eranian +# +obj-$(CONFIG_PERFMON) = perfmon.o perfmon_rw.o perfmon_res.o perfmon_fmt.o \ + perfmon_pmu.o perfmon_sysfs.o perfmon_syscalls.o \ + perfmon_file.o perfmon_ctxsw.o perfmon_intr.o \ + perfmon_dfl_smpl.o perfmon_sets.o Index: linux-2.6/perfmon/perfmon.c =================================================================== --- /dev/null +++ linux-2.6/perfmon/perfmon.c @@ -0,0 +1,1692 @@ +/* + * perfmon.c: perfmon2 core functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://www.hpl.hp.com/research/linux/perfmon + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * internal variables + */ +static struct kmem_cache *pfm_ctx_cachep; + +/* + * external variables + */ +DEFINE_PER_CPU(u32, pfm_syst_info); +DEFINE_PER_CPU(struct task_struct *, pmu_owner); +DEFINE_PER_CPU(struct pfm_context *, pmu_ctx); +DEFINE_PER_CPU(u64, pmu_activation_number); +DEFINE_PER_CPU(struct pfm_stats, pfm_stats); +EXPORT_PER_CPU_SYMBOL(pmu_ctx); + +#define PFM_INVALID_ACTIVATION ((u64)~0) + +int perfmon_disabled; /* >0 if perfmon is disabled */ + +/* + * Reset PMD register flags + */ +#define PFM_PMD_RESET_NONE 0 /* do not reset (pfm_switch_set) */ +#define PFM_PMD_RESET_SHORT 1 /* use short reset value */ +#define PFM_PMD_RESET_LONG 2 /* use long reset value */ + +/* + * get a new message slot from the queue. If the queue is full NULL + * is returned and monitoring stops. + */ +union pfarg_msg *pfm_get_new_msg(struct pfm_context *ctx) +{ + int next; + + next = ctx->msgq_head & PFM_MSGQ_MASK; + + if ((ctx->msgq_head - ctx->msgq_tail) == PFM_MSGS_COUNT) + return NULL; + + /* + * move to next possible slot + */ + ctx->msgq_head++; + + PFM_DBG_ovfl("head=%d tail=%d msg=%d", + ctx->msgq_head & PFM_MSGQ_MASK, + ctx->msgq_tail & PFM_MSGQ_MASK, + next); + + return ctx->msgq+next; +} +EXPORT_SYMBOL(pfm_get_new_msg); + +void pfm_context_free(struct pfm_context *ctx) +{ + struct pfm_smpl_fmt *fmt; + + pfm_arch_context_free(ctx); + + fmt = ctx->smpl_fmt; + + pfm_free_sets(ctx); + + if (ctx->smpl_addr) { + PFM_DBG("freeing sampling buffer @%p size=%zu", + ctx->smpl_addr, + ctx->smpl_size); + + pfm_release_buf_space(ctx, ctx->smpl_size); + + if (fmt->fmt_exit) + (*fmt->fmt_exit)(ctx->smpl_addr); + + vfree(ctx->smpl_addr); + } + + PFM_DBG("free ctx @%p", ctx); + kmem_cache_free(pfm_ctx_cachep, ctx); + /* + * decrease refcount on: + * - PMU description table + * - sampling format + */ + pfm_pmu_conf_put(); + pfm_smpl_fmt_put(fmt); + pfm_pmu_release(); +} + +/* + * only called in for the current task + */ +static int pfm_setup_smpl_fmt(struct pfm_smpl_fmt *fmt, void *fmt_arg, + struct pfm_context *ctx, u32 ctx_flags, + int mode, struct file *filp) +{ + size_t size = 0; + int ret = 0; + + /* + * validate parameters + */ + if (fmt->fmt_validate) { + ret = (*fmt->fmt_validate)(ctx_flags, pfm_pmu_conf->regs.num_pmds, + fmt_arg); + PFM_DBG("validate(0x%x,%p)=%d", ctx_flags, fmt_arg, ret); + if (ret) + goto error; + } + + /* + * check if buffer format wants to use perfmon + * buffer allocation/mapping service + */ + size = 0; + if (fmt->fmt_getsize) { + ret = (*fmt->fmt_getsize)(ctx_flags, fmt_arg, &size); + if (ret) { + PFM_DBG("cannot get size ret=%d", ret); + goto error; + } + } + + if (size) { + if (mode == PFM_COMPAT) + ret = pfm_smpl_buffer_alloc_compat(ctx, size, filp); + else + ret = pfm_smpl_buffer_alloc(ctx, size); + if (ret) + goto error; + + } + + if (fmt->fmt_init) { + ret = (*fmt->fmt_init)(ctx, ctx->smpl_addr, ctx_flags, + pfm_pmu_conf->regs.num_pmds, + fmt_arg); + if (ret) + goto error_buffer; + } + return 0; + +error_buffer: + pfm_release_buf_space(ctx, ctx->smpl_size); + /* + * we do not call fmt_exit, if init has failed + */ + vfree(ctx->smpl_addr); +error: + return ret; +} + +/* + * interrupts are masked when entering this function. + * context must be in MASKED state when calling. + */ +static void pfm_unmask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + if (ctx->state != PFM_CTX_MASKED) + return; + + PFM_DBG_ovfl("unmasking monitoring"); + + /* + * must be done before calling + * pfm_arch_unmask_monitoring() + */ + ctx->state = PFM_CTX_LOADED; + + /* + * we need to restore the PMDs because they + * may have been modified by user while MASKED in which + * case the actual registers were not updated + * + * XXX: could be avoided in system-wide mode + */ + pfm_arch_restore_pmds(ctx, set); + + pfm_arch_unmask_monitoring(ctx, set); + + set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; + + /* + * reset set duration timer + */ + set->duration_start = sched_clock(); +} + +/* + * called from pfm_smpl_buffer_alloc_old() (IA64-COMPAT) + * and pfm_setup_smpl_fmt() + * + * interrupts are enabled, context is not locked. + */ +int pfm_smpl_buffer_alloc(struct pfm_context *ctx, size_t rsize) +{ + void *addr; + size_t size; + int ret; + + might_sleep(); + + /* + * align page boundary + */ + size = PAGE_ALIGN(rsize); + + PFM_DBG("buffer req_size=%zu actual_size=%zu before", rsize, size); + + ret = pfm_reserve_buf_space(size); + if (ret) + return ret; + + PFM_DBG("buffer req_size=%zu actual_size=%zu after", rsize, size); + /* + * vmalloc can sleep. we do not hold + * any spinlock and interrupts are enabled + */ + addr = vmalloc(size); + if (!addr) { + PFM_DBG("cannot allocate sampling buffer"); + goto unres; + } + + memset(addr, 0, size); + + ctx->smpl_addr = addr; + ctx->smpl_size = size; + + PFM_DBG("kernel smpl buffer @%p", addr); + + return 0; +unres: + PFM_DBG("buffer req_size=%zu actual_size=%zu error", rsize, size); + pfm_release_buf_space(ctx, size); + return -ENOMEM; +} + +void pfm_reset_pmds(struct pfm_context *ctx, + struct pfm_event_set *set, + int num_pmds, + int reset_mode) +{ + u64 val, mask, new_seed; + struct pfm_pmd *reg; + unsigned int i, not_masked; + + not_masked = ctx->state != PFM_CTX_MASKED; + + PFM_DBG_ovfl("%s r_pmds=0x%llx not_masked=%d", + reset_mode == PFM_PMD_RESET_LONG ? "long" : "short", + (unsigned long long)set->reset_pmds[0], + not_masked); + + __get_cpu_var(pfm_stats).reset_pmds_count++; + + for (i = 0; num_pmds; i++) { + if (test_bit(i, cast_ulp(set->reset_pmds))) { + num_pmds--; + + reg = set->pmds + i; + + val = reset_mode == PFM_PMD_RESET_LONG ? reg->long_reset : reg->short_reset; + + if (reg->flags & PFM_REGFL_RANDOM) { + mask = reg->mask; + new_seed = random32(); + + /* construct a full 64-bit random value: */ + if ((unlikely(mask >> 32) != 0)) + new_seed |= (u64)random32() << 32; + + /* counter values are negative numbers! */ + val -= (new_seed & mask); + } + + set->pmds[i].value = val; + reg->lval = val; + + /* + * not all PMD to reset are necessarily + * counters + */ + if (not_masked) + pfm_write_pmd(ctx, i, val); + + PFM_DBG_ovfl("set%u pmd%u sval=0x%llx", + set->id, + i, + (unsigned long long)val); + } + } + + /* + * done with reset + */ + bitmap_zero(cast_ulp(set->reset_pmds), i); + + /* + * make changes visible + */ + if (not_masked) + pfm_arch_serialize(); +} + +/* + * called from pfm_handle_work() and __pfm_restart() + * for system-wide and per-thread context to resume + * monitoring after a user level notification. + * + * In both cases, the context is locked and interrupts + * are disabled. + */ +static void pfm_resume_after_ovfl(struct pfm_context *ctx) +{ + struct pfm_smpl_fmt *fmt; + u32 rst_ctrl; + struct pfm_event_set *set; + u64 *reset_pmds; + void *hdr; + int state, ret; + + hdr = ctx->smpl_addr; + fmt = ctx->smpl_fmt; + state = ctx->state; + set = ctx->active_set; + ret = 0; + + if (hdr) { + rst_ctrl = 0; + prefetch(hdr); + } else + rst_ctrl= PFM_OVFL_CTRL_RESET; + + /* + * if using a sampling buffer format and it has a restart callback, + * then invoke it. hdr may be NULL, it the format does not use a + * perfmon buffer + */ + if (fmt && fmt->fmt_restart) + ret = (*fmt->fmt_restart)(state == PFM_CTX_LOADED, &rst_ctrl, hdr); + + reset_pmds = set->reset_pmds; + + PFM_DBG("restart=%d set=%u r_pmds=0x%llx switch=%d ctx_state=%d", + ret, + set->id, + (unsigned long long)reset_pmds[0], + (set->priv_flags & PFM_SETFL_PRIV_SWITCH), + state); + + if (!ret) { + /* + * switch set if needed + */ + if (set->priv_flags & PFM_SETFL_PRIV_SWITCH) { + set->priv_flags &= ~PFM_SETFL_PRIV_SWITCH; + pfm_switch_sets(ctx, NULL, PFM_PMD_RESET_LONG, 0); + set = ctx->active_set; + } else if (rst_ctrl & PFM_OVFL_CTRL_RESET) { + int nn; + nn = bitmap_weight(cast_ulp(set->reset_pmds), + pfm_pmu_conf->regs.max_pmd); + if (nn) + pfm_reset_pmds(ctx, set, nn, PFM_PMD_RESET_LONG); + } + + if (!(rst_ctrl & PFM_OVFL_CTRL_MASK)) + pfm_unmask_monitoring(ctx, set); + else + PFM_DBG("stopping monitoring?"); + ctx->state = PFM_CTX_LOADED; + } + ctx->flags.can_restart = 0; +} + +/* + * This function is always called after pfm_stop has been issued + */ +static void pfm_flush_pmds(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_event_set *set; + u64 ovfl_mask; + u64 *ovfl_pmds; + u32 num_ovfls; + u16 i, first_cnt_pmd; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + first_cnt_pmd = pfm_pmu_conf->regs.first_cnt_pmd; + + set = ctx->active_set; + + /* + * save active set + * UP: + * if not current task and due to lazy, state may + * still be live + * for system-wide, guaranteed to run on correct CPU + */ + if (__get_cpu_var(pmu_ctx) == ctx) { + /* + * pending overflows have been saved by pfm_stop() + */ + pfm_save_pmds(ctx, set); + pfm_set_pmu_owner(NULL, NULL); + PFM_DBG("released ownership"); + } + + /* + * cleanup each set + */ + list_for_each_entry(set, &ctx->list, list) { + if (!set->npend_ovfls) + continue; + + /* + * take care of overflow + * no format handler is called here + */ + ovfl_pmds = set->povfl_pmds; + num_ovfls = set->npend_ovfls; + + PFM_DBG("set%u first=%u novfls=%u", + set->id, first_cnt_pmd, num_ovfls); + /* + * only look up to the last counting PMD register + */ + for (i = first_cnt_pmd; num_ovfls; i++) { + if (test_bit(i, cast_ulp(ovfl_pmds))) { + set->pmds[i].value += 1 + ovfl_mask; + num_ovfls--; + PFM_DBG("pmd%u overflowed", i); + } + PFM_DBG("pmd%u set=%u val=0x%llx", + i, + set->id, + (unsigned long long)set->pmds[i].value); + } + } +} + +/* + * This function is called when we need to perform asynchronous + * work on a context. This function is called ONLY when about to + * return to user mode (very much like with signals handling). + * + * There are several reasons why we come here: + * + * - per-thread mode, not self-monitoring, to reset the counters + * after a pfm_restart() by the thread controlling the context + * + * - because we are zombie and we need to cleanup our state + * + * - because we need to block after an overflow notification + * on a context with the PFM_OVFL_NOTIFY_BLOCK flag + * + * This function is never called for a system-wide context. + * + * pfm_handle_work() can be called with interrupts enabled + * (TIF_NEED_RESCHED) or disabled. The down_interruptible + * call may sleep, therefore we must re-enable interrupts + * to avoid deadlocks. It is safe to do so because this function + * is called ONLY when returning to user level, in which case + * there is no risk of kernel stack overflow due to deep + * interrupt nesting. + */ +void pfm_handle_work(struct pt_regs *regs) +{ + struct pfm_context *ctx; + unsigned long flags, dummy_flags; + int type, ret, can_release; + +#ifdef CONFIG_PPC + /* + * This is just a temporary fix. Obviously we'd like to fix the powerpc + * code to make that check before calling __pfm_handle_work() to + * prevent the function call overhead, but the call is made from assembly + * code, so it will take a little while to figure out how to perform the + * check correctly. + */ + if (!test_thread_flag(TIF_PERFMON_WORK)) + return; +#endif + + if (!user_mode(regs)) + return; + + //BUG_ON(!irqs_disabled()); + + clear_thread_flag(TIF_PERFMON_WORK); + + __get_cpu_var(pfm_stats).handle_work_count++; + + ctx = current->pfm_context; + if (ctx == NULL) { + PFM_ERR("handle_work [%d] has no ctx", current->pid); + return; + } + + BUG_ON(ctx->flags.system); + + spin_lock_irqsave(&ctx->lock, flags); + + type = ctx->flags.work_type; + ctx->flags.work_type = PFM_WORK_NONE; + + /* + * must be done before we check for simple reset mode + */ + if (type == PFM_WORK_ZOMBIE) + goto do_zombie; + + if (type == PFM_WORK_RESET) { + PFM_DBG("counter reset"); + goto skip_blocking; + } + + /* + * restore interrupt mask to what it was on entry. + * Could be enabled/disabled. + */ + spin_unlock_irqrestore(&ctx->lock, flags); + + /* + * force interrupt enable because of down_interruptible() + */ + local_irq_enable(); + + PFM_DBG("before block sleeping"); + + /* + * may go through without blocking on SMP systems + * if restart has been received already by the time we call down() + */ + ret = wait_for_completion_interruptible(&ctx->restart_complete); + + PFM_DBG("after block sleeping ret=%d", ret); + + /* + * lock context and mask interrupts again + * We save flags into a dummy because we may have + * altered interrupts mask compared to entry in this + * function. + */ + spin_lock_irqsave(&ctx->lock, dummy_flags); + + if (ctx->state == PFM_CTX_ZOMBIE) + goto do_zombie; + + /* + * in case of interruption of down() we don't restart anything + */ + if (ret < 0) + goto nothing_to_do; + +skip_blocking: + pfm_resume_after_ovfl(ctx); + +nothing_to_do: + /* + * restore flags as they were upon entry + */ + spin_unlock_irqrestore(&ctx->lock, flags); + return; + +do_zombie: + PFM_DBG("context is zombie, bailing out"); + + __pfm_unload_context(ctx, &can_release); + + /* + * keep the spinlock check happy + */ + spin_unlock(&ctx->lock); + + /* + * enable interrupt for vfree() + */ + local_irq_enable(); + + /* + * actual context free + */ + pfm_context_free(ctx); + + /* + * restore interrupts as they were upon entry + */ + local_irq_restore(flags); + + /* always true */ + if (can_release) + pfm_release_session(0, 0); +} + +int pfm_notify_user(struct pfm_context *ctx) +{ + if (ctx->state == PFM_CTX_ZOMBIE) { + PFM_DBG("ignoring overflow notification, owner is zombie"); + return 0; + } + PFM_DBG("waking up somebody"); + + wake_up_interruptible(&ctx->msgq_wait); + + /* + * it is safe to call kill_fasync() from an interrupt + * handler. kill_fasync() grabs two RW locks (fasync_lock, + * tasklist_lock) in read mode. There is conflict only in + * case the PMU interrupt occurs during a write mode critical + * section. This cannot happen because for both locks, the + * write mode is always using interrupt masking (write_lock_irq). + */ + kill_fasync (&ctx->async_queue, SIGIO, POLL_IN); + + return 0; +} +EXPORT_SYMBOL(pfm_notify_user); + +/* + * send a counter overflow notification message to + * user. First appends the message to the queue, then + * wake up ay waiter on the file descriptor + * + * context is locked and interrupts are disabled (no preemption). + */ +int pfm_ovfl_notify_user(struct pfm_context *ctx, + struct pfm_event_set *set, + unsigned long ip) +{ + union pfarg_msg *msg = NULL; + int max_cnt_pmd; + u64 *ovfl_pmds; + + max_cnt_pmd = pfm_pmu_conf->regs.max_cnt_pmd; + + if (!ctx->flags.no_msg) { + msg = pfm_get_new_msg(ctx); + if (msg == NULL) { + /* + * when message queue fills up it is because the user + * did not extract the message, yet issued + * pfm_restart(). At this point, we stop sending + * notification, thus the user will not be able to get + * new samples when using the default format. + */ + PFM_DBG_ovfl("no more notification msgs"); + return -1; + } + + msg->pfm_ovfl_msg.msg_type = PFM_MSG_OVFL; + msg->pfm_ovfl_msg.msg_ovfl_pid = current->pid; + msg->pfm_ovfl_msg.msg_active_set = set->id; + + ovfl_pmds = msg->pfm_ovfl_msg.msg_ovfl_pmds; + + bitmap_copy(cast_ulp(ovfl_pmds), cast_ulp(set->ovfl_pmds), + max_cnt_pmd); + + msg->pfm_ovfl_msg.msg_ovfl_cpu = smp_processor_id(); + msg->pfm_ovfl_msg.msg_ovfl_tid = current->tgid; + msg->pfm_ovfl_msg.msg_ovfl_ip = ip; + + __get_cpu_var(pfm_stats).ovfl_notify_count++; + } + + PFM_DBG("ovfl msg: ip=0x%lx o_pmds=0x%llx", + ip, + (unsigned long long)set->ovfl_pmds[0]); + + return pfm_notify_user(ctx); +} + +/* + * In per-thread mode, when not self-monitoring, perfmon + * send a 'end' notification message when the monitored + * thread where the context is attached is exiting. + * + * This helper message alleviate the need to track the activity + * of the thread/process when it is not directly related, i.e., + * was attached vs was forked/execd. + * + * This function appends the 'end' notification message to the + * queue. + * + * the context must be locked and interrupts disabled. + */ +static int pfm_end_notify_user(struct pfm_context *ctx) +{ + union pfarg_msg *msg; + + msg = pfm_get_new_msg(ctx); + if (msg == NULL) { + PFM_ERR("%s no more msgs", __FUNCTION__); + return -1; + } + /* no leak */ + memset(msg, 0, sizeof(*msg)); + + msg->type = PFM_MSG_END; + + PFM_DBG("end msg: msg=%p no_msg=%d", + msg, + ctx->flags.no_msg); + + return pfm_notify_user(ctx); +} + +/* + * called only from exit_thread(): task == current + * we come here only if current has a context + * attached (loaded or masked or zombie) + */ +void __pfm_exit_thread(struct task_struct *task) +{ + struct pfm_context *ctx; + unsigned long flags; + int free_ok = 0, can_release = 0; + + ctx = task->pfm_context; + + BUG_ON(ctx->flags.system); + + spin_lock_irqsave(&ctx->lock, flags); + + PFM_DBG("state=%d", ctx->state); + + /* + * __pfm_unload_context() cannot fail + * in the context states we are interested in + */ + switch (ctx->state) { + case PFM_CTX_LOADED: + case PFM_CTX_MASKED: + __pfm_unload_context(ctx, &can_release); + pfm_end_notify_user(ctx); + break; + case PFM_CTX_ZOMBIE: + __pfm_unload_context(ctx, &can_release); + free_ok = 1; + break; + default: + BUG_ON(ctx->state != PFM_CTX_LOADED); + break; + } + spin_unlock_irqrestore(&ctx->lock, flags); + + if (can_release) + pfm_release_session(0, 0); + + /* + * All memory free operations (especially for vmalloc'ed memory) + * MUST be done with interrupts ENABLED. + */ + if (free_ok) + pfm_context_free(ctx); +} + +/* + * CPU hotplug event nofication callback + * + * We use the callback to do manage the sysfs interface. + * Note that the actual shutdown of monitoring on the CPU + * is done in pfm_cpu_disable(), see comments there for more + * information. + */ +static int pfm_cpu_notify(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + int ret = NOTIFY_OK; + + pfm_pmu_conf_get(0); + + switch (action) { + case CPU_ONLINE: + pfm_sysfs_add_cpu(cpu); + PFM_INFO("CPU%d is online", cpu); + break; + case CPU_UP_PREPARE: + PFM_INFO("CPU%d prepare online", cpu); + break; + case CPU_UP_CANCELED: + pfm_sysfs_del_cpu(cpu); + PFM_INFO("CPU%d is up canceled", cpu); + break; + case CPU_DOWN_PREPARE: + PFM_INFO("CPU%d prepare offline", cpu); + break; + case CPU_DOWN_FAILED: + PFM_INFO("CPU%d is down failed", cpu); + break; + case CPU_DEAD: + pfm_sysfs_del_cpu(cpu); + PFM_INFO("CPU%d is offline", cpu); + break; + } + pfm_pmu_conf_put(); + return ret; +} + +static struct notifier_block pfm_cpu_notifier ={ + .notifier_call = pfm_cpu_notify +}; + +/* + * called from cpu_init() and pfm_pmu_register() + */ +void __pfm_init_percpu(void *dummy) +{ + pfm_arch_init_percpu(); +} + +/* + * global initialization routine, executed only once + */ +int __init pfm_init(void) +{ + int ret; + + PFM_LOG("version %u.%u, compiled: " __DATE__ ", " __TIME__, + PFM_VERSION_MAJ, PFM_VERSION_MIN); + + pfm_ctx_cachep = kmem_cache_create("pfm_context", + sizeof(struct pfm_context)+PFM_ARCH_CTX_SIZE, + SLAB_HWCACHE_ALIGN, 0, NULL, NULL); + if (pfm_ctx_cachep == NULL) { + PFM_ERR("cannot initialize context slab"); + goto error_disable; + } + ret = pfm_sets_init(); + if (ret) + goto error_disable; + + if (pfm_init_fs()) + goto error_disable; + + if (pfm_init_sysfs()) + goto error_disable; + + /* + * one time, arch-specific global initialization + */ + if (pfm_arch_init()) + goto error_disable; + + /* + * register CPU hotplug event notifier + */ + ret = register_cpu_notifier(&pfm_cpu_notifier); + if (!ret) + return 0; + +error_disable: + PFM_INFO("perfmon is disabled due to initialization error"); + perfmon_disabled = 1; + return -1; +} + +/* + * must use subsys_initcall() to ensure that the perfmon2 core + * is initialized before any PMU description module when they are + * compiled in. + */ +subsys_initcall(pfm_init); + +/* + * function used to start monitoring. When operating in per-thread + * mode and when not self-monitoring, the monitored thread must be + * stopped. + * + * The pfarg_start argument is optional and may be used to designate + * the initial event set to activate. Wehn missing, the last active + * set is used. For the first activation, set0 is used. + * + * On some architectures, e.g., IA-64, it may be possible to start monitoring + * without calling this function under certain conditions (per-thread and self + * monitoring). + * + * the context is locked and interrupts are disabled. + */ +int __pfm_start(struct pfm_context *ctx, struct pfarg_start *start) +{ + struct task_struct *task, *owner_task; + struct pfm_event_set *new_set, *old_set; + u64 now; + int is_self; + + task = ctx->task; + + /* + * context must be loaded. + * we do not support starting while in MASKED state + * (mostly because of set switching issues) + */ + if (ctx->state != PFM_CTX_LOADED) + return -EINVAL; + + old_set = new_set = ctx->active_set; + + /* + * always the case for system-wide + */ + if (task == NULL) + task = current; + + is_self = task == current; + + /* + * argument is provided? + */ + if (start) { + /* + * find the set to load first + */ + new_set = pfm_find_set(ctx, start->start_set, 0); + if (new_set == NULL) { + PFM_DBG("event set%u does not exist", + start->start_set); + return -EINVAL; + } + } + + PFM_DBG("cur_set=%u req_set=%u", + old_set->id, + new_set->id); + + /* + * if we need to change the active set we need + * to check if we can access the PMU + */ + if (new_set != old_set) { + owner_task = __get_cpu_var(pmu_owner); + /* + * system-wide: must run on the right CPU + * per-thread : must be the owner of the PMU context + * + * pfm_switch_sets() returns with monitoring stopped + */ + if (is_self) { + pfm_switch_sets(ctx, new_set, PFM_PMD_RESET_LONG, 1); + } else { + /* + * In a UP kernel, the PMU may contain the state + * of the task we want to operate on, yet the task + * may be switched out (lazy save). We need to save + * current state (old_set), switch active_set and + * mark it for reload. + */ + if (owner_task == task) + pfm_save_pmds(ctx, old_set); + ctx->active_set = new_set; + new_set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH; + } + } + /* + * mark as started, must be done before calling + * pfm_arch_start() + */ + ctx->flags.started = 1; + + now = sched_clock(); + + pfm_arch_start(task, ctx, new_set); + + /* + * we check whether we had a pending ovfl before restarting. + * If so we need to regenerate the interrupt to make sure we + * keep recorded samples. For non-self monitoring this check + * is done in the pfm_ctxswin_thread() routine. + */ + if (is_self && new_set->npend_ovfls) { + pfm_arch_resend_irq(); + __get_cpu_var(pfm_stats).ovfl_intr_replay_count++; + } + + /* + * we restart total duration even if context was + * already started. In that case, counts are simply + * reset. + * + * For per-thread, if not self-monitoring, the statement + * below will have no effect because thread is stopped. + * The field is reset of ctxsw in. + */ + new_set->duration_start = now; + + return 0; +} + +/* + * function used to stop monitoring. When operating in per-thread + * mode and when not self-monitoring, the monitored thread must be + * stopped. + * + * the context is locked and interrupts are disabled. + */ +int __pfm_stop(struct pfm_context *ctx) +{ + struct pfm_event_set *set; + struct task_struct *task; + u64 now; + int state; + + now = sched_clock(); + state = ctx->state; + set = ctx->active_set; + + /* + * context must be attached (zombie cannot happen) + */ + if (state == PFM_CTX_UNLOADED) + return -EINVAL; + + task = ctx->task; + + PFM_DBG("ctx_task=[%d] ctx_state=%d is_system=%d", + task ? task->pid : -1, + state, + ctx->flags.system); + + /* + * this happens for system-wide context + */ + if (task == NULL) + task = current; + + /* + * compute elapsed time + * don't update set duration if masked + */ + if (task == current && state == PFM_CTX_LOADED) + set->duration += now - set->duration_start; + + pfm_arch_stop(task, ctx, set); + + ctx->flags.started = 0; + /* + * starting now, in-flight PMU interrupt for this context + * are treated as spurious + */ + return 0; +} + +/* + * function called from sys_pfm_restart(). It is used when overflow + * notification is requested. For each notification received, the user + * must call pfm_restart() to indicate to the kernel that it is done + * processing the notification. + * + * When the caller is doing user level sampling, this function resets + * the overflowed counters and resumes monitoring which is normally stopped + * during notification (always the consequence of a counter overflow). + * + * When using a sampling format, the format restart() callback is invoked, + * overflowed PMDS may be reset based upon decision from sampling format. + * + * When operating in per-thread mode, and when not self-monitoring, the + * monitored thread DOES NOT need to be stopped, unlike for many other calls. + * + * This means that the effect of the restart may not necessarily be observed + * right when returning from the call. For instance, counters may not already + * be reset in the other thread. + * + * When operating in system-wide, the caller must be running on the monitored + * CPU. + * + * The context is locked and interrupts are disabled. + * + */ +int __pfm_restart(struct pfm_context *ctx, int *complete_needed) +{ + int state; + + + state = ctx->state; + + PFM_DBG("state=%d", state); + + *complete_needed = 0; + + if (state != PFM_CTX_MASKED && state != PFM_CTX_LOADED) { + PFM_DBG("invalid state=%d", state); + return -EBUSY; + } + + __get_cpu_var(pfm_stats).pfm_restart_count++; + /* + * at this point, the context is either LOADED or MASKED + */ + + if (ctx->task == current || ctx->flags.system) { + pfm_resume_after_ovfl(ctx); + return 0; + } + + /* + * restart another task + */ + + /* + * When PFM_CTX_MASKED, we cannot issue a restart before the previous + * one is seen by the task. + */ + if (state == PFM_CTX_MASKED) { + if (!ctx->flags.can_restart) { + PFM_DBG("cannot restart can_restart=%d", + ctx->flags.can_restart); + return -EBUSY; + } + /* + * prevent subsequent restart before this one is + * seen by the task + */ + ctx->flags.can_restart = 0; + } + + /* + * if blocking, then post the semaphore is PFM_CTX_MASKED, i.e. + * the task is blocked or on its way to block. That's the normal + * restart path. If the monitoring is not masked, then the task + * can be actively monitoring and we cannot directly intervene. + * Therefore we use the trap mechanism to catch the task and + * force it to reset the buffer/reset PMDs. + * + * if non-blocking, then we ensure that the task will go into + * pfm_handle_work() before returning to user mode. + * + * We cannot explicitly reset another task, it MUST always + * be done by the task itself. This works for system wide because + * the tool that is controlling the session is logically doing + * "self-monitoring". + */ + if (ctx->flags.block && state == PFM_CTX_MASKED) { + PFM_DBG("unblocking [%d]", ctx->task->pid); + /* + * It is not possible to call complete() with the context locked + * otherwise we have a potential deadlock with the PMU context + * switch code due to a lock inversion between task_rq_lock() + * and the context lock. + * Instead we mark whether or not we need to issue the complete + * and we invoke the function once the context lock is released + * in sys_pfm_restart() + */ + *complete_needed = 1; + } else { + PFM_DBG("[%d] armed exit trap", ctx->task->pid); + ctx->flags.work_type = PFM_WORK_RESET; + set_tsk_thread_flag(ctx->task, TIF_PERFMON_WORK); + } + return 0; +} + +/* + * function used to attach a context to either a CPU or a thread. + * In per-thread mode, and when not self-monitoring, the thread must be + * stopped. In system-wide mode, the cpu specified in the pfarg_load.load_tgt + * argument must be the current CPU. + * + * The function must be called with the context locked and interrupts disabled. + */ +int __pfm_load_context(struct pfm_context *ctx, struct pfarg_load *req, + struct task_struct *task) +{ + struct pfm_event_set *set; + struct pfm_context *old; + int mycpu; + int ret; + + mycpu = smp_processor_id(); + + /* + * system-wide: check we are running on the desired CPU + */ + if (ctx->flags.system && req->load_pid != mycpu) { + PFM_DBG("running on wrong CPU: %u vs. %u", + mycpu, req->load_pid); + return -EINVAL; + } + + /* + * locate first set to activate + */ + set = pfm_find_set(ctx, req->load_set, 0); + if (set == NULL) { + PFM_DBG("event set%u does not exist", req->load_set); + return -EINVAL; + } + + /* + * assess sanity of event sets, initialize set state + */ + ret = pfm_prepare_sets(ctx, set); + if (ret) { + PFM_DBG("invalid next field pointers in the sets"); + return -EINVAL; + } + + PFM_DBG("load_pid=%d set=%u set_flags=0x%x", + req->load_pid, + set->id, + set->flags); + + /* + * per-thread: + * - task to attach to is checked in sys_pfm_load_context() to avoid + * locking issues. if found, and not self, task refcount was incremented. + */ + if (ctx->flags.system) { + ctx->cpu = mycpu; + ctx->task = NULL; + task = current; + } else { + old = cmpxchg(&task->pfm_context, NULL, ctx); + if (old != NULL) { + PFM_DBG("load_pid=%d has a context " + "old=%p new=%p cur=%p", + req->load_pid, + old, + ctx, + task->pfm_context); + return -EEXIST; + } + ctx->task = task; + ctx->cpu = -1; + } + + /* + * perform any architecture specific actions + */ + ret = pfm_arch_load_context(ctx, set, ctx->task); + if (ret) + goto error_noload; + + /* + * now reserve the session, before we can proceed with + * actually accessing the PMU hardware + */ + ret = pfm_reserve_session(ctx->flags.system, ctx->cpu); + if (ret) + goto error; + + /* + * commit active set + */ + ctx->set_all_runs = 1; + ctx->active_set = set; + + set->runs++; + + /* + * self-monitoring (incl. system-wide) + */ + if (task == current) { +#ifndef CONFIG_SMP + /* + * in UP mode, because of lazy save/restore + * there may already be valid state on the PMU. + * We need to push it out before we can load the + * next state + */ + struct pfm_context *ctxp; + ctxp = __get_cpu_var(pmu_ctx); + if (ctxp) + pfm_save_prev_context(ctxp); +#endif + pfm_set_last_cpu(ctx, mycpu); + pfm_inc_activation(); + pfm_set_activation(ctx); + + /* + * we activate switch timeout callbacks to pfm_handle_switch_timeout() + * even though the interface guarantees monitoring is inactive at + * this point. The reason is that on some architectures (e.g., IA-64) + * it is possible to start monitoring directly from user level without + * the kernel knowing. In that case, the kernel would not be able to + * active switch timeout when monitoring starts + */ + if (set->flags & PFM_SETFL_TIME_SWITCH) + __get_cpu_var(pfm_syst_info) = PFM_CPUINFO_TIME_SWITCH; + + /* + * load PMD from set + * load PMC from set + */ + pfm_arch_restore_pmds(ctx, set); + pfm_arch_restore_pmcs(ctx, set); + + /* + * set new ownership + */ + pfm_set_pmu_owner(ctx->task, ctx); + } else { + /* force a full reload */ + ctx->last_act = PFM_INVALID_ACTIVATION; + pfm_set_last_cpu(ctx, -1); + set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH; + PFM_DBG("context loaded next ctxswin for [%d]", task->pid); + } + + if (!ctx->flags.system) { + set_tsk_thread_flag(task, TIF_PERFMON_CTXSW); + PFM_DBG("[%d] set TIF", task->pid); + } + + ctx->flags.work_type = PFM_WORK_NONE; + + /* + * reset message queue + */ + ctx->msgq_head = ctx->msgq_tail = 0; + + ctx->state = PFM_CTX_LOADED; + + return 0; + +error: + pfm_arch_unload_context(ctx, task); +error_noload: + /* + * detach context + */ + if (!ctx->flags.system) + task->pfm_context = NULL; + + return ret; +} + +/* + * Function used to detach a context from either a CPU or a thread. + * In the per-thread case and when not self-monitoring, the thread must be + * stopped. After the call, the context is detached and monitoring is stopped. + * + * The function must be called with the context locked and interrupts disabled. + */ +int __pfm_unload_context(struct pfm_context *ctx, int *can_release) +{ + struct task_struct *task; + struct pfm_event_set *set; + int ret, is_self; + + PFM_DBG("ctx_state=%d task [%d]", ctx->state, ctx->task ? ctx->task->pid : -1); + + *can_release = 0; + + /* + * unload only when necessary + */ + if (ctx->state == PFM_CTX_UNLOADED) + return 0; + + task = ctx->task; + set = ctx->active_set; + is_self = ctx->flags.system || task == current; + + /* + * stop monitoring + */ + ret = __pfm_stop(ctx); + if (ret) + return ret; + + ctx->state = PFM_CTX_UNLOADED; + ctx->flags.can_restart = 0; + + /* + * clear any leftover in pfm_syst_info. + * + * for non-self monitoring, + * this is done in pfm_ctxswout_thread. + */ + if (is_self) + __get_cpu_var(pfm_syst_info) = 0; + + /* + * save PMD registers + * release ownership + */ + pfm_flush_pmds(task, ctx); + + /* + * arch-specific unload operations + */ + pfm_arch_unload_context(ctx, task); + + /* + * per-thread: disconnect from monitored task + * syswide : keep ctx->cpu has it may be used after unload + * to release the session + */ + if (task) { + task->pfm_context = NULL; + ctx->task = NULL; + clear_tsk_thread_flag(task, TIF_PERFMON_CTXSW); + } + + *can_release = 1; + + return 0; +} + +static inline int pfm_ctx_flags_sane(u32 ctx_flags) +{ + if (ctx_flags & PFM_FL_SYSTEM_WIDE) { + if (ctx_flags & PFM_FL_NOTIFY_BLOCK) { + PFM_DBG("cannot use blocking mode in syswide mode"); + return -EINVAL; + } + } + return 0; +} + +/* + * check for permissions to create a context. + * + * A sysadmin may decide to restrict creation of per-thread + * and/or system-wide context to a group of users using the + * group id via /sys/kernel/perfmon/task_group and + * /sys/kernel/perfmon/sys_group. + * + * Once we identify a user level package which can be used + * to grant/revoke Linux capabilites at login via PAM, we will + * be able to use capabilities. We would also need to increase + * the size of cap_t to support more than 32 capabilities (it + * is currently defined as u32 and 32 capabilities are alrady + * defined). + */ +static inline int pfm_ctx_permissions(u32 ctx_flags) +{ + if ( (ctx_flags & PFM_FL_SYSTEM_WIDE) + && pfm_controls.sys_group != PFM_GROUP_PERM_ANY + && !in_group_p(pfm_controls.sys_group)) { + PFM_DBG("user group not allowed to create a syswide ctx"); + return -EPERM; + } else if (pfm_controls.task_group != PFM_GROUP_PERM_ANY + && !in_group_p(pfm_controls.task_group)) { + PFM_DBG("user group not allowed to create a task context"); + return -EPERM; + } + return 0; +} + +/* + * function used to allocate a new context. A context is allocated along + * with the default event set. If a sampling format is used, the buffer + * may be allocated and initialized. + * + * The file descriptor identifying the context is allocated and returned + * to caller. + * + * This function operates with no locks and interrupts are enabled. + * return: + * >=0: the file descriptor to identify the context + * <0 : the error code + */ +int __pfm_create_context(struct pfarg_ctx *req, + struct pfm_smpl_fmt *fmt, + void *fmt_arg, + int mode, + struct pfm_context **new_ctx) +{ + struct pfm_context *ctx; + struct pfm_event_set *set; + struct file *filp = NULL; + u32 ctx_flags; + int fd = 0, ret; + + ctx_flags = req->ctx_flags; + + /* Increase refcount on PMU description */ + ret = pfm_pmu_conf_get(1); + if (ret < 0) + goto error_conf; + + ret = pfm_ctx_flags_sane(ctx_flags); + if (ret < 0) + goto error_alloc; + + ret = pfm_ctx_permissions(ctx_flags); + if (ret < 0) + goto error_alloc; + + /* + * we can use GFP_KERNEL and potentially sleep because we do + * not hold any lock at this point. + */ + might_sleep(); + ret = -ENOMEM; + ctx = kmem_cache_zalloc(pfm_ctx_cachep, GFP_KERNEL); + if (!ctx) + goto error_alloc; + + /* + * needs to init event set list otherwise + * we could fail in pfm_free_sets + */ + INIT_LIST_HEAD(&ctx->list); + + ret = pfm_pmu_acquire(); + if (ret) + goto error_file; + /* + * link to format, must be done first for correct + * error handling in pfm_context_free() + */ + ctx->smpl_fmt = fmt; + + ret = -ENFILE; + fd = pfm_alloc_fd(&filp); + if (fd < 0) + goto error_file; + + /* + * context is unloaded + */ + ctx->state = PFM_CTX_UNLOADED; + + /* + * initialization of context's flags + * must be done before pfm_find_set() + */ + ctx->flags.block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0; + ctx->flags.system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0; + ctx->flags.no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0; + + INIT_LIST_HEAD(&ctx->list); + + /* + * initialize arch-specific section + * must be done before fmt_init() + * + * XXX: fix dependency with fmt_init() + */ + ret = pfm_arch_context_create(ctx, ctx_flags); + if (ret) + goto error_set; + + ret = -ENOMEM; + /* + * create initial set + */ + if (pfm_find_set(ctx, 0, 1) == NULL) + goto error_set; + + set = list_entry(ctx->list.next, struct pfm_event_set, list); + + pfm_init_evtset(set); + + /* + * does the user want to sample? + */ + if (fmt) { + ret = pfm_setup_smpl_fmt(fmt, fmt_arg, ctx, ctx_flags, + mode, filp); + if (ret) + goto error_set; + } + + filp->private_data = ctx; + + spin_lock_init(&ctx->lock); + init_completion(&ctx->restart_complete); + + ctx->last_act = PFM_INVALID_ACTIVATION; + pfm_set_last_cpu(ctx, -1); + + /* + * initialize notification message queue + */ + ctx->msgq_head = ctx->msgq_tail = 0; + init_waitqueue_head(&ctx->msgq_wait); + + PFM_DBG("ctx=%p flags=0x%x system=%d notify_block=%d no_msg=%d" + " use_fmt=%d ctx_fd=%d mode=%d", + ctx, + ctx_flags, + ctx->flags.system, + ctx->flags.block, + ctx->flags.no_msg, + fmt != NULL, + fd, mode); + + *new_ctx = ctx; + + /* + * we defer the fd_install until we are certain the call succeeded + * to ensure we do not have to undo its effect. Neither put_filp() + * nor put_unused_fd() undoes the effect of fd_install(). + */ + fd_install(fd, filp); + + return fd; + +error_set: + put_filp(filp); + put_unused_fd(fd); +error_file: + /* + * calls the right *_put() functions + * calls pfm_release_pmu() + */ + pfm_context_free(ctx); + return ret; +error_alloc: + pfm_pmu_conf_put(); +error_conf: + pfm_smpl_fmt_put(fmt); + return ret; +} + +/* + * called from cpu_disable() to detach the perfmon context + * from the CPU going down. + * + * We cannot use the cpu hotplug notifier because we MUST run + * on the CPU that is going down to save the PMU state + */ +void pfm_cpu_disable(void) +{ + struct pfm_context *ctx; + unsigned long flags; + int is_system, can_release = 0; + u32 cpu; + + ctx = __get_cpu_var(pmu_ctx); + if (ctx == NULL) + return; + + is_system = ctx->flags.system; + cpu = ctx->cpu; + + /* + * context is LOADED or MASKED + * + * we unload from CPU. That stops monitoring and does + * all the bookeeping of saving values and updating duration + */ + spin_lock_irqsave(&ctx->lock, flags); + if (is_system) + __pfm_unload_context(ctx, &can_release); + spin_unlock_irqrestore(&ctx->lock, flags); + + if (can_release) + pfm_release_session(is_system, cpu); +} Index: linux-2.6/perfmon/perfmon_ctxsw.c =================================================================== --- /dev/null +++ linux-2.6/perfmon/perfmon_ctxsw.c @@ -0,0 +1,333 @@ +/* + * perfmon_cxtsw.c: perfmon2 context switch code + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +/* + * used only in UP mode + */ +void pfm_save_prev_context(struct pfm_context *ctxp) +{ + struct pfm_event_set *set; + + /* + * in UP per-thread, due to lazy save + * there could be a context from another + * task. We need to push it first before + * installing our new state + */ + set = ctxp->active_set; + pfm_save_pmds(ctxp, set); + /* + * do not clear ownership because we rewrite + * right away + */ +} + +void pfm_save_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 val, ovfl_mask; + u64 *used_mask, *cnt_mask; + u16 i, num; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + num = set->nused_pmds; + cnt_mask = pfm_pmu_conf->regs.cnt_pmds; + used_mask = set->used_pmds; + + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(used_mask))) { + val = pfm_read_pmd(ctx, i); + if (likely(test_bit(i, cast_ulp(cnt_mask)))) + val = (set->pmds[i].value & ~ovfl_mask) | + (val & ovfl_mask); + set->pmds[i].value = val; + num--; + } + } +} + +/* + * interrupts are disabled (no preemption) + */ +static void __pfm_ctxswin_thread(struct task_struct *task, + struct pfm_context *ctx, u64 now) +{ + u64 cur_act; + struct pfm_event_set *set; + int reload_pmcs, reload_pmds; + int mycpu; + + mycpu = smp_processor_id(); + + /* + * we need to lock context because it could be accessed + * from another CPU + */ + spin_lock(&ctx->lock); + + cur_act = __get_cpu_var(pmu_activation_number); + + set = ctx->active_set; + + /* + * in case fo zombie, we do not complete ctswin of the + * PMU, and we force a call to pfm_handle_work() to finish + * cleanup, i.e., free context + smpl_buff. The reason for + * deferring to pfm_handle_work() is that it is not possible + * to vfree() with interrupts disabled. + */ + if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) { + ctx->flags.work_type = PFM_WORK_ZOMBIE; + set_tsk_thread_flag(task, TIF_PERFMON_WORK); + spin_unlock(&ctx->lock); + return; + } + + if (set->flags & PFM_SETFL_TIME_SWITCH) + __get_cpu_var(pfm_syst_info) = PFM_CPUINFO_TIME_SWITCH; + + /* + * if we were the last user of the PMU on that CPU, + * then nothing to do except restore psr + */ + if (ctx->last_cpu == mycpu && ctx->last_act == cur_act) { + /* + * check for forced reload conditions + */ + reload_pmcs = set->priv_flags & PFM_SETFL_PRIV_MOD_PMCS; + reload_pmds = set->priv_flags & PFM_SETFL_PRIV_MOD_PMDS; + } else { +#ifndef CONFIG_SMP + struct pfm_context *ctxp; + ctxp = __get_cpu_var(pmu_ctx); + if (ctxp) + pfm_save_prev_context(ctxp); +#endif + reload_pmcs = 1; + reload_pmds = 1; + } + /* consumed */ + set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; + + if (reload_pmds) + pfm_arch_restore_pmds(ctx, set); + + /* + * need to check if had in-flight interrupt in + * pfm_ctxswout_thread(). If at least one bit set, then we must replay + * the interrupt to avoid losing some important performance data. + * + * npend_ovfls is cleared in interrupt handler + */ + if (set->npend_ovfls) { + pfm_arch_resend_irq(); + __get_cpu_var(pfm_stats).ovfl_intr_replay_count++; + } + + if (reload_pmcs) + pfm_arch_restore_pmcs(ctx, set); + + /* + * record current activation for this context + */ + pfm_inc_activation(); + pfm_set_last_cpu(ctx, mycpu); + pfm_set_activation(ctx); + + /* + * establish new ownership. + */ + pfm_set_pmu_owner(task, ctx); + + pfm_arch_ctxswin_thread(task, ctx, set); + /* + * set->duration does not count when context in MASKED state. + * set->duration_start is reset in unmask_monitoring() + */ + set->duration_start = now; + + spin_unlock(&ctx->lock); +} + +/* + * interrupts are masked, runqueue lock is held. + * + * In UP. we simply stop monitoring and leave the state + * in place, i.e., lazy save + */ +static void __pfm_ctxswout_thread(struct task_struct *task, + struct pfm_context *ctx, u64 now) +{ + struct pfm_event_set *set; + int need_save_pmds; + + /* + * we need to lock context because it could be accessed + * from another CPU + */ + spin_lock(&ctx->lock); + + set = ctx->active_set; + + /* + * stop monitoring and + * collect pending overflow information + * needed on ctxswin. We cannot afford to lose + * a PMU interrupt. + */ + need_save_pmds = pfm_arch_ctxswout_thread(task, ctx, set); + + /* + * accumulate only when set is actively monitoring, + */ + if (ctx->state == PFM_CTX_LOADED) + set->duration += now - set->duration_start; + +#ifdef CONFIG_SMP + /* + * in SMP, release ownership of this PMU. + * PMU interrupts are masked, so nothing + * can happen. + */ + pfm_set_pmu_owner(NULL, NULL); + + /* + * On some architectures, it is necessary to read the + * PMD registers to check for pending overflow in + * pfm_arch_ctxswout_thread(). In that case, saving of + * the PMDs may be done there and not here. + */ + if (need_save_pmds) + pfm_save_pmds(ctx, set); +#endif + /* + * clear cpuinfo, cpuinfo is used in + * per task mode with the set time switch flag. + */ + __get_cpu_var(pfm_syst_info) = 0; + + spin_unlock(&ctx->lock); +} + +/* + * no need to lock the context. To operate on a system-wide + * context, the task has to run on the monitored CPU. In the + * case of close issued on another CPU, an IPI is sent but + * this routine runs with interrupts masked, so we are + * protected + * + * On some architectures, such as IA-64, it may be necessary + * to intervene during the context even in system-wide mode + * to modify some machine state. + */ +static void __pfm_ctxsw_sys(struct task_struct *prev, + struct task_struct *next) +{ + struct pfm_context *ctx; + struct pfm_event_set *set; + + ctx = __get_cpu_var(pmu_ctx); + if (!ctx) { + pr_info("prev=%d tif=%d ctx=%p next=%d tif=%d ctx=%p\n", + prev->pid, + test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW), + prev->pfm_context, + next->pid, + test_tsk_thread_flag(next, TIF_PERFMON_CTXSW), + next->pfm_context); + BUG_ON(!ctx); + } + + set = ctx->active_set; + + /* + * propagate TIF_PERFMON_CTXSW to ensure that: + * - previous task has TIF_PERFMON_CTXSW cleared, in case it is + * scheduled onto another CPU where there is syswide monitoring + * - next task has TIF_PERFMON_CTXSW set to ensure it will come back + * here when context switched out + */ + clear_tsk_thread_flag(prev, TIF_PERFMON_CTXSW); + set_tsk_thread_flag(next, TIF_PERFMON_CTXSW); + + /* + * nothing to do until actually started + * XXX: assumes no mean to start from user level + */ + if (!ctx->flags.started) + return; + + pfm_arch_ctxswout_sys(prev, ctx, set); + pfm_arch_ctxswin_sys(next, ctx, set); +} + +/* + * come here when either prev or next has TIF_PERFMON_CTXSW flag set + * Note that this is not because a task has TIF_PERFMON_CTXSW set that + * it has a context attached, e.g., in system-wide on certain arch. + */ +void pfm_ctxsw(struct task_struct *prev, struct task_struct *next) +{ + struct pfm_context *ctxp, *ctxn; + u64 now; + + now = sched_clock(); + + ctxp = prev->pfm_context; + ctxn = next->pfm_context; + + if (ctxp) + __pfm_ctxswout_thread(prev, ctxp, now); + + if (ctxn) + __pfm_ctxswin_thread(next, ctxn, now); + + /* + * given that prev and next can never be the same, this + * test is checking that ctxp == ctxn == NULL which is + * an indication we have an active system-wide session on + * this CPU that needs ctxsw intervention. Not all processors + * needs this, IA64 is one. + */ + if (ctxp == ctxn) + __pfm_ctxsw_sys(prev, next); + + __get_cpu_var(pfm_stats).ctxsw_count++; + __get_cpu_var(pfm_stats).ctxsw_ns += sched_clock() - now; +} Index: linux-2.6/perfmon/perfmon_dfl_smpl.c =================================================================== --- /dev/null +++ linux-2.6/perfmon/perfmon_dfl_smpl.c @@ -0,0 +1,284 @@ +/* + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file implements the new default sampling buffer format + * for the perfmon2 subsystem. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("new perfmon default sampling format"); +MODULE_LICENSE("GPL"); + +static int pfm_dfl_fmt_validate(u32 ctx_flags, u16 npmds, void *data) +{ + struct pfm_dfl_smpl_arg *arg = data; + u64 min_buf_size; + + if (data == NULL) { + PFM_DBG("no argument passed"); + return -EINVAL; + } + + /* + * sanity check in case size_t is smaller then u64 + */ +#if BITS_PER_LONG == 4 +#define MAX_SIZE_T (1ULL<<(sizeof(size_t)<<3)) + if (sizeof(size_t) < sizeof(arg->buf_size)) { + if (arg->buf_size >= MAX_SIZE_T) + return -ETOOBIG; + } +#endif + + /* + * compute min buf size. npmds is the maximum number + * of implemented PMD registers. + */ + min_buf_size = sizeof(struct pfm_dfl_smpl_hdr) + + (sizeof(struct pfm_dfl_smpl_entry) + (npmds*sizeof(u64))); + + PFM_DBG("validate ctx_flags=0x%x flags=0x%x npmds=%u " + "min_buf_size=%llu buf_size=%llu\n", + ctx_flags, + arg->buf_flags, + npmds, + (unsigned long long)min_buf_size, + (unsigned long long)arg->buf_size); + + /* + * must hold at least the buffer header + one minimally sized entry + */ + if (arg->buf_size < min_buf_size) + return -EINVAL; + + return 0; +} + +static int pfm_dfl_fmt_get_size(u32 flags, void *data, size_t *size) +{ + struct pfm_dfl_smpl_arg *arg = data; + + /* + * size has been validated in default_validate + * we can never loose bits from buf_size. + */ + *size = (size_t)arg->buf_size; + + return 0; +} + +static int pfm_dfl_fmt_init(struct pfm_context *ctx, void *buf, u32 ctx_flags, + u16 npmds, void *data) +{ + struct pfm_dfl_smpl_hdr *hdr; + struct pfm_dfl_smpl_arg *arg = data; + + hdr = buf; + + hdr->hdr_version = PFM_DFL_SMPL_VERSION; + hdr->hdr_buf_size = arg->buf_size; + hdr->hdr_buf_flags = arg->buf_flags; + hdr->hdr_cur_offs = sizeof(*hdr); + hdr->hdr_overflows = 0; + hdr->hdr_count = 0; + hdr->hdr_min_buf_space = sizeof(struct pfm_dfl_smpl_entry) + (npmds*sizeof(u64)); + + PFM_DBG("buffer=%p buf_size=%llu hdr_size=%zu hdr_version=%u.%u " + "min_space=%llu npmds=%u", + buf, + (unsigned long long)hdr->hdr_buf_size, + sizeof(*hdr), + PFM_VERSION_MAJOR(hdr->hdr_version), + PFM_VERSION_MINOR(hdr->hdr_version), + (unsigned long long)hdr->hdr_min_buf_space, + npmds); + + return 0; +} + +/* + * called from pfm_overflow_handler() to record a new sample + * + * context is locked, interrupts are disabled (no preemption) + */ +static int pfm_dfl_fmt_handler(void *buf, struct pfm_ovfl_arg *arg, + unsigned long ip, u64 tstamp, void *data) +{ + struct pfm_dfl_smpl_hdr *hdr; + struct pfm_dfl_smpl_entry *ent; + void *cur, *last; + u64 *e; + size_t entry_size, min_size; + u16 npmds, i; + u16 ovfl_pmd; + + hdr = buf; + cur = buf+hdr->hdr_cur_offs; + last = buf+hdr->hdr_buf_size; + ovfl_pmd = arg->ovfl_pmd; + min_size = hdr->hdr_min_buf_space; + + /* + * precheck for sanity + */ + if ((last - cur) < min_size) + goto full; + + npmds = arg->num_smpl_pmds; + + ent = (struct pfm_dfl_smpl_entry *)cur; + + entry_size = sizeof(*ent) + (npmds << 3); + + /* position for first pmd */ + e = (u64 *)(ent+1); + + hdr->hdr_count++; + + PFM_DBG_ovfl("count=%llu cur=%p last=%p free_bytes=%zu ovfl_pmd=%d " + "npmds=%u", + (unsigned long long)hdr->hdr_count, + cur, last, + (last-cur), + ovfl_pmd, + npmds); + + /* + * current = task running at the time of the overflow. + * + * per-task mode: + * - this is usually the task being monitored. + * Under certain conditions, it might be a different task + * + * system-wide: + * - this is not necessarily the task controlling the session + */ + ent->pid = current->pid; + ent->ovfl_pmd = ovfl_pmd; + ent->last_reset_val = arg->pmd_last_reset; + + /* + * where did the fault happen (includes slot number) + */ + ent->ip = ip; + + ent->tstamp = tstamp; + ent->cpu = smp_processor_id(); + ent->set = arg->active_set; + ent->tgid = current->tgid; + + /* + * selectively store PMDs in increasing index number + */ + if (npmds) { + u64 *val = arg->smpl_pmds_values; + for(i=0; i < npmds; i++) { + *e++ = *val++; + } + } + + /* + * update position for next entry + */ + hdr->hdr_cur_offs += entry_size; + cur += entry_size; + + /* + * post check to avoid losing the last sample + */ + if ((last - cur) < min_size) + goto full; + + /* reset before returning from interrupt handler */ + arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET; + + return 0; +full: + PFM_DBG_ovfl("sampling buffer full free=%zu, count=%llu", + last-cur, + (unsigned long long)hdr->hdr_count); + + /* + * increment number of buffer overflows. + * important to detect duplicate set of samples. + */ + hdr->hdr_overflows++; + + /* + * request notification and masking of monitoring. + * Notification is still subject to the overflowed + * register having the FL_NOTIFY flag set. + */ + arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY| PFM_OVFL_CTRL_MASK; + + return -ENOBUFS; /* we are full, sorry */ +} + +static int pfm_dfl_fmt_restart(int is_active, u32 *ovfl_ctrl, void *buf) +{ + struct pfm_dfl_smpl_hdr *hdr; + + hdr = buf; + + hdr->hdr_count = 0; + hdr->hdr_cur_offs = sizeof(*hdr); + + *ovfl_ctrl = PFM_OVFL_CTRL_RESET; + + return 0; +} + +static int pfm_dfl_fmt_exit(void *buf) +{ + return 0; +} + +static struct pfm_smpl_fmt dfl_fmt={ + .fmt_name = "default", + .fmt_version = 0x10000, + .fmt_arg_size = sizeof(struct pfm_dfl_smpl_arg), + .fmt_validate = pfm_dfl_fmt_validate, + .fmt_getsize = pfm_dfl_fmt_get_size, + .fmt_init = pfm_dfl_fmt_init, + .fmt_handler = pfm_dfl_fmt_handler, + .fmt_restart = pfm_dfl_fmt_restart, + .fmt_exit = pfm_dfl_fmt_exit, + .fmt_flags = PFM_FMT_BUILTIN_FLAG, + .owner = THIS_MODULE +}; + +static int pfm_dfl_fmt_init_module(void) +{ + return pfm_fmt_register(&dfl_fmt); +} + +static void pfm_dfl_fmt_cleanup_module(void) +{ + pfm_fmt_unregister(&dfl_fmt); +} + +module_init(pfm_dfl_fmt_init_module); +module_exit(pfm_dfl_fmt_cleanup_module); Index: linux-2.6/perfmon/perfmon_file.c =================================================================== --- /dev/null +++ linux-2.6/perfmon/perfmon_file.c @@ -0,0 +1,791 @@ +/* + * perfmon_file.c: perfmon2 file input/output functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define PFMFS_MAGIC 0xa0b4d889 /* perfmon filesystem magic number */ + +static inline int pfm_msgq_is_empty(struct pfm_context *ctx) +{ + return ctx->msgq_head == ctx->msgq_tail; +} + +static int pfmfs_delete_dentry(struct dentry *dentry) +{ + return 1; +} + +static struct dentry_operations pfmfs_dentry_operations = { + .d_delete = pfmfs_delete_dentry, +}; + +static union pfarg_msg *pfm_get_next_msg(struct pfm_context *ctx) +{ + union pfarg_msg *msg; + + PFM_DBG("in head=%d tail=%d", + ctx->msgq_head & PFM_MSGQ_MASK, + ctx->msgq_tail & PFM_MSGQ_MASK); + + if (pfm_msgq_is_empty(ctx)) + return NULL; + + /* + * get oldest message + */ + msg = ctx->msgq + (ctx->msgq_tail & PFM_MSGQ_MASK); + + /* + * move tail forward + */ + ctx->msgq_tail++; + + PFM_DBG("out head=%d tail=%d type=%d", + ctx->msgq_head & PFM_MSGQ_MASK, + ctx->msgq_tail & PFM_MSGQ_MASK, + msg->type); + + return msg; +} + +static struct page *pfm_buf_map_pagefault(struct vm_area_struct *vma, + unsigned long address, int *type) +{ + void *kaddr; + struct pfm_context *ctx; + struct page *page; + size_t size; + + ctx = vma->vm_private_data; + if (ctx == NULL) { + PFM_DBG("no ctx"); + return NOPAGE_SIGBUS; + } + size = ctx->smpl_size; + + if ( (address < (unsigned long) vma->vm_start) || + (address >= (unsigned long) (vma->vm_start + size)) ) + return NOPAGE_SIGBUS; + + kaddr = ctx->smpl_addr + (address - vma->vm_start); + + if (type) + *type = VM_FAULT_MINOR; + + page = vmalloc_to_page(kaddr); + get_page(page); + + PFM_DBG("[%d] start=%p ref_count=%d", + current->pid, + kaddr, page_count(page)); + + return page; +} +/* + * we need to determine whther or not we are closing the last reference + * to the file and thus are going to end up in pfm_close() which eventually + * calls pfm_release_buf_space(). In that function, we update the accouting + * for locked_vm given that we are actually freeing the sampling buffer. The + * issue is that there are multiple paths leading to pfm_release_buf_space(), + * from exit(), munmap(), close(). The path coming from munmap() is problematic + * becuse do_munmap() grabs mmap_sem in write-mode which is also what + * pfm_release_buf_space does. To avoid deadlock, we need to determine where + * we are calling from and skip the locking. The vm_ops->close() callback + * is invoked for each remove_vma() independently of the number of references + * left on the file descriptor, therefore simple reference counter does not + * work. We need to determine if this is the last call, and then set a flag + * to skip the locking. + */ +static void pfm_buf_map_close(struct vm_area_struct *vma) +{ + struct file *file; + struct pfm_context *ctx; + + file = vma->vm_file; + ctx = vma->vm_private_data; + + /* + * if file is going to close, then pfm_close() will + * be called, do not lock in pfm_release_buf + */ + if (atomic_read(&file->f_count) == 1) + ctx->flags.mmap_nlock = 1; +} + +/* + * we do not have a close callback because, the locked + * memory accounting must be done when the actual buffer + * is freed. Munmap does not free the page backing the vma + * because they may still be in use by the PMU interrupt handler. + */ +struct vm_operations_struct pfm_buf_map_vm_ops = { + .nopage = pfm_buf_map_pagefault, + .close = pfm_buf_map_close +}; + +static int pfm_mmap_buffer(struct pfm_context *ctx, struct vm_area_struct *vma, + size_t size) +{ + if (ctx->smpl_addr == NULL) { + PFM_DBG("no sampling buffer to map"); + return -EINVAL; + } + + if (size > ctx->smpl_size) { + PFM_DBG("mmap size=%zu >= actual buf size=%zu", + size, + ctx->smpl_size); + return -EINVAL; + } + + vma->vm_ops = &pfm_buf_map_vm_ops; + vma->vm_private_data = ctx; + + return 0; +} + +static int pfm_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t size; + struct pfm_context *ctx; + unsigned long flags; + int ret; + + PFM_DBG("pfm_file_ops"); + + ctx = file->private_data; + size = (vma->vm_end - vma->vm_start); + + if (ctx == NULL) + return -EINVAL; + + ret = -EINVAL; + + spin_lock_irqsave(&ctx->lock, flags); + + if (vma->vm_flags & VM_WRITE) { + PFM_DBG("cannot map buffer for writing"); + goto done; + } + + PFM_DBG("vm_pgoff=%lu size=%zu vm_start=0x%lx", + vma->vm_pgoff, + size, + vma->vm_start); + + ret = pfm_mmap_buffer(ctx, vma, size); + if (ret == 0) + vma->vm_flags |= VM_RESERVED; + + PFM_DBG("ret=%d vma_flags=0x%lx vma_start=0x%lx vma_size=%lu", + ret, + vma->vm_flags, + vma->vm_start, + vma->vm_end-vma->vm_start); +done: + spin_unlock_irqrestore(&ctx->lock, flags); + + return ret; +} + +/* + * Extract one message from queue. + * + * return: + * -EAGAIN: when non-blocking and nothing is* in the queue. + * -ERESTARTSYS: when blocking and signal is pending + * Otherwise returns size of message (sizeof(pfarg_msg)) + */ +ssize_t __pfm_read(struct pfm_context *ctx, union pfarg_msg *msg_buf, int non_block) +{ + union pfarg_msg *msg; + ssize_t ret = 0; + unsigned long flags; + DECLARE_WAITQUEUE(wait, current); + + /* + * we must masks interrupts to avoid a race condition + * with the PMU interrupt handler. + */ + spin_lock_irqsave(&ctx->lock, flags); + + while (pfm_msgq_is_empty(ctx)) { + + /* + * handle non-blocking reads + * return -EAGAIN + */ + ret = -EAGAIN; + if (non_block) + break; + + add_wait_queue(&ctx->msgq_wait, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + spin_unlock_irqrestore(&ctx->lock, flags); + + schedule(); + + /* + * during this window, another thread may call + * pfm_read() and steal our message + */ + + spin_lock_irqsave(&ctx->lock, flags); + + remove_wait_queue(&ctx->msgq_wait, &wait); + set_current_state(TASK_RUNNING); + + /* + * check for pending signals + * return -ERESTARTSYS + */ + ret = -ERESTARTSYS; + if(signal_pending(current)) + break; + + /* + * we may have a message + */ + ret = 0; + } + + /* + * extract message + */ + if (ret == 0) { + msg = pfm_get_next_msg(ctx); + BUG_ON(msg == NULL); + + /* + * we must make a local copy before we unlock + * to ensure that the message queue cannot fill + * (overwriting our message) up before + * we do copy_to_user() which cannot be done + * with interrupts masked. + */ + *msg_buf = *msg; + + ret = sizeof(*msg); + + PFM_DBG("extracted type=%d", msg->type); + } + + spin_unlock_irqrestore(&ctx->lock, flags); + + PFM_DBG("blocking=%d ret=%zd", non_block, ret); + + return ret; +} + +static ssize_t pfm_read(struct file *filp, char __user *buf, size_t size, + loff_t *ppos) +{ + struct pfm_context *ctx; + union pfarg_msg msg_buf; + int non_block, ret; + + PFM_DBG("pfm_file_ops"); + + ctx = filp->private_data; + if (ctx == NULL) { + PFM_ERR("no ctx for pfm_read"); + return -EINVAL; + } + + /* + * cannot extract partial messages. + * check even when there is no message + * + * cannot extract more than one message per call. Bytes + * above sizeof(msg) are ignored. + */ + if (size < sizeof(msg_buf)) { + PFM_DBG("message is too small size=%zu must be >=%zu)", + size, + sizeof(msg_buf)); + return -EINVAL; + } + + non_block = filp->f_flags & O_NONBLOCK; + + ret = __pfm_read(ctx, &msg_buf, non_block); + if (ret > 0) { + if(copy_to_user(buf, &msg_buf, sizeof(msg_buf))) + ret = -EFAULT; + } + return ret; +} + +static ssize_t pfm_write(struct file *file, const char __user *ubuf, + size_t size, loff_t *ppos) +{ + PFM_DBG("pfm_write called"); + return -EINVAL; +} + +static unsigned int pfm_poll(struct file *filp, poll_table *wait) +{ + struct pfm_context *ctx; + unsigned long flags; + unsigned int mask = 0; + + PFM_DBG("pfm_file_ops"); + + if (filp->f_op != &pfm_file_ops) { + PFM_ERR("pfm_poll bad magic"); + return 0; + } + + ctx = filp->private_data; + if (ctx == NULL) { + PFM_ERR("pfm_poll no ctx"); + return 0; + } + + PFM_DBG("before poll_wait"); + + poll_wait(filp, &ctx->msgq_wait, wait); + + /* + * pfm_msgq_is_empty() is non-atomic + * + * filp is protected by fget() at upper level + * context cannot be closed by another thread. + * + * There may be a race with a PMU interrupt adding + * messages to the queue. But we are interested in + * queue not empty, so adding more messages should + * not really be a problem. + * + * There may be a race with another thread issuing + * a read() and stealing messages from the queue thus + * may return the wrong answer. This could potentially + * lead to a blocking read, because nothing is + * available in the queue + */ + spin_lock_irqsave(&ctx->lock, flags); + + if (!pfm_msgq_is_empty(ctx)) + mask = POLLIN | POLLRDNORM; + + spin_unlock_irqrestore(&ctx->lock, flags); + + PFM_DBG("after poll_wait mask=0x%x", mask); + + return mask; +} + +static int pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg) +{ + PFM_DBG("pfm_ioctl called"); + return -EINVAL; +} + +/* + * interrupt cannot be masked when entering this function + */ +static inline int __pfm_fasync(int fd, struct file *filp, + struct pfm_context *ctx, int on) +{ + int ret; + + ret = fasync_helper (fd, filp, on, &ctx->async_queue); + + PFM_DBG("fd=%d on=%d async_q=%p ret=%d", + fd, + on, + ctx->async_queue, ret); + + return ret; +} + +static int pfm_fasync(int fd, struct file *filp, int on) +{ + struct pfm_context *ctx; + int ret; + + PFM_DBG("pfm_file_ops"); + + ctx = filp->private_data; + if (ctx == NULL) { + PFM_ERR("pfm_fasync no ctx"); + return -EBADF; + } + + /* + * we cannot mask interrupts during this call because this may + * may go to sleep if memory is not readily avalaible. + * + * We are protected from the context disappearing by the + * get_fd()/put_fd() done in caller. Serialization of this function + * is ensured by caller. + */ + ret = __pfm_fasync(fd, filp, ctx, on); + + PFM_DBG("pfm_fasync called on fd=%d on=%d async_queue=%p ret=%d", + fd, + on, + ctx->async_queue, ret); + + return ret; +} + +#ifdef CONFIG_SMP +static void __pfm_close_remote_cpu(void *info) +{ + struct pfm_context *ctx = info; + int can_release; + + BUG_ON(ctx != __get_cpu_var(pmu_ctx)); + + /* + * we are in IPI interrupt handler which has always higher + * priority than PMU interrupt, therefore we do not need to + * mask interrupts. context locking is not needed because we + * are in close(), no more user references. + * + * can_release is ignored, release done on calling CPU + */ + __pfm_unload_context(ctx, &can_release); + + /* + * we cannot free context here because we are in_interrupt(). + * we free on the calling CPU + */ +} + +static int pfm_close_remote_cpu(u32 cpu, struct pfm_context *ctx) +{ + BUG_ON(irqs_disabled()); + return smp_call_function_single(cpu, __pfm_close_remote_cpu, ctx, 0, 1); +} +#endif /* CONFIG_SMP */ + +/* + * called either on explicit close() or from exit_files(). + * Only the LAST user of the file gets to this point, i.e., it is + * called only ONCE. + * + * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero + * (fput()),i.e, last task to access the file. Nobody else can access the + * file at this point. + * + * When called from exit_files(), the VMA has been freed because exit_mm() + * is executed before exit_files(). + * + * When called from exit_files(), the current task is not yet ZOMBIE but we + * flush the PMU state to the context. + */ +int __pfm_close(struct pfm_context *ctx, struct file *filp) +{ + unsigned long flags; + int state; + int can_free = 1, can_unload = 1; + int is_system, can_release = 0; + u32 cpu; + + spin_lock_irqsave(&ctx->lock, flags); + + state = ctx->state; + is_system = ctx->flags.system; + cpu = ctx->cpu; + + PFM_DBG("state=%d", state); + + /* + * check if unload is needed + */ + if (state == PFM_CTX_UNLOADED) + goto doit; + +#ifdef CONFIG_SMP + /* + * we need to release the resource on the ORIGINAL cpu. + * we need to release the context lock to avoid deadlocks + * on the original CPU, especially in the context switch + * routines. It is safe to unlock because we are in close(), + * in other words, there is no more access from user level. + * we can also unmask interrupts on this CPU because the + * context is running on the original CPU. Context will be + * unloaded and the session will be released on the original + * CPU. Upon return, the caller is guaranteed that the context + * is gone from original CPU. + */ + if (is_system && cpu != smp_processor_id()) { + spin_unlock_irqrestore(&ctx->lock, flags); + pfm_close_remote_cpu(cpu, ctx); + can_release = 1; + goto free_it; + } + + if (!is_system && ctx->task != current) { + /* + * switch context to zombie state + */ + ctx->state = PFM_CTX_ZOMBIE; + + PFM_DBG("zombie ctx for [%d]", ctx->task->pid); + /* + * must check if other thread is using block overflow + * notification mode. If so make sure it will not block + * because there will not be any pfm_restart() issued. + * When the thread notices the ZOMBIE state, it will clean + * up what is left of the context + */ + if (state == PFM_CTX_MASKED && ctx->flags.block) { + /* + * force task to wake up from MASKED state + */ + PFM_DBG("waking up [%d]", ctx->task->pid); + + complete(&ctx->restart_complete); + } + /* + * PMU session will be release by monitored task when it notices + * ZOMBIE state as part of pfm_unload_context() + */ + can_unload = can_free = 0; + } +#endif + if (can_unload) + __pfm_unload_context(ctx, &can_release); +doit: + spin_unlock_irqrestore(&ctx->lock, flags); + +#ifdef CONFIG_SMP +free_it: +#endif + if (can_release) + pfm_release_session(is_system, cpu); + + if (can_free) + pfm_context_free(ctx); + + return 0; +} + +static int pfm_close(struct inode *inode, struct file *filp) +{ + struct pfm_context *ctx; + + PFM_DBG("pfm_file_ops"); + + ctx = filp->private_data; + if (ctx == NULL) { + PFM_ERR("no ctx"); + return -EBADF; + } + return __pfm_close(ctx, filp); +} + +static int pfm_no_open(struct inode *irrelevant, struct file *dontcare) +{ + PFM_DBG("pfm_file_ops"); + + return -ENXIO; +} + +/* + * pfm_flush() is called from filp_close() on every call to + * close(). pfm_close() is only invoked when the last user + * calls close(). pfm_close() is never invoked without + * pfm_flush() being invoked first. + * + * Partially free resources: + * - remove from fasync queue + */ +static int pfm_flush(struct file *filp, fl_owner_t id) +{ + struct pfm_context *ctx; + + PFM_DBG("pfm_file_ops"); + + ctx = filp->private_data; + if (ctx == NULL) { + PFM_ERR("pfm_flush no ctx"); + return -EBADF; + } + + /* + * remove our file from the async queue, if we use this mode. + * This can be done without the context being protected. We come + * here when the context has become unreacheable by other tasks. + * + * We may still have active monitoring at this point and we may + * end up in pfm_overflow_handler(). However, fasync_helper() + * operates with interrupts disabled and it cleans up the + * queue. If the PMU handler is called prior to entering + * fasync_helper() then it will send a signal. If it is + * invoked after, it will find an empty queue and no + * signal will be sent. In both case, we are safe + */ + if (filp->f_flags & FASYNC) { + PFM_DBG("cleaning up async_queue=%p", ctx->async_queue); + __pfm_fasync (-1, filp, ctx, 0); + } + return 0; +} + +const struct file_operations pfm_file_ops = { + .llseek = no_llseek, + .read = pfm_read, + .write = pfm_write, + .poll = pfm_poll, + .ioctl = pfm_ioctl, + .open = pfm_no_open, /* special open to disallow open via /proc */ + .fasync = pfm_fasync, + .release = pfm_close, + .flush= pfm_flush, + .mmap = pfm_mmap +}; + +static int pfmfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) +{ + return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt); +} + +static struct file_system_type pfm_fs_type = { + .name = "pfmfs", + .get_sb = pfmfs_get_sb, + .kill_sb = kill_anon_super, +}; + +/* + * pfmfs should _never_ be mounted by userland - too much of security hassle, + * no real gain from having the whole whorehouse mounted. So we don't need + * any operations on the root directory. However, we need a non-trivial + * d_name - pfm: will go nicely and kill the special-casing in procfs. + */ +static struct vfsmount *pfmfs_mnt; + +int __init pfm_init_fs(void) +{ + int err = register_filesystem(&pfm_fs_type); + if (!err) { + pfmfs_mnt = kern_mount(&pfm_fs_type); + err = PTR_ERR(pfmfs_mnt); + if (IS_ERR(pfmfs_mnt)) + unregister_filesystem(&pfm_fs_type); + else + err = 0; + } + return err; +} + +static void __exit exit_pfm_fs(void) +{ + unregister_filesystem(&pfm_fs_type); + mntput(pfmfs_mnt); +} + +int pfm_alloc_fd(struct file **cfile) +{ + int fd, ret = 0; + struct file *file = NULL; + struct inode * inode; + char name[32]; + struct qstr this; + + fd = get_unused_fd(); + if (fd < 0) + return -ENFILE; + + ret = -ENFILE; + + file = get_empty_filp(); + if (!file) + goto out; + + /* + * allocate a new inode + */ + inode = new_inode(pfmfs_mnt->mnt_sb); + if (!inode) + goto out; + + PFM_DBG("new inode ino=%ld @%p", inode->i_ino, inode); + + inode->i_sb = pfmfs_mnt->mnt_sb; + inode->i_mode = S_IFCHR|S_IRUGO; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + + sprintf(name, "[%lu]", inode->i_ino); + this.name = name; + this.hash = inode->i_ino; + this.len = strlen(name); + + ret = -ENOMEM; + + /* + * allocate a new dcache entry + */ + file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); + if (!file->f_dentry) + goto out; + + file->f_dentry->d_op = &pfmfs_dentry_operations; + + d_add(file->f_dentry, inode); + file->f_vfsmnt = mntget(pfmfs_mnt); + file->f_mapping = inode->i_mapping; + + file->f_op = &pfm_file_ops; + file->f_mode = FMODE_READ; + file->f_flags = O_RDONLY; + file->f_pos = 0; + + *cfile = file; + + return fd; +out: + if (file) + put_filp(file); + put_unused_fd(fd); + return ret; +} Index: linux-2.6/perfmon/perfmon_fmt.c =================================================================== --- /dev/null +++ linux-2.6/perfmon/perfmon_fmt.c @@ -0,0 +1,218 @@ +/* + * perfmon_fmt.c: perfmon2 sampling buffer format management + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_smpl_fmt_lock); +static LIST_HEAD(pfm_smpl_fmt_list); + +static inline int fmt_is_mod(struct pfm_smpl_fmt *f) +{ + return !(f->fmt_flags & PFM_FMTFL_IS_BUILTIN); +} + +static struct pfm_smpl_fmt *pfm_find_fmt(char *name) +{ + struct pfm_smpl_fmt * entry; + + list_for_each_entry(entry, &pfm_smpl_fmt_list, fmt_list) { + if (!strcmp(entry->fmt_name, name)) + return entry; + } + return NULL; +} +/* + * find a buffer format based on its name + */ +struct pfm_smpl_fmt *pfm_smpl_fmt_get(char *name) +{ + struct pfm_smpl_fmt * fmt; + + spin_lock(&pfm_smpl_fmt_lock); + + fmt = pfm_find_fmt(name); + + /* + * increase module refcount + */ + if (fmt && fmt_is_mod(fmt) && !try_module_get(fmt->owner)) + fmt = NULL; + + spin_unlock(&pfm_smpl_fmt_lock); + + return fmt; +} + +void pfm_smpl_fmt_put(struct pfm_smpl_fmt *fmt) +{ + if (fmt == NULL || !fmt_is_mod(fmt)) + return; + BUG_ON(fmt->owner == NULL); + + spin_lock(&pfm_smpl_fmt_lock); + module_put(fmt->owner); + spin_unlock(&pfm_smpl_fmt_lock); +} + +int pfm_fmt_register(struct pfm_smpl_fmt *fmt) +{ + int ret = 0; + + if (perfmon_disabled) { + PFM_INFO("perfmon disabled, cannot add sampling format"); + return -ENOSYS; + } + + /* some sanity checks */ + if (fmt == NULL) { + PFM_INFO("perfmon: NULL format for register"); + return -EINVAL; + } + + if (fmt->fmt_name == NULL) { + PFM_INFO("perfmon: format has no name"); + return -EINVAL; + } + + if (fmt->fmt_qdepth > PFM_MSGS_COUNT) { + PFM_INFO("perfmon: format %s requires %u msg queue depth (max %d)", + fmt->fmt_name, + fmt->fmt_qdepth, + PFM_MSGS_COUNT); + return -EINVAL; + } + + /* + * fmt is missing the initialization of .owner = THIS_MODULE + * this is only valid when format is compiled as a module + */ + if (fmt->owner == NULL && fmt_is_mod(fmt)) { + PFM_INFO("format %s has no module owner", fmt->fmt_name); + return -EINVAL; + } + /* + * we need at least a handler + */ + if (fmt->fmt_handler == NULL) { + PFM_INFO("format %s has no handler", fmt->fmt_name); + return -EINVAL; + } + + /* + * format argument size cannot be bigger than PAGE_SIZE + */ + if (fmt->fmt_arg_size > PAGE_SIZE) { + PFM_INFO("format %s arguments too big", fmt->fmt_name); + return -EINVAL; + } + + spin_lock(&pfm_smpl_fmt_lock); + + /* + * because of sysfs, we cannot have two formats with the same name + */ + if (pfm_find_fmt(fmt->fmt_name)) { + PFM_INFO("format %s already registered", fmt->fmt_name); + ret = -EBUSY; + goto out; + } + + ret = pfm_sysfs_add_fmt(fmt); + if (ret) { + PFM_INFO("sysfs cannot add format entry for %s", fmt->fmt_name); + goto out; + } + + list_add(&fmt->fmt_list, &pfm_smpl_fmt_list); + + PFM_INFO("added sampling format %s", fmt->fmt_name); +out: + spin_unlock(&pfm_smpl_fmt_lock); + + return ret; +} +EXPORT_SYMBOL(pfm_fmt_register); + +int pfm_fmt_unregister(struct pfm_smpl_fmt *fmt) +{ + struct pfm_smpl_fmt *fmt2; + int ret = 0; + + if (!fmt || !fmt->fmt_name) { + PFM_DBG("invalid fmt"); + return -EINVAL; + } + + spin_lock(&pfm_smpl_fmt_lock); + + fmt2 = pfm_find_fmt(fmt->fmt_name); + if (!fmt) { + PFM_INFO("unregister failed, format not registered"); + ret = -EINVAL; + goto out; + } + list_del_init(&fmt->fmt_list); + + pfm_sysfs_remove_fmt(fmt); + + PFM_INFO("removed sampling format: %s", fmt->fmt_name); + +out: + spin_unlock(&pfm_smpl_fmt_lock); + return ret; + +} +EXPORT_SYMBOL(pfm_fmt_unregister); + +/* + * we defer adding the builtin formats to /sys/kernel/perfmon/formats + * until after the pfm sysfs subsystem is initialized. This function + * is called from pfm_sysfs_init() + */ +void pfm_sysfs_builtin_fmt_add(void) +{ + struct pfm_smpl_fmt * entry; + + /* + * locking not needed, kernel not fully booted + * when called + */ + list_for_each_entry(entry, &pfm_smpl_fmt_list, fmt_list) { + pfm_sysfs_add_fmt(entry); + } +} Index: linux-2.6/perfmon/perfmon_intr.c =================================================================== --- /dev/null +++ linux-2.6/perfmon/perfmon_intr.c @@ -0,0 +1,565 @@ +/* + * perfmon_intr.c: perfmon2 interrupt handling + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include + +void pfm_mask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + u64 now; + + PFM_DBG_ovfl("masking monitoring"); + + now = sched_clock(); + + /* + * we save the PMD values such that we can read them while + * MASKED without having to have the thread stopped which + * is uncessary because monitoring is stopped + * + * XXX: could be avoided in system-wide + */ + pfm_save_pmds(ctx, set); + pfm_arch_mask_monitoring(ctx, set); + /* + * accumulate the set duration up to this point + */ + set->duration += now - set->duration_start; +} +EXPORT_SYMBOL(pfm_mask_monitoring); + +/* + * main overflow processing routine. + * + * set->num_ovfl_pmds is 0 when returning from this function even though + * set->ovfl_pmds[] may have bits set. When leaving set->num_ovfl_pmds + * must never be used to determine if there was a pending overflow. + */ +static void pfm_overflow_handler(struct pfm_context *ctx, struct pfm_event_set *set, + unsigned long ip, + struct pt_regs *regs) +{ + struct pfm_ovfl_arg *ovfl_arg; + struct pfm_event_set *set_orig; + void *hdr; + u64 old_val, ovfl_mask, new_val, ovfl_thres; + u64 *ovfl_notify, *ovfl_pmds, *pend_ovfls; + u64 *smpl_pmds, *reset_pmds; + u64 now, t0, t1; + u32 ovfl_ctrl, num_ovfl, num_ovfl_orig; + u16 i, max_pmd, max_cnt_pmd, first_cnt_pmd; + u8 must_switch, has_64b_ovfl; + u8 ctx_block, has_notify, has_ovfl_sw; + + now = t0 = sched_clock(); + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + + max_pmd = pfm_pmu_conf->regs.max_pmd; + first_cnt_pmd = pfm_pmu_conf->regs.first_cnt_pmd; + max_cnt_pmd = pfm_pmu_conf->regs.max_cnt_pmd; + + ovfl_pmds = set->ovfl_pmds; + num_ovfl = num_ovfl_orig = set->npend_ovfls; + pend_ovfls = set->povfl_pmds; + has_ovfl_sw = set->flags & PFM_SETFL_OVFL_SWITCH; + set_orig = set; + + if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) + goto stop_monitoring; + + must_switch = has_64b_ovfl = 0; + + hdr = ctx->smpl_addr; + + PFM_DBG_ovfl("ovfl_pmds=0x%llx npend=%u ip=%p, blocking=%d " + "u_pmds=0x%llx use_fmt=%u", + (unsigned long long)pend_ovfls[0], + num_ovfl, + (void *)ip, + ctx->flags.block, + (unsigned long long)set->used_pmds[0], + ctx->smpl_fmt != NULL); + + /* + * initialize temporary bitvectors + * we allocate bitvectors in the context + * rather than on the stack to minimize stack + * space consumption. PMU interrupt is very high + * which implies possible deep nesting of interrupt + * hence limited kernel stack space. + * + * This is safe because a context can only be in the + * overflow handler once at a time + */ + reset_pmds = set->reset_pmds; + ovfl_notify = ctx->ovfl_ovfl_notify; + + bitmap_zero(cast_ulp(reset_pmds), max_pmd); + + /* + * first we update the virtual counters + * + * we leverage num_ovfl to minimize number of + * iterations of the loop. + * + * The i < max_cnt_pmd is just a sanity check + */ + for (i = first_cnt_pmd; num_ovfl && i < max_cnt_pmd; i++) { + /* + * skip pmd which did not overflow + */ + if (!test_bit(i, cast_ulp(pend_ovfls))) + continue; + + num_ovfl--; + + /* + * Note that the pmd is not necessarily 0 at this point as + * qualified events may have happened before the PMU was + * frozen. The residual count is not taken into consideration + * here but will be with any read of the pmd + */ + old_val = new_val = set->pmds[i].value; + ovfl_thres = set->pmds[i].ovflsw_thres; + new_val += 1 + ovfl_mask; + set->pmds[i].value = new_val; + + /* + * check for overflow condition + */ + if (likely(old_val > new_val)) { + PFM_DBG_ovfl("64 bit overflow of PMD%d", i); + has_64b_ovfl = 1; + if (has_ovfl_sw && ovfl_thres > 0) { + if (ovfl_thres == 1) + must_switch = 1; + set->pmds[i].ovflsw_thres = ovfl_thres - 1; + } + + /* + * what to reset because of this overflow + */ + __set_bit(i, cast_ulp(reset_pmds)); + + bitmap_or(cast_ulp(reset_pmds), + cast_ulp(reset_pmds), + cast_ulp(set->pmds[i].reset_pmds), + max_pmd); + + } else { + PFM_DBG_ovfl("Hardware counter overflow of PMD%d=0x%04Lx", + i, + (unsigned long long)new_val); + /* only keep track of 64-bit overflows */ + __clear_bit(i, cast_ulp(pend_ovfls)); + /* + * on some PMU, it may be necessary to re-arm the PMD + */ + pfm_arch_ovfl_reset_pmd(ctx, i); + } + + PFM_DBG_ovfl("pmd%u=0x%llx old_val=0x%llx " + "hw_pmd=0x%llx o_pmds=0x%llx must_switch=%u " + "o_thres=%llu o_thres_ref=%llu", + i, + (unsigned long long)new_val, + (unsigned long long)old_val, + (unsigned long long)pfm_read_pmd(ctx, i), + (unsigned long long)ovfl_pmds[0], + must_switch, + (unsigned long long)set->pmds[i].ovflsw_thres, + (unsigned long long)set->pmds[i].ovflsw_ref_thres); + } + + /* + * mark the overflow as consumed + */ + set->npend_ovfls = 0; + + ctx_block = ctx->flags.block; + + t1 = sched_clock(); + __get_cpu_var(pfm_stats).ccnt0 += t1 - t0; + t0 = t1; + + /* + * there was no 64-bit overflow, nothing else to do + */ + if (!has_64b_ovfl) + return; + + /* + * copy pending_ovfls to ovfl_pmd. It is used in + * the notification message or getinfo_evtsets(). + * + * pend_ovfls modified to reflect only 64-bit overflows + */ + bitmap_copy(cast_ulp(ovfl_pmds), + cast_ulp(pend_ovfls), + max_cnt_pmd); + + /* + * build ovfl_notify bitmask from ovfl_pmds + */ + bitmap_and(cast_ulp(ovfl_notify), + cast_ulp(pend_ovfls), + cast_ulp(set->ovfl_notify), + max_cnt_pmd); + + has_notify = !bitmap_empty(cast_ulp(ovfl_notify), max_cnt_pmd); + /* + * must reset for next set of overflows + */ + bitmap_zero(cast_ulp(pend_ovfls), max_cnt_pmd); + + /* + * check for format + */ + if (likely(ctx->smpl_fmt)) { + u64 start_cycles, end_cycles; + u64 *cnt_pmds; + int j, k, ret = 0; + + ovfl_ctrl = 0; + num_ovfl = num_ovfl_orig; + ovfl_arg = &ctx->ovfl_arg; + cnt_pmds = pfm_pmu_conf->regs.cnt_pmds; + + ovfl_arg->active_set = set->id; + + for (i = first_cnt_pmd; num_ovfl && !ret; i++) { + + if (!test_bit(i, cast_ulp(ovfl_pmds))) + continue; + + num_ovfl--; + + ovfl_arg->ovfl_pmd = i; + ovfl_arg->ovfl_ctrl = 0; + + ovfl_arg->pmd_last_reset = set->pmds[i].lval; + ovfl_arg->pmd_eventid = set->pmds[i].eventid; + + /* + * copy values of pmds of interest. + * Sampling format may use them + * We do not initialize the unused smpl_pmds_values + */ + k = 0; + smpl_pmds = set->pmds[i].smpl_pmds; + if (!bitmap_empty(cast_ulp(smpl_pmds), max_pmd)) { + + for (j = 0; j < max_pmd; j++) { + + if (!test_bit(j, cast_ulp(smpl_pmds))) + continue; + + new_val = pfm_read_pmd(ctx, j); + + /* for counters, build 64-bit value */ + if (test_bit(j, cast_ulp(cnt_pmds))) { + new_val = (set->pmds[j].value & ~ovfl_mask) + | (new_val & ovfl_mask); + } + ovfl_arg->smpl_pmds_values[k++] = new_val; + + PFM_DBG_ovfl("s_pmd_val[%u]=" + "pmd%u=0x%llx", + k, j, + (unsigned long long)new_val); + } + } + ovfl_arg->num_smpl_pmds = k; + + __get_cpu_var(pfm_stats).fmt_handler_calls++; + + start_cycles = sched_clock(); + + /* + * call custom buffer format record (handler) routine + */ + ret = (*ctx->smpl_fmt->fmt_handler)(hdr, + ovfl_arg, + ip, + now, + regs); + + end_cycles = sched_clock(); + + /* + * for PFM_OVFL_CTRL_MASK and PFM_OVFL_CTRL_NOTIFY + * we take the union + * + * The reset_pmds mask is constructed automatically + * on overflow. When the actual reset takes place + * depends on the masking, switch and notification + * status. It may be deferred until pfm_restart(). + */ + ovfl_ctrl |= ovfl_arg->ovfl_ctrl; + + __get_cpu_var(pfm_stats).fmt_handler_ns += end_cycles + - start_cycles; + } + /* + * when the format cannot handle the rest of the overflow, + * we abort right here + */ + if (ret) { + PFM_DBG_ovfl("handler aborted at PMD%u ret=%d", + i, ret); + } + } else { + /* + * When no sampling format is used, the default + * is: + * - mask monitoring + * - notify user if requested + * + * If notification is not requested, monitoring is masked + * and overflowed counters are not reset (saturation). + * This mimics the behavior of the default sampling format. + */ + ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY; + + if (!must_switch || has_notify) + ovfl_ctrl |= PFM_OVFL_CTRL_MASK; + } + t1 = sched_clock(); + __get_cpu_var(pfm_stats).ccnt1 += t1 - t0; + t0 = t1; + + PFM_DBG_ovfl("set%u o_notify=0x%llx o_pmds=0x%llx " + "r_pmds=0x%llx ovfl_ctrl=0x%x", + set->id, + (unsigned long long)ovfl_notify[0], + (unsigned long long)ovfl_pmds[0], + (unsigned long long)reset_pmds[0], + ovfl_ctrl); + + /* + * we only reset (short reset) when we are not masking. Otherwise + * the reset is postponed until restart. + */ + if (likely(!(ovfl_ctrl & PFM_OVFL_CTRL_MASK))) { + if (must_switch) { + /* + * pfm_switch_sets() takes care + * of resetting new set if needed + */ + pfm_switch_sets_from_intr(ctx); + + /* + * update our view of the active set + */ + set = ctx->active_set; + + must_switch = 0; + } else if (ovfl_ctrl & PFM_OVFL_CTRL_RESET) { + u16 nn; + t0 = sched_clock(); + nn = bitmap_weight(cast_ulp(reset_pmds), max_pmd); + if (nn) + pfm_reset_pmds(ctx, set, nn, PFM_PMD_RESET_SHORT); + __get_cpu_var(pfm_stats).ccnt5 += sched_clock() - t0; + } + /* + * do not block if not masked + */ + ctx_block = 0; + } else { + pfm_mask_monitoring(ctx, set); + ctx->state = PFM_CTX_MASKED; + ctx->flags.can_restart = 1; + } + /* + * if we have not switched here, then remember for the + * time monitoring is restarted + */ + if (must_switch) + set->priv_flags |= PFM_SETFL_PRIV_SWITCH; + + /* + * block only if CTRL_NOTIFY+CTRL_MASK and requested by user + * + * Defer notification until last operation in the handler + * to avoid spinlock contention + */ + if (has_notify && (ovfl_ctrl & PFM_OVFL_CTRL_NOTIFY)) { + if (ctx_block) { + ctx->flags.work_type = PFM_WORK_BLOCK; + set_thread_flag(TIF_PERFMON_WORK); + } + /* + * if message queue is full, then mask monitoring + * and wait for pfm_restart() + */ + if (pfm_ovfl_notify_user(ctx, set_orig, ip)) { + pfm_mask_monitoring(ctx, set); + ctx->state = PFM_CTX_MASKED; + ctx->flags.can_restart = 1; + } + } + + t1 = sched_clock(); + __get_cpu_var(pfm_stats).ccnt2 += t1 - t0; + + return; + +stop_monitoring: + /* + * Does not happen for a system-wide context nor for a + * self-monitored context. We cannot attach to kernel-only + * thread, thus it is safe to set TIF bits, i.e., the thread + * will eventually leave the kernel or die and either we will + * catch the context and clean it up in pfm_handler_work() or + * pfm_exit_thread(). + */ + PFM_DBG_ovfl("ctx is zombie, converted to spurious"); + + __pfm_stop(ctx); + ctx->flags.work_type = PFM_WORK_ZOMBIE; + set_thread_flag(TIF_PERFMON_WORK); +} + +/* + * interrupts are masked + * + * Context locking necessary to avoid concurrent accesses from other CPUs + * - For per-thread, we must prevent pfm_restart() which works when + * context is LOADED or MASKED + */ +static void __pfm_interrupt_handler(unsigned long iip, struct pt_regs *regs) +{ + struct task_struct *task; + struct pfm_context *ctx; + struct pfm_event_set *set; + u64 t0; + + t0 = sched_clock(); + __get_cpu_var(pfm_stats).ovfl_intr_all_count++; + + task = __get_cpu_var(pmu_owner); + ctx = __get_cpu_var(pmu_ctx); + + if (unlikely(ctx == NULL)) { + PFM_DBG_ovfl("no ctx"); + goto spurious; + } + + spin_lock(&ctx->lock); + + set = ctx->active_set; + + /* + * For SMP per-thread, it is not possible to have + * owner != NULL && task != current. + * + * For UP per-thread, because of lazy save, it + * is possible to receive an interrupt in another task + * which is not using the PMU. This means + * that the interrupt was in-flight at the + * time of pfm_ctxswout_thread(). In that + * case it will be replayed when the task + * is scheduled again. Hence we convert to spurious. + * + * The basic rule is that an overflow is always + * processed in the context of the task that + * generated it for all per-thread contexts. + * + * for system-wide, task is always NULL + */ +#ifndef CONFIG_SMP + if (unlikely((task && current->pfm_context != ctx))) { + PFM_DBG_ovfl("spurious: not owned by current task"); + goto spurious; + } +#endif + if (unlikely(!pfm_arch_is_active(ctx))) { + PFM_DBG_ovfl("spurious: monitoring non active"); + goto spurious; + } + + /* + * freeze PMU and collect overflowed PMD registers + * into set->povfl_pmds. Number of overflowed PMDs reported + * in set->npend_ovfls + */ + pfm_arch_intr_freeze_pmu(ctx, set); + if (unlikely(!set->npend_ovfls)) { + PFM_DBG_ovfl("no npend_ovfls"); + goto spurious; + } + + __get_cpu_var(pfm_stats).ovfl_intr_regular_count++; + + __get_cpu_var(pfm_stats).ccnt3 += sched_clock() - t0; + + pfm_overflow_handler(ctx, set, iip, regs); + + pfm_arch_intr_unfreeze_pmu(ctx); + + __get_cpu_var(pfm_stats).ccnt4 += sched_clock() - t0; + + spin_unlock(&ctx->lock); + + return; + +spurious: + /* ctx may be NULL */ + pfm_arch_intr_unfreeze_pmu(ctx); + if (ctx) + spin_unlock(&ctx->lock); +} + +void pfm_interrupt_handler(unsigned long iip, struct pt_regs *regs) +{ + u64 start; + + BUG_ON(!irqs_disabled()); + + start = sched_clock(); + + __pfm_interrupt_handler(iip, regs); + + __get_cpu_var(pfm_stats).ovfl_intr_ns += sched_clock() - start; +} +EXPORT_SYMBOL(pfm_interrupt_handler); + Index: linux-2.6/perfmon/perfmon_pmu.c =================================================================== --- /dev/null +++ linux-2.6/perfmon/perfmon_pmu.c @@ -0,0 +1,567 @@ +/* + * perfmon_pmu.c: perfmon2 PMU configuration management + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +#ifndef CONFIG_MODULE_UNLOAD +#define module_refcount(n) 1 +#endif + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_conf_lock); +static __cacheline_aligned_in_smp int request_mod_in_progress; +/* + * perfmon core must acces PMU information ONLY through pfm_pmu_conf + * if pfm_pmu_conf is NULL, then no description is registered + */ +struct pfm_pmu_config *pfm_pmu_conf; +EXPORT_SYMBOL(pfm_pmu_conf); + +static inline int pmu_is_module(struct pfm_pmu_config *c) +{ + return !(c->flags & PFM_PMUFL_IS_BUILTIN); +} + +/* + * compute the following: + * - max_pmc, num_pmcs + * - max_pmd, num_pmds, first_cnt_pmd, max_rw_pmd + * based on existing regdesc and valid pmc_desc and pmd_desc + */ +static void pfm_pmu_regdesc_calc_limits(struct pfm_regdesc *d) +{ + u16 n, n2, n_counters, i; + int max1, max2, max3, first_cnt, first_i; + + n = 0; + max1 = max2 = -1; + for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++) { + if (!test_bit(i, cast_ulp(d->pmcs))) + continue; + max1 = i; + n++; + } + d->max_pmc = max1 + 1; + d->num_pmcs = n; + + n = n_counters = n2 = 0; + max1 = max2 = max3 = first_cnt = first_i = -1; + for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++) { + if (!test_bit(i, cast_ulp(d->pmds))) + continue; + + if (first_i == -1) + first_i = i; + + max1 = i; + n++; + + /* + * read-write registers + */ + if (!(pfm_pmu_conf->pmd_desc[i].type & PFM_REG_RO)) { + max3 = i; + n2++; + } + + /* + * counters registers + */ + if (pfm_pmu_conf->pmd_desc[i].type & PFM_REG_C64) { + max2 = i; + n_counters++; + if (first_cnt == -1) + first_cnt = i; + } + } + d->max_pmd = max1 + 1; + d->first_cnt_pmd = first_cnt == -1 ? first_i : first_cnt; + + /* guaranteed to be <= PFM_MAX_HW_PMDS */ + d->max_cnt_pmd = max2 + 1; + + d->num_counters = n_counters; + d->num_pmds = n; + d->max_rw_pmd = max3 + 1; + d->num_rw_pmd = n2; +} + +static int pfm_regdesc_init(struct pfm_regdesc *d, struct pfm_pmu_config *cfg) +{ + u16 n, n2, n_counters, i; + int max1, max2, max3, first_cnt, first_i; + + memset(d, 0 , sizeof(*d)); + /* + * compute the number of implemented PMC from the + * description tables + * + * We separate actual PMC registers from virtual + * PMC registers. Needed for PMC save/restore routines. + */ + n = 0; + max1 = max2 = -1; + for (i = 0; i < cfg->num_pmc_entries; i++) { + if (!(cfg->pmc_desc[i].type & PFM_REG_I)) + continue; + + __set_bit(i, cast_ulp(d->pmcs)); + + max1 = i; + n++; + } + + if (!n) { + PFM_INFO("%s PMU description has no PMC registers", + cfg->pmu_name); + return -EINVAL; + } + + d->max_pmc = max1 + 1; + d->num_pmcs = n; + + n = n_counters = n2 = 0; + max1 = max2 = max3 = first_cnt = first_i = -1; + for (i = 0; i < cfg->num_pmd_entries; i++) { + if (!(cfg->pmd_desc[i].type & PFM_REG_I)) + continue; + + if (first_i == -1) + first_i = i; + + __set_bit(i, cast_ulp(d->pmds)); + max1 = i; + n++; + + /* + * read-write registers + */ + if (!(cfg->pmd_desc[i].type & PFM_REG_RO)) { + __set_bit(i, cast_ulp(d->rw_pmds)); + max3 = i; + n2++; + } + + /* + * counters registers + */ + if (cfg->pmd_desc[i].type & PFM_REG_C64) { + __set_bit(i, cast_ulp(d->cnt_pmds)); + max2 = i; + n_counters++; + if (first_cnt == -1) + first_cnt = i; + } + } + + if (!n) { + PFM_INFO("%s PMU description has no PMD registers", + cfg->pmu_name); + return -EINVAL; + } + + d->max_pmd = max1 + 1; + d->first_cnt_pmd = first_cnt == -1 ? first_i : first_cnt; + + /* guaranteed to be <= PFM_MAX_HW_PMDS */ + d->max_cnt_pmd = max2 + 1; + + d->num_counters = n_counters; + d->num_pmds = n; + d->max_rw_pmd = max3 + 1; + d->num_rw_pmd = n2; + + return 0; +} + +/* + * initialize PMU configuration from PMU config descriptor + */ +static int pfm_pmu_config_init(struct pfm_pmu_config *cfg) +{ + int ret; + + /* we build the register description using the full mapping + * table as defined by the module. For the first user, we update + * the current description (regs) based on local constraints, + * such as some register used by other subsystems + */ + ret = pfm_regdesc_init(&cfg->full_regs, cfg); + if (ret) + return ret; + + if (!cfg->version) + cfg->version = "0.0"; + + pfm_pmu_conf = cfg; + pfm_pmu_conf->ovfl_mask = (1ULL << cfg->counter_width) -1; + + PFM_INFO("%s PMU detected, %u PMCs, %u PMDs, %u counters (%u bits)", + pfm_pmu_conf->pmu_name, + pfm_pmu_conf->full_regs.num_pmcs, + pfm_pmu_conf->full_regs.num_pmds, + pfm_pmu_conf->full_regs.num_counters, + pfm_pmu_conf->counter_width); + + return 0; +} + +int pfm_pmu_register(struct pfm_pmu_config *cfg) +{ + u16 i, nspec, nspec_ro, num_pmcs, num_pmds, num_wc = 0; + int type, ret = -EBUSY; + + if (perfmon_disabled) { + PFM_INFO("perfmon disabled, cannot add PMU description"); + return -ENOSYS; + } + + nspec = nspec_ro = num_pmds = num_pmcs = 0; + + /* some sanity checks */ + if (cfg == NULL || cfg->pmu_name == NULL) { + PFM_INFO("PMU config descriptor is invalid"); + return -EINVAL; + } + + /* must have a probe */ + if (cfg->probe_pmu == NULL) { + PFM_INFO("PMU config has no probe routine"); + return -EINVAL; + } + + /* + * execute probe routine before anything else as it + * may update configuration tables + */ + if ((*cfg->probe_pmu)() == -1) { + PFM_INFO("%s PMU detection failed", cfg->pmu_name); + return -EINVAL; + } + + if (!(cfg->flags & PFM_PMUFL_IS_BUILTIN) && cfg->owner == NULL) { + PFM_INFO("PMU config %s is missing owner", cfg->pmu_name); + return -EINVAL; + } + + if (!cfg->num_pmd_entries) { + PFM_INFO("%s needs to define num_pmd_entries", cfg->pmu_name); + return -EINVAL; + } + + if (!cfg->num_pmc_entries) { + PFM_INFO("%s needs to define num_pmc_entries", cfg->pmu_name); + return -EINVAL; + } + + if (!cfg->counter_width) { + PFM_INFO("PMU config %s, zero width counters", cfg->pmu_name); + return -EINVAL; + } + + /* + * REG_RO, REG_V not supported on PMC registers + */ + for (i = 0; i < cfg->num_pmc_entries; i++) { + + type = cfg->pmc_desc[i].type; + + if (type & PFM_REG_I) + num_pmcs++; + + if (type & PFM_REG_WC) + num_wc++; + + if (type & PFM_REG_V) { + PFM_INFO("PFM_REG_V is not supported on " + "PMCs (PMC%d)", i); + return -EINVAL; + } + if (type & PFM_REG_RO) { + PFM_INFO("PFM_REG_RO meaningless on " + "PMCs (PMC%u)", i); + return -EINVAL; + } + } + + if (num_wc && cfg->pmc_write_check == NULL) { + PFM_INFO("PMC have write-checker but no callback provided\n"); + return -EINVAL; + } + + /* + * check virtual PMD registers + */ + num_wc= 0; + for (i = 0; i < cfg->num_pmd_entries; i++) { + + type = cfg->pmd_desc[i].type; + + if (type & PFM_REG_I) + num_pmds++; + + if (type & PFM_REG_V) { + nspec++; + if (type & PFM_REG_RO) + nspec_ro++; + } + + if (type & PFM_REG_WC) + num_wc++; + + /* + * only up to HW_PMD can overflow, SW PMD cannot + */ + if (type & PFM_REG_C64 && i >= PFM_ARCH_MAX_HW_PMDS) { + PFM_INFO("overflowing PMD counters must be < %d", + PFM_ARCH_MAX_HW_PMDS); + return -EINVAL; + } + } + + if (num_wc && cfg->pmd_write_check == NULL) { + PFM_INFO("PMD have write-checker but no callback provided\n"); + return -EINVAL; + } + + if (nspec && cfg->pmd_sread == NULL) { + PFM_INFO("PMU config is missing pmd_sread()"); + return -EINVAL; + } + + nspec = nspec - nspec_ro; + if (nspec && cfg->pmd_swrite == NULL) { + PFM_INFO("PMU config is missing pmd_swrite()"); + return -EINVAL; + } + + if (num_pmcs >= PFM_MAX_PMCS) { + PFM_INFO("%s PMCS registers exceed name space [0-%u]", + cfg->pmu_name, + PFM_MAX_PMCS); + return -EINVAL; + } + + spin_lock(&pfm_pmu_conf_lock); + + if (pfm_pmu_conf) + goto unlock; + + ret = pfm_pmu_config_init(cfg); + if (ret) + goto unlock; + + ret = pfm_arch_pmu_config_init(pfm_pmu_conf); + if (ret) + goto unlock; + + ret = pfm_sysfs_add_pmu(pfm_pmu_conf); + if (ret) { + pfm_arch_pmu_config_remove(); + pfm_pmu_conf = NULL; + } + +unlock: + spin_unlock(&pfm_pmu_conf_lock); + + if (ret) { + PFM_INFO("register %s PMU error %d", cfg->pmu_name, ret); + } else { + PFM_INFO("%s PMU installed", cfg->pmu_name); + /* + * (re)initialize PMU on each PMU now that we have a description + */ + on_each_cpu(__pfm_init_percpu, cfg, 0, 0); + } + return ret; +} +EXPORT_SYMBOL(pfm_pmu_register); + +/* + * remove PMU description. Caller must pass address of current + * configuration. This is mostly for sanity checking as only + * one config can exist at any time. + * + * We are using the module refcount mechanism to protect against + * removal while the configuration is being used. As long as there is + * one context, a PMU configuration cannot be removed. The protection is + * managed in module logic. + */ +void pfm_pmu_unregister(struct pfm_pmu_config *cfg) +{ + if (!(cfg ||pfm_pmu_conf)) + return; + + spin_lock(&pfm_pmu_conf_lock); + + BUG_ON(module_refcount(pfm_pmu_conf->owner)); + + if (cfg->owner == pfm_pmu_conf->owner) { + pfm_arch_pmu_config_remove(); + pfm_sysfs_remove_pmu(pfm_pmu_conf); + pfm_pmu_conf = NULL; + } + + spin_unlock(&pfm_pmu_conf_lock); +} +EXPORT_SYMBOL(pfm_pmu_unregister); + +static int pfm_pmu_request_module(void) +{ + char *mod_name; + int ret; + + mod_name = pfm_arch_get_pmu_module_name(); + if (mod_name == NULL) + return -ENOSYS; + + ret = request_module(mod_name); + + PFM_DBG("mod=%s ret=%d\n", mod_name, ret); + return ret; +} + +/* + * autoload: + * 0 : do not try to autoload the PMU description module + * not 0 : try to autoload the PMU description module + */ +int pfm_pmu_conf_get(int autoload) +{ + int ret; + + spin_lock(&pfm_pmu_conf_lock); + + if (request_mod_in_progress) { + ret = -ENOSYS; + goto skip; + } + + if (autoload && pfm_pmu_conf == NULL) { + + request_mod_in_progress = 1; + + spin_unlock(&pfm_pmu_conf_lock); + + pfm_pmu_request_module(); + + spin_lock(&pfm_pmu_conf_lock); + + request_mod_in_progress = 0; + + /* + * request_module() may succeed but the module + * may not have registered properly so we need + * to check + */ + } + + ret = pfm_pmu_conf == NULL ? -ENOSYS : 0; + if (!ret && pmu_is_module(pfm_pmu_conf) + && !try_module_get(pfm_pmu_conf->owner)) + ret = -ENOSYS; +skip: + spin_unlock(&pfm_pmu_conf_lock); + + return ret; +} + +void pfm_pmu_conf_put(void) +{ + if (pfm_pmu_conf == NULL || !pmu_is_module(pfm_pmu_conf)) + return; + + spin_lock(&pfm_pmu_conf_lock); + module_put(pfm_pmu_conf->owner); + spin_unlock(&pfm_pmu_conf_lock); +} + +static __cacheline_aligned_in_smp atomic_t pfm_pmu_acquired; + +int pfm_pmu_acquire(void) +{ + int ret; + + PFM_DBG("before pmu_acquired=%d", atomic_read(&pfm_pmu_acquired)); + + if (atomic_inc_return(&pfm_pmu_acquired) > 1) + return 0; + + PFM_DBG("after pmu_acquired=%d", atomic_read(&pfm_pmu_acquired)); + /* + * copy full description and then check if arch-specific + * layer needs some adjustments + */ + pfm_pmu_conf->regs = pfm_pmu_conf->full_regs; + + ret = pfm_arch_pmu_acquire(); + if (ret) { + atomic_dec(&pfm_pmu_acquired); + return ret; + } + + /* + * calculate new limits (num, max) + */ + pfm_pmu_regdesc_calc_limits(&pfm_pmu_conf->regs); + + /* available PMU ressources */ + PFM_DBG("PMU acquired: %u PMCs, %u PMDs, %u counters", + pfm_pmu_conf->regs.num_pmcs, + pfm_pmu_conf->regs.num_pmds, + pfm_pmu_conf->regs.num_counters); + + if (!(pfm_pmu_conf->regs.num_pmcs && pfm_pmu_conf->regs.num_pmcs)) { + PFM_DBG("no usable PMU registers"); + pfm_arch_pmu_release(); + atomic_dec(&pfm_pmu_acquired); + return -EBUSY; + } + return 0; +} + +void pfm_pmu_release(void) +{ + PFM_DBG("pmu_acquired=%d", atomic_read(&pfm_pmu_acquired)); + + if (!atomic_dec_and_test(&pfm_pmu_acquired)) + return; + + pfm_arch_pmu_release(); + memset(&pfm_pmu_conf->regs, 0, sizeof(pfm_pmu_conf->regs)); + PFM_DBG("PMU released"); +} Index: linux-2.6/perfmon/perfmon_res.c =================================================================== --- /dev/null +++ linux-2.6/perfmon/perfmon_res.c @@ -0,0 +1,419 @@ +/* + * perfmon_res.c: perfmon2 resource allocations + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include + +/* + * global information about all sessions + * mostly used to synchronize between system wide and per-process + */ +struct pfm_sessions { + u32 pfs_task_sessions;/* #num loaded per-thread sessions */ + size_t pfs_smpl_buffer_mem_cur; /* current smpl buf mem usage */ + cpumask_t pfs_sys_cpumask; /* bitmask of used cpus (system-wide) */ +}; + +static struct pfm_sessions pfm_sessions; + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_sessions_lock); + +/* + * sampling buffer allocated by perfmon must be + * checked against max usage thresholds for security + * reasons. + * + * The first level check is against the system wide limit + * as indicated by the system administrator in /proc/sys/kernel/perfmon + * + * The second level check is on a per-process basis using + * RLIMIT_MEMLOCK limit. + * + * Operating on the current task only. + */ +int pfm_reserve_buf_space(size_t size) +{ + struct mm_struct *mm; + unsigned long locked; + unsigned long buf_mem, buf_mem_max; + unsigned long flags; + + spin_lock_irqsave(&pfm_sessions_lock, flags); + + /* + * check against global buffer limit + */ + buf_mem_max = pfm_controls.smpl_buffer_mem_max; + buf_mem = pfm_sessions.pfs_smpl_buffer_mem_cur + size; + + if (buf_mem <= buf_mem_max) { + pfm_sessions.pfs_smpl_buffer_mem_cur = buf_mem; + + PFM_DBG("buf_mem_max=%lu current_buf_mem=%lu", + buf_mem_max, + buf_mem); + } + spin_unlock_irqrestore(&pfm_sessions_lock, flags); + + if (buf_mem > buf_mem_max) { + PFM_DBG("smpl buffer memory threshold reached"); + return -ENOMEM; + } + + /* + * check against RLIMIT_MEMLOCK + */ + mm = get_task_mm(current); + + down_write(&mm->mmap_sem); + + locked = mm->locked_vm << PAGE_SHIFT; + locked += size; + + if (locked > current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur) { + + PFM_DBG("RLIMIT_MEMLOCK reached ask_locked=%lu rlim_cur=%lu", + locked, + current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur); + + up_write(&mm->mmap_sem); + mmput(mm); + goto unres; + } + + mm->locked_vm = locked >> PAGE_SHIFT; + + up_write(&mm->mmap_sem); + + mmput(mm); + + return 0; + +unres: + /* + * remove global buffer memory allocation + */ + spin_lock_irqsave(&pfm_sessions_lock, flags); + + pfm_sessions.pfs_smpl_buffer_mem_cur -= size; + + spin_unlock_irqrestore(&pfm_sessions_lock, flags); + + return -ENOMEM; +} +/* + *There exist multiple paths leading to this function. We need to + * be very careful withlokcing on the mmap_sem as it may already be + * held by the time we come here. + * The following paths exist: + * + * exit path: + * sys_exit_group + * do_group_exit + * do_exit + * exit_mm + * mmput + * exit_mmap + * remove_vma + * fput + * __fput + * pfm_close + * __pfm_close + * pfm_context_free + * pfm_release_buf_space + * munmap path: + * sys_munmap + * do_munmap + * remove_vma + * fput + * __fput + * pfm_close + * __pfm_close + * pfm_context_free + * pfm_release_buf_space + * + * close path: + * sys_close + * filp_close + * fput + * __fput + * pfm_close + * __pfm_close + * pfm_context_free + * pfm_release_buf_space + * + * The issue is that on the munmap() path, the mmap_sem is already held + * in write-mode by the time we come here. To avoid the deadlock, we need + * to know where we are coming from and skip down_write(). If is fairly + * difficult to know this because of the lack of good hooks and + * the fact that, there may not have been any mmap() of the sampling buffer + * (i.e. create_context() followed by close() or exit()). + * + * We use a set flag ctx->flags.mmap_nlock which is toggle in the vm_ops + * callback in remove_vma() which is called systematically for the call, so + * on all but the pure close() path. The exit path does not already hold + * the lock but this is exit so there is no task->mm by the time we come here. + * + * The mmap_nlock is set only when unmapping and this is the LAST reference + * to the file (i.e., close() followed by munmap()). + */ +void pfm_release_buf_space(struct pfm_context *ctx, size_t size) +{ + unsigned long flags; + struct mm_struct *mm; + + mm = get_task_mm(current); + if (mm) { + if (ctx->flags.mmap_nlock == 0) { + PFM_DBG("doing down_write"); + down_write(&mm->mmap_sem); + } + + mm->locked_vm -= size >> PAGE_SHIFT; + + PFM_DBG("locked_vm=%lu size=%zu", mm->locked_vm, size); + + if (ctx->flags.mmap_nlock == 0) + up_write(&mm->mmap_sem); + + mmput(mm); + } + + spin_lock_irqsave(&pfm_sessions_lock, flags); + + pfm_sessions.pfs_smpl_buffer_mem_cur -= size; + + spin_unlock_irqrestore(&pfm_sessions_lock, flags); +} + +int pfm_reserve_session(int is_system, u32 cpu) +{ + unsigned long flags; + u32 nsys_cpus; + int ret = 0; + + /* + * validy checks on cpu_mask have been done upstream + */ + spin_lock_irqsave(&pfm_sessions_lock, flags); + + nsys_cpus = cpus_weight(pfm_sessions.pfs_sys_cpumask); + + PFM_DBG("in sys=%u task=%u is_sys=%d cpu=%u", + nsys_cpus, + pfm_sessions.pfs_task_sessions, + is_system, + cpu); + + if (is_system) { + /* + * cannot mix system wide and per-task sessions + */ + if (pfm_sessions.pfs_task_sessions > 0) { + PFM_DBG("%u conflicting task_sessions", + pfm_sessions.pfs_task_sessions); + ret = -EBUSY; + goto abort; + } + + if (cpu_isset(cpu, pfm_sessions.pfs_sys_cpumask)) { + PFM_DBG("conflicting session on CPU%u", cpu); + ret = -EBUSY; + goto abort; + } + + PFM_DBG("reserved session on CPU%u", cpu); + + cpu_set(cpu, pfm_sessions.pfs_sys_cpumask); + nsys_cpus++; + } else { + if (nsys_cpus) { + ret = -EBUSY; + goto abort; + } + pfm_sessions.pfs_task_sessions++; + } + + PFM_DBG("out sys=%u task=%u is_sys=%d cpu=%u", + nsys_cpus, + pfm_sessions.pfs_task_sessions, + is_system, + cpu); + +abort: + spin_unlock_irqrestore(&pfm_sessions_lock, flags); + + return ret; +} + +/* + * called from __pfm_unload_context() + */ +int pfm_release_session(int is_system, u32 cpu) +{ + unsigned long flags; + + spin_lock_irqsave(&pfm_sessions_lock, flags); + + PFM_DBG("in sys_sessions=%u task_sessions=%u syswide=%d cpu=%u", + cpus_weight(pfm_sessions.pfs_sys_cpumask), + pfm_sessions.pfs_task_sessions, + is_system, cpu); + + if (is_system) + cpu_clear(cpu, pfm_sessions.pfs_sys_cpumask); + else + pfm_sessions.pfs_task_sessions--; + + PFM_DBG("out sys_sessions=%u task_sessions=%u syswide=%d cpu=%u", + cpus_weight(pfm_sessions.pfs_sys_cpumask), + pfm_sessions.pfs_task_sessions, + is_system, cpu); + + spin_unlock_irqrestore(&pfm_sessions_lock, flags); + return 0; +} + +int pfm_reserve_allcpus(void) +{ + unsigned long flags; + u32 nsys_cpus, cpu; + + spin_lock_irqsave(&pfm_sessions_lock, flags); + + nsys_cpus = cpus_weight(pfm_sessions.pfs_sys_cpumask); + + PFM_DBG("in sys=%u task=%u", + nsys_cpus, + pfm_sessions.pfs_task_sessions); + + if (nsys_cpus) { + PFM_DBG("already some system-wide sessions"); + goto abort; + } + + /* + * cannot mix system wide and per-task sessions + */ + if (pfm_sessions.pfs_task_sessions) { + PFM_DBG("%u conflicting task_sessions", + pfm_sessions.pfs_task_sessions); + goto abort; + } + + for_each_online_cpu(cpu) { + cpu_set(cpu, pfm_sessions.pfs_sys_cpumask); + nsys_cpus++; + } + + PFM_DBG("out sys=%u task=%u", + nsys_cpus, + pfm_sessions.pfs_task_sessions); + + spin_unlock_irqrestore(&pfm_sessions_lock, flags); + + return 0; + +abort: + spin_unlock_irqrestore(&pfm_sessions_lock, flags); + + return -EBUSY; +} +EXPORT_SYMBOL(pfm_reserve_allcpus); + +int pfm_release_allcpus(void) +{ + unsigned long flags; + u32 nsys_cpus, cpu; + + spin_lock_irqsave(&pfm_sessions_lock, flags); + + nsys_cpus = cpus_weight(pfm_sessions.pfs_sys_cpumask); + + PFM_DBG("in sys=%u task=%u", + nsys_cpus, + pfm_sessions.pfs_task_sessions); + + /* + * XXX: could use __cpus_clear() with nbits + */ + for_each_online_cpu(cpu) { + cpu_clear(cpu, pfm_sessions.pfs_sys_cpumask); + nsys_cpus--; + } + + PFM_DBG("out sys=%u task=%u", + nsys_cpus, + pfm_sessions.pfs_task_sessions); + + spin_unlock_irqrestore(&pfm_sessions_lock, flags); + + return 0; +} +EXPORT_SYMBOL(pfm_release_allcpus); + +/* + * called from perfmon_sysfs.c: + * what=0 : pfs_task_sessions + * what=1 : cpus_weight(pfs_sys_cpumask) + * what=2 : smpl_buffer_mem_cur + * what=3 : pmu model name + * + * return number of bytes written into buf (up to sz) + */ +ssize_t pfm_sysfs_session_show(char *buf, size_t sz, int what) +{ + unsigned long flags; + + spin_lock_irqsave(&pfm_sessions_lock, flags); + + switch (what) { + case 0: snprintf(buf, sz, "%u\n", pfm_sessions.pfs_task_sessions); + break; + case 1: snprintf(buf, sz, "%d\n", cpus_weight(pfm_sessions.pfs_sys_cpumask)); + break; + case 2: snprintf(buf, sz, "%zu\n", pfm_sessions.pfs_smpl_buffer_mem_cur); + break; + case 3: + snprintf(buf, sz, "%s\n", + pfm_pmu_conf ? pfm_pmu_conf->pmu_name + : "unknown\n"); + } + spin_unlock_irqrestore(&pfm_sessions_lock, flags); + return strlen(buf); +} Index: linux-2.6/perfmon/perfmon_rw.c =================================================================== --- /dev/null +++ linux-2.6/perfmon/perfmon_rw.c @@ -0,0 +1,602 @@ +/* + * perfmon.c: perfmon2 PMC/PMD read/write system calls + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net/ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include + +#define PFM_REGFL_PMC_ALL (PFM_REGFL_NO_EMUL64|PFM_REG_RETFL_MASK) +#define PFM_REGFL_PMD_ALL (PFM_REGFL_RANDOM | \ + PFM_REGFL_OVFL_NOTIFY| \ + PFM_REG_RETFL_MASK) +/* + * function called from sys_pfm_write_pmds() to write the + * requested PMD registers. The function succeeds whether the context is + * attached or not. When attached to another thread, that thread must be + * stopped. + * + * compat: is used only on IA-64 to maintain backward compatibility with v2.0 + * + * The context is locked and interrupts are disabled. + */ +int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count, + int compat) +{ + struct pfm_event_set *set, *active_set; + u64 value, ovfl_mask; + u64 *smpl_pmds, *reset_pmds, *impl_pmds, *impl_rw_pmds; + u32 req_flags, flags; + u16 cnum, pmd_type, max_pmd, max_pmc; + u16 set_id, prev_set_id; + int i, can_access_pmu; + int is_counter; + int ret, error_code; + pfm_pmd_check_t wr_func; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + active_set = ctx->active_set; + max_pmd = pfm_pmu_conf->regs.max_pmd; + max_pmc = pfm_pmu_conf->regs.max_pmc; + impl_pmds = pfm_pmu_conf->regs.pmds; + impl_rw_pmds = pfm_pmu_conf->regs.rw_pmds; + wr_func = pfm_pmu_conf->pmd_write_check; + set = NULL; + + prev_set_id = 0; + can_access_pmu = 0; + + /* + * we cannot access the actual PMD registers when monitoring is masked + */ + if (unlikely(ctx->state == PFM_CTX_LOADED)) + can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task + || ctx->flags.system; + + error_code = PFM_REG_RETFL_EINVAL; + ret = -EINVAL; + + for (i = 0; i < count; i++, req++) { + + cnum = req->reg_num; + set_id = req->reg_set; + req_flags = req->reg_flags; + smpl_pmds = req->reg_smpl_pmds; + reset_pmds = req->reg_reset_pmds; + flags = 0; + + if (unlikely(cnum >= max_pmd || !test_bit(cnum, cast_ulp(impl_pmds)))) { + PFM_DBG("pmd%u is not implemented/unaccessible", cnum); + error_code = PFM_REG_RETFL_NOTAVAIL; + goto error; + } + + pmd_type = pfm_pmu_conf->pmd_desc[cnum].type; + is_counter = pmd_type & PFM_REG_C64; + + if (likely(!compat && is_counter)) { + /* + * ensure only valid flags are set + */ + if (req_flags & ~(PFM_REGFL_PMD_ALL)) { + PFM_DBG("pmd%u: invalid flags=0x%x", + cnum, req_flags); + goto error; + } + + if (req_flags & PFM_REGFL_OVFL_NOTIFY) + flags |= PFM_REGFL_OVFL_NOTIFY; + if (req_flags & PFM_REGFL_RANDOM) + flags |= PFM_REGFL_RANDOM; + /* + * verify validity of smpl_pmds + */ + if (unlikely(!bitmap_subset(cast_ulp(smpl_pmds), + cast_ulp(impl_pmds), + max_pmd))) { + PFM_DBG("invalid smpl_pmds=0x%llx " + "for pmd%u", + (unsigned long long)smpl_pmds[0], + cnum); + goto error; + } + /* + * verify validity of reset_pmds + * check against impl_rw_pmds because it is not + * possible to reset read-only PMDs + */ + if (unlikely(!bitmap_subset(cast_ulp(reset_pmds), + cast_ulp(impl_rw_pmds), + max_pmd))) { + PFM_DBG("invalid reset_pmds=0x%llx " + "for pmd%u", + (unsigned long long)reset_pmds[0], + cnum); + goto error; + } + + } + + /* + * locate event set + */ + if (i == 0 || set_id != prev_set_id) { + set = pfm_find_set(ctx, set_id, 0); + if (set == NULL) { + PFM_DBG("event set%u does not exist", + set_id); + error_code = PFM_REG_RETFL_NOSET; + goto error; + } + } + + /* + * execute write checker, if any + */ + if (unlikely(wr_func && (pmd_type & PFM_REG_WC))) { + ret = (*wr_func)(ctx, set, req); + if (ret) + goto error; + + } + + value = req->reg_value; + + /* + * now commit changes to software state + */ + + if (likely(is_counter)) { + if (likely(!compat)) { + set->pmds[cnum].flags = flags; + + /* + * copy reset and sampling bitvectors + */ + bitmap_copy(cast_ulp(set->pmds[cnum].reset_pmds), + cast_ulp(reset_pmds), + max_pmd); + + bitmap_copy(cast_ulp(set->pmds[cnum].smpl_pmds), + cast_ulp(smpl_pmds), + max_pmd); + + set->pmds[cnum].eventid = req->reg_smpl_eventid; + + /* + * Mark reset/smpl PMDS as used. + * + * We do not keep track of PMC because we have to + * systematically restore ALL of them. + */ + bitmap_or(cast_ulp(set->used_pmds), + cast_ulp(set->used_pmds), + cast_ulp(reset_pmds), max_pmd); + + bitmap_or(cast_ulp(set->used_pmds), + cast_ulp(set->used_pmds), + cast_ulp(smpl_pmds), max_pmd); + + /* + * we reprogrammed the PMD hence, clear any pending + * ovfl, switch based on the old value + * for restart we have already established new values + */ + if (test_bit(cnum, cast_ulp(set->povfl_pmds))) { + set->npend_ovfls--; + __clear_bit(cnum, cast_ulp(set->povfl_pmds)); + } + __clear_bit(cnum, cast_ulp(set->ovfl_pmds)); + + /* + * update ovfl_notify + */ + if (flags & PFM_REGFL_OVFL_NOTIFY) + __set_bit(cnum, cast_ulp(set->ovfl_notify)); + else + __clear_bit(cnum, cast_ulp(set->ovfl_notify)); + } + /* + * reset last value to new value + */ + set->pmds[cnum].lval = value; + + /* + * establish new switch count + */ + set->pmds[cnum].ovflsw_thres = req->reg_ovfl_switch_cnt; + set->pmds[cnum].ovflsw_ref_thres = req->reg_ovfl_switch_cnt; + } + + /* + * update reset values (not just for counters) + */ + set->pmds[cnum].long_reset = req->reg_long_reset; + set->pmds[cnum].short_reset = req->reg_short_reset; + + /* + * update randomization mask + */ + set->pmds[cnum].mask = req->reg_random_mask; + + /* + * update set values + */ + set->pmds[cnum].value = value; + + __set_bit(cnum, cast_ulp(set->used_pmds)); + + if (set == active_set) { + set->priv_flags |= PFM_SETFL_PRIV_MOD_PMDS; + if (can_access_pmu) + pfm_write_pmd(ctx, cnum, value); + } + + /* + * update number of used PMD registers + */ + set->nused_pmds = bitmap_weight(cast_ulp(set->used_pmds), max_pmd); + + pfm_retflag_set(req->reg_flags, 0); + + prev_set_id = set_id; + + PFM_DBG("set%u pmd%u=0x%llx flags=0x%x a_pmu=%d " + "ctx_pmd=0x%llx s_reset=0x%llx " + "l_reset=0x%llx u_pmds=0x%llx nu_pmds=%u " + "s_pmds=0x%llx r_pmds=0x%llx o_pmds=0x%llx " + "o_thres=%llu compat=%d eventid=%llx", + set->id, + cnum, + (unsigned long long)value, + set->pmds[cnum].flags, + can_access_pmu, + (unsigned long long)set->pmds[cnum].value, + (unsigned long long)set->pmds[cnum].short_reset, + (unsigned long long)set->pmds[cnum].long_reset, + (unsigned long long)set->used_pmds[0], + set->nused_pmds, + (unsigned long long)set->pmds[cnum].smpl_pmds[0], + (unsigned long long)set->pmds[cnum].reset_pmds[0], + (unsigned long long)set->ovfl_pmds[0], + (unsigned long long)set->pmds[cnum].ovflsw_thres, + compat, + (unsigned long long)set->pmds[cnum].eventid); + } + + /* + * make changes visible + */ + if (can_access_pmu) + pfm_arch_serialize(); + + return 0; + +error: + /* + * for now, we have only one possibility for error + */ + pfm_retflag_set(req->reg_flags, error_code); + PFM_DBG("set%u pmd%u error=%d", set_id, cnum, error_code); + return ret; +} + +/* + * function called from sys_pfm_write_pmcs() to write the + * requested PMC registers. The function succeeds whether the context is + * attached or not. When attached to another thread, that thread must be + * stopped. + * + * The context is locked and interrupts are disabled. + */ +int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmc *req, int count) +{ + struct pfm_event_set *set, *active_set; + u64 value, dfl_val, rsvd_msk; + u64 *impl_pmcs; + int i, can_access_pmu; + int ret, error_code; + u16 set_id, prev_set_id; + u16 cnum, pmc_type, max_pmc; + u32 flags; + pfm_pmc_check_t wr_func; + + active_set = ctx->active_set; + + wr_func = pfm_pmu_conf->pmc_write_check; + max_pmc = pfm_pmu_conf->regs.max_pmc; + impl_pmcs = pfm_pmu_conf->regs.pmcs; + + set = NULL; + prev_set_id = 0; + can_access_pmu = 0; + + /* + * we cannot access the actual PMC registers when monitoring is masked + */ + if (unlikely(ctx->state == PFM_CTX_LOADED)) + can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task + || ctx->flags.system; + + error_code = PFM_REG_RETFL_EINVAL; + + for (i = 0; i < count; i++, req++) { + + ret = -EINVAL; + cnum = req->reg_num; + set_id = req->reg_set; + value = req->reg_value; + flags = req->reg_flags; + + /* + * no access to unimplemented PMC register + */ + if (unlikely(cnum >= max_pmc + || !test_bit(cnum, cast_ulp(impl_pmcs)))) { + PFM_DBG("pmc%u is not implemented/unaccessible", cnum); + error_code = PFM_REG_RETFL_NOTAVAIL; + goto error; + } + + pmc_type = pfm_pmu_conf->pmc_desc[cnum].type; + dfl_val = pfm_pmu_conf->pmc_desc[cnum].dfl_val; + rsvd_msk = pfm_pmu_conf->pmc_desc[cnum].rsvd_msk; + + /* + * ensure only valid flags are set + */ + if (flags & ~PFM_REGFL_PMC_ALL) { + PFM_DBG("pmc%u: invalid flags=0x%x", cnum, flags); + goto error; + } + + /* + * locate event set + */ + if (i == 0 || set_id != prev_set_id) { + set = pfm_find_set(ctx, set_id, 0); + if (set == NULL) { + PFM_DBG("event set%u does not exist", + set_id); + error_code = PFM_REG_RETFL_NOSET; + goto error; + } + } + + /* + * set reserved bits to default values + * (reserved bits must be 1 in rsvd_msk) + */ + value = (value & ~rsvd_msk) | (dfl_val & rsvd_msk); + + if (flags & PFM_REGFL_NO_EMUL64) { + if (!(pmc_type & PFM_REG_NO64)) { + PFM_DBG("pmc%u no support for " + "PFM_REGFL_NO_EMUL64", cnum); + goto error; + } + value &= ~pfm_pmu_conf->pmc_desc[cnum].no_emul64_msk; + } + + /* + * execute write checker, if any + */ + if (likely(wr_func && (pmc_type & PFM_REG_WC))) { + req->reg_value = value; + ret = (*wr_func)(ctx, set, req); + if (ret) + goto error; + value = req->reg_value; + } + + /* + * Now we commit the changes + */ + + /* + * mark PMC register as used + * We do not track associated PMC register based on + * the fact that they will likely need to be written + * in order to become useful at which point the statement + * below will catch that. + * + * The used_pmcs bitmask is only useful on architectures where + * the PMC needs to be modified for particular bits, especially + * on overflow or to stop/start. + */ + if (!test_bit(cnum, cast_ulp(set->used_pmcs))) { + __set_bit(cnum, cast_ulp(set->used_pmcs)); + set->nused_pmcs++; + } + + set->pmcs[cnum] = value; + + if (set == active_set) { + set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; + if (can_access_pmu) + pfm_arch_write_pmc(ctx, cnum, value); + } + + pfm_retflag_set(req->reg_flags, 0); + + prev_set_id = set_id; + + PFM_DBG("set%u pmc%u=0x%llx a_pmu=%d " + "u_pmcs=0x%llx nu_pmcs=%u", + set->id, + cnum, + (unsigned long long)value, + can_access_pmu, + (unsigned long long)set->used_pmcs[0], + set->nused_pmcs); + } + /* + * make sure the changes are visible + */ + if (can_access_pmu) + pfm_arch_serialize(); + + return 0; +error: + pfm_retflag_set(req->reg_flags, error_code); + PFM_DBG("set%u pmc%u error=0x%08x", set_id, cnum, error_code); + return ret; +} + +/* + * function called from sys_pfm_read_pmds() to read the 64-bit value of + * requested PMD registers. The function succeeds whether the context is + * attached or not. When attached to another thread, that thread must be + * stopped. + * + * The context is locked and interrupts are disabled. + */ +int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count) +{ + u64 val = 0, lval, ovfl_mask, hw_val; + u64 sw_cnt; + u64 *impl_pmds; + struct pfm_event_set *set, *active_set; + int i, can_access_pmu = 0; + int error_code; + u16 cnum, pmd_type, set_id, prev_set_id, max_pmd; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + impl_pmds = pfm_pmu_conf->regs.pmds; + max_pmd = pfm_pmu_conf->regs.max_pmd; + active_set = ctx->active_set; + set = NULL; + prev_set_id = 0; + + if (likely(ctx->state == PFM_CTX_LOADED)) { + can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task + || ctx->flags.system; + + if (can_access_pmu) + pfm_arch_serialize(); + } + error_code = PFM_REG_RETFL_EINVAL; + + /* + * on both UP and SMP, we can only read the PMD from the hardware + * register when the task is the owner of the local PMU. + */ + for (i = 0; i < count; i++, req++) { + + cnum = req->reg_num; + set_id = req->reg_set; + + if (unlikely(cnum >= max_pmd + || !test_bit(cnum, cast_ulp(impl_pmds)))) { + PFM_DBG("pmd%u is not implemented/unaccessible", cnum); + error_code = PFM_REG_RETFL_NOTAVAIL; + goto error; + } + + pmd_type = pfm_pmu_conf->pmd_desc[cnum].type; + + /* + * locate event set + */ + if (i == 0 || set_id != prev_set_id) { + set = pfm_find_set(ctx, set_id, 0); + if (set == NULL) { + PFM_DBG("event set%u does not exist", + set_id); + error_code = PFM_REG_RETFL_NOSET; + goto error; + } + } + /* + * it is not possible to read a PMD which was not requested: + * - explicitly written via pfm_write_pmds() + * - provided as a reg_smpl_pmds[] to another PMD during + * pfm_write_pmds() + * + * This is motivated by security and for optimization purposes: + * - on context switch restore, we can restore only what we + * use (except when regs directly readable at user level, + * e.g., IA-64 self-monitoring, I386 RDPMC). + * - do not need to maintain PMC -> PMD dependencies + */ + if (unlikely(!test_bit(cnum, cast_ulp(set->used_pmds)))) { + PFM_DBG("pmd%u cannot be read, because never " + "requested", cnum); + goto error; + } + + val = set->pmds[cnum].value; + lval = set->pmds[cnum].lval; + + /* + * extract remaining ovfl to switch + */ + sw_cnt = set->pmds[cnum].ovflsw_thres; + + /* + * If the task is not the current one, then we check if the + * PMU state is still in the local live register due to lazy + * ctxsw. If true, then we read directly from the registers. + */ + if (set == active_set && can_access_pmu) { + hw_val = pfm_read_pmd(ctx, cnum); + if (pmd_type & PFM_REG_C64) + val = (val & ~ovfl_mask) | (hw_val & ovfl_mask); + else + val = hw_val; + } + + PFM_DBG("set%u pmd%u=0x%llx sw_thr=%llu lval=0x%llx", + set->id, + cnum, + (unsigned long long)val, + (unsigned long long)sw_cnt, + (unsigned long long)lval); + + pfm_retflag_set(req->reg_flags, 0); + + req->reg_value = val; + req->reg_last_reset_val = lval; + req->reg_ovfl_switch_cnt = sw_cnt; + + prev_set_id = set_id; + } + return 0; +error: + pfm_retflag_set(req->reg_flags, error_code); + PFM_DBG("set%u pmd%u error=%d", set_id, cnum, error_code); + return -EINVAL; +} Index: linux-2.6/perfmon/perfmon_sets.c =================================================================== --- /dev/null +++ linux-2.6/perfmon/perfmon_sets.c @@ -0,0 +1,833 @@ +/* + * perfmon_sets.c: perfmon2 event sets and multiplexing functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include + +static struct kmem_cache *pfm_set_cachep; + +/* + * reload reference overflow switch thresholds + */ +static void pfm_reload_switch_thresholds(struct pfm_event_set *set) +{ + u64 *mask; + u16 i, max_cnt_pmd, first_cnt_pmd; + + mask = set->used_pmds; + first_cnt_pmd = pfm_pmu_conf->regs.first_cnt_pmd; + max_cnt_pmd = pfm_pmu_conf->regs.max_cnt_pmd; + + for (i = first_cnt_pmd; i< max_cnt_pmd; i++) { + if (test_bit(i, cast_ulp(mask))) { + set->pmds[i].ovflsw_thres = set->pmds[i].ovflsw_ref_thres; + PFM_DBG("set%u pmd%u ovflsw_thres=%llu", + set->id, + i, + (unsigned long long)set->pmds[i].ovflsw_thres); + } + } +} + +/* + * ensures that all id_next sets exists such that the round-robin + * will work correctly, i.e., next dangling references. + */ +int pfm_prepare_sets(struct pfm_context *ctx, struct pfm_event_set *act_set) +{ + struct pfm_event_set *set1, *set2; + u16 max_cnt_pmd; +#define is_last_set(s, c) ((s)->list.next == &(c)->list) + + max_cnt_pmd = pfm_pmu_conf->regs.max_cnt_pmd; + + list_for_each_entry(set1, &ctx->list, list) { + + if (is_last_set(set1, ctx)) + set2 = list_entry(ctx->list.next, + struct pfm_event_set, list); + else + set2 = list_entry(set1->list.next, + struct pfm_event_set, list); + /* + * update field used during actual switching + */ + set1->sw_next = set2; + + PFM_DBG("set%u sw_next=%u", set1->id, set2->id); + + /* + * cleanup bitvectors + */ + bitmap_zero(cast_ulp(set1->ovfl_pmds), max_cnt_pmd); + bitmap_zero(cast_ulp(set1->povfl_pmds), max_cnt_pmd); + set1->npend_ovfls = 0; + /* + * we cannot just use plain clear because of arch-specific flags + */ + set1->priv_flags &= ~(PFM_SETFL_PRIV_MOD_BOTH|PFM_SETFL_PRIV_SWITCH); + + /* + * reset activation and elapsed ns + */ + set1->duration = 0; + + set1->runs = 0; + } + /* + * setting PFM_CPUINFO_TIME_SWITCH, triggers + * further checking if __pfm_handle_switch_timeout(). + * switch timeout is effectively decremented only when + * monitoring has been activated via pfm_start() or + * any user level equivalent. + */ + if (act_set->flags & PFM_SETFL_TIME_SWITCH) { + + act_set->timeout_sw_left = act_set->timeout_sw_ref; + PFM_DBG("arming timeout for set%u", act_set->id); + + if (ctx->flags.system) + __get_cpu_var(pfm_syst_info) = PFM_CPUINFO_TIME_SWITCH; + + } else if (act_set->flags & PFM_SETFL_OVFL_SWITCH) + pfm_reload_switch_thresholds(act_set); + + return 0; +} + +/* + * called from *_timer_interrupt(). task == current + */ +void __pfm_handle_switch_timeout(void) +{ + struct pfm_event_set *set; + struct pfm_context *ctx; + unsigned long flags; + + /* + * The timer tick check is operating on each + * CPU. Not all CPUs have time switching enabled + * hence we need to check. + */ + ctx = __get_cpu_var(pmu_ctx); + if (ctx == NULL) + return; + + spin_lock_irqsave(&ctx->lock, flags); + + set = ctx->active_set; + BUG_ON(set == NULL); + + /* + * we decrement only when attached and not masked or zombie + */ + if (ctx->state != PFM_CTX_LOADED) + goto done; + + /* + * do not decrement timeout unless monitoring is active. + */ + if (!pfm_arch_is_active(ctx)) + goto done; + + set->timeout_sw_left--; + + __get_cpu_var(pfm_stats).handle_timeout_count++; + + if (!set->timeout_sw_left) + pfm_switch_sets(ctx, NULL, PFM_PMD_RESET_SHORT, 0); +done: + spin_unlock_irqrestore(&ctx->lock, flags); +} + +/* + * + * always operating on the current task + * interrupts are masked + * + * input: + * - new_set: new set to switch to, if NULL follow normal chain + */ +void pfm_switch_sets(struct pfm_context *ctx, + struct pfm_event_set *new_set, + int reset_mode, + int no_restart) +{ + struct pfm_event_set *set; + u64 switch_count; + u64 now, end; + unsigned long info = 0; + u32 new_flags; + int is_system, state, is_active, nn; + + now = sched_clock(); + set = ctx->active_set; + is_active = pfm_arch_is_active(ctx); + + /* + * if no set is explicitly requested, + * use the set_switch_next field + */ + if (new_set == NULL) { + /* + * we use round-robin unless the user specified + * a particular set to go to. + */ + new_set = set->sw_next; + BUG_ON(new_set == NULL); + } + + PFM_DBG("state=%d act=%d cur_set=%u cur_runs=%llu cur_npend=%d next_set=%u " + "next_runs=%llu new_npend=%d reset_mode=%d reset_pmds=%llx", + ctx->state, + is_active, + set->id, + (unsigned long long)set->runs, + set->npend_ovfls, + new_set->id, + (unsigned long long)new_set->runs, + new_set->npend_ovfls, + reset_mode, + (unsigned long long)new_set->reset_pmds[0]); + + is_system = ctx->flags.system; + state = ctx->state; + new_flags = new_set->flags; + switch_count = __get_cpu_var(pfm_stats).set_switch_count; + + + /* + * nothing more to do + */ + if (new_set == set) + goto skip_same_set; + + + if (is_active) { + pfm_arch_stop(current, ctx, set); + + pfm_save_pmds(ctx, set); + + /* + * compute elapsed ns for active set + */ + set->duration += now - set->duration_start; + /* + * must be done after pfm_arch_stop() + */ + if (is_system) + info = __get_cpu_var(pfm_syst_info); + } + pfm_arch_restore_pmds(ctx, new_set); + + /* + * if masked, we must restore the pmcs such that they + * do not capture anything. + */ + pfm_arch_restore_pmcs(ctx, new_set); + + if (new_set->npend_ovfls) { + pfm_arch_resend_irq(); + __get_cpu_var(pfm_stats).ovfl_intr_replay_count++; + } + + new_set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; + +skip_same_set: + switch_count++; + new_set->runs++; + /* + * reset switch threshold + */ + if (new_flags & PFM_SETFL_OVFL_SWITCH) + pfm_reload_switch_thresholds(new_set); + else if (new_flags & PFM_SETFL_TIME_SWITCH) + new_set->timeout_sw_left = new_set->timeout_sw_ref; + + /* + * reset overflowed PMD registers + */ + nn = bitmap_weight(cast_ulp(new_set->reset_pmds), + pfm_pmu_conf->regs.max_pmd); + if (nn) + pfm_reset_pmds(ctx, new_set, nn, reset_mode); + /* + * this is needed when coming from pfm_start() + */ + if (no_restart) + goto skip_restart; + + /* + * reactivate monitoring + */ + if (is_system) { + info &= ~PFM_CPUINFO_TIME_SWITCH; + + if (new_flags & PFM_SETFL_TIME_SWITCH) + info |= PFM_CPUINFO_TIME_SWITCH; + + __get_cpu_var(pfm_syst_info) = info; + + PFM_DBG("new_set=%u info=0x%lx flags=0x%x", + new_set->id, + info, + new_flags); + } + + if (is_active) { + pfm_arch_start(current, ctx, new_set); + new_set->duration_start = now; + } + +skip_restart: + ctx->active_set = new_set; + ctx->set_all_runs++; + + end = sched_clock(); + + __get_cpu_var(pfm_stats).set_switch_count = switch_count; + __get_cpu_var(pfm_stats).set_switch_ns += end - now; +} + +/* + * called from __pfm_overflow_handler() to switch event sets. + * monitoring is stopped, task is current, interrupts are masked. + * compared to pfm_switch_sets(), this version is simplified because + * it know about the call path. There is no need to stop monitoring + * because it is already frozen by PMU handler. + */ +void pfm_switch_sets_from_intr(struct pfm_context *ctx) +{ + struct pfm_event_set *set, *new_set; + u64 switch_count; + u64 now, end; + u32 new_flags; + unsigned long info = 0; + int is_system, state, n; + + now = sched_clock(); + set = ctx->active_set; + new_set = set->sw_next; + + PFM_DBG_ovfl("state=%d cur_set=%u cur_runs=%llu cur_npend=%d next_set=%u " + "next_runs=%llu new_npend=%d new_r_pmds=%llx", + ctx->state, + set->id, + (unsigned long long)set->runs, + set->npend_ovfls, + new_set->id, + (unsigned long long)new_set->runs, + new_set->npend_ovfls, + (unsigned long long)new_set->reset_pmds[0]); + + is_system = ctx->flags.system; + state = ctx->state; + new_flags = new_set->flags; + switch_count = __get_cpu_var(pfm_stats).set_switch_count; + + /* + * nothing more to do + */ + if (new_set == set) + goto skip_same_set; + + /* + * when called from PMU intr handler, monitoring + * is already stopped + * + * save current PMD registers, we use a special + * form for performance reason. On some architectures, + * such as x86, the pmds are already saved when entering + * the PMU interrupt handler via pfm-arch_intr_freeze() + * so we don't need to save them again. On the contrary, + * on IA-64, they are not saved by freeze, thus we have to + * to it here. + */ + pfm_arch_save_pmds_from_intr(ctx, set); + + /* + * compute elapsed ns for active set + */ + set->duration += now - set->duration_start; + + if (is_system) + info = __get_cpu_var(pfm_syst_info); + + pfm_arch_restore_pmds(ctx, new_set); + __get_cpu_var(pfm_stats).ccnt6 += sched_clock() - now; + /* + * must not be restored active as we are still executing in the + * PMU interrupt handler. activation is deferred to unfreeze PMU + */ + pfm_arch_restore_pmcs(ctx, new_set); + + /* + * check for pending interrupt on incoming set. + * interrupts are masked so handler call deferred + */ + if (new_set->npend_ovfls) { + pfm_arch_resend_irq(); + __get_cpu_var(pfm_stats).ovfl_intr_replay_count++; + } + /* + * no need to restore anything, that is already done + */ + new_set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; + /* + * reset duration counter + */ + new_set->duration_start = now; + +skip_same_set: + switch_count++; + new_set->runs++; + + /* + * reset switch threshold + */ + if (new_flags & PFM_SETFL_OVFL_SWITCH) + pfm_reload_switch_thresholds(new_set); + else if (new_flags & PFM_SETFL_TIME_SWITCH) + new_set->timeout_sw_left = new_set->timeout_sw_ref; + + /* + * reset overflowed PMD registers + */ + n = bitmap_weight(cast_ulp(new_set->reset_pmds), pfm_pmu_conf->regs.max_pmd); + if (n) + pfm_reset_pmds(ctx, new_set, n, PFM_PMD_RESET_SHORT); + + /* + * reactivate monitoring + */ + if (is_system) { + info &= ~PFM_CPUINFO_TIME_SWITCH; + + if (new_flags & PFM_SETFL_TIME_SWITCH) + info |= PFM_CPUINFO_TIME_SWITCH; + + __get_cpu_var(pfm_syst_info) = info; + + PFM_DBG("new_set=%u info=0x%lx flags=0x%x", + new_set->id, + info, + new_flags); + } + + ctx->active_set = new_set; + ctx->set_all_runs++; + + end = sched_clock(); + + __get_cpu_var(pfm_stats).set_switch_count = switch_count; + __get_cpu_var(pfm_stats).set_switch_ns += end - now; +} + + +static int pfm_setfl_sane(struct pfm_context *ctx, u32 flags) +{ +#define PFM_SETFL_BOTH_SWITCH (PFM_SETFL_OVFL_SWITCH|PFM_SETFL_TIME_SWITCH) + int ret; + + ret = pfm_arch_setfl_sane(ctx, flags); + if (ret) + return ret; + + if ((flags & PFM_SETFL_BOTH_SWITCH) == PFM_SETFL_BOTH_SWITCH) { + PFM_DBG("both switch ovfl and switch time are set"); + return -EINVAL; + } + return 0; +} + +/* + * it is never possible to change the identification of an existing set + */ +static int __pfm_change_evtset(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_setdesc *req) +{ + u32 flags; + u16 set_id; + struct timespec tv; + unsigned long ji; + int ret; + + BUG_ON(ctx->state == PFM_CTX_LOADED); + + set_id = req->set_id; + flags = req->set_flags; + + ret = pfm_setfl_sane(ctx, flags); + if (ret) { + PFM_DBG("invalid flags 0x%x set %u", flags, set_id); + return -EINVAL; + } + + /* + * commit changes + * + * note that we defer checking the validity of set_id_next until the + * context is actually attached. This is the only moment where we can + * safely assess the sanity of the sets because sets cannot be changed + * or deleted once the context is attached + */ + set->id = set_id; + set->flags = flags; + set->priv_flags = 0; + + /* + * reset pointer to next set + */ + set->sw_next = NULL; + + tv.tv_sec = 0; + tv.tv_nsec = req->set_timeout; + ji = timespec_to_jiffies(&tv); + + /* + * verify that timeout is not 0 + */ + if (!ji && (flags & PFM_SETFL_TIME_SWITCH) != 0) { + PFM_DBG("invalid timeout=0"); + return -EINVAL; + } + + set->timeout_sw_ref = set->timeout_sw_left = ji; + + PFM_DBG("set%u flags=0x%x req_nsec=%llu" + " jiffies=%lu HZ=%u TICK_NSEC=%lu eff_nsec=%lu", + set_id, + flags, + (unsigned long long)req->set_timeout, + ji, + HZ, TICK_NSEC, + ji * TICK_NSEC); + + /* + * return actual timeout in nsecs + */ + jiffies_to_timespec(ji, &tv); + req->set_timeout = tv.tv_sec * 1000000000 + tv.tv_nsec; + + return 0; +} + +/* + * this function does not modify the next field + */ +void pfm_init_evtset(struct pfm_event_set *set) +{ + u64 *impl_pmcs; + u16 i, max_pmc; + + max_pmc = pfm_pmu_conf->regs.max_pmc; + impl_pmcs = pfm_pmu_conf->regs.pmcs; + + /* + * install default values for all PMC registers + */ + for (i=0; i < max_pmc; i++) { + if (test_bit(i, cast_ulp(impl_pmcs))) { + set->pmcs[i] = pfm_pmu_conf->pmc_desc[i].dfl_val; + PFM_DBG("set%u pmc%u=0x%llx", + set->id, + i, + (unsigned long long)set->pmcs[i]); + } + } + + /* + * PMD registers are set to 0 when the event set is allocated, + * hence we do not need to explicitly initialize them. + * + * For virtual PMD registers (i.e., those tied to a SW resource) + * their value becomes meaningful once the context is attached. + */ +} + +/* + * look for an event set using its identification. If the set does not + * exist: + * - if alloc == 0 then return error + * - if alloc == 1 then allocate set + * + * alloc is one ONLY when coming from pfm_create_evtsets() which can only + * be called when the context is detached, i.e. monitoring is stopped. + */ +struct pfm_event_set *pfm_find_set(struct pfm_context *ctx, u16 set_id, + int alloc) +{ + struct pfm_event_set *set, *new_set, *prev; + + PFM_DBG("looking for set=%u", set_id); + + /* + * shortcut for set0: always exist, cannot be removed + */ + if (!(set_id || alloc)) + return list_entry(ctx->list.next, struct pfm_event_set, list); + + prev = NULL; + list_for_each_entry(set, &ctx->list, list) { + if (set->id == set_id) + return set; + if (set->id > set_id) + break; + prev = set; + } + + if (!alloc) + return NULL; + + /* + * when alloc == 1, context is detached, monitoring is stopped and + * interrupts are enabled but context is locked + */ + + /* + * we are holding the context spinlock and interrupts + * are unmasked. We must use GFP_ATOMIC as we cannot + * sleep while holding a spin lock. + */ + new_set = kmem_cache_zalloc(pfm_set_cachep, GFP_ATOMIC); + if (!new_set) + return NULL; + + new_set->id = set_id; + + INIT_LIST_HEAD(&new_set->list); + + if (prev == NULL) { + list_add(&(new_set->list), &ctx->list); + } else { + PFM_DBG("add after set=%u", prev->id); + list_add(&(new_set->list), &prev->list); + } + return new_set; +} + +/* + * context is unloaded for this command. Interrupts are enabled + */ +int __pfm_create_evtsets(struct pfm_context *ctx, struct pfarg_setdesc *req, + int count) +{ + struct pfm_event_set *set; + u16 set_id; + int i, ret; + + for (i = 0; i < count; i++, req++) { + set_id = req->set_id; + + PFM_DBG("set_id=%u", set_id); + + set = pfm_find_set(ctx, set_id, 1); + if (set == NULL) + goto error_mem; + + ret = __pfm_change_evtset(ctx, set, req); + if (ret) + goto error_params; + + pfm_init_evtset(set); + } + return 0; +error_mem: + PFM_DBG("cannot allocate set %u", set_id); + pfm_retflag_set(req->set_flags, PFM_REG_RETFL_EINVAL); + return -ENOMEM; +error_params: + pfm_retflag_set(req->set_flags, PFM_REG_RETFL_EINVAL); + return ret; +} + +int __pfm_getinfo_evtsets(struct pfm_context *ctx, struct pfarg_setinfo *req, + int count) +{ + struct pfm_event_set *set; + int i, is_system, is_loaded; + u16 set_id; + int max_cnt_pmd; + u64 end; + + end = sched_clock(); + is_system = ctx->flags.system; + is_loaded = ctx->state == PFM_CTX_LOADED; + max_cnt_pmd = pfm_pmu_conf->regs.max_cnt_pmd; + + for (i = 0; i < count; i++, req++) { + + set_id = req->set_id; + + list_for_each_entry(set, &ctx->list, list) { + if (set->id == set_id) + goto found; + if (set->id > set_id) + goto error; + } +found: + /* + * compute leftover timeout + */ + req->set_flags = set->flags; + req->set_timeout = set->timeout_sw_left * TICK_NSEC; + req->set_runs = set->runs; + req->set_act_duration = set->duration; + + /* + * adjust for active set if needed + */ + if (is_system && is_loaded && ctx->flags.started + && set == ctx->active_set) + req->set_act_duration += end - set->duration_start; + + /* + * copy the list of pmds which last overflowed for + * the set + */ + bitmap_copy(cast_ulp(req->set_ovfl_pmds), + cast_ulp(set->ovfl_pmds), + max_cnt_pmd); + + /* + * copy bitmask of available PMU registers + */ + bitmap_copy(cast_ulp(req->set_avail_pmcs), + cast_ulp(pfm_pmu_conf->regs.pmcs), + pfm_pmu_conf->regs.max_pmc); + + bitmap_copy(cast_ulp(req->set_avail_pmds), + cast_ulp(pfm_pmu_conf->regs.pmds), + pfm_pmu_conf->regs.max_pmd); + + pfm_retflag_set(req->set_flags, 0); + + PFM_DBG("set%u flags=0x%x eff_usec=%llu runs=%llu " + "a_pmcs=0x%llx a_pmds=0x%llx", + set_id, + set->flags, + (unsigned long long)req->set_timeout, + (unsigned long long)set->runs, + (unsigned long long)pfm_pmu_conf->regs.pmcs[0], + (unsigned long long)pfm_pmu_conf->regs.pmds[0]); + } + return 0; +error: + PFM_DBG("set%u not found", set_id); + pfm_retflag_set(req->set_flags, PFM_REG_RETFL_EINVAL); + return -EINVAL; +} + +/* + * context is unloaded for this command. Interrupts are enabled + */ +int __pfm_delete_evtsets(struct pfm_context *ctx, void *arg, int count) +{ + struct pfarg_setdesc *req = arg; + struct pfm_event_set *set; + u16 set_id; + int i; + + /* delete operation only works when context is detached */ + BUG_ON(ctx->state != PFM_CTX_UNLOADED); + + for (i = 0; i < count; i++, req++) { + set_id = req->set_id; + + /* + * cannot remove set 0 + */ + if (!set_id) + goto error; + + list_for_each_entry(set, &ctx->list, list) { + if (set->id == set_id) + goto found; + if (set->id > set_id) + goto error; + } + goto error; +found: + /* + * clear active set if necessary. + * will be updated when context is loaded + */ + if (set == ctx->active_set) + ctx->active_set = NULL; + + list_del(&set->list); + + kmem_cache_free(pfm_set_cachep, set); + + pfm_retflag_set(req->set_flags, 0); + + PFM_DBG("set%u deleted", set_id); + } + return 0; +error: + PFM_DBG("set%u not found or invalid", set_id); + pfm_retflag_set(req->set_flags, PFM_REG_RETFL_EINVAL); + return -EINVAL; +} + +/* + * called from pfm_context_free() to free all sets + */ +void pfm_free_sets(struct pfm_context *ctx) +{ + struct pfm_event_set *set, *tmp; + + list_for_each_entry_safe(set, tmp, &ctx->list, list) { + list_del(&set->list); + kmem_cache_free(pfm_set_cachep, set); + } +} + +int pfm_sets_init(void) +{ + + pfm_set_cachep = kmem_cache_create("pfm_event_set", + sizeof(struct pfm_event_set), + SLAB_HWCACHE_ALIGN, 0, NULL, NULL); + if (pfm_set_cachep == NULL) { + PFM_ERR("cannot initialize event set slab"); + return -ENOMEM; + } + return 0; +} Index: linux-2.6/perfmon/perfmon_syscalls.c =================================================================== --- /dev/null +++ linux-2.6/perfmon/perfmon_syscalls.c @@ -0,0 +1,1007 @@ +/* + * perfmon_syscalls.c: perfmon2 system call interface + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include + +/* + * Context locking rules: + * --------------------- + * - any thread with access to the file descriptor of a context can + * potentially issue perfmon calls + * + * - calls must be serialized to guarantee correctness + * + * - as soon as a context is attached to a thread or CPU, it may be + * actively monitoring. On some architectures, such as IA-64, this + * is true even though the pfm_start() call has not been made. This + * comes from the fact that on some architectures, it is possible to + * start/stop monitoring from userland. + * + * - If monitoring is active, then there can PMU interrupts. Because + * context accesses must be serialized, the perfmon system calls + * must mask interrupts as soon as the context is attached. + * + * - perfmon system calls that operate with the context unloaded cannot + * assume it is actually unloaded when they are called. They first need + * to check and for that they need interrupts masked. Then if the context + * is actually unloaded, they can unmask interrupts. + * + * - interrupt masking holds true for other internal perfmon functions as + * well. Except for PMU interrupt handler because those interrupts cannot + * be nested. + * + * - we mask ALL interrupts instead of just the PMU interrupt because we + * also need to protect against timer interrupts which could trigger + * a set switch. + */ + +/* + * cannot attach if : + * - kernel task + * - task not owned by caller (checked by ptrace_may_attach()) + * - task is dead or zombie + * - cannot use blocking notification when self-monitoring + */ +static int pfm_task_incompatible(struct pfm_context *ctx, struct task_struct *task) +{ + /* + * cannot attach to a kernel thread + */ + if (!task->mm) { + PFM_DBG("cannot attach to kernel thread [%d]", task->pid); + return -EPERM; + } + + /* + * cannot use block on notification when + * self-monitoring. + */ + if (ctx->flags.block && task == current) { + PFM_DBG("cannot use block on notification when self-monitoring" + "[%d]", task->pid); + return -EINVAL; + } + /* + * cannot attach to a zombie task + */ + if (task->exit_state == EXIT_ZOMBIE || task->exit_state == EXIT_DEAD) { + PFM_DBG("cannot attach to zombie/dead task [%d]", task->pid); + return -EBUSY; + } + return 0; +} + +/* + * This function is used in per-thread mode only AND when not + * self-monitoring. It finds the task to monitor and checks + * that the caller has persmissions to attach. It also checks + * that the task is stopped via ptrace so that we can safely + * modify its state. + * + * task refcount is increment when succesful. + * This function is not declared static because it is used by the + * IA-64 compatiblity module arch/ia64/perfmon/perfmon_comapt.c + */ +int pfm_get_task(struct pfm_context *ctx, pid_t pid, struct task_struct **task) +{ + struct task_struct *p; + int ret = 0, ret1 = 0; + + /* + * When attaching to another thread we must ensure + * that the thread is actually stopped. Just like with + * perfmon system calls, we enforce that the thread + * be ptraced and STOPPED by using ptrace_check_attach(). + * + * As a consequence, only the ptracing parent can actually + * attach a context to a thread. Obviously, this constraint + * does not exist for self-monitoring threads. + * + * We use ptrace_may_attach() to check for permission. + * No permission checking is needed for self monitoring. + */ + read_lock(&tasklist_lock); + + p = find_task_by_pid(pid); + if (p) + get_task_struct(p); + + read_unlock(&tasklist_lock); + + if (p == NULL) + return -ESRCH; + + ret = -EPERM; + + /* + * returns 0 if cannot attach + */ + ret1 = ptrace_may_attach(p); + if (ret1) + ret = ptrace_check_attach(p, 0); + + PFM_DBG("may_attach=%d check_attach=%d", ret1, ret); + + if (ret || !ret1) + goto error; + + ret = pfm_task_incompatible(ctx, p); + if (ret) + goto error; + + *task = p; + + return 0; +error: + if (!(ret1 || ret)) + ret = -EPERM; + + put_task_struct(p); + + return ret; +} + +/* + * context must be locked when calling this function + */ +int pfm_check_task_state(struct pfm_context *ctx, int check_mask, + unsigned long *flags) +{ + struct task_struct *task; + unsigned long local_flags, new_flags; + int state, ret; + +recheck: + /* + * task is NULL for system-wide context + */ + task = ctx->task; + state = ctx->state; + local_flags = *flags; + + PFM_DBG("state=%d check_mask=0x%x", state, check_mask); + /* + * if the context is detached, then we do not touch + * hardware, therefore there is not restriction on when we can + * access it. + */ + if (state == PFM_CTX_UNLOADED) + return 0; + /* + * no command can operate on a zombie context. + * A context becomes zombie when the file that identifies + * it is closed while the context is still attached to the + * thread it monitors. + */ + if (state == PFM_CTX_ZOMBIE) + return -EINVAL; + + /* + * at this point, state is PFM_CTX_LOADED or PFM_CTX_MASKED + */ + + /* + * some commands require the context to be unloaded to operate + */ + if (check_mask & PFM_CMD_UNLOADED) { + PFM_DBG("state=%d, cmd needs context unloaded", state); + return -EBUSY; + } + + /* + * self-monitoring always ok. + */ + if (task == current) + return 0; + + /* + * for syswide, the calling thread must be running on the cpu + * the context is bound to. There cannot be preemption as we + * check with interrupts disabled. + */ + if (ctx->flags.system) { + if (ctx->cpu != smp_processor_id()) + return -EBUSY; + return 0; + } + + /* + * at this point, monitoring another thread + */ + + /* + * the pfm_unload_context() command is allowed on masked context + */ + if (state == PFM_CTX_MASKED && !(check_mask & PFM_CMD_UNLOAD)) + return 0; + + /* + * When we operate on another thread, we must wait for it to be + * stopped and completely off any CPU as we need to access the + * PMU state (or machine state). + * + * A thread can be put in the STOPPED state in various ways + * including PTRACE_ATTACH, or when it receives a SIGSTOP signal. + * We enforce that the thread must be ptraced, so it is stopped + * AND it CANNOT wake up while we operate on it because this + * would require an action from the ptracing parent which is the + * thread that is calling this function. + * + * The dependency on ptrace, imposes that only the ptracing + * parent can issue command on a thread. This is unfortunate + * but we do not know of a better way of doing this. + */ + if (check_mask & PFM_CMD_STOPPED) { + + spin_unlock_irqrestore(&ctx->lock, local_flags); + + /* + * check that the thread is ptraced AND STOPPED + */ + ret = ptrace_check_attach(task, 0); + + spin_lock_irqsave(&ctx->lock, new_flags); + + /* + * flags may be different than when we released the lock + */ + *flags = new_flags; + + if (ret) + return ret; + /* + * we must recheck to verify if state has changed + */ + if (ctx->state != state) { + PFM_DBG("old_state=%d new_state=%d", + state, + ctx->state); + goto recheck; + } + } + return 0; +} + +/* + * both req and ptr_free are kmalloc'ed, thus they need a kfree by caller + */ +int pfm_get_args(void __user *ureq, size_t sz, size_t lsz, void *laddr, + void **req, void **ptr_free) +{ + void *addr; + + /* + * check if we can get by with stack buffer + */ + if (sz <= lsz) { + *req = laddr; + *ptr_free = NULL; + return copy_from_user(laddr, ureq, sz) ? -EFAULT : 0; + } + + if (unlikely(sz > pfm_controls.arg_mem_max)) { + PFM_DBG("argument too big %zu max=%zu", + sz, + pfm_controls.arg_mem_max); + return -E2BIG; + } + + addr = kmalloc(sz, GFP_KERNEL); + if (unlikely(addr == NULL)) + return -ENOMEM; + + if (copy_from_user(addr, ureq, sz)) { + kfree(addr); + return -EFAULT; + } + *req = *ptr_free = addr; + + return 0; +} + +/* + * arg is kmalloc'ed, thus it needs a kfree by caller + */ +int pfm_get_smpl_arg(char __user *fmt_uname, void __user *fmt_uarg, size_t usize, void **arg, + struct pfm_smpl_fmt **fmt) +{ + struct pfm_smpl_fmt *f; + char *fmt_name; + void *addr = NULL; + size_t sz; + int ret; + + fmt_name = getname(fmt_uname); + if (!fmt_name) { + PFM_DBG("getname failed"); + return -ENOMEM; + } + + /* + * find fmt and increase refcount + */ + f = pfm_smpl_fmt_get(fmt_name); + + putname(fmt_name); + + if (f == NULL) { + PFM_DBG("buffer format not found"); + return -EINVAL; + } + + /* + * expected format argument size + */ + sz = f->fmt_arg_size; + + /* + * check user size matches expected size + * usize = -1 is for IA-64 backward compatibility + */ + ret = -EINVAL; + if (sz != usize && usize != -1) { + PFM_DBG("invalid arg size %zu, format expects %zu", + usize, sz); + goto error; + } + + if (sz) { + ret = -ENOMEM; + addr = kmalloc(sz, GFP_KERNEL); + if (addr == NULL) + goto error; + + ret = -EFAULT; + if (copy_from_user(addr, fmt_uarg, sz)) + goto error; + } + *arg = addr; + *fmt = f; + return 0; + +error: + kfree(addr); + pfm_smpl_fmt_put(f); + return ret; +} + +/* + * unlike the other perfmon system calls, this one return a file descriptor + * or a value < 0 in case of error, very much like open() or socket() + */ +asmlinkage long sys_pfm_create_context(struct pfarg_ctx __user *ureq, + char __user *fmt_name, + void __user *fmt_uarg, size_t fmt_size) +{ + struct pfarg_ctx req; + struct pfm_context *new_ctx; + struct pfm_smpl_fmt *fmt = NULL; + void *fmt_arg = NULL; + int ret; + + if (perfmon_disabled) + return -ENOSYS; + + if (copy_from_user(&req, ureq, sizeof(req))) + return -EFAULT; + + if (fmt_name) { + ret = pfm_get_smpl_arg(fmt_name, fmt_uarg, fmt_size, &fmt_arg, &fmt); + if (ret) + goto abort; + } + + ret = __pfm_create_context(&req, fmt, fmt_arg, PFM_NORMAL, &new_ctx); + + kfree(fmt_arg); +abort: + return ret; +} + +asmlinkage long sys_pfm_write_pmcs(int fd, struct pfarg_pmc __user *ureq, int count) +{ + struct pfm_context *ctx; + struct file *filp; + struct pfarg_pmc pmcs[PFM_PMC_STK_ARG]; + struct pfarg_pmc *req; + void *fptr; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) { + PFM_DBG("invalid arg count %d", count); + return -EINVAL; + } + + sz = count*sizeof(*ureq); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, sizeof(pmcs), pmcs, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags); + if (!ret) + ret = __pfm_write_pmcs(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + /* + * This function may be on the critical path. + * We want to avoid the branch if unecessary. + */ + if (fptr) + kfree(fptr); +error: + fput_light(filp, fput_needed); + return ret; +} + +asmlinkage long sys_pfm_write_pmds(int fd, struct pfarg_pmd __user *ureq, int count) +{ + struct pfm_context *ctx; + struct file *filp; + struct pfarg_pmd pmds[PFM_PMD_STK_ARG]; + struct pfarg_pmd *req; + void *fptr; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) + return -EINVAL; + + sz = count*sizeof(*ureq); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, sizeof(pmds), pmds, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags); + if (!ret) + ret = __pfm_write_pmds(ctx, req, count, 0); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + if (fptr) + kfree(fptr); +error: + fput_light(filp, fput_needed); + return ret; +} + +asmlinkage long sys_pfm_read_pmds(int fd, struct pfarg_pmd __user *ureq, int count) +{ + struct pfm_context *ctx; + struct file *filp; + struct pfarg_pmd pmds[PFM_PMD_STK_ARG]; + struct pfarg_pmd *req; + void *fptr; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) + return -EINVAL; + + sz = count*sizeof(*ureq); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, sizeof(pmds), pmds, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags); + if (!ret) + ret = __pfm_read_pmds(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + if (fptr) + kfree(req); +error: + fput_light(filp, fput_needed); + return ret; +} + +asmlinkage long sys_pfm_restart(int fd) +{ + struct pfm_context *ctx; + struct file *filp; + unsigned long flags; + int ret, fput_needed, complete_needed; + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, 0, &flags); + if (!ret) + ret = __pfm_restart(ctx, &complete_needed); + + spin_unlock_irqrestore(&ctx->lock, flags); + /* + * In per-thread mode with blocking notification, i.e. + * ctx->flags.blocking=1, we need to defer issuing the + * complete to unblock the blocked monitored thread. + * Otherwise we have a potential deadlock due to a lock + * inversion between the context lock and the task_rq_lock() + * which can happen if one thread is in this call and the other + * (the monitored thread) is in the context switch code. + * + * It is safe to access the context outside the critical section + * because: + * - we are protected by the fget_light(), so the context cannot + * disappear. + * - we are protected against another thread issuing a extraneous + * pfm_restart() because the ctx->flags.can-restart flag has + * already been cleared + * - the restart_complete field is only touched by the context init + * code (happens only once) or by wait_for_completion_interruptible + * in __pfm_handle_work(), so this is already serialized + */ + if (complete_needed) + complete(&ctx->restart_complete); + +error: + fput_light(filp, fput_needed); + return ret; +} + +asmlinkage long sys_pfm_stop(int fd) +{ + struct pfm_context *ctx; + struct file *filp; + unsigned long flags; + int ret, fput_needed; + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags); + if (!ret) + ret = __pfm_stop(ctx); + + spin_unlock_irqrestore(&ctx->lock, flags); + +error: + fput_light(filp, fput_needed); + return ret; +} + +asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *ureq) +{ + struct pfm_context *ctx; + struct file *filp; + struct pfarg_start req; + unsigned long flags; + int ret, fput_needed; + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + /* + * the one argument is actually optional + */ + if (ureq && copy_from_user(&req, ureq, sizeof(req))) + return -EFAULT; + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags); + if (!ret) + ret = __pfm_start(ctx, ureq ? &req : NULL); + + spin_unlock_irqrestore(&ctx->lock, flags); + +error: + fput_light(filp, fput_needed); + return ret; +} + +asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ureq) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct file *filp; + unsigned long flags; + struct pfarg_load req; + int ret, fput_needed; + + if (copy_from_user(&req, ureq, sizeof(req))) + return -EFAULT; + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + task = NULL; + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + /* + * in per-thread mode (not self-monitoring), get a reference + * on task to monitor. This must be done with interrupts enabled + * Upon succesful return, refcount on task is increased. + * + * fget_light() is protecting the context. + */ + if (!ctx->flags.system) { + if (req.load_pid != current->pid) { + ret = pfm_get_task(ctx, req.load_pid, &task); + if (ret) + goto error; + } else + task = current; + } + + /* + * irqsave is required to avoid race in case context is already + * loaded or with switch timeout in the case of self-monitoring + */ + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags); + if (!ret) + ret = __pfm_load_context(ctx, &req, task); + + spin_unlock_irqrestore(&ctx->lock, flags); + + /* + * in per-thread mode (not self-monitoring), we need + * to decrease refcount on task to monitor: + * - load successful: we have a reference to the task in ctx->task + * - load failed : undo the effect of pfm_get_task() + */ + if (task && task != current) + put_task_struct(task); +error: + fput_light(filp, fput_needed); + return ret; +} + +asmlinkage long sys_pfm_unload_context(int fd) +{ + struct pfm_context *ctx; + struct file *filp; + unsigned long flags; + int ret, fput_needed; + int is_system, can_release = 0; + u32 cpu; + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + is_system = ctx->flags.system; + cpu = ctx->cpu; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED|PFM_CMD_UNLOAD, &flags); + if (!ret) + ret = __pfm_unload_context(ctx, &can_release); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (can_release) + pfm_release_session(is_system, cpu); + +error: + fput_light(filp, fput_needed); + return ret; +} + +asmlinkage long sys_pfm_create_evtsets(int fd, struct pfarg_setdesc __user *ureq, int count) +{ + struct pfm_context *ctx; + struct file *filp; + struct pfarg_setdesc *req; + void *fptr; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) + return -EINVAL; + + sz = count*sizeof(*ureq); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + /* + * must mask interrupts because we do not know the state of context, + * could be attached and we could be getting PMU interrupts. So + * we mask and lock context and we check and possibly relax masking + */ + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags); + if (!ret) + ret = __pfm_create_evtsets(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); + +error: + fput_light(filp, fput_needed); + return ret; +} + +asmlinkage long sys_pfm_getinfo_evtsets(int fd, struct pfarg_setinfo __user *ureq, int count) +{ + struct pfm_context *ctx; + struct file *filp; + struct pfarg_setinfo *req; + void *fptr; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) + return -EINVAL; + + sz = count*sizeof(*ureq); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + /* + * this command operate even when context is loaded, so we need + * to keep interrupts masked to avoid a race with PMU interrupt + * which may switch active set + */ + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, 0, &flags); + if (!ret) + ret = __pfm_getinfo_evtsets(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); +error: + fput_light(filp, fput_needed); + if (ret) + PFM_DBG("failed: errno=%d", -ret); + return ret; +} + +asmlinkage long sys_pfm_delete_evtsets(int fd, struct pfarg_setinfo __user *ureq, int count) +{ + struct pfm_context *ctx; + struct file *filp; + struct pfarg_setinfo *req; + void *fptr; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) + return -EINVAL; + + sz = count*sizeof(*ureq); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + /* + * must mask interrupts because we do not know the state of context, + * could be attached and we could be getting PMU interrupts + */ + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags); + if (!ret) + ret = __pfm_delete_evtsets(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); + +error: + fput_light(filp, fput_needed); + return ret; +} Index: linux-2.6/perfmon/perfmon_sysfs.c =================================================================== --- /dev/null +++ linux-2.6/perfmon/perfmon_sysfs.c @@ -0,0 +1,937 @@ +/* + * perfmon_sysfs.c: perfmon2 sysfs interface + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +struct pfm_attribute { + struct attribute attr; + ssize_t (*show)(void *, char *); + ssize_t (*store)(void *, const char *, size_t); +}; +#define to_attr(n) container_of(n, struct pfm_attribute, attr); + +#define PFM_RO_ATTR(_name) \ +struct pfm_attribute attr_##_name = __ATTR_RO(_name) + +#define PFM_RW_ATTR(_name,_mode,_show,_store) \ +struct pfm_attribute attr_##_name = __ATTR(_name,_mode,_show,_store); + +static int pfm_sysfs_init_done; /* true when pfm_sysfs_init() completed */ + +int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu); + +struct pfm_controls pfm_controls = { + .sys_group = PFM_GROUP_PERM_ANY, + .task_group = PFM_GROUP_PERM_ANY, + .arg_mem_max = PAGE_SIZE, + .smpl_buffer_mem_max = ~0, +}; +EXPORT_SYMBOL(pfm_controls); + +DECLARE_PER_CPU(struct pfm_stats, pfm_stats); + +static struct kobject pfm_kernel_kobj, pfm_kernel_fmt_kobj; + +static void pfm_reset_stats(int cpu) +{ + struct pfm_stats *st; + unsigned long flags; + + st = &per_cpu(pfm_stats, cpu); + + local_irq_save(flags); + + /* + * cannot use memset because of kobj member + */ + st->ovfl_intr_replay_count = 0; + st->ovfl_intr_regular_count = 0; + st->ovfl_intr_all_count = 0; + st->ovfl_intr_ns = 0; + st->ovfl_intr_phase1 = 0; + st->ovfl_intr_phase2 = 0; + st->ovfl_intr_phase3 = 0; + st->ovfl_intr_nmi_count = 0; + st->handle_work_count = 0; + st->ovfl_notify_count = 0; + st->reset_pmds_count = 0; + st->pfm_restart_count = 0; + st->ccnt0 = 0; + st->ccnt1 = 0; + st->ccnt2 = 0; + st->ccnt3 = 0; + st->ccnt4 = 0; + st->ccnt5 = 0; + st->ccnt6 = 0; + st->fmt_handler_calls = 0; + st->fmt_handler_ns = 0; + st->set_switch_count = 0; + st->set_switch_ns = 0; + st->ctxsw_count = 0; + st->ctxsw_ns = 0; + st->handle_timeout_count = 0; + + local_irq_restore(flags); +} + +static ssize_t pfm_fmt_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct pfm_smpl_fmt *fmt = to_smpl_fmt(kobj); + struct pfm_attribute *attribute = to_attr(attr); + return attribute->show ? attribute->show(fmt, buf) : -EIO; +} + +static ssize_t pfm_pmu_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct pfm_pmu_config *pmu= to_pmu(kobj); + struct pfm_attribute *attribute = to_attr(attr); + return attribute->show ? attribute->show(pmu, buf) : -EIO; +} + +static ssize_t pfm_stats_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct pfm_stats *st = to_stats(kobj); + struct pfm_attribute *attribute = to_attr(attr); + return attribute->show ? attribute->show(st, buf) : -EIO; +} + +static ssize_t pfm_regs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct pfm_regmap_desc *reg = to_reg(kobj); + struct pfm_attribute *attribute = to_attr(attr); + return attribute->show ? attribute->show(reg, buf) : -EIO; +} + +static ssize_t pfm_stats_attr_store(struct kobject *kobj, + struct attribute *attr, const char *buf, size_t count) +{ + struct pfm_stats *st = to_stats(kobj); + struct pfm_attribute *attribute = to_attr(attr); + return attribute->store ? attribute->store(st, buf, count) : -EIO; +} + +static struct sysfs_ops pfm_fmt_sysfs_ops = { + .show = pfm_fmt_attr_show +}; + +static struct sysfs_ops pfm_pmu_sysfs_ops = { + .show = pfm_pmu_attr_show +}; + +static struct sysfs_ops pfm_stats_sysfs_ops = { + .show = pfm_stats_attr_show, + .store = pfm_stats_attr_store +}; + +static struct sysfs_ops pfm_regs_sysfs_ops = { + .show = pfm_regs_attr_show +}; + +static struct kobj_type pfm_fmt_ktype = { + .sysfs_ops = &pfm_fmt_sysfs_ops, +}; + +static struct kobj_type pfm_pmu_ktype = { + .sysfs_ops = &pfm_pmu_sysfs_ops, +}; + +static struct kobj_type pfm_stats_ktype = { + .sysfs_ops = &pfm_stats_sysfs_ops, +}; + +static struct kobj_type pfm_regs_ktype = { + .sysfs_ops = &pfm_regs_sysfs_ops, +}; + +decl_subsys_name(pfm_fmt, pfm_fmt, &pfm_fmt_ktype, NULL); +decl_subsys_name(pfm_pmu, pfm_pmu, &pfm_pmu_ktype, NULL); +decl_subsys_name(pfm_stats, pfm_stats, &pfm_stats_ktype, NULL); +decl_subsys_name(pfm_regs, pfm_regs, &pfm_regs_ktype, NULL); + +static ssize_t version_show(void *info, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u.%u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN); +} + +static ssize_t pmd_max_fast_arg_show(void *info, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", PFM_PMD_STK_ARG); +} + +static ssize_t pmc_max_fast_arg_show(void *info, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", PFM_PMC_STK_ARG); +} + +static ssize_t task_sessions_count_show(void *info, char *buf) +{ + return pfm_sysfs_session_show(buf, PAGE_SIZE, 0); +} + +static ssize_t sys_sessions_count_show(void *info, char *buf) +{ + return pfm_sysfs_session_show(buf, PAGE_SIZE, 1); +} + +static ssize_t smpl_buffer_mem_cur_show(void *info, char *buf) +{ + return pfm_sysfs_session_show(buf, PAGE_SIZE, 2); +} + +static ssize_t debug_show(void *info, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.debug); +} + +static ssize_t debug_store(void *info, const char *buf, size_t sz) +{ + int d, i; + + if (sscanf(buf,"%d", &d) != 1) + return -EINVAL; + + pfm_controls.debug = d; + + if (d == 0) { + for_each_online_cpu(i) { + pfm_reset_stats(i); + } + } + return sz; +} + +static ssize_t debug_ovfl_show(void *info, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.debug_ovfl); +} + +static ssize_t debug_ovfl_store(void *info, const char *buf, size_t sz) +{ + int d; + + if (sscanf(buf,"%d", &d) != 1) + return -EINVAL; + + pfm_controls.debug_ovfl = d; + + return strnlen(buf, PAGE_SIZE); +} + +static ssize_t reset_stats_show(void *info, char *buf) +{ + buf[0]='0'; + buf[1]='\0'; + return strnlen(buf, PAGE_SIZE); +} + +static ssize_t reset_stats_store(void *info, const char *buf, size_t count) +{ + int i; + + for_each_online_cpu(i) { + pfm_reset_stats(i); + } + return count; +} + +static ssize_t sys_group_show(void *info, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.sys_group); +} + +static ssize_t sys_group_store(void *info, const char *buf, size_t sz) +{ + int d; + + if (sscanf(buf,"%d", &d) != 1) + return -EINVAL; + + pfm_controls.sys_group = d; + + return strnlen(buf, PAGE_SIZE); +} + +static ssize_t task_group_show(void *info, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.task_group); +} + +static ssize_t task_group_store(void *info, const char *buf, size_t sz) +{ + int d; + + if (sscanf(buf,"%d", &d) != 1) + return -EINVAL; + + pfm_controls.task_group = d; + + return strnlen(buf, PAGE_SIZE); +} + +static ssize_t buf_size_show(void *info, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%zu\n", pfm_controls.smpl_buffer_mem_max); +} + +static ssize_t buf_size_store(void *info, const char *buf, size_t sz) +{ + size_t d; + + if (sscanf(buf,"%zu", &d) != 1) + return -EINVAL; + /* + * we impose a page as the minimum + */ + if (d < PAGE_SIZE) + return -EINVAL; + + pfm_controls.smpl_buffer_mem_max = d; + + return strnlen(buf, PAGE_SIZE); +} + +static ssize_t arg_size_show(void *info, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%zu\n", pfm_controls.arg_mem_max); +} + +static ssize_t arg_size_store(void *info, const char *buf, size_t sz) +{ + size_t d; + + if (sscanf(buf,"%zu", &d) != 1) + return -EINVAL; + + /* + * we impose a page as the minimum. + * + * This limit may be smaller than the stack buffer + * available and that is fine. + */ + if (d < PAGE_SIZE) + return -EINVAL; + + pfm_controls.arg_mem_max = d; + + return strnlen(buf, PAGE_SIZE); +} + +/* + * /sys/kernel/perfmon attributes + */ +static PFM_RO_ATTR(version); +static PFM_RO_ATTR(task_sessions_count); +static PFM_RO_ATTR(sys_sessions_count); +static PFM_RO_ATTR(smpl_buffer_mem_cur); +static PFM_RO_ATTR(pmd_max_fast_arg); +static PFM_RO_ATTR(pmc_max_fast_arg); + +static PFM_RW_ATTR(debug, 0644, debug_show, debug_store); +static PFM_RW_ATTR(debug_ovfl, 0644, debug_ovfl_show, debug_ovfl_store); +static PFM_RW_ATTR(reset_stats, 0644, reset_stats_show, reset_stats_store); +static PFM_RW_ATTR(sys_group, 0644, sys_group_show, sys_group_store); +static PFM_RW_ATTR(task_group, 0644, task_group_show, task_group_store); +static PFM_RW_ATTR(smpl_buffer_mem_max, 0644, buf_size_show, buf_size_store); +static PFM_RW_ATTR(arg_mem_max, 0644, arg_size_show, arg_size_store); + +static struct attribute *pfm_kernel_attrs[] = { + &attr_version.attr, + &attr_task_sessions_count.attr, + &attr_sys_sessions_count.attr, + &attr_smpl_buffer_mem_cur.attr, + &attr_pmd_max_fast_arg.attr, + &attr_pmc_max_fast_arg.attr, + &attr_debug.attr, + &attr_debug_ovfl.attr, + &attr_reset_stats.attr, + &attr_sys_group.attr, + &attr_task_group.attr, + &attr_smpl_buffer_mem_max.attr, + &attr_arg_mem_max.attr, + NULL +}; + +static struct attribute_group pfm_kernel_attr_group = { + .attrs = pfm_kernel_attrs, +}; + +int __init pfm_init_sysfs(void) +{ + int ret; + int done_fmt = 0, done_pmu = 0, done_stats = 0, done_regs = 0; + int done_kobj_fmt = 0, done_kobj_kernel = 0; + int i, cpu = -1; + + ret = subsystem_register(&pfm_fmt_subsys); + if (ret) { + PFM_INFO("cannot register pfm_fmt_subsys: %d", ret); + goto error; + } + done_fmt = 1; + + ret = subsystem_register(&pfm_pmu_subsys); + if (ret) { + PFM_INFO("cannot register pfm_pmu_subsys: %d", ret); + goto error; + } + done_pmu = 1; + + ret = subsystem_register(&pfm_stats_subsys); + if (ret) { + PFM_INFO("cannot register pfm_stats_subsys: %d", ret); + goto error; + } + done_stats = 1; + + ret = subsystem_register(&pfm_regs_subsys); + if (ret) { + PFM_INFO("cannot register pfm_regs_subsys: %d", ret); + goto error; + } + done_regs = 1; + + kobject_init(&pfm_kernel_kobj); + kobject_init(&pfm_kernel_fmt_kobj); + + pfm_kernel_kobj.parent = &kernel_subsys.kobj; + kobject_set_name(&pfm_kernel_kobj, "perfmon"); + + pfm_kernel_fmt_kobj.parent = &pfm_kernel_kobj; + kobject_set_name(&pfm_kernel_fmt_kobj, "formats"); + + ret = kobject_add(&pfm_kernel_kobj); + if (ret) { + PFM_INFO("cannot add kernel object: %d", ret); + goto error; + } + done_kobj_kernel = 1; + + ret = kobject_add(&pfm_kernel_fmt_kobj); + if (ret) { + PFM_INFO("cannot add fmt object: %d", ret); + goto error; + } + done_kobj_fmt = 1; + + ret = sysfs_create_group(&pfm_kernel_kobj, &pfm_kernel_attr_group); + if (ret) { + PFM_INFO("cannot create kernel group"); + goto error; + } + + /* + * must be set before builtin_fmt and + * add_pmu() calls + */ + pfm_sysfs_init_done = 1; + + pfm_sysfs_builtin_fmt_add(); + + if (pfm_pmu_conf) + pfm_sysfs_add_pmu(pfm_pmu_conf); + + for_each_online_cpu(cpu) { + ret = pfm_sysfs_add_cpu(cpu); + if (ret) + goto error; + } + return 0; +error: + if (done_fmt) + subsystem_unregister(&pfm_fmt_subsys); + if (done_pmu) + subsystem_unregister(&pfm_pmu_subsys); + if (done_stats) + subsystem_unregister(&pfm_stats_subsys); + if (done_regs) + subsystem_unregister(&pfm_regs_subsys); + if (done_kobj_kernel) + kobject_del(&pfm_kernel_kobj); + if (done_kobj_fmt) + kobject_del(&pfm_kernel_fmt_kobj); + + for (i=0; i < cpu; i++) + pfm_sysfs_del_cpu(i); + + return ret; +} + +/* + * per-cpu perfmon stats attributes + */ +#define PFM_DECL_STATS_ATTR(name) \ +static ssize_t name##_show(void *info, char *buf) \ +{ \ + struct pfm_stats *st = info;\ + return snprintf(buf, PAGE_SIZE, "%llu\n", \ + (unsigned long long)st->name); \ +} \ +static PFM_RO_ATTR(name) + +PFM_DECL_STATS_ATTR(ovfl_intr_replay_count); +PFM_DECL_STATS_ATTR(ovfl_intr_all_count); +PFM_DECL_STATS_ATTR(ovfl_intr_ns); +PFM_DECL_STATS_ATTR(fmt_handler_calls); +PFM_DECL_STATS_ATTR(fmt_handler_ns); +PFM_DECL_STATS_ATTR(set_switch_count); +PFM_DECL_STATS_ATTR(set_switch_ns); +PFM_DECL_STATS_ATTR(ctxsw_count); +PFM_DECL_STATS_ATTR(ctxsw_ns); +PFM_DECL_STATS_ATTR(handle_timeout_count); +PFM_DECL_STATS_ATTR(ovfl_intr_nmi_count); +PFM_DECL_STATS_ATTR(handle_work_count); +PFM_DECL_STATS_ATTR(ovfl_notify_count); +PFM_DECL_STATS_ATTR(reset_pmds_count); +PFM_DECL_STATS_ATTR(pfm_restart_count); +PFM_DECL_STATS_ATTR(ccnt0); +PFM_DECL_STATS_ATTR(ccnt1); +PFM_DECL_STATS_ATTR(ccnt2); +PFM_DECL_STATS_ATTR(ccnt3); +PFM_DECL_STATS_ATTR(ccnt4); +PFM_DECL_STATS_ATTR(ccnt5); +PFM_DECL_STATS_ATTR(ccnt6); + +/* + * per-reg attributes + */ +static ssize_t name_show(void *info, char *buf) +{ + struct pfm_regmap_desc *reg = info; + return snprintf(buf, PAGE_SIZE, "%s\n", reg->desc); +} +static PFM_RO_ATTR(name); + +static ssize_t dfl_val_show(void *info, char *buf) +{ + struct pfm_regmap_desc *reg = info; + return snprintf(buf, PAGE_SIZE, "0x%llx\n", + (unsigned long long)reg->dfl_val); +} +static PFM_RO_ATTR(dfl_val); + +static ssize_t rsvd_msk_show(void *info, char *buf) +{ + struct pfm_regmap_desc *reg = info; + return snprintf(buf, PAGE_SIZE, "0x%llx\n", + (unsigned long long)reg->rsvd_msk); +} +static PFM_RO_ATTR(rsvd_msk); + +static ssize_t width_show(void *info, char *buf) +{ + struct pfm_regmap_desc *reg = info; + int w; + + w = (reg->type & PFM_REG_C64) ? pfm_pmu_conf->counter_width : 64; + + return snprintf(buf, PAGE_SIZE, "%d\n", w); +} +static PFM_RO_ATTR(width); + + +static ssize_t addr_show(void *info, char *buf) +{ + struct pfm_regmap_desc *reg = info; + return snprintf(buf, PAGE_SIZE, "0x%lx\n", reg->hw_addr); +} +static PFM_RO_ATTR(addr); + +static ssize_t ovfl_intr_spurious_count_show(void *info, char *buf) +{ + struct pfm_stats *st = info; + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(st->ovfl_intr_all_count + - st->ovfl_intr_regular_count)); +} + +static ssize_t ovfl_intr_regular_count_show(void *info, char *buf) +{ + struct pfm_stats *st = info; + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(st->ovfl_intr_regular_count + - st->ovfl_intr_replay_count)); +} + +static PFM_RO_ATTR(ovfl_intr_spurious_count); +static PFM_RO_ATTR(ovfl_intr_regular_count); + +static struct attribute *pfm_cpu_attrs[] = { + &attr_ovfl_intr_spurious_count.attr, + &attr_ovfl_intr_replay_count.attr, + &attr_ovfl_intr_regular_count.attr, + &attr_ovfl_intr_all_count.attr, + &attr_ovfl_intr_ns.attr, + &attr_handle_work_count.attr, + &attr_ovfl_notify_count.attr, + &attr_reset_pmds_count.attr, + &attr_pfm_restart_count.attr, + &attr_ccnt0.attr, + &attr_ccnt1.attr, + &attr_ccnt2.attr, + &attr_ccnt3.attr, + &attr_ccnt4.attr, + &attr_ccnt5.attr, + &attr_ccnt6.attr, + &attr_fmt_handler_calls.attr, + &attr_fmt_handler_ns.attr, + &attr_set_switch_count.attr, + &attr_set_switch_ns.attr, + &attr_ctxsw_count.attr, + &attr_ctxsw_ns.attr, + &attr_handle_timeout_count.attr, + &attr_ovfl_intr_nmi_count.attr, + NULL +}; + +static struct attribute_group pfm_cpu_attr_group = { + .attrs = pfm_cpu_attrs, +}; + +int pfm_sysfs_add_cpu(int mycpu) +{ + struct sys_device *cpudev; + struct pfm_stats *st; + int ret; + + cpudev = get_cpu_sysdev(mycpu); + if (!cpudev) + return -EINVAL; + + st = &per_cpu(pfm_stats, mycpu); + + kobject_init(&st->kobj); + + st->kobj.parent = &cpudev->kobj; + kobject_set_name(&st->kobj, "perfmon"); + kobj_set_kset_s(st, pfm_stats_subsys); + + ret = kobject_add(&st->kobj); + if (ret) + return ret; + + ret = sysfs_create_group(&st->kobj, &pfm_cpu_attr_group); + if (ret) + kobject_del(&st->kobj); + + pfm_reset_stats(mycpu); + + return ret; +} + +void pfm_sysfs_del_cpu(int mycpu) +{ + struct sys_device *cpudev; + struct pfm_stats *st; + + cpudev = get_cpu_sysdev(mycpu); + if (!cpudev) + return; + + st = &per_cpu(pfm_stats, mycpu); + kobject_del(&st->kobj); + + sysfs_remove_group(&st->kobj, &pfm_cpu_attr_group); +} + +static ssize_t smpl_version_show(void *data, char *buf) +{ + struct pfm_smpl_fmt *fmt = data; + + return snprintf(buf, PAGE_SIZE, "%u.%u", + fmt->fmt_version >>16 & 0xffff, + fmt->fmt_version & 0xffff); +} +PFM_RO_ATTR(smpl_version); + +static ssize_t smpl_argsize_show(void *data, char *buf) +{ + struct pfm_smpl_fmt *fmt = data; + + return snprintf(buf, PAGE_SIZE, "%zu", fmt->fmt_arg_size); +} +PFM_RO_ATTR(smpl_argsize); + +static struct attribute *pfm_fmt_attrs[] = { + &attr_smpl_version.attr, + &attr_smpl_argsize.attr, + NULL +}; + +static struct attribute_group pfm_fmt_attr_group = { + .attrs = pfm_fmt_attrs, +}; + + +/* + * when a sampling format module is inserted, we populate + * sysfs with some information + */ +int pfm_sysfs_add_fmt(struct pfm_smpl_fmt *fmt) +{ + int ret; + + if (pfm_sysfs_init_done == 0) + return 0; + + kobject_init(&fmt->kobj); + kobject_set_name(&fmt->kobj, fmt->fmt_name); + kobj_set_kset_s(fmt, pfm_fmt_subsys); + fmt->kobj.parent = &pfm_kernel_fmt_kobj; + + ret = kobject_add(&fmt->kobj); + if (ret) + return ret; + + ret = sysfs_create_group(&fmt->kobj, &pfm_fmt_attr_group); + if (ret) + kobject_del(&fmt->kobj); + + return ret; +} + +/* + * when a sampling format module is removed, its information + * must also be removed from sysfs + */ +int pfm_sysfs_remove_fmt(struct pfm_smpl_fmt *fmt) +{ + if (pfm_sysfs_init_done == 0) + return 0; + + sysfs_remove_group(&fmt->kobj, &pfm_fmt_attr_group); + kobject_del(&fmt->kobj); + + return 0; +} + +static struct attribute *pfm_reg_attrs[] = { + &attr_name.attr, + &attr_dfl_val.attr, + &attr_rsvd_msk.attr, + &attr_width.attr, + &attr_addr.attr, + NULL +}; + +static struct attribute_group pfm_reg_attr_group = { + .attrs = pfm_reg_attrs, +}; + +static ssize_t model_show(void *info, char *buf) +{ + struct pfm_pmu_config *p = info; + return snprintf(buf, PAGE_SIZE, "%s\n", p->pmu_name); +} +static PFM_RO_ATTR(model); + +static struct attribute *pfm_pmu_desc_attrs[] = { + &attr_model.attr, + NULL +}; + +static struct attribute_group pfm_pmu_desc_attr_group = { + .attrs = pfm_pmu_desc_attrs, +}; + +static int pfm_sysfs_add_pmu_regs(struct pfm_pmu_config *pmu) +{ + struct pfm_regmap_desc *reg; + unsigned int i, k; + int ret; + char reg_name[8]; + + reg = pmu->pmc_desc; + for(i=0; i < pmu->num_pmc_entries; i++, reg++) { + + if (!(reg->type & PFM_REG_I)) + continue; + + kobject_init(®->kobj); + + reg->kobj.parent = &pmu->kobj; + snprintf(reg_name, sizeof(reg_name), "pmc%u", i); + kobject_set_name(®->kobj, reg_name); + kobj_set_kset_s(reg, pfm_regs_subsys); + + ret = kobject_add(®->kobj); + if (ret) + goto undo_pmcs; + + ret = sysfs_create_group(®->kobj, &pfm_reg_attr_group); + if (ret) { + kobject_del(®->kobj); + goto undo_pmcs; + } + } + + reg = pmu->pmd_desc; + for(i=0; i < pmu->num_pmd_entries; i++, reg++) { + + if (!(reg->type & PFM_REG_I)) + continue; + + kobject_init(®->kobj); + + reg->kobj.parent = &pmu->kobj; + snprintf(reg_name, sizeof(reg_name), "pmd%u", i); + kobject_set_name(®->kobj, reg_name); + kobj_set_kset_s(reg, pfm_regs_subsys); + + ret = kobject_add(®->kobj); + if (ret) + goto undo_pmds; + + ret = sysfs_create_group(®->kobj, &pfm_reg_attr_group); + if (ret) { + kobject_del(®->kobj); + goto undo_pmds; + } + } + return 0; +undo_pmds: + reg = pmu->pmd_desc; + for(k = 0; k < i; k++, reg++) { + if (!(reg->type & PFM_REG_I)) + continue; + sysfs_remove_group(®->kobj, &pfm_reg_attr_group); + kobject_del(®->kobj); + } + i = pmu->num_pmc_entries; + /* fall through */ +undo_pmcs: + reg = pmu->pmc_desc; + for(k=0; k < i; k++, reg++) { + if (!(reg->type & PFM_REG_I)) + continue; + sysfs_remove_group(®->kobj, &pfm_reg_attr_group); + kobject_del(®->kobj); + } + return ret; +} + +static int pfm_sysfs_del_pmu_regs(struct pfm_pmu_config *pmu) +{ + struct pfm_regmap_desc *reg; + unsigned int i; + + reg = pmu->pmc_desc; + for(i=0; i < pmu->regs.max_pmc; i++, reg++) { + + if (!(reg->type & PFM_REG_I)) + continue; + + sysfs_remove_group(®->kobj, &pfm_reg_attr_group); + kobject_del(®->kobj); + } + + reg = pmu->pmd_desc; + for(i=0; i < pmu->regs.max_pmd; i++, reg++) { + + if (!(reg->type & PFM_REG_I)) + continue; + + sysfs_remove_group(®->kobj, &pfm_reg_attr_group); + kobject_del(®->kobj); + } + return 0; +} + +/* + * when a PMU description module is inserted, we create + * a pmu_desc subdir in sysfs and we populate it with + * PMU specific information, such as register mappings + */ +int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu) +{ + int ret; + + if (pfm_sysfs_init_done == 0) + return 0; + + kobject_init(&pmu->kobj); + kobject_set_name(&pmu->kobj, "pmu_desc"); + kobj_set_kset_s(pmu, pfm_pmu_subsys); + pmu->kobj.parent = &pfm_kernel_kobj; + + ret = kobject_add(&pmu->kobj); + if (ret) + return ret; + + ret = sysfs_create_group(&pmu->kobj, &pfm_pmu_desc_attr_group); + if (ret) + kobject_del(&pmu->kobj); + + ret = pfm_sysfs_add_pmu_regs(pmu); + if (ret) { + sysfs_remove_group(&pmu->kobj, &pfm_pmu_desc_attr_group); + kobject_del(&pmu->kobj); + } + return ret; +} + +/* + * when a PMU description module is removed, we also remove + * all its information from sysfs, i.e., the pmu_desc subdir + * disappears + */ +int pfm_sysfs_remove_pmu(struct pfm_pmu_config *pmu) +{ + if (pfm_sysfs_init_done == 0) + return 0; + + pfm_sysfs_del_pmu_regs(pmu); + sysfs_remove_group(&pmu->kobj, &pfm_pmu_desc_attr_group); + kobject_del(&pmu->kobj); + + return 0; +}