commit d2b2ea81b693098865444f199ec1e61e75747654
Merge: 29e3e17... 3f46faa...
Author: Arnd Bergmann <arnd@klappe.arndb.de>
Date:   Fri Jul 6 19:25:32 2007 +0200

    Merge branches 'netdev-merge' and 'perfmon2' into perfmon2-merge

commit 29e3e17760cc6add6c95aaeb76d11893c9f7229a
Merge: 2f18e1c... fb8e39e...
Author: Arnd Bergmann <arnd@klappe.arndb.de>
Date:   Fri Jul 6 19:18:10 2007 +0200

    Merge branch 'netdev-merge' into perfmon2-merge
    
    Conflicts:
    
    	arch/powerpc/Kconfig
    	include/asm-powerpc/systbl.h
    	include/asm-powerpc/unistd.h

commit 2f18e1caf74987f749eba3af99dc39e3e07f0e85
Author: Kevin Corry <kevcorry@us.ibm.com>
Date:   Fri Jun 29 08:56:00 2007 -0500

    Perfmon2: Add support for Cell PMU's hardware-sampling.

commit fa0876715a1a02a5170e023ebb39e6cb68abacf7
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Sun Jun 24 16:22:04 2007 -0500

    Update to Perfmon2 version 070621: powerpc code.

commit e332e9217b88f537cbf8b882398592f54c557b97
Author: Stephane Eranian <eranaian@hpl.hp.com>
Date:   Sun Jun 24 15:26:17 2007 -0500

    Update to Perfmon2 version 070621: x86-64 code.

commit 9e5443f91fc5dd66469ab09331b5d2bee06bb1db
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Sun Jun 24 15:10:58 2007 -0500

    Update to Perfmon2 version 070621: mips code.

commit f06e44db987b70306896ff5cdfd84023b224949b
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Sat Jun 23 17:22:04 2007 -0500

    Update to Perfmon2 version 070621: ia64 code.

commit fb9f0a0e62583e51f242b19a817629e75e5961c0
Author: Kevin Corry <kevcorry@us.ibm.com>
Date:   Sat Jun 23 17:19:55 2007 -0500

    Update to Perfmon2 version 070621: i386 code.

commit 8c649c0b5e0a210015b1eb45ec82d87b76e7c9d9
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Sat Jun 23 17:06:31 2007 -0500

    Update to Perfmon2 version 070621: core Perfmon2 code.

commit 68971f94498ea868c6a3e6d6351ff7120edf14ad
Merge: 90ec31a... 0864a4e...
Author: Kevin Corry <kevcorry@us.ibm.com>
Date:   Fri Jun 22 09:09:16 2007 -0500

    Merge with git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git

commit 90ec31acbcfa76df7ab5c7c98a0b9d6b3e7ec1ce
Author: Kevin Corry <kevcorry@us.ibm.com>
Date:   Thu Jun 21 11:57:54 2007 -0500

    Add TIF_PERFMON_WORK and TIF_PERFMON_CTXSW flags.
    
    This patch has already been picked up in the powerpc tree, but not yet in
    mainline. This patch is required for the mainline-based Perfmon2 tree to
    compile on powerpc.

commit 483a8ce2532afb62378abfe6758ba833be719a63
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Thu Jun 21 11:28:50 2007 -0500

    Perfmon2: powerpc modifications.
    
    Modifications to powerpc architecture code needed for Perfmon2.

commit b1a5a6fc3dcb6bff4f19377785a21692fc47620f
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Wed Jun 20 16:12:37 2007 -0500

    Perfmon2: new powerpc architecture code

commit c4db69f1389171c8977534c8944d7128dee794d3
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Wed Jun 20 14:45:03 2007 -0500

    Perfmon2: x86-64 modifications.
    
    Modifications to x86-64 architecture code needed for Perfmon2.

commit 77323002a9ffd5367f7d0288619733a15dd403ef
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Wed Jun 20 15:17:55 2007 -0500

    Perfmon2: new x86-64 architecture code

commit ba8ce1c95c79fafe561bafeff3c5e480a9104d1a
Author: Stephane Eranian <ernaian@hpl.hp.com>
Date:   Wed Jun 20 14:28:23 2007 -0500

    Perfmon2: mips modifications.
    
    Modifications to mips architecture code needed for Perfmon2.

commit 0af167473835e52268d7f5c004e2ec9463a357f4
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Wed Jun 20 15:15:41 2007 -0500

    Perfmon2: new mips architecture code

commit 884b8c725e227901f0a548b55803a01cc6f156f6
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Wed Jun 20 14:11:17 2007 -0500

    Perfmon2: ia64 modifications.
    
    Modifications to ia64 architecture code needed for Perfmon2.

commit af04b61bf1b16df0a1f2c96d2b02b12bf1c39cdd
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Wed Jun 20 15:14:10 2007 -0500

    Perfmon2: new ia64 architecture code

commit ff7f2d460449a9167c0389ab19d12be2c927aebd
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Thu Jun 21 11:25:24 2007 -0500

    Perfmon2: i386 modifications.
    
    Modifications to i386 architecture code needed for Perfmon2.

commit dedfc8b61584483c2511c7afba39bed6159ecee2
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Wed Jun 20 15:06:45 2007 -0500

    Perfmon2: new i386 architecture code

commit 9e4067e8ce29fcec3e32a1d89339ada5904eaa7b
Author: Stephane Eranian <eranian@hlp.hp.com>
Date:   Wed Jun 20 16:04:03 2007 -0500

    Perfmon2 arch-independent modifications.
    
    Modifications to architecture-independent code needed for Perfmon2.

commit 94b60799f0c2e4a1f320ab9025ccfadc2e57d6be
Author: Stephane Eranian <eranian@hpl.hp.com>
Date:   Thu Jun 21 11:23:22 2007 -0500

    Perfmon2: new core code
Index: linux-2.6/Makefile
===================================================================
--- linux-2.6.orig/Makefile
+++ linux-2.6/Makefile
@@ -553,7 +553,7 @@ export mod_strip_cmd
 
 
 ifeq ($(KBUILD_EXTMOD),)
-core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/
+core-y		+= kernel/ mm/ fs/ ipc/ security/ crypto/ block/ perfmon/
 
 vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
Index: linux-2.6/arch/i386/Kconfig
===================================================================
--- linux-2.6.orig/arch/i386/Kconfig
+++ linux-2.6/arch/i386/Kconfig
@@ -910,6 +910,8 @@ config COMPAT_VDSO
 
 	  If unsure, say Y.
 
+source "arch/i386/perfmon/Kconfig"
+
 endmenu
 
 config ARCH_ENABLE_MEMORY_HOTPLUG
Index: linux-2.6/arch/i386/Makefile
===================================================================
--- linux-2.6.orig/arch/i386/Makefile
+++ linux-2.6/arch/i386/Makefile
@@ -99,6 +99,7 @@ mflags-y += -Iinclude/asm-i386/mach-defa
 head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o
 
 libs-y 					+= arch/i386/lib/
+core-$(CONFIG_PERFMON)			+= arch/i386/perfmon/
 core-y					+= arch/i386/kernel/ \
 					   arch/i386/mm/ \
 					   arch/i386/$(mcore-y)/ \
Index: linux-2.6/arch/i386/kernel/apic.c
===================================================================
--- linux-2.6.orig/arch/i386/kernel/apic.c
+++ linux-2.6/arch/i386/kernel/apic.c
@@ -28,6 +28,7 @@
 #include <linux/acpi_pmtmr.h>
 #include <linux/module.h>
 #include <linux/dmi.h>
+#include <linux/perfmon.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
@@ -562,6 +563,8 @@ static void local_apic_timer_interrupt(v
 	per_cpu(irq_stat, cpu).apic_timer_irqs++;
 
 	evt->event_handler(evt);
+
+	pfm_handle_switch_timeout();
 }
 
 /*
@@ -1325,6 +1328,9 @@ void __init apic_intr_init(void)
 #ifdef CONFIG_X86_MCE_P4THERMAL
 	set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 #endif
+#ifdef CONFIG_PERFMON
+	set_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt);
+#endif
 }
 
 /**
Index: linux-2.6/arch/i386/kernel/cpu/common.c
===================================================================
--- linux-2.6.orig/arch/i386/kernel/cpu/common.c
+++ linux-2.6/arch/i386/kernel/cpu/common.c
@@ -5,6 +5,7 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/bootmem.h>
+#include <linux/perfmon.h>
 #include <asm/semaphore.h>
 #include <asm/processor.h>
 #include <asm/i387.h>
@@ -718,6 +719,8 @@ void __cpuinit cpu_init(void)
 	current_thread_info()->status = 0;
 	clear_used_math();
 	mxcsr_feature_mask_init();
+
+	pfm_init_percpu();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
Index: linux-2.6/arch/i386/kernel/entry.S
===================================================================
--- linux-2.6.orig/arch/i386/kernel/entry.S
+++ linux-2.6/arch/i386/kernel/entry.S
@@ -465,7 +465,7 @@ ENDPROC(system_call)
 	ALIGN
 	RING0_PTREGS_FRAME		# can't unwind into user space anyway
 work_pending:
-	testb $_TIF_NEED_RESCHED, %cl
+	testw $(_TIF_NEED_RESCHED|_TIF_PERFMON_WORK), %cx
 	jz work_notifysig
 work_resched:
 	call schedule
Index: linux-2.6/arch/i386/kernel/process.c
===================================================================
--- linux-2.6.orig/arch/i386/kernel/process.c
+++ linux-2.6/arch/i386/kernel/process.c
@@ -31,6 +31,7 @@
 #include <linux/delay.h>
 #include <linux/reboot.h>
 #include <linux/init.h>
+#include <linux/perfmon.h>
 #include <linux/mc146818rtc.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
@@ -384,6 +385,7 @@ void exit_thread(void)
 		tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
 		put_cpu();
 	}
+	pfm_exit_thread(current);
 }
 
 void flush_thread(void)
@@ -435,6 +437,8 @@ int copy_thread(int nr, unsigned long cl
 
 	savesegment(gs,p->thread.gs);
 
+ 	pfm_copy_thread(p);
+
 	tsk = current;
 	if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
@@ -538,8 +542,9 @@ int dump_task_regs(struct task_struct *t
 	return 1;
 }
 
-static noinline void __switch_to_xtra(struct task_struct *next_p,
-				    struct tss_struct *tss)
+static noinline void __switch_to_xtra(struct task_struct *prev_p,
+				      struct task_struct *next_p,
+				      struct tss_struct *tss)
 {
 	struct thread_struct *next;
 
@@ -555,6 +560,10 @@ static noinline void __switch_to_xtra(st
 		set_debugreg(next->debugreg[7], 7);
 	}
 
+	if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW)
+	    || test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW))
+		pfm_ctxsw(prev_p, next_p);
+
 	if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
 		/*
 		 * Disable the bitmap via an invalid offset. We still cache
@@ -690,8 +699,8 @@ struct task_struct fastcall * __switch_t
 	 * Now maybe handle debug registers and/or IO bitmaps
 	 */
 	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW)
-	    || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)))
-		__switch_to_xtra(next_p, tss);
+	    || (task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW)))
+		__switch_to_xtra(prev_p, next_p, tss);
 
 	disable_tsc(prev_p, next_p);
 
Index: linux-2.6/arch/i386/kernel/signal.c
===================================================================
--- linux-2.6.orig/arch/i386/kernel/signal.c
+++ linux-2.6/arch/i386/kernel/signal.c
@@ -21,6 +21,7 @@
 #include <linux/ptrace.h>
 #include <linux/elf.h>
 #include <linux/binfmts.h>
+#include <linux/perfmon.h>
 #include <asm/processor.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
@@ -652,6 +653,9 @@ void do_notify_resume(struct pt_regs *re
 		clear_thread_flag(TIF_SINGLESTEP);
 	}
 
+	if (thread_info_flags & _TIF_PERFMON_WORK)
+ 		pfm_handle_work(regs);
+
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
 		do_signal(regs);
Index: linux-2.6/arch/i386/kernel/smpboot.c
===================================================================
--- linux-2.6.orig/arch/i386/kernel/smpboot.c
+++ linux-2.6/arch/i386/kernel/smpboot.c
@@ -45,6 +45,7 @@
 #include <linux/cpu.h>
 #include <linux/percpu.h>
 #include <linux/nmi.h>
+#include <linux/perfmon.h>
 
 #include <linux/delay.h>
 #include <linux/mc146818rtc.h>
@@ -1207,6 +1208,7 @@ int __cpu_disable(void)
 
 	cpu_clear(cpu, map);
 	fixup_irqs(map);
+	pfm_cpu_disable();
 	/* It's now safe to remove this processor from the online map */
 	cpu_clear(cpu, cpu_online_map);
 	return 0;
Index: linux-2.6/arch/i386/kernel/syscall_table.S
===================================================================
--- linux-2.6.orig/arch/i386/kernel/syscall_table.S
+++ linux-2.6/arch/i386/kernel/syscall_table.S
@@ -323,3 +323,15 @@ ENTRY(sys_call_table)
 	.long sys_signalfd
 	.long sys_timerfd
 	.long sys_eventfd
+	.long sys_pfm_create_context
+	.long sys_pfm_write_pmcs	/* 325 */
+	.long sys_pfm_write_pmds
+	.long sys_pfm_read_pmds
+	.long sys_pfm_load_context
+	.long sys_pfm_start
+	.long sys_pfm_stop		/* 330 */
+	.long sys_pfm_restart
+	.long sys_pfm_create_evtsets
+	.long sys_pfm_getinfo_evtsets
+	.long sys_pfm_delete_evtsets
+	.long sys_pfm_unload_context	/* 335 */
Index: linux-2.6/arch/i386/oprofile/Makefile
===================================================================
--- linux-2.6.orig/arch/i386/oprofile/Makefile
+++ linux-2.6/arch/i386/oprofile/Makefile
@@ -10,3 +10,4 @@ oprofile-y				:= $(DRIVER_OBJS) init.o b
 oprofile-$(CONFIG_X86_LOCAL_APIC) 	+= nmi_int.o op_model_athlon.o \
 					   op_model_ppro.o op_model_p4.o
 oprofile-$(CONFIG_X86_IO_APIC)		+= nmi_timer_int.o
+oprofile-$(CONFIG_PERFMON)		+= perfmon.o
Index: linux-2.6/arch/i386/oprofile/init.c
===================================================================
--- linux-2.6.orig/arch/i386/oprofile/init.c
+++ linux-2.6/arch/i386/oprofile/init.c
@@ -15,9 +15,11 @@
  * with the NMI mode driver.
  */
  
+extern int op_perfmon_init(struct oprofile_operations * ops);
 extern int op_nmi_init(struct oprofile_operations * ops);
 extern int op_nmi_timer_init(struct oprofile_operations * ops);
 extern void op_nmi_exit(void);
+extern void op_perfmon_exit(void);
 extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth);
 
 
@@ -27,8 +29,12 @@ int __init oprofile_arch_init(struct opr
 
 	ret = -ENODEV;
 
+#ifdef CONFIG_PERFMON
+	ret = op_perfmon_init(ops);
+#endif
 #ifdef CONFIG_X86_LOCAL_APIC
-	ret = op_nmi_init(ops);
+	if (ret < 0)
+		ret = op_nmi_init(ops);
 #endif
 #ifdef CONFIG_X86_IO_APIC
 	if (ret < 0)
@@ -42,6 +48,9 @@ int __init oprofile_arch_init(struct opr
 
 void oprofile_arch_exit(void)
 {
+#ifdef CONFIG_PERFMON
+	op_perfmon_exit();
+#endif
 #ifdef CONFIG_X86_LOCAL_APIC
 	op_nmi_exit();
 #endif
Index: linux-2.6/arch/i386/oprofile/nmi_int.c
===================================================================
--- linux-2.6.orig/arch/i386/oprofile/nmi_int.c
+++ linux-2.6/arch/i386/oprofile/nmi_int.c
@@ -465,6 +465,7 @@ int __init op_nmi_init(struct oprofile_o
 	ops->start = nmi_start;
 	ops->stop = nmi_stop;
 	ops->cpu_type = cpu_type;
+	ops->implementation = "oprofile";
 	printk(KERN_INFO "oprofile: using NMI interrupt.\n");
 	return 0;
 }
Index: linux-2.6/arch/i386/oprofile/nmi_timer_int.c
===================================================================
--- linux-2.6.orig/arch/i386/oprofile/nmi_timer_int.c
+++ linux-2.6/arch/i386/oprofile/nmi_timer_int.c
@@ -64,6 +64,7 @@ int __init op_nmi_timer_init(struct opro
 	ops->start = timer_start;
 	ops->stop = timer_stop;
 	ops->cpu_type = "timer";
+	ops->implementation = "nmi_timer";
 	printk(KERN_INFO "oprofile: using NMI timer interrupt.\n");
 	return 0;
 }
Index: linux-2.6/arch/i386/oprofile/perfmon.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/i386/oprofile/perfmon.c
@@ -0,0 +1,161 @@
+/**
+ * @file perfmon.c
+ *
+ * @remark Copyright 2003 OProfile authors
+ * @remark Read the file COPYING
+ *
+ * @author John Levon <levon@movementarian.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/oprofile.h>
+#include <linux/perfmon.h>
+
+static int allow_ints;
+
+static int
+perfmon_handler(void *buf, struct pfm_ovfl_arg *arg,
+                unsigned long ip, u64 stamp, void *data)
+{
+	struct pt_regs * const regs = data;
+	int event = arg->pmd_eventid;
+
+	PFM_DBG_ovfl("oprofile overflow ip=%lx, event=%d",
+		     instruction_pointer(regs), event);
+
+	arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET;
+
+	/* the owner of the oprofile event buffer may have exited
+	 * without perfmon being shutdown (e.g. SIGSEGV)
+	 */
+	if (allow_ints)
+		oprofile_add_sample(regs, event);
+	return 0;
+}
+
+
+static int perfmon_start(void)
+{
+	allow_ints = 1;
+	return 0;
+}
+
+
+static void perfmon_stop(void)
+{
+	allow_ints = 0;
+}
+
+static struct pfm_smpl_fmt oprofile_fmt = {
+ 	.fmt_name = "OProfile",
+ 	.fmt_handler = perfmon_handler,
+	.fmt_flags = PFM_FMT_BUILTIN_FLAG,
+	.owner = THIS_MODULE
+};
+
+/* all the ops are handled via userspace for i386 oprofile using perfmon */
+
+static int using_perfmon;
+
+static int __init ppro_init(char ** cpu_type)
+{
+	__u8 cpu_model = boot_cpu_data.x86_model;
+
+	if (cpu_model == 14)
+		*cpu_type = "i386/core";
+	else if (cpu_model == 15)
+		*cpu_type = "i386/core_2";
+	else if (cpu_model > 0xd)
+		return 0;
+	else if (cpu_model == 9) {
+		*cpu_type = "i386/p6_mobile";
+	} else if (cpu_model > 5) {
+		*cpu_type = "i386/piii";
+	} else if (cpu_model > 2) {
+		*cpu_type = "i386/pii";
+	} else {
+		*cpu_type = "i386/ppro";
+	}
+	return 1;
+}
+
+static int __init p4_init(char ** cpu_type)
+{
+#ifndef CONFIG_SMP
+	*cpu_type = "i386/p4";
+	return 1;
+#else
+	switch (smp_num_siblings) {
+		case 1:
+			*cpu_type = "i386/p4";
+			return 1;
+
+		case 2:
+			*cpu_type = "i386/p4-ht";
+			return 1;
+	}
+#endif
+	return 0;
+}
+
+static char *get_cpu_type(void)
+{
+	char *cpu_type = "??/??";
+
+	switch (boot_cpu_data.x86_vendor) {
+		case X86_VENDOR_AMD:
+			/* Needs to be at least an Athlon (or hammer in 32bit mode) */
+			switch (boot_cpu_data.x86) {
+			case 6:
+				cpu_type = "i386/athlon";
+				break;
+			case 0xf:
+				/* Actually it could be i386/hammer too, but give
+				   user space a consistent name. */
+				cpu_type = "x86-64/hammer";
+				break;
+			}
+			break;
+		case X86_VENDOR_INTEL:
+			switch (boot_cpu_data.x86) {
+				/* Pentium IV */
+				case 0xf:
+					p4_init(&cpu_type);
+					break;
+
+				/* A P6-class processor */
+				case 6:
+					ppro_init(&cpu_type);
+					break;
+			}
+			break;
+	}
+	return cpu_type;
+}
+
+
+int __init op_perfmon_init(struct oprofile_operations * ops)
+{
+	int ret = pfm_fmt_register(&oprofile_fmt);
+	if (ret)
+		return -ENODEV;
+
+	ops->cpu_type = get_cpu_type();
+	ops->start = perfmon_start;
+	ops->stop = perfmon_stop;
+	ops->implementation = "perfmon2";
+	using_perfmon = 1;
+	printk(KERN_INFO "oprofile: using perfmon.\n");
+	return 0;
+}
+
+
+void __exit op_perfmon_exit(void)
+{
+	if (!using_perfmon)
+		return;
+
+	pfm_fmt_unregister(&oprofile_fmt);
+}
+
Index: linux-2.6/arch/i386/perfmon/Kconfig
===================================================================
--- /dev/null
+++ linux-2.6/arch/i386/perfmon/Kconfig
@@ -0,0 +1,65 @@
+menu "Hardware Performance Monitoring support"
+config PERFMON
+  	bool "Perfmon2 performance monitoring interface"
+	select X86_LOCAL_APIC
+	default n
+  	help
+  	Enables the perfmon2 interface to access the hardware
+	performance counters. See <http://perfmon2.sf.net/> for
+ 	more details.
+
+config PERFMON_DEBUG
+ 	bool "Perfmon debugging"
+	default n
+	depends on PERFMON
+ 	help
+  	Enables perfmon debugging support
+
+config PERFMON_P6
+	tristate "Support for Intel P6/Pentium M processor hardware performance counters"
+	depends on PERFMON
+	default n
+	help
+	Enables support for Intel P6-style hardware performance counters.
+	To be used for with Intel Pentium III, PentiumPro, Pentium M processors.
+
+config I386_PERFMON_P4
+	tristate "Support for 32-bit Intel Pentium 4/Xeon hardware performance counters"
+	depends on PERFMON
+	default n
+	help
+	Enables support for 32-bit Intel Pentium 4/Xeon hardware performance
+	counters.
+
+config	I386_PERFMON_PEBS
+	tristate "Support for Intel Precise Event-Based Sampling (PEBS)"
+	depends on PERFMON
+	default n
+	help
+	Enables support for 32-bit Precise Event-Based Sampling (PEBS) on the Intel
+	Pentium 4, Xeon, and Core-based processors which support it.
+
+config  I386_PERFMON_CORE
+	tristate "Support for Intel Core-based performance counters"
+	depends on PERFMON
+	default n
+	help
+	Enables 32-bit support for Intel Core-based performance counters. Enable
+	this option to support Intel Core 2 Duo processors.
+
+config  I386_PERFMON_INTEL_ARCH
+	tristate "Support for Intel architectural performance counters"
+	depends on PERFMON
+	default n
+	help
+	Enables 32-bit support for Intel architectural performance counters. This
+	architecture was introduced by Intel Core Solo/Core Duo processors.
+
+config	I386_PERFMON_K8
+	tristate "Support 32-bit mode AMD Athlon64/Opteron64 hardware performance counters"
+	depends on PERFMON
+	default n
+	help
+	Enables support for 32-bit mode AND Althon64/Opterton64 hardware performance counters.
+endmenu
+
Index: linux-2.6/arch/i386/perfmon/Makefile
===================================================================
--- /dev/null
+++ linux-2.6/arch/i386/perfmon/Makefile
@@ -0,0 +1,14 @@
+#
+# Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+# Contributed by Stephane Eranian <eranian@hpl.hp.com>
+#
+obj-$(CONFIG_PERFMON)			+= perfmon.o
+obj-$(CONFIG_PERFMON_P6)		+= perfmon_p6.o
+obj-$(CONFIG_I386_PERFMON_P4)		+= perfmon_p4.o
+obj-$(CONFIG_I386_PERFMON_CORE)		+= perfmon_core.o
+obj-$(CONFIG_I386_PERFMON_INTEL_ARCH)	+= perfmon_intel_arch.o
+obj-$(CONFIG_I386_PERFMON_PEBS)		+= perfmon_pebs_smpl.o
+obj-$(CONFIG_I386_PERFMON_K8)   	+= perfmon_k8.o
+
+perfmon_k8-$(subst m,y,$(CONFIG_I386_PERFMON_K8)) += ../../x86_64/perfmon/perfmon_k8.o
+perfmon_core-$(subst m,y,$(CONFIG_I386_PERFMON_CORE)) += ../../x86_64/perfmon/perfmon_core.o
Index: linux-2.6/arch/i386/perfmon/perfmon.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/i386/perfmon/perfmon.c
@@ -0,0 +1,1302 @@
+/*
+ * This file implements the X86 specific support for the perfmon2 interface
+ *
+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/interrupt.h>
+#include <linux/perfmon.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+
+#include <asm/nmi.h>
+#include <asm/apic.h>
+#include <asm/perfctr.h>
+
+DEFINE_PER_CPU(unsigned long, real_iip);
+
+static int pfm_using_nmi;
+
+struct pfm_ds_area {
+	unsigned long	bts_buf_base;
+	unsigned long	bts_index;
+	unsigned long	bts_abs_max;
+	unsigned long	bts_intr_thres;
+	unsigned long	pebs_buf_base;
+	unsigned long	pebs_index;
+	unsigned long	pebs_abs_max;
+	unsigned long	pebs_intr_thres;
+	u64		pebs_cnt_reset;
+};
+
+static int (*pfm_has_ovfl)(struct pfm_context *);
+static int (*pfm_stop_save)(struct pfm_context *ctx,
+			    struct pfm_event_set *set);
+
+static inline int get_smt_id(void)
+{
+#ifdef CONFIG_SMP
+	int cpu = smp_processor_id();
+	return (cpu != first_cpu(cpu_sibling_map[cpu]));
+#else
+	return 0;
+#endif
+}
+
+void __pfm_write_reg_p4(const struct pfm_arch_ext_reg *xreg, u64 val)
+{
+	u64 pmi;
+	int smt_id;
+	
+	smt_id = get_smt_id();
+	/*
+	 * HT is only supported by P4-style PMU
+	 *
+	 * Adjust for T1 if necessary:
+	 *
+	 * - move the T0_OS/T0_USR bits into T1 slots
+	 * - move the OVF_PMI_T0 bits into T1 slot
+	 *
+	 * The P4/EM64T T1 is cleared by description table.
+	 * User only works with T0.
+	 */
+	if (smt_id) {
+		if (xreg->reg_type & PFM_REGT_ESCR) {
+
+			/* copy T0_USR & T0_OS to T1 */
+			val |= ((val & 0xc) >> 2);
+
+			/* clear bits T0_USR & T0_OS */
+			val &= ~0xc;
+
+		} else if (xreg->reg_type & PFM_REGT_CCCR) {
+			pmi = (val >> 26) & 0x1;
+			if (pmi) {
+				val &=~(1UL<<26);
+				val |= 1UL<<27;
+			}
+		}
+	}
+	if (xreg->addrs[smt_id])
+		wrmsrl(xreg->addrs[smt_id], val);
+}
+
+void __pfm_read_reg_p4(const struct pfm_arch_ext_reg *xreg, u64 *val)
+{
+	int smt_id;
+	
+	smt_id = get_smt_id();
+
+	if (likely(xreg->addrs[smt_id])) {
+		rdmsrl(xreg->addrs[smt_id], *val);
+		/*
+		 * HT is only supported by P4-style PMU
+		 *
+		 * move the Tx_OS and Tx_USR bits into
+		 * T0 slots setting the T1 slots to zero
+		 */
+		if (xreg->reg_type & PFM_REGT_ESCR) {
+			if (smt_id)
+				*val |= (((*val) & 0x3) << 2);
+
+			/*
+			 * zero out bits that are reserved
+			 * (including T1_OS and T1_USR)
+			 */
+			*val &= PFM_ESCR_RSVD;
+		}
+	} else
+		 *val = 0;
+}
+
+/*
+ * called from NMI interrupt handler
+ */
+static void __kprobes __pfm_arch_quiesce_pmu_percpu(void)
+{
+	struct pfm_arch_pmu_info *arch_info;
+	unsigned int i;
+
+	arch_info = pfm_pmu_conf->arch_info;
+
+	/*
+	 * quiesce PMU by clearing registers that have enable bits
+	 * (start/stop capabilities). 
+	 */
+	for (i = 0; i < arch_info->max_ena; i++)
+		if (test_bit(i, cast_ulp(arch_info->enable_mask)))
+			pfm_arch_write_pmc(NULL, i, 0);
+}
+
+/*
+ * unfreeze PMU from pfm_do_interrupt_handler().
+ * ctx may be NULL for spurious interrupts.
+ * interrupts are masked.
+ */
+void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
+{
+	struct pfm_arch_context *ctx_arch;
+
+	if (ctx == NULL)
+		return;
+
+	PFM_DBG_ovfl("state=%d", ctx->state);
+
+	ctx->flags.started = 1;
+
+	if (ctx->state == PFM_CTX_MASKED)
+		return;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	pfm_arch_restore_pmcs(ctx, ctx->active_set);
+
+	if (ctx_arch->flags & PFM_X86_USE_DS)
+		wrmsrl(MSR_IA32_DS_AREA, ctx_arch->ds_area);
+}
+
+/*
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * set cannot be NULL. Context is locked. Interrupts are masked.
+ * Caller has already restored all PMD and PMC registers.
+ *
+ * must reactivate monitoring
+ */
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx,
+		      struct pfm_event_set *set)
+{
+	struct pfm_arch_context *ctx_arch;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	/*
+	 * reload DS management area pointer. Pointer
+	 * not managed as a PMC thus it is not restored
+	 * with the rest of the registers.
+	 */
+	if (ctx_arch->flags & PFM_X86_USE_DS)
+		wrmsrl(MSR_IA32_DS_AREA, ctx_arch->ds_area);
+
+	if (set->npend_ovfls)
+		__get_cpu_var(real_iip) = ctx_arch->saved_real_iip;
+}
+
+static int pfm_stop_save_p6(struct pfm_context *ctx,
+			    struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	u64 used_mask[PFM_PMC_BV];
+	u64 *cnt_mask;
+	u64 val, wmask, ovfl_mask;
+	u32 i, count;
+
+	if (ctx->state == PFM_CTX_MASKED)
+		return 1;
+
+	wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+	bitmap_and(cast_ulp(used_mask),
+		   cast_ulp(set->used_pmcs),
+		   cast_ulp(arch_info->enable_mask),
+		   arch_info->max_ena);
+
+	count = bitmap_weight(cast_ulp(used_mask), pfm_pmu_conf->regs.max_pmc);
+
+	/*
+	 * stop monitoring
+	 * Unfortunately, this is very expensive!
+	 * wrmsrl() is serializing.
+	 */
+	for (i = 0; count; i++) {
+		if (test_bit(i, cast_ulp(used_mask))) {
+			wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
+			count--;
+		}
+	}
+
+	/*
+	 * if we already having a pending overflow condition, we simply
+	 * return to take care of this first.
+	 */
+	if (set->npend_ovfls) {
+		__get_cpu_var(pfm_stats).ccnt6++;
+		return 1;
+	}
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	cnt_mask = pfm_pmu_conf->regs.cnt_pmds;
+
+	/*
+	 * check for pending overflows and save PMDs (combo)
+	 * Must check for counting PMDs because of virtual PMDs
+	 */
+	count = set->nused_pmds;
+	for (i = 0; count; i++) {
+		if (test_bit(i, cast_ulp(set->used_pmds))) {
+			val = pfm_arch_read_pmd(ctx, i);
+			if (likely(test_bit(i, cast_ulp(cnt_mask)))) {
+				if (!(val & wmask)) {
+					__set_bit(i, cast_ulp(set->povfl_pmds));
+					set->npend_ovfls++;
+				}
+				val = (set->pmds[i].value & ~ovfl_mask) | (val & ovfl_mask);
+			}
+			set->pmds[i].value = val;
+			count--;
+		}
+	}
+	/* 0 means: no need to save PMDs at upper level */
+	return 0;
+}
+
+static int pfm_stop_save_amd64(struct pfm_context *ctx,
+			       struct pfm_event_set *set)
+{
+ 	return pfm_stop_save_p6(ctx, set);
+}
+
+static int pfm_stop_save_intel_core(struct pfm_context *ctx,
+			      struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	struct pfm_arch_context *ctx_arch;
+	struct pfm_ds_area *ds;
+	u64 used_mask[PFM_PMC_BV];
+	u64 *cnt_mask;
+	u64 val, wmask, ovfl_mask;
+	u32 i, count;
+
+	if (ctx->state == PFM_CTX_MASKED)
+		return 1;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	/*
+	 * if PEBS used, clear DS area pointer
+	 */
+	if (ctx_arch->flags & PFM_X86_USE_DS)
+		wrmsrl(MSR_IA32_DS_AREA, 0);
+
+	wmask = 1ULL << pfm_pmu_conf->counter_width;
+
+	/*
+	 * used enable pmc bitmask
+	 */
+	bitmap_and(cast_ulp(used_mask),
+		   cast_ulp(set->used_pmcs),
+		   cast_ulp(arch_info->enable_mask),
+		   arch_info->max_ena);
+
+	count = bitmap_weight(cast_ulp(used_mask), arch_info->max_ena);
+
+	/*
+	 * stop monitoring
+	 * Unfortunately, this is very expensive!
+	 * wrmsrl() is serializing.
+	 */
+	for (i = 0; count; i++) {
+		if (test_bit(i, cast_ulp(used_mask))) {
+			wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0);
+			count--;
+		}
+	}
+
+	/*
+	 * if we already having a pending overflow condition, we simply
+	 * return to take care of this first.
+	 */
+	if (set->npend_ovfls)
+		return 1;
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	cnt_mask = pfm_pmu_conf->regs.cnt_pmds;
+
+	/*
+	 * check for pending overflows and save PMDs (combo)
+	 * Must check for counting PMDs because of virtual PMDs
+	 *
+	 * XXX: should use the ovf_status register instead, yet
+	 *      we would have to check if NMI is used and fallback
+	 *      to individual pmd inspection.
+	 */
+	count = set->nused_pmds;
+	for (i = 0; count; i++) {
+		if (test_bit(i, cast_ulp(set->used_pmds))) {
+			val = pfm_arch_read_pmd(ctx, i);
+			if (likely(test_bit(i, cast_ulp(cnt_mask)))) {
+				if (!(val & wmask)) {
+					__set_bit(i, cast_ulp(set->povfl_pmds));
+					set->npend_ovfls++;
+				}
+				val = (set->pmds[i].value & ~ovfl_mask) | (val & ovfl_mask);
+			}
+			set->pmds[i].value = val;
+			count--;
+		}
+	}
+
+	/*
+	 * check for PEBS buffer full and set the corresponding PMD overflow
+	 */
+	if (ctx_arch->flags & PFM_X86_USE_PEBS) {
+
+		ds = (struct pfm_ds_area *)ctx_arch->ds_area;
+
+		PFM_DBG("ds=%p pebs_idx=0x%lx thres=0x%lx",
+			ds,
+			ds->pebs_index,
+			ds->pebs_intr_thres);
+
+		if (ds->pebs_index >= ds->pebs_intr_thres
+		    && test_bit(arch_info->pebs_ctr_idx,
+		    	        cast_ulp(set->used_pmds))) {
+			__set_bit(arch_info->pebs_ctr_idx,
+				  cast_ulp(set->povfl_pmds));
+			set->npend_ovfls++;
+		}
+	}
+	/* 0 means: no need to save PMDs at upper level */
+	return 0;
+}
+
+static int pfm_stop_save_p4(struct pfm_context *ctx,
+			    struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	struct pfm_arch_context *ctx_arch;
+	struct pfm_arch_ext_reg *xrc, *xrd;
+	u64 used_mask[PFM_PMC_BV];
+	u32 i, j, count;
+	u16 max_pmc;
+	u64 cccr, ctr1, ctr2, ovfl_mask;
+
+	if (ctx->state == PFM_CTX_MASKED)
+		return 1;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	max_pmc = pfm_pmu_conf->regs.max_pmc;
+	xrc = arch_info->pmc_addrs;
+	xrd = arch_info->pmd_addrs;
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+
+	/*
+	 * build used enable PMC bitmask
+	 * if user did not set any CCCR, then mask is
+	 * empty and there is nothing to do because nothing
+	 * was started
+	 */
+	bitmap_and(cast_ulp(used_mask),
+		   cast_ulp(set->used_pmcs),
+		   cast_ulp(arch_info->enable_mask),
+		   arch_info->max_ena);
+
+	count = bitmap_weight(cast_ulp(used_mask), arch_info->max_ena);
+
+	PFM_DBG_ovfl("npend=%u ena_mask=0x%llx u_pmcs=0x%llx count=%u num=%u",
+		set->npend_ovfls,
+		(unsigned long long)arch_info->enable_mask[0],
+		(unsigned long long)set->used_pmcs[0],
+		count, arch_info->max_ena);
+	/*
+	 * stop clear DS area pointer
+	 */
+	if (ctx_arch->flags & PFM_X86_USE_DS)
+		wrmsrl(MSR_IA32_DS_AREA, 0);
+
+	/*
+	 * ensures we do not destroy pending overflow
+	 * information. If pended interrupts are already
+	 * known, then we just stop monitoring.
+	 */
+	if (set->npend_ovfls) {
+		/*
+		 * clear enable bit
+		 * unfortunately, this is very expensive!
+		 */
+		for (i = 0; count; i++) {
+			if (test_bit(i, cast_ulp(used_mask))) {
+				__pfm_write_reg_p4(xrc+i, 0);
+				count--;
+			}
+		}
+		/* need save PMDs at upper level */
+		return 1;
+	}
+
+	/*
+	 * stop monitoring AND collect pending overflow information AND
+	 * save pmds.
+	 *
+	 * We need to access the CCCR twice, once to get overflow info
+	 * and a second to stop monitoring (which destroys the OVF flag)
+	 * Similarly, we need to read the counter twice to check whether
+	 * it did overflow between the CCR read and the CCCR write.
+	 */
+	for (i = 0; count; i++) {
+		if (test_bit(i, cast_ulp(used_mask))) {
+			/*
+			 * controlled counter
+			 */
+			j = xrc[i].ctr;
+
+			/* read CCCR (PMC) value */
+			__pfm_read_reg_p4(xrc+i, &cccr);
+
+			/* read counter (PMD) controlled by PMC */
+			__pfm_read_reg_p4(xrd+j, &ctr1);
+
+			/* clear CCCR value: stop counter but destroy OVF */
+			__pfm_write_reg_p4(xrc+i, 0);
+
+			/* read counter controlled by CCCR again */
+			__pfm_read_reg_p4(xrd+j, &ctr2);
+
+			/*
+			 * there is an overflow if either:
+			 * 	- CCCR.ovf is set (and we just cleared it)
+			 * 	- ctr2 < ctr1
+			 * in that case we set the bit corresponding to the
+			 * overflowed PMD  in povfl_pmds.
+			 */
+			if ((cccr & (1ULL<<31)) || (ctr2 < ctr1)) {
+				__set_bit(j, cast_ulp(set->povfl_pmds));
+				set->npend_ovfls++;
+			}
+			ctr2 = (set->pmds[j].value & ~ovfl_mask) | (ctr2 & ovfl_mask);
+			set->pmds[j].value = ctr2;
+			count--;
+		}
+	}
+	/*
+	 * check for PEBS buffer full and set the corresponding PMD overflow
+	 */
+	if (ctx_arch->flags & PFM_X86_USE_PEBS) {
+		struct pfm_ds_area *ds;
+		ds = (struct pfm_ds_area *)ctx_arch->ds_area;
+		PFM_DBG("ds=%p pebs_idx=0x%lx thres=0x%lx", ds, ds->pebs_index, ds->pebs_intr_thres);
+		if (ds->pebs_index >= ds->pebs_intr_thres
+		    && test_bit(arch_info->pebs_ctr_idx, cast_ulp(set->used_pmds))) {
+			__set_bit(arch_info->pebs_ctr_idx, cast_ulp(set->povfl_pmds));
+			set->npend_ovfls++;
+		}
+	}
+
+	/* 0 means: no need to save the PMD at higher level */
+	return 0;
+}
+
+/*
+ * Called from pfm_stop() and idle notifier
+ *
+ * Interrupts are masked. Context is locked. Set is the active set.
+ *
+ * For per-thread:
+ *   task is not necessarily current. If not current task, then
+ *   task is guaranteed stopped and off any cpu. Access to PMU
+ *   is not guaranteed. Interrupts are masked. Context is locked.
+ *   Set is the active set.
+ *
+ * For system-wide:
+ * 	task is current
+ *
+ * must disable active monitoring. ctx cannot be NULL
+ */
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx,
+		   struct pfm_event_set *set)
+{
+	/*
+	 * no need to go through stop_save()
+	 * if we are already stopped
+	 */
+	if (!ctx->flags.started)
+		return;
+	/*
+	 * on x86, masked is equivalent to stopped, thus we have
+	 * nothing to do here
+	 */
+	if (task == current)
+        	pfm_stop_save(ctx, set);
+}
+
+/*
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * Context is locked. Interrupts are masked. Monitoring may be active.
+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
+ *
+ * Must stop monitoring, save pending overflow information
+ *
+ * Return:
+ * 	non-zero : did not save PMDs (as part of stopping the PMU)
+ * 	       0 : saved PMDs (no need to save them in caller)
+ */
+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx,
+		             struct pfm_event_set *set)
+{
+	/*
+	 * disable lazy restore of PMCS on ctxswin because
+	 * we modify some of them.
+	 */
+	set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
+
+	if (set->npend_ovfls) {
+		struct pfm_arch_context *ctx_arch;
+		ctx_arch = pfm_ctx_arch(ctx);
+		ctx_arch->saved_real_iip = __get_cpu_var(real_iip);
+	}
+	return pfm_stop_save(ctx, set);
+}
+
+/*
+ * called from pfm_start() and idle notifier
+ *
+ * Interrupts are masked. Context is locked. Set is the active set.
+ *
+ * For per-thread:
+ * 	Task is not necessarily current. If not current task, then task
+ * 	is guaranteed stopped and off any cpu. No access to PMU is task
+ *	is not current.
+ *
+ * For system-wide:
+ * 	task is always current
+ *
+ * must enable active monitoring.
+ */
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx,
+		    struct pfm_event_set *set)
+{
+	struct pfm_arch_context *ctx_arch;
+	u64 *mask;
+	u16 i, num;
+
+	/*
+	 * pfm_start issue while context is masked as no effect.
+	 * This comes from the fact that on x86, masking and stopping
+	 * use the same mechanism, i.e., clearing the enable bits
+	 * of the PMC registers.
+	 */
+	if (ctx->state == PFM_CTX_MASKED)
+		return;
+
+	/*
+	 * cannot restore PMC if no access to PMU. Will be done
+	 * when the thread is switched back in
+	 */
+ 	if (task != current)
+		return;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	/*
+	 * we must actually install all implemented pmcs registers because
+	 * until started, we do not write any PMC registers.
+	 * Note that registers used  by other subsystems (e.g. NMI) are
+	 * removed from pmcs.
+	 *
+	 * The available registers that are actually not used get their default
+	 * value such that counters do not count anything. As such, we can
+	 * afford to write all of them but then stop only the one we use.
+	 *
+	 * XXX: we may be able to optimize this for non-P4 PMU has pmcs are
+	 * independent from each others.
+	 */
+	num = pfm_pmu_conf->regs.num_pmcs;
+	mask = pfm_pmu_conf->regs.pmcs;
+	for (i = 0; num; i++) {
+		if (test_bit(i, cast_ulp(mask))) {
+			pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
+			num--;
+		}
+	}
+
+	/*
+	 * reload DS area pointer.
+	 */
+	if (ctx_arch->flags & PFM_X86_USE_DS)
+		wrmsrl(MSR_IA32_DS_AREA, ctx_arch->ds_area);
+
+}
+
+/*
+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
+ * pfm_context_load_sys(), pfm_ctxsw()
+ *
+ * context is locked. Interrupts are masked. Set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ *
+ * function must restore PMD registers
+ */
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	u64 *used_pmds;
+	u16 i, num;
+
+	used_pmds = set->used_pmds;
+	num = set->nused_pmds;
+
+	/*
+	 * we can restore only the PMD we use because:
+	 * 	- you can only read with pfm_read_pmds() the registers
+	 * 	  declared used via pfm_write_pmds(), smpl_pmds, reset_pmds
+	 *
+	 * 	- if cr4.pce=1, only counters are exposed to user. No
+	 * 	  address is ever exposed by counters.
+	 */
+	for (i = 0; num; i++) {
+		if (likely(test_bit(i, cast_ulp(used_pmds)))) {
+			pfm_write_pmd(ctx, i, set->pmds[i].value);
+			num--;
+		}
+	}
+}
+
+/*
+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
+ * pfm_context_load_sys(), pfm_ctxsw().
+ * Context is locked. Interrupts are masked. set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ *
+ * function must restore all PMC registers from set
+ */
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	u64 *mask;
+	u16 i, num;
+
+	/*
+	 * - by default, no PMC measures anything
+	 * - on ctxswout, all used PMCs are disabled (cccr cleared)
+	 *
+	 * we need to restore the PMC (incl enable bits) only if
+	 * not masked and user issued pfm_start()
+	 */
+	if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0)
+		return;
+
+	/*
+	 * In general, writing MSRs is very expensive, so try to be smart.
+	 *
+	 * P6-style, Core-style:
+	 * 	- pmc are totally independent of each other, there is
+	 * 	  possible side-effect from stale pmcs. Therefore we only
+	 * 	  restore the registers we use
+	 * P4-style:
+	 * 	- must restore everything because there are some dependencies
+	 * 	(e.g., ESCR and CCCR)
+	 */
+	if (arch_info->pmu_style == PFM_X86_PMU_P4) {
+		num = pfm_pmu_conf->regs.num_pmcs;
+		mask = pfm_pmu_conf->regs.pmcs;
+	} else {
+		num = set->nused_pmcs;
+		mask = set->used_pmcs;
+	}
+	for (i = 0; num; i++) {
+		if (test_bit(i, cast_ulp(mask))) {
+			pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
+			num--;
+		}
+	}
+}
+
+/*
+ * invoked only when NMI is used. Called from the LOCAL_PERFMON_VECTOR
+ * handler to copy P4 overflow state captured when the NMI triggered.
+ * Given that on P4, stopping monitoring destroy the overflow information
+ * we save it in pfm_has_ovfl_p4() where monitoring is also stopped.
+ *
+ * Here we propagate the overflow state to current active set. The
+ * freeze_pmu() call we not overwrite this state because npend_ovfls
+ * is non-zero.
+ */
+static void pfm_p4_copy_nmi_state(void)
+{
+	struct pfm_context *ctx;
+	struct pfm_arch_context *ctx_arch;
+	struct pfm_event_set *set;
+
+	ctx = __get_cpu_var(pmu_ctx);
+	if (!ctx)
+		return;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	set = ctx->active_set;
+
+	if (ctx_arch->p4->npend_ovfls) {
+		set->npend_ovfls = ctx_arch->p4->npend_ovfls;
+
+		bitmap_copy(cast_ulp(set->povfl_pmds),
+			    cast_ulp(ctx_arch->p4->povfl_pmds),
+			    pfm_pmu_conf->regs.max_pmd);
+
+		ctx_arch->p4->npend_ovfls = 0;
+	}
+}
+
+/*
+ * The PMU interrupt is handled through an interrupt gate, therefore
+ * the CPU automatically clears the EFLAGS.IF, i.e., masking interrupts.
+ *
+ * The perfmon interrupt handler MUST run with interrupts disabled due
+ * to possible race with other, higher priority interrupts, such as timer
+ * or IPI function calls.
+ *
+ * See description in IA-32 architecture manual, Vol 3 section 5.8.1
+ */
+fastcall void smp_pmu_interrupt(struct pt_regs *regs)
+{
+	struct pfm_arch_pmu_info *arch_info;
+	unsigned long iip;
+
+	ack_APIC_irq();
+
+	irq_enter();
+
+	/*
+	 * when using NMI, pfm_handle_nmi() gets called
+	 * first. It stops monitoring and record the
+	 * iip into real_iip, then it repost the interrupt
+	 * using the lower priority vector LOCAL_PERFMON_VECTOR
+	 *
+	 * On P4, due to the difficulty of detecting overflows
+	 * and stoppping the PMU, pfm_handle_nmi() needs to
+	 * record npend_ovfl and ovfl_pmds in ctx_arch. So
+	 * here we simply copy them back to the set.
+	 */
+	if (pfm_using_nmi) {
+		arch_info = pfm_pmu_conf->arch_info;
+		iip = __get_cpu_var(real_iip);
+		if (arch_info->pmu_style == PFM_X86_PMU_P4)
+			pfm_p4_copy_nmi_state();
+	} else
+		iip = instruction_pointer(regs);
+	
+	pfm_interrupt_handler(iip, regs);
+
+	/*
+	 * On Intel P6, Pentium M, P4, Intel Core:
+	 * 	- it is necessary to clear the MASK field for the LVTPC
+	 * 	  vector. Otherwise interrupts remain masked. See
+	 * 	  section 8.5.1
+	 * AMD X86-64:
+	 * 	- the documentation does not stipulate the behavior.
+	 * 	  To be safe, we also rewrite the vector to clear the
+	 * 	  mask field
+	 */
+	if (cpu_data->x86_vendor == X86_VENDOR_INTEL)
+		apic_write(APIC_LVTPC, LOCAL_PERFMON_VECTOR);
+
+	irq_exit();
+}
+
+/*
+ * detect is counters have overflowed.
+ * return:
+ * 	0 : no overflow
+ * 	1 : at least one overflow
+ *
+ * used by AMD K8 and Intel architectural PMU
+ */
+static int __kprobes pfm_has_ovfl_p6(struct pfm_context *ctx)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	struct pfm_arch_ext_reg *xrd;
+	u64 *cnt_mask;
+	u64 wmask, val;
+	u16 i, num;
+
+	cnt_mask = pfm_pmu_conf->regs.cnt_pmds;
+	num = pfm_pmu_conf->regs.num_counters;
+	wmask = 1ULL << pfm_pmu_conf->counter_width;
+	xrd = arch_info->pmd_addrs;
+
+	for (i = 0; num; i++) {
+		if (test_bit(i, cast_ulp(cnt_mask))) {
+			rdmsrl(xrd[i].addrs[0], val);
+			if (!(val & wmask))
+				return 1;
+			num--;
+		}
+	}
+	return 0;
+}
+
+static int __kprobes pfm_has_ovfl_amd64(struct pfm_context *ctx)
+{
+	return pfm_has_ovfl_p6(ctx);
+}
+
+/*
+ * detect is counters have overflowed.
+ * return:
+ * 	0 : no overflow
+ * 	1 : at least one overflow
+ *
+ * used by Intel P4
+ */
+static int __kprobes pfm_has_ovfl_p4(struct pfm_context *ctx)
+{	
+	struct pfm_arch_ext_reg *xrc, *xrd;
+	struct pfm_arch_context *ctx_arch;
+	struct pfm_arch_p4_context *p4;
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	u64 ena_mask[PFM_PMC_BV];
+	u64 cccr, ctr1, ctr2;
+	int n, i, j;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	xrc = arch_info->pmc_addrs;
+	xrd = arch_info->pmd_addrs;
+	p4 = ctx_arch->p4;
+
+	bitmap_and(cast_ulp(ena_mask),
+			cast_ulp(pfm_pmu_conf->regs.pmcs),
+			cast_ulp(arch_info->enable_mask),
+			arch_info->max_ena);
+
+	n = bitmap_weight(cast_ulp(ena_mask), arch_info->max_ena);
+
+	for(i=0; n; i++) {
+		if (!test_bit(i, cast_ulp(ena_mask))) 
+			continue;
+		/*
+		 * controlled counter
+		 */
+		j = xrc[i].ctr;
+
+		/* read CCCR (PMC) value */
+		__pfm_read_reg_p4(xrc+i, &cccr);
+
+		/* read counter (PMD) controlled by PMC */
+		__pfm_read_reg_p4(xrd+j, &ctr1);
+
+		/* clear CCCR value: stop counter but destroy OVF */
+		__pfm_write_reg_p4(xrc+i, 0);
+
+		/* read counter controlled by CCCR again */
+		__pfm_read_reg_p4(xrd+j, &ctr2);
+
+		/*
+		 * there is an overflow if either:
+		 * 	- CCCR.ovf is set (and we just cleared it)
+		 * 	- ctr2 < ctr1
+		 * in that case we set the bit corresponding to the
+		 * overflowed PMD in povfl_pmds.
+		 */
+		if ((cccr & (1ULL<<31)) || (ctr2 < ctr1)) {
+			__set_bit(j, cast_ulp(ctx_arch->p4->povfl_pmds));
+			ctx_arch->p4->npend_ovfls++;
+		}
+		p4->saved_cccrs[i] = cccr;
+		n--;
+	}
+	/*
+	 * if there was no overflow, then it means the NMI was not really
+	 * for us, so we have to resume monitoring
+	 */
+	if (unlikely(!ctx_arch->p4->npend_ovfls)) {
+		for(i=0; n; i++) {
+			if (!test_bit(i, cast_ulp(ena_mask))) 
+				continue;
+			__pfm_write_reg_p4(xrc+i, ctx_arch->p4->saved_cccrs[i]);
+		}
+	}
+	return 0;
+}
+
+/*
+ * detect is counters have overflowed.
+ * return:
+ * 	0 : no overflow
+ * 	1 : at least one overflow
+ *
+ * used by Intel Core-based processors
+ */
+static int __kprobes pfm_has_ovfl_intel_core(struct pfm_context *ctx)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	struct pfm_arch_ext_reg *xrd;
+	u64 *cnt_mask;
+	u64 wmask, val;
+	u16 i, num;
+
+	cnt_mask = pfm_pmu_conf->regs.cnt_pmds;
+	num = pfm_pmu_conf->regs.num_counters;
+	wmask = 1ULL << pfm_pmu_conf->counter_width;
+	xrd = arch_info->pmd_addrs;
+
+	for (i = 0; num; i++) {
+		if (test_bit(i, cast_ulp(cnt_mask))) {
+			rdmsrl(xrd[i].addrs[0], val);
+			if (!(val & wmask))
+				return 1;
+			num--;
+		}
+	}
+	return 0;
+}
+
+/*
+ * called from notify_die() notifier from an trap handler path. We only
+ * care about NMI related callbacks, and ignore everything else.
+ *
+ * Cannot grab any locks, include the perfmon context lock
+ *
+ * Must detect if NMI interrupt comes from perfmon, and if so it must
+ * stop the PMU and repost a lower-priority interrupt. The perfmon interrupt
+ * handler needs to grab the context lock, thus is cannot be run directly
+ * from the NMI interrupt call path.
+ */
+static int __kprobes pfm_handle_nmi(struct notifier_block *nb, unsigned long val,
+			      void *data)
+{
+	struct die_args *args = data;
+	struct pfm_context *ctx;
+
+	/*
+	 * only NMI related calls
+	 */
+	if (val != DIE_NMI_IPI)
+		return NOTIFY_DONE;
+
+	/*
+	 * perfmon not active on this processor
+	 */
+	ctx = __get_cpu_var(pmu_ctx);
+	if (ctx == NULL) {
+		PFM_DBG_ovfl("ctx NULL");
+		return NOTIFY_DONE;
+	}
+
+	/*
+	 * detect if we have overflows, i.e., NMI interrupt
+	 * caused by PMU
+	 */
+	if (!pfm_has_ovfl(ctx)) {
+		PFM_DBG_ovfl("no ovfl");
+		return NOTIFY_DONE;
+	}
+
+	/*
+	 * we stop the PMU to avoid further overflow before this
+	 * one is treated by lower priority interrupt handler
+	 */
+	__pfm_arch_quiesce_pmu_percpu();
+
+	/*
+	 * record actual instruction pointer
+	 */
+	__get_cpu_var(real_iip) = instruction_pointer(args->regs);
+
+	/*
+	 * post lower priority interrupt (LOCAL_PERFMON_VECTOR)
+	 */
+	pfm_arch_resend_irq();
+
+	__get_cpu_var(pfm_stats).ovfl_intr_nmi_count++;
+
+	/*
+	 * we need to rewrite the APIC vector on Intel
+	 */
+	if (cpu_data->x86_vendor == X86_VENDOR_INTEL)
+		apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+	/*
+	 * the notification was for us
+	 */
+	return NOTIFY_STOP;
+}
+
+static struct notifier_block pfm_nmi_nb={
+	.notifier_call = pfm_handle_nmi
+};
+
+/*
+ * called from pfm_register_pmu_config() after the new
+ * config has been validated. The pfm_session_lock
+ * is held.
+ *
+ * return:
+ * 	< 0 : if error
+ * 	  0 : if success
+ */
+int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg)
+{
+	struct pfm_arch_pmu_info *arch_info = cfg->arch_info;
+
+	/*
+	 * ensure that PMU description is able to deal with NMI watchdog using
+	 * the performance counters
+	 */
+	if (   nmi_watchdog == NMI_LOCAL_APIC
+	    && !(arch_info->flags & PFM_X86_FL_USE_NMI)) {
+		PFM_INFO("NMI watchdog uses counters, PMU module cannot handle");
+		return -EINVAL;
+	}
+
+	/*
+	 * adust stop routine based on PMU model
+	 *
+	 * P6  : P6, Pentium M, AMD K8, Intel architectural perfmon
+	 * P4  : Xeon, EM64T, P4
+	 * Core: Core 2,
+	 */
+	switch(arch_info->pmu_style) {
+	case PFM_X86_PMU_P4:
+		pfm_stop_save = pfm_stop_save_p4;
+		pfm_has_ovfl  = pfm_has_ovfl_p4;
+		break;
+	case PFM_X86_PMU_P6:
+		pfm_stop_save = pfm_stop_save_p6;
+		pfm_has_ovfl  = pfm_has_ovfl_p6;
+		break;
+	case PFM_X86_PMU_CORE:
+		pfm_stop_save = pfm_stop_save_intel_core;
+		pfm_has_ovfl  = pfm_has_ovfl_intel_core;
+		break;
+	case PFM_X86_PMU_AMD64:
+		pfm_stop_save = pfm_stop_save_amd64;
+		pfm_has_ovfl  = pfm_has_ovfl_amd64;
+		break;
+	default:
+		PFM_INFO("unknown pmu_style=%d", arch_info->pmu_style);
+		return -EINVAL;
+	}
+
+	/*
+	 * determine interrupt type to use
+	 */
+	if (arch_info->flags & PFM_X86_FL_USE_NMI) {
+		register_die_notifier(&pfm_nmi_nb);
+		PFM_INFO("intr_type=NMI");
+		pfm_using_nmi = 1;
+	} else {
+		PFM_INFO("intr_type=regular");
+	}
+	return 0;
+}
+
+void pfm_arch_pmu_config_remove(void)
+{
+	if (pfm_using_nmi)
+		unregister_die_notifier(&pfm_nmi_nb);
+
+	pfm_using_nmi = 0;
+}
+
+char *pfm_arch_get_pmu_module_name(void)
+{
+	switch(cpu_data->x86) {
+	case 6:
+		switch(cpu_data->x86_model) {
+		case 3: /* Pentium II */
+		case 7 ... 11:
+		case 13:
+			return "perfmon_p6";
+		case 15:
+			return "perfmon_core";
+		default:
+			goto try_arch;
+		}
+	case 15:
+	case 16:
+		/* All Opteron processors */
+		if (cpu_data->x86_vendor == X86_VENDOR_AMD)
+			return "perfmon_k8";
+
+		switch(cpu_data->x86_model) {
+		case 0 ... 6:
+			return "perfmon_p4";
+		}
+		/* FALL THROUGH */
+	default:
+try_arch:
+		if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON))
+			return "perfmon_intel_arch";
+		return NULL;
+	}
+	return NULL;
+}
+
+void pfm_arch_resend_irq(void)
+{
+	unsigned long val, dest;
+	/*
+	 * we cannot use hw_resend_irq() because it goes to
+	 * the I/O APIC. We need to go to the Local APIC.
+	 *
+	 * The "int vec" is not the right solution either
+	 * because it triggers a software intr. We need
+	 * to regenerate the interrupt and have it pended
+	 * until we unmask interrupts.
+	 *
+	 * Instead we send ourself an IPI on the perfmon
+	 * vector.
+	 */
+	val  = APIC_DEST_SELF|APIC_INT_ASSERT|
+	       APIC_DM_FIXED|LOCAL_PERFMON_VECTOR;
+
+	dest = apic_read(APIC_ID);
+	apic_write(APIC_ICR2, dest);
+	apic_write(APIC_ICR, val);
+}
+
+DEFINE_PER_CPU(unsigned long, saved_lvtpc);
+
+static void pfm_arch_pmu_acquire_percpu(void *data)
+{
+	int vec;
+
+	__get_cpu_var(saved_lvtpc) = apic_read(APIC_LVTPC);
+
+	vec = pfm_using_nmi ? APIC_DM_NMI : LOCAL_PERFMON_VECTOR;
+	apic_write(APIC_LVTPC, vec);
+
+	PFM_DBG("LTVPC=0x%lx saved=0x%lx",
+		(unsigned long)apic_read(APIC_LVTPC),
+		(unsigned long)__get_cpu_var(saved_lvtpc));
+}
+
+static void pfm_arch_pmu_release_percpu(void *data)
+{
+	PFM_DBG("restoring LVTPC=0x%lx", __get_cpu_var(saved_lvtpc));
+	apic_write(APIC_LVTPC, __get_cpu_var(saved_lvtpc));
+}
+
+
+/*
+ * called from pfm_acquire_pmu() with
+ * pfm_pmu_conf.regs copied from pfm_pmu_conf.full_regs
+ * needs to adjust regs to match current PMU availabilityy
+ *
+ * Caller does recalculate all max/num/first limits on the
+ * pfm_pmu_conf.regs structure.
+ *
+ * interrupts are not masked
+ *
+ *
+ * XXX: until reserve_*_nmi() get fixed by Bjorn to work
+ * correctly whenever the NMI watchdog is not used. We skip
+ * the allocation. Yet we do the percpu initialization.
+ */
+int pfm_arch_pmu_acquire(void)
+{
+	struct pfm_arch_pmu_info *arch_info;
+	struct pfm_regmap_desc *d;
+	struct pfm_arch_ext_reg *pc;
+	u16 i, n, ena = 0;
+
+	arch_info = pfm_pmu_conf->arch_info;
+	pc = arch_info->pmc_addrs;
+
+	bitmap_zero(cast_ulp(arch_info->enable_mask), PFM_MAX_PMCS);
+
+	d = pfm_pmu_conf->pmc_desc;
+	n = pfm_pmu_conf->regs.num_pmcs;
+	for(i=0; n; i++, d++) {
+		/*
+		 * skip not implemented registers (including those
+		 * already removed by the module)
+		 */
+		if (!(d->type & PFM_REG_I))
+			continue;
+
+		n--;
+
+		if (d->type & PFM_REG_V)
+			continue;
+
+		/*
+		 * reserve register with lower-level allocator
+		 */
+		if (!reserve_evntsel(d->hw_addr)) {
+			PFM_DBG("pmc%d (%s) in use elsewhere, disabling", i, d->desc);
+			__clear_bit(i, cast_ulp(pfm_pmu_conf->regs.pmcs));
+		} else {
+			if (pc[i].reg_type & PFM_REGT_EN) {
+		    		__set_bit(i, cast_ulp(arch_info->enable_mask));
+		    		ena++;
+				arch_info->max_ena = i + 1;
+			}
+		}
+	}
+
+	PFM_DBG("%u PMCs with enable capability", ena);
+	if (!ena) {
+		PFM_INFO("no registers with start/stop capability,"
+			 "try rebooting with nmi_watchdog=0");
+		goto undo;
+	}
+
+	d = pfm_pmu_conf->pmd_desc;
+	n = pfm_pmu_conf->regs.num_pmds;
+	for(i=0; n; i++, d++) {
+		if (!(d->type & PFM_REG_I))
+			continue;
+		n--;
+
+		if (d->type & PFM_REG_V)
+			continue;
+
+		if (!reserve_perfctr(d->hw_addr)) {
+			PFM_DBG("pmd%d (%s) in use elsewhere, disabling", i, d->desc);
+			__clear_bit(i, cast_ulp(pfm_pmu_conf->regs.pmds));
+			__clear_bit(i, cast_ulp(pfm_pmu_conf->regs.cnt_pmds));
+			__clear_bit(i, cast_ulp(pfm_pmu_conf->regs.rw_pmds));
+		}
+	}
+
+	/*
+	 * program APIC
+	 */
+	on_each_cpu(pfm_arch_pmu_acquire_percpu, NULL, 0, 1);
+
+	return 0;
+undo:
+	pfm_pmu_conf->regs = pfm_pmu_conf->full_regs;
+	return -EBUSY;
+}
+
+/*
+ * called from pfm_pmu_release()
+ * interrupts are not masked
+ */
+void pfm_arch_pmu_release(void)
+{
+	struct pfm_regmap_desc *d;
+	u16 i, n;
+
+	d = pfm_pmu_conf->pmc_desc;
+	n = pfm_pmu_conf->regs.num_pmcs;
+	for(i=0; n; i++, d++) {
+		if (!test_bit(i, cast_ulp(pfm_pmu_conf->regs.pmcs)))
+			continue;
+		release_evntsel(d->hw_addr);
+		n--;
+		PFM_DBG("pmc%u released", i);
+	}
+	d = pfm_pmu_conf->pmd_desc;
+	n = pfm_pmu_conf->regs.num_pmds;
+	for(i=0; n; i++, d++) {
+		if (!test_bit(i, cast_ulp(pfm_pmu_conf->regs.pmds)))
+			continue;
+		release_perfctr(d->hw_addr);
+		n--;
+		PFM_DBG("pmd%u released", i);
+	}
+	on_each_cpu(pfm_arch_pmu_release_percpu, NULL, 0, 1);
+}
Index: linux-2.6/arch/i386/perfmon/perfmon_gen_ia32.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/i386/perfmon/perfmon_gen_ia32.c
@@ -0,0 +1,290 @@
+/*
+ * This file contains the IA-32 architectural perfmon register description tables.
+ *
+ * The IA-32 architectural perfmon (PMU) was introduced with Intel Core Solo
+ * and Core Duo processors.
+ *
+ * Copyright (c) 2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("Generic IA-32 PMU description table");
+MODULE_LICENSE("GPL");
+
+static int force;
+MODULE_PARM_DESC(force, "bool: force module to load succesfully");
+module_param(force, bool, 0600);
+
+/*
+ * - upper 32 bits are reserved
+ * - INT: APIC enable bit is reserved (forced to 1)
+ * - bit 21 is reserved
+ *
+ * RSVD: reserved bits are 1
+ */
+#define PFM_GEN_IA32_PMC_RSVD	((~((1ULL<<32)-1)) \
+			| (1ULL<<20) \
+			| (1ULL<<21))
+
+/*
+ * force Local APIC interrupt on overflow
+ * disable with NO_EMUL64
+ */
+#define PFM_GEN_IA32_PMC_VAL	(1ULL<<20)
+#define PFM_GEN_IA32_NO64	(1ULL<<20)
+
+/*
+ * architectuture specifies that:
+ * IA32_PMCx MSR starts at 0xc1 & occupy a contiguous block of MSR addr
+ * IA32_PERFEVTSELx MSR starts at 0x186 & occupy a contiguous block of MSR addr
+ */
+#define MSR_GEN_PERFEVTSEL_BASE	MSR_P6_EVNTSEL0
+#define MSR_GEN_PMC_BASE	MSR_P6_PERFCTR0
+
+#define PFM_GEN_IA32_SEL(n)	{ \
+	.addrs[0] = MSR_GEN_PERFEVTSEL_BASE+(n), \
+	.addrs[1] = 0, \
+	.ctr = n, \
+	.reg_type = PFM_REGT_EN}
+
+#define PFM_GEN_IA32_CTR(n) { \
+	.addrs[0] = MSR_GEN_PMC_BASE+(n), \
+	.addrs[1] = 0, \
+	.ctr = n, \
+	.reg_type = PFM_REGT_CTR}
+
+struct pmu_eax {
+        unsigned int version:8;
+        unsigned int num_cnt:8;
+        unsigned int cnt_width:8;
+        unsigned int ebx_length:8;
+};
+
+/*
+ * physical addresses of MSR controlling the perfevtsel and counter registers
+ */
+struct pfm_arch_pmu_info pfm_gen_ia32_pmu_info={
+	.pmc_addrs = {
+		PFM_GEN_IA32_SEL(0) ,  PFM_GEN_IA32_SEL(1),  PFM_GEN_IA32_SEL(2),  PFM_GEN_IA32_SEL(3),
+		PFM_GEN_IA32_SEL(4) ,  PFM_GEN_IA32_SEL(5),  PFM_GEN_IA32_SEL(6),  PFM_GEN_IA32_SEL(7),
+		PFM_GEN_IA32_SEL(8) ,  PFM_GEN_IA32_SEL(9), PFM_GEN_IA32_SEL(10), PFM_GEN_IA32_SEL(11),
+		PFM_GEN_IA32_SEL(12), PFM_GEN_IA32_SEL(13), PFM_GEN_IA32_SEL(14), PFM_GEN_IA32_SEL(15),
+		PFM_GEN_IA32_SEL(16), PFM_GEN_IA32_SEL(17), PFM_GEN_IA32_SEL(18), PFM_GEN_IA32_SEL(19),
+		PFM_GEN_IA32_SEL(20), PFM_GEN_IA32_SEL(21), PFM_GEN_IA32_SEL(22), PFM_GEN_IA32_SEL(23),
+		PFM_GEN_IA32_SEL(24), PFM_GEN_IA32_SEL(25), PFM_GEN_IA32_SEL(26), PFM_GEN_IA32_SEL(27),
+		PFM_GEN_IA32_SEL(28), PFM_GEN_IA32_SEL(29), PFM_GEN_IA32_SEL(30), PFM_GEN_IA32_SEL(31)
+	},
+	.pmd_addrs = {
+		PFM_GEN_IA32_CTR(0) ,  PFM_GEN_IA32_CTR(1),  PFM_GEN_IA32_CTR(2),  PFM_GEN_IA32_CTR(3),
+		PFM_GEN_IA32_CTR(4) ,  PFM_GEN_IA32_CTR(5),  PFM_GEN_IA32_CTR(6),  PFM_GEN_IA32_CTR(7),
+		PFM_GEN_IA32_CTR(8) ,  PFM_GEN_IA32_CTR(9), PFM_GEN_IA32_CTR(10), PFM_GEN_IA32_CTR(11),
+		PFM_GEN_IA32_CTR(12), PFM_GEN_IA32_CTR(13), PFM_GEN_IA32_CTR(14), PFM_GEN_IA32_CTR(15),
+		PFM_GEN_IA32_CTR(16), PFM_GEN_IA32_CTR(17), PFM_GEN_IA32_CTR(18), PFM_GEN_IA32_CTR(19),
+		PFM_GEN_IA32_CTR(20), PFM_GEN_IA32_CTR(21), PFM_GEN_IA32_CTR(22), PFM_GEN_IA32_CTR(23),
+		PFM_GEN_IA32_CTR(24), PFM_GEN_IA32_CTR(25), PFM_GEN_IA32_CTR(26), PFM_GEN_IA32_CTR(27),
+		PFM_GEN_IA32_CTR(28), PFM_GEN_IA32_CTR(29), PFM_GEN_IA32_CTR(30), PFM_GEN_IA32_CTR(31)
+	},
+	.pmu_style = PFM_X86_PMU_P6
+};
+
+#define PFM_GEN_IA32_C(n) {                 \
+	.type = PFM_REG_I64,                \
+	.desc = "PERFEVTSEL"#n,             \
+	.dfl_val = PFM_GEN_IA32_PMC_VAL,    \
+	.rsvd_msk = PFM_GEN_IA32_PMC_RSVD,  \
+	.no_emul64_msk = PFM_GEN_IA32_NO64, \
+	.hw_addr = MSR_GEN_PERFEVTSEL_BASE+(n) \
+	}
+
+#define PFM_GEN_IA32_D(n) { \
+	.type = PFM_REG_C,  \
+	.desc = "PMC"#n,    \
+	.dfl_val = 0,       \
+	.rsvd_msk = 0,      \
+	.no_emul64_msk = 0, \
+	.hw_addr = MSR_GEN_PMC_BASE+(n) \
+	}
+
+static struct pfm_reg_desc pfm_gen_ia32_pmc_desc[]={
+/* pmc0  */  PFM_GEN_IA32_C(0),  PFM_GEN_IA32_C(1),  PFM_GEN_IA32_C(2),  PFM_GEN_IA32_C(3),
+/* pmc4  */  PFM_GEN_IA32_C(4),  PFM_GEN_IA32_C(5),  PFM_GEN_IA32_C(6),  PFM_GEN_IA32_C(7),
+/* pmc8  */  PFM_GEN_IA32_C(8),  PFM_GEN_IA32_C(9), PFM_GEN_IA32_C(10), PFM_GEN_IA32_C(11),
+/* pmc12 */ PFM_GEN_IA32_C(12), PFM_GEN_IA32_C(13), PFM_GEN_IA32_C(14), PFM_GEN_IA32_C(15),
+/* pmc16 */ PFM_GEN_IA32_C(16), PFM_GEN_IA32_C(17), PFM_GEN_IA32_C(18), PFM_GEN_IA32_C(19),
+/* pmc20 */ PFM_GEN_IA32_C(20), PFM_GEN_IA32_C(21), PFM_GEN_IA32_C(22), PFM_GEN_IA32_C(23),
+/* pmc24 */ PFM_GEN_IA32_C(24), PFM_GEN_IA32_C(25), PFM_GEN_IA32_C(26), PFM_GEN_IA32_C(27),
+/* pmc28 */ PFM_GEN_IA32_C(28), PFM_GEN_IA32_C(29), PFM_GEN_IA32_C(30), PFM_GEN_IA32_C(31)
+};
+
+static struct pfm_reg_desc pfm_gen_ia32_pmd_desc[]={
+/* pmd0  */  PFM_GEN_IA32_D(0),  PFM_GEN_IA32_D(1),  PFM_GEN_IA32_D(2),  PFM_GEN_IA32_D(3),
+/* pmd4  */  PFM_GEN_IA32_D(4),  PFM_GEN_IA32_D(5),  PFM_GEN_IA32_D(6),  PFM_GEN_IA32_D(7),
+/* pmd8  */  PFM_GEN_IA32_D(8),  PFM_GEN_IA32_D(9), PFM_GEN_IA32_D(10), PFM_GEN_IA32_D(11),
+/* pmd12 */ PFM_GEN_IA32_D(12), PFM_GEN_IA32_D(13), PFM_GEN_IA32_D(14), PFM_GEN_IA32_D(15),
+/* pmd16 */ PFM_GEN_IA32_D(16), PFM_GEN_IA32_D(17), PFM_GEN_IA32_D(18), PFM_GEN_IA32_D(19),
+/* pmd20 */ PFM_GEN_IA32_D(20), PFM_GEN_IA32_D(21), PFM_GEN_IA32_D(22), PFM_GEN_IA32_D(23),
+/* pmd24 */ PFM_GEN_IA32_D(24), PFM_GEN_IA32_D(25), PFM_GEN_IA32_D(26), PFM_GEN_IA32_D(27),
+/* pmd28 */ PFM_GEN_IA32_D(28), PFM_GEN_IA32_D(29), PFM_GEN_IA32_D(30), PFM_GEN_IA32_D(31)
+};
+#define PFM_GEN_IA32_MAX_PMCS	ARRAY_SIZE(pfm_gen_ia32_pmc_desc)
+
+#define MSR_IA32_MISC_ENABLE_PERF_AVAIL (1<<7) /* read-only status bit */
+
+static struct pfm_pmu_config pfm_gen_ia32_pmu_conf;
+
+static int pfm_gen_ia32_probe_pmu(void)
+{
+	union {
+		unsigned int val;
+		struct pmu_eax eax;
+	} eax;
+	unsigned int ebx, ecx, edx;
+	unsigned int num_cnt;
+
+	if (cpu_data->x86_vendor != X86_VENDOR_INTEL) {
+		PFM_INFO("not an Intel processor");
+		return -1;
+	}
+
+	/*
+	 * ensure CPUID instruction exists
+	 */
+	if (cpu_data->x86 < 5) {
+		PFM_INFO("processor family too old");
+		return -1;
+	}
+
+	if (force == 0) {
+		/*
+		 * check if CPU supports 0xa function of CPUID
+		 * 0xa started with Core Duo/Solo. Needed to detect if
+		 * architected PMU is present
+		 */
+		cpuid(0x0, &eax.val, &ebx, &ecx, &edx);
+		if (eax.val < 0xa) {
+			PFM_INFO("CPUID 0xa function not supported\n");
+			return -1;
+		}
+
+		cpuid(0xa, &eax.val, &ebx, &ecx, &edx);
+		if (eax.eax.version < 1) {
+			PFM_INFO("architectural perfmon not supported\n");
+			return -1;
+		}
+
+		/*
+		 * ensure that when all moduels are linked in, we picked the right
+		 * one for Intel Core-based processors, as they accept architectural
+	 	 * perfmon, but implement extensions which are only visible with
+		 * perfmon_core module
+		 */
+		if (cpu_data->x86 == 6 && cpu_data->x86_model == 15) {
+			PFM_INFO("use perfmon_core for Core-based processors");
+			return -1;
+		}
+	} else {
+		eax.eax.num_cnt = 2;
+		eax.eax.cnt_width = 31;
+	}
+
+	num_cnt = eax.eax.num_cnt;
+
+	/*
+	 * sanity check number of counters
+	 */
+	if (num_cnt == 0 || num_cnt >= PFM_MAX_HW_PMCS) {
+		PFM_INFO("invalid number of counters %u\n", eax.eax.num_cnt);
+		return -1;
+	}
+	/*
+	 * instead of dynamically generating the description table
+	 * and MSR addresses, we have a default description with a reasonably
+	 * large number of counters (32). We believe this is plenty for quite
+	 * some time. Thus allows us to have a much simpler probing and
+	 * initialization routine, especially because we have no dynamic
+	 * allocation, especially for the counter names.
+	 *
+	 * When HW supports more that what we haev prepared for, then we limit
+	 * the number of counters we support and print a message.
+	 */
+	if (num_cnt >= PFM_GEN_IA32_MAX_PMCS) {
+		printk(KERN_INFO "perfmon: Limiting number of counters to %zu,"
+				 "HW supports %u", PFM_GEN_IA32_MAX_PMCS, num_cnt);
+		num_cnt = PFM_GEN_IA32_MAX_PMCS;
+	}
+
+	if (eax.eax.cnt_width > 63) {
+		PFM_INFO("invalid counter width %u\n", eax.eax.cnt_width);
+		return -1;
+	}
+
+	if (!cpu_has_apic) {
+		PFM_INFO("no Local APIC, unsupported");
+		return -1;
+	}
+
+	if (nmi_watchdog == NMI_LOCAL_APIC) {
+		PFM_INFO("NMI watchdog using PERFEVTSEL0/PERTCTR0, disabling them for perfmon");
+		pfm_gen_ia32_pmc_desc[0].type = PFM_REG_NA;
+		pfm_gen_ia32_pmd_desc[0].type = PFM_REG_NA;
+		pfm_gen_ia32_pmu_info.pmc_addrs[0].reg_type = PFM_REGT_NA;
+		pfm_gen_ia32_pmu_info.pmd_addrs[0].reg_type = PFM_REGT_NA;
+	}
+	pfm_gen_ia32_pmu_conf.num_pmc_entries = num_cnt;
+	pfm_gen_ia32_pmu_conf.num_pmd_entries = num_cnt;
+
+	return 0;
+}
+
+/*
+ * Counters may have model-specific width. Yet the documentation says
+ * that only the lower 32 bits can be written to. bits [w-32]
+ * are sign extensions of bit 31. As such the effective width of
+ * a counter is 31 bits only.
+ * See IA-32 Intel Architecture Software developer manual Vol 3b:
+ * system programming and section 18.17.2 in particular.
+ */
+static struct pfm_pmu_config pfm_gen_ia32_pmu_conf={
+	.pmu_name = "Intel architectural",
+	.pmd_desc = pfm_gen_ia32_pmd_desc,
+	.counter_width   = 31,
+	.pmc_desc = pfm_gen_ia32_pmc_desc,
+	.probe_pmu = pfm_gen_ia32_probe_pmu,
+	.version = "1.0",
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE,
+	.arch_info = &pfm_gen_ia32_pmu_info
+};
+	
+static int __init pfm_gen_ia32_pmu_init_module(void)
+{
+	return pfm_pmu_register(&pfm_gen_ia32_pmu_conf);
+}
+
+static void __exit pfm_gen_ia32_pmu_cleanup_module(void)
+{
+	pfm_pmu_unregister(&pfm_gen_ia32_pmu_conf);
+}
+
+module_init(pfm_gen_ia32_pmu_init_module);
+module_exit(pfm_gen_ia32_pmu_cleanup_module);
Index: linux-2.6/arch/i386/perfmon/perfmon_intel_arch.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/i386/perfmon/perfmon_intel_arch.c
@@ -0,0 +1,261 @@
+/*
+ * This file contains the Intel architectural perfmon register v1
+ * description tables.
+ *
+ * Architectural perfmon was introduced with Intel Core Solo and
+ * Core Duo processors.
+ *
+ * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("Intel architectural perfmon v1");
+MODULE_LICENSE("GPL");
+
+static int force, force_nmi;
+MODULE_PARM_DESC(force, "bool: force module to load succesfully");
+MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt");
+module_param(force, bool, 0600);
+module_param(force_nmi, bool, 0600);
+
+/*
+ * - upper 32 bits are reserved
+ * - INT: APIC enable bit is reserved (forced to 1)
+ * - bit 21 is reserved
+ *
+ * RSVD: reserved bits are 1
+ */
+#define PFM_INTEL_ARCH_PMC_RSVD	((~((1ULL<<32)-1)) \
+			| (1ULL<<20) \
+			| (1ULL<<21))
+
+/*
+ * force Local APIC interrupt on overflow
+ * disable with NO_EMUL64
+ */
+#define PFM_INTEL_ARCH_PMC_VAL	(1ULL<<20)
+#define PFM_INTEL_ARCH_NO64	(1ULL<<20)
+
+/*
+ * architectuture specifies that:
+ * IA32_PMCx MSR        : starts at 0xc1 & occupy a contiguous block of MSR
+ * IA32_PERFEVTSELx MSR : starts at 0x186 & occupy a contiguous block of MSR
+ */
+#define MSR_GEN_SEL_BASE	MSR_P6_EVNTSEL0
+#define MSR_GEN_PMC_BASE	MSR_P6_PERFCTR0
+
+#define PFM_INTEL_ARCH_SEL(n)	{ \
+	.addrs[0] = MSR_GEN_SEL_BASE+(n), \
+	.addrs[1] = 0, \
+	.ctr = n, \
+	.reg_type = PFM_REGT_EN}
+
+#define PFM_INTEL_ARCH_CTR(n) { \
+	.addrs[0] = MSR_GEN_PMC_BASE+(n), \
+	.addrs[1] = 0, \
+	.ctr = n, \
+	.reg_type = PFM_REGT_CTR}
+
+struct pmu_eax {
+        unsigned int version:8;
+        unsigned int num_cnt:8; /* up to 256 counters? */
+        unsigned int cnt_width:8;
+        unsigned int ebx_length:8;
+};
+
+/*
+ * physical addresses of MSR controlling the perfevtsel and counter registers
+ */
+struct pfm_arch_pmu_info pfm_intel_arch_pmu_info={
+	.pmc_addrs = {
+		PFM_INTEL_ARCH_SEL(0) ,  PFM_INTEL_ARCH_SEL(1),  PFM_INTEL_ARCH_SEL(2),  PFM_INTEL_ARCH_SEL(3),
+		PFM_INTEL_ARCH_SEL(4) ,  PFM_INTEL_ARCH_SEL(5),  PFM_INTEL_ARCH_SEL(6),  PFM_INTEL_ARCH_SEL(7),
+		PFM_INTEL_ARCH_SEL(8) ,  PFM_INTEL_ARCH_SEL(9), PFM_INTEL_ARCH_SEL(10), PFM_INTEL_ARCH_SEL(11),
+		PFM_INTEL_ARCH_SEL(12), PFM_INTEL_ARCH_SEL(13), PFM_INTEL_ARCH_SEL(14), PFM_INTEL_ARCH_SEL(15),
+		PFM_INTEL_ARCH_SEL(16), PFM_INTEL_ARCH_SEL(17), PFM_INTEL_ARCH_SEL(18), PFM_INTEL_ARCH_SEL(19),
+		PFM_INTEL_ARCH_SEL(20), PFM_INTEL_ARCH_SEL(21), PFM_INTEL_ARCH_SEL(22), PFM_INTEL_ARCH_SEL(23),
+		PFM_INTEL_ARCH_SEL(24), PFM_INTEL_ARCH_SEL(25), PFM_INTEL_ARCH_SEL(26), PFM_INTEL_ARCH_SEL(27),
+		PFM_INTEL_ARCH_SEL(28), PFM_INTEL_ARCH_SEL(29), PFM_INTEL_ARCH_SEL(30), PFM_INTEL_ARCH_SEL(31)
+	},
+	.pmd_addrs = {
+		PFM_INTEL_ARCH_CTR(0) ,  PFM_INTEL_ARCH_CTR(1),  PFM_INTEL_ARCH_CTR(2),  PFM_INTEL_ARCH_CTR(3),
+		PFM_INTEL_ARCH_CTR(4) ,  PFM_INTEL_ARCH_CTR(5),  PFM_INTEL_ARCH_CTR(6),  PFM_INTEL_ARCH_CTR(7),
+		PFM_INTEL_ARCH_CTR(8) ,  PFM_INTEL_ARCH_CTR(9), PFM_INTEL_ARCH_CTR(10), PFM_INTEL_ARCH_CTR(11),
+		PFM_INTEL_ARCH_CTR(12), PFM_INTEL_ARCH_CTR(13), PFM_INTEL_ARCH_CTR(14), PFM_INTEL_ARCH_CTR(15),
+		PFM_INTEL_ARCH_CTR(16), PFM_INTEL_ARCH_CTR(17), PFM_INTEL_ARCH_CTR(18), PFM_INTEL_ARCH_CTR(19),
+		PFM_INTEL_ARCH_CTR(20), PFM_INTEL_ARCH_CTR(21), PFM_INTEL_ARCH_CTR(22), PFM_INTEL_ARCH_CTR(23),
+		PFM_INTEL_ARCH_CTR(24), PFM_INTEL_ARCH_CTR(25), PFM_INTEL_ARCH_CTR(26), PFM_INTEL_ARCH_CTR(27),
+		PFM_INTEL_ARCH_CTR(28), PFM_INTEL_ARCH_CTR(29), PFM_INTEL_ARCH_CTR(30), PFM_INTEL_ARCH_CTR(31)
+	},
+	.pmu_style = PFM_X86_PMU_P6
+};
+
+#define PFM_INTEL_ARCH_C(n) {                 \
+	.type = PFM_REG_I64,                \
+	.desc = "PERFEVTSEL"#n,             \
+	.dfl_val = PFM_INTEL_ARCH_PMC_VAL,    \
+	.rsvd_msk = PFM_INTEL_ARCH_PMC_RSVD,  \
+	.no_emul64_msk = PFM_INTEL_ARCH_NO64, \
+	.hw_addr = MSR_GEN_SEL_BASE+(n) \
+	}
+
+#define PFM_INTEL_ARCH_D(n) { \
+	.type = PFM_REG_C,  \
+	.desc = "PMC"#n,    \
+	.dfl_val = 0,       \
+	.rsvd_msk = 0,      \
+	.no_emul64_msk = 0, \
+	.hw_addr = MSR_GEN_PMC_BASE+(n) \
+	}
+
+static struct pfm_regmap_desc pfm_intel_arch_pmc_desc[]={
+/* pmc0  */  PFM_INTEL_ARCH_C(0),  PFM_INTEL_ARCH_C(1),  PFM_INTEL_ARCH_C(2),  PFM_INTEL_ARCH_C(3),
+/* pmc4  */  PFM_INTEL_ARCH_C(4),  PFM_INTEL_ARCH_C(5),  PFM_INTEL_ARCH_C(6),  PFM_INTEL_ARCH_C(7),
+/* pmc8  */  PFM_INTEL_ARCH_C(8),  PFM_INTEL_ARCH_C(9), PFM_INTEL_ARCH_C(10), PFM_INTEL_ARCH_C(11),
+/* pmc12 */ PFM_INTEL_ARCH_C(12), PFM_INTEL_ARCH_C(13), PFM_INTEL_ARCH_C(14), PFM_INTEL_ARCH_C(15),
+/* pmc16 */ PFM_INTEL_ARCH_C(16), PFM_INTEL_ARCH_C(17), PFM_INTEL_ARCH_C(18), PFM_INTEL_ARCH_C(19),
+/* pmc20 */ PFM_INTEL_ARCH_C(20), PFM_INTEL_ARCH_C(21), PFM_INTEL_ARCH_C(22), PFM_INTEL_ARCH_C(23),
+/* pmc24 */ PFM_INTEL_ARCH_C(24), PFM_INTEL_ARCH_C(25), PFM_INTEL_ARCH_C(26), PFM_INTEL_ARCH_C(27),
+/* pmc28 */ PFM_INTEL_ARCH_C(28), PFM_INTEL_ARCH_C(29), PFM_INTEL_ARCH_C(30), PFM_INTEL_ARCH_C(31)
+};
+
+static struct pfm_regmap_desc pfm_intel_arch_pmd_desc[]={
+/* pmd0  */  PFM_INTEL_ARCH_D(0),  PFM_INTEL_ARCH_D(1),  PFM_INTEL_ARCH_D(2),  PFM_INTEL_ARCH_D(3),
+/* pmd4  */  PFM_INTEL_ARCH_D(4),  PFM_INTEL_ARCH_D(5),  PFM_INTEL_ARCH_D(6),  PFM_INTEL_ARCH_D(7),
+/* pmd8  */  PFM_INTEL_ARCH_D(8),  PFM_INTEL_ARCH_D(9), PFM_INTEL_ARCH_D(10), PFM_INTEL_ARCH_D(11),
+/* pmd12 */ PFM_INTEL_ARCH_D(12), PFM_INTEL_ARCH_D(13), PFM_INTEL_ARCH_D(14), PFM_INTEL_ARCH_D(15),
+/* pmd16 */ PFM_INTEL_ARCH_D(16), PFM_INTEL_ARCH_D(17), PFM_INTEL_ARCH_D(18), PFM_INTEL_ARCH_D(19),
+/* pmd20 */ PFM_INTEL_ARCH_D(20), PFM_INTEL_ARCH_D(21), PFM_INTEL_ARCH_D(22), PFM_INTEL_ARCH_D(23),
+/* pmd24 */ PFM_INTEL_ARCH_D(24), PFM_INTEL_ARCH_D(25), PFM_INTEL_ARCH_D(26), PFM_INTEL_ARCH_D(27),
+/* pmd28 */ PFM_INTEL_ARCH_D(28), PFM_INTEL_ARCH_D(29), PFM_INTEL_ARCH_D(30), PFM_INTEL_ARCH_D(31)
+};
+#define PFM_INTEL_ARCH_MAX_PMCS	ARRAY_SIZE(pfm_intel_arch_pmc_desc)
+
+static struct pfm_pmu_config pfm_intel_arch_pmu_conf;
+
+
+static int pfm_intel_arch_probe_pmu(void)
+{
+	union {
+		unsigned int val;
+		struct pmu_eax eax;
+	} eax;
+	unsigned int ebx, ecx, edx;
+	unsigned int num_cnt;
+
+	if (!cpu_has_arch_perfmon) {
+		PFM_INFO("no support for Intel architectural PMU");
+		return -1;
+	}
+
+	if (force == 0) {
+		cpuid(0xa, &eax.val, &ebx, &ecx, &edx);
+	} else {
+		eax.eax.num_cnt = 2;
+		eax.eax.cnt_width = 31;
+	}
+
+	/* number of counters */
+	num_cnt = eax.eax.num_cnt;
+
+	/*
+	 * sanity check number of counters
+	 */
+	if (num_cnt == 0 || num_cnt >= PFM_MAX_PMCS) {
+		PFM_INFO("invalid number of counters %u\n", eax.eax.num_cnt);
+		return -1;
+	}
+	/*
+	 * instead of dynamically generating the description table
+	 * and MSR addresses, we have a default description with a reasonably
+	 * large number of counters (32). We believe this is plenty for quite
+	 * some time. This allows us to have a much simpler probing and
+	 * initialization routine, especially because we have no dynamic
+	 * allocation.
+	 *
+	 * When HW supports more that what we prepared for, then we limit
+	 * the number of counters we support and print a message.
+	 */
+	if (num_cnt >= PFM_INTEL_ARCH_MAX_PMCS) {
+		printk(KERN_INFO "perfmon: Limiting number of counters to %zu,"
+				 "HW supports %u", PFM_INTEL_ARCH_MAX_PMCS, num_cnt);
+		num_cnt = PFM_INTEL_ARCH_MAX_PMCS;
+	}
+
+	if (eax.eax.cnt_width > 63) {
+		PFM_INFO("invalid counter width %u\n", eax.eax.cnt_width);
+		return -1;
+	}
+
+	if (!cpu_has_apic) {
+		PFM_INFO("no Local APIC, try rebooting with lapic");
+		return -1;
+	}
+
+	pfm_intel_arch_pmu_conf.num_pmc_entries = num_cnt;
+	pfm_intel_arch_pmu_conf.num_pmd_entries = num_cnt;
+
+	PFM_INFO("nmi_watchdog=%d nmi_active=%d force_nmi=%d",
+		nmi_watchdog, atomic_read(&nmi_active), force_nmi);
+
+	/*
+	 * NMI using PMU?
+	 * Actual removal of NMI counter is done by pfm_pmu_acquire()
+	 */
+	if (nmi_watchdog == NMI_LOCAL_APIC || force_nmi)
+		pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_USE_NMI;
+
+	return 0;
+}
+
+/*
+ * Counters may have model-specific width. Yet the documentation says
+ * that only the lower 32 bits can be written to. bits [w-32]
+ * are sign extensions of bit 31. As such the effective width of
+ * a counter is 31 bits only.
+ * See IA-32 Intel Architecture Software developer manual Vol 3B
+ */
+static struct pfm_pmu_config pfm_intel_arch_pmu_conf={
+	.pmu_name = "Intel architectural v1",
+	.pmd_desc = pfm_intel_arch_pmd_desc,
+	.counter_width   = 31,
+	.pmc_desc = pfm_intel_arch_pmc_desc,
+	.probe_pmu = pfm_intel_arch_probe_pmu,
+	.version = "1.0",
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE,
+	.arch_info = &pfm_intel_arch_pmu_info
+};
+	
+static int __init pfm_intel_arch_pmu_init_module(void)
+{
+	return pfm_pmu_register(&pfm_intel_arch_pmu_conf);
+}
+
+static void __exit pfm_intel_arch_pmu_cleanup_module(void)
+{
+	pfm_pmu_unregister(&pfm_intel_arch_pmu_conf);
+}
+
+module_init(pfm_intel_arch_pmu_init_module);
+module_exit(pfm_intel_arch_pmu_cleanup_module);
Index: linux-2.6/arch/i386/perfmon/perfmon_p4.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/i386/perfmon/perfmon_p4.c
@@ -0,0 +1,414 @@
+/*	
+ * This file contains the P4/Xeon PMU register description tables
+ * for both 32 and 64 bit modes.
+ *
+ * Copyright (c) 2005 Intel Corporation
+ * Contributed by Bryan Wilkerson <bryan.p.wilkerson@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+#include <asm/msr.h>
+#include <asm/apic.h>
+#include <asm/nmi.h>
+
+MODULE_AUTHOR("Bryan Wilkerson <bryan.p.wilkerson@intel.com>");
+MODULE_DESCRIPTION("P4/Xeon/EM64T PMU description table");
+MODULE_LICENSE("GPL");
+
+static int force;
+MODULE_PARM_DESC(force, "bool: force module to load succesfully");
+module_param(force, bool, 0600);
+
+static int force_nmi;
+MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt");
+module_param(force_nmi, bool, 0600);
+
+/*
+ * CCCR default value:
+ * 	- OVF_PMI_T0=1 (bit 26)
+ * 	- OVF_PMI_T1=0 (bit 27) (set if necessary in pfm_write_reg())
+ * 	- all other bits are zero
+ *
+ * OVF_PMI is forced to zero if PFM_REGFL_NO_EMUL64 is set on CCCR
+ */
+#define PFM_CCCR_DFL	(1ULL<<26) | (3ULL<<16)
+
+/*
+ * CCCR reserved fields:
+ * 	- bits 0-11, 25-29, 31-63
+ * 	- OVF_PMI (26-27), override with REGFL_NO_EMUL64
+ *
+ * RSVD: reserved bits must be 1
+ */
+#define PFM_CCCR_RSVD     ~((0xfull<<12)  \
+			| (0x7full<<18) \
+			| (0x1ull<<30))
+
+#define PFM_P4_NO64	(3ULL<<26) /* use 3 even in non HT mode */
+
+/*
+ * With HyperThreading enabled:
+ *
+ *  The ESCRs and CCCRs are divided in half with the top half
+ *  belonging to logical processor 0 and the bottom half going to
+ *  logical processor 1. Thus only half of the PMU resources are
+ *  accessible to applications.
+ *
+ *  PEBS is not available due to the fact that:
+ *  	- MSR_PEBS_MATRIX_VERT is shared between the threads
+ *      - IA32_PEBS_ENABLE is shared between the threads
+ *
+ * With HyperThreading disabled:
+ *
+ * The full set of PMU resources is exposed to applications.
+ *
+ * The mapping is chosen such that PMCxx -> MSR is the same
+ * in HT and non HT mode, if register is present in HT mode.
+ *
+ */
+#define PFM_REGT_NHTESCR (PFM_REGT_ESCR|PFM_REGT_NOHT)
+#define PFM_REGT_NHTCCCR (PFM_REGT_CCCR|PFM_REGT_NOHT|PFM_REGT_EN)
+#define PFM_REGT_NHTPEBS (PFM_REGT_PEBS|PFM_REGT_NOHT|PFM_REGT_EN)
+#define PFM_REGT_NHTCTR  (PFM_REGT_CTR|PFM_REGT_NOHT)
+#define PFM_REGT_ENAC    (PFM_REGT_CCCR|PFM_REGT_EN)
+
+static struct pfm_arch_pmu_info pfm_p4_pmu_info={
+ .pmc_addrs = {
+ 	/*pmc 0 */    {{MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1}, 0, PFM_REGT_ESCR}, /*   BPU_ESCR0,1 */
+ 	/*pmc 1 */    {{MSR_P4_IS_ESCR0, MSR_P4_IS_ESCR1}, 0, PFM_REGT_ESCR}, /*    IS_ESCR0,1 */
+ 	/*pmc 2 */    {{MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1}, 0, PFM_REGT_ESCR}, /*   MOB_ESCR0,1 */
+ 	/*pmc 3 */    {{MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1}, 0, PFM_REGT_ESCR}, /*  ITLB_ESCR0,1 */
+ 	/*pmc 4 */    {{MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1}, 0, PFM_REGT_ESCR}, /*   PMH_ESCR0,1 */
+ 	/*pmc 5 */    {{MSR_P4_IX_ESCR0, MSR_P4_IX_ESCR1}, 0, PFM_REGT_ESCR}, /*    IX_ESCR0,1 */
+ 	/*pmc 6 */    {{MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1}, 0, PFM_REGT_ESCR}, /*   FSB_ESCR0,1 */
+ 	/*pmc 7 */    {{MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1}, 0, PFM_REGT_ESCR}, /*   BSU_ESCR0,1 */
+ 	/*pmc 8 */    {{MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1}, 0, PFM_REGT_ESCR}, /*    MS_ESCR0,1 */
+ 	/*pmc 9 */    {{MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1}, 0, PFM_REGT_ESCR}, /*    TC_ESCR0,1 */
+ 	/*pmc 10*/    {{MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1}, 0, PFM_REGT_ESCR}, /*  TBPU_ESCR0,1 */
+ 	/*pmc 11*/    {{MSR_P4_FLAME_ESCR0, MSR_P4_FLAME_ESCR1}, 0, PFM_REGT_ESCR}, /* FLAME_ESCR0,1 */
+ 	/*pmc 12*/    {{MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1}, 0, PFM_REGT_ESCR}, /*  FIRM_ESCR0,1 */
+ 	/*pmc 13*/    {{MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1}, 0, PFM_REGT_ESCR}, /*  SAAT_ESCR0,1 */
+ 	/*pmc 14*/    {{MSR_P4_U2L_ESCR0, MSR_P4_U2L_ESCR1}, 0, PFM_REGT_ESCR}, /*   U2L_ESCR0,1 */
+ 	/*pmc 15*/    {{MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1}, 0, PFM_REGT_ESCR}, /*   DAC_ESCR0,1 */
+ 	/*pmc 16*/    {{MSR_P4_IQ_ESCR0, MSR_P4_IQ_ESCR1}, 0, PFM_REGT_ESCR}, /*    IQ_ESCR0,1 (only model 1 and 2) */
+ 	/*pmc 17*/    {{MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1}, 0, PFM_REGT_ESCR}, /*   ALF_ESCR0,1 */
+ 	/*pmc 18*/    {{MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1}, 0, PFM_REGT_ESCR}, /*   RAT_ESCR0,1 */
+ 	/*pmc 19*/    {{MSR_P4_SSU_ESCR0, 0}, 0, PFM_REGT_ESCR}, /*   SSU_ESCR0   */
+ 	/*pmc 20*/    {{MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1}, 0, PFM_REGT_ESCR}, /*   CRU_ESCR0,1 */
+ 	/*pmc 21*/    {{MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3}, 0, PFM_REGT_ESCR}, /*   CRU_ESCR2,3 */
+ 	/*pmc 22*/    {{MSR_P4_CRU_ESCR4, MSR_P4_CRU_ESCR5}, 0, PFM_REGT_ESCR}, /*   CRU_ESCR4,5 */
+
+ 	/*pmc 23*/    {{MSR_P4_BPU_CCCR0, MSR_P4_BPU_CCCR2}, 0, PFM_REGT_ENAC}, /*   BPU_CCCR0,2 */
+ 	/*pmc 24*/    {{MSR_P4_BPU_CCCR1, MSR_P4_BPU_CCCR3}, 1, PFM_REGT_ENAC}, /*   BPU_CCCR1,3 */
+ 	/*pmc 25*/    {{MSR_P4_MS_CCCR0, MSR_P4_MS_CCCR2}, 2, PFM_REGT_ENAC}, /*    MS_CCCR0,2 */
+ 	/*pmc 26*/    {{MSR_P4_MS_CCCR1, MSR_P4_MS_CCCR3}, 3, PFM_REGT_ENAC}, /*    MS_CCCR1,3 */
+ 	/*pmc 27*/    {{MSR_P4_FLAME_CCCR0, MSR_P4_FLAME_CCCR2}, 4, PFM_REGT_ENAC}, /* FLAME_CCCR0,2 */
+ 	/*pmc 28*/    {{MSR_P4_FLAME_CCCR1, MSR_P4_FLAME_CCCR3}, 5, PFM_REGT_ENAC}, /* FLAME_CCCR1,3 */
+ 	/*pmc 29*/    {{MSR_P4_IQ_CCCR0, MSR_P4_IQ_CCCR2}, 6, PFM_REGT_ENAC}, /*    IQ_CCCR0,2 */
+ 	/*pmc 30*/    {{MSR_P4_IQ_CCCR1, MSR_P4_IQ_CCCR3}, 7, PFM_REGT_ENAC}, /*    IQ_CCCR1,3 */
+ 	/*pmc 31*/    {{MSR_P4_IQ_CCCR4, MSR_P4_IQ_CCCR5}, 8, PFM_REGT_ENAC}, /*    IQ_CCCR4,5 */
+ 	/* non HT extensions */	
+ 	/*pmc 32*/    {{MSR_P4_BPU_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*   BPU_ESCR1   */
+ 	/*pmc 33*/    {{MSR_P4_IS_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*    IS_ESCR1   */
+ 	/*pmc 34*/    {{MSR_P4_MOB_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*   MOB_ESCR1   */
+ 	/*pmc 35*/    {{MSR_P4_ITLB_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*  ITLB_ESCR1   */
+ 	/*pmc 36*/    {{MSR_P4_PMH_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*   PMH_ESCR1   */
+ 	/*pmc 37*/    {{MSR_P4_IX_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*    IX_ESCR1   */
+ 	/*pmc 38*/    {{MSR_P4_FSB_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*   FSB_ESCR1   */
+ 	/*pmc 39*/    {{MSR_P4_BSU_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*   BSU_ESCR1   */
+ 	/*pmc 40*/    {{MSR_P4_MS_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*    MS_ESCR1   */
+ 	/*pmc 41*/    {{MSR_P4_TC_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*    TC_ESCR1   */
+ 	/*pmc 42*/    {{MSR_P4_TBPU_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*  TBPU_ESCR1   */
+ 	/*pmc 43*/    {{MSR_P4_FLAME_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /* FLAME_ESCR1   */
+ 	/*pmc 44*/    {{MSR_P4_FIRM_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*  FIRM_ESCR1   */
+ 	/*pmc 45*/    {{MSR_P4_SAAT_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*  SAAT_ESCR1   */
+ 	/*pmc 46*/    {{MSR_P4_U2L_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*   U2L_ESCR1   */
+ 	/*pmc 47*/    {{MSR_P4_DAC_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*   DAC_ESCR1   */
+ 	/*pmc 48*/    {{MSR_P4_IQ_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*    IQ_ESCR1   (only model 1 and 2) */
+ 	/*pmc 49*/    {{MSR_P4_ALF_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*   ALF_ESCR1   */
+ 	/*pmc 50*/    {{MSR_P4_RAT_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*   RAT_ESCR1   */
+ 	/*pmc 51*/    {{MSR_P4_CRU_ESCR1,     0}, 0, PFM_REGT_NHTESCR}, /*   CRU_ESCR1   */
+ 	/*pmc 52*/    {{MSR_P4_CRU_ESCR3,     0}, 0, PFM_REGT_NHTESCR}, /*   CRU_ESCR3   */
+ 	/*pmc 53*/    {{MSR_P4_CRU_ESCR5,     0}, 0, PFM_REGT_NHTESCR}, /*   CRU_ESCR5   */
+ 	/*pmc 54*/    {{MSR_P4_BPU_CCCR1,     0}, 9, PFM_REGT_NHTCCCR}, /*   BPU_CCCR1   */
+ 	/*pmc 55*/    {{MSR_P4_BPU_CCCR3,     0},10, PFM_REGT_NHTCCCR}, /*   BPU_CCCR3   */
+ 	/*pmc 56*/    {{MSR_P4_MS_CCCR1,     0},11, PFM_REGT_NHTCCCR}, /*    MS_CCCR1   */
+ 	/*pmc 57*/    {{MSR_P4_MS_CCCR3,     0},12, PFM_REGT_NHTCCCR}, /*    MS_CCCR3   */
+ 	/*pmc 58*/    {{MSR_P4_FLAME_CCCR1,     0},13, PFM_REGT_NHTCCCR}, /* FLAME_CCCR1   */
+ 	/*pmc 59*/    {{MSR_P4_FLAME_CCCR3,     0},14, PFM_REGT_NHTCCCR}, /* FLAME_CCCR3   */
+ 	/*pmc 60*/    {{MSR_P4_IQ_CCCR2,     0},15, PFM_REGT_NHTCCCR}, /*    IQ_CCCR2   */
+ 	/*pmc 61*/    {{MSR_P4_IQ_CCCR3,     0},16, PFM_REGT_NHTCCCR}, /*    IQ_CCCR3   */
+ 	/*pmc 62*/    {{MSR_P4_IQ_CCCR5,     0},17, PFM_REGT_NHTCCCR}, /*    IQ_CCCR5   */
+ 	/*pmc 63*/    {{0x3f2,     0}, 0, PFM_REGT_NHTPEBS},/* PEBS_MATRIX_VERT */
+ 	/*pmc 64*/    {{0x3f1,     0}, 0, PFM_REGT_NHTPEBS} /* PEBS_ENABLE   */
+ },
+
+ .pmd_addrs = {
+ 	/*pmd 0 */    {{MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_PERFCTR2}, 0, PFM_REGT_CTR},  /*   BPU_CTR0,2  */
+ 	/*pmd 1 */    {{MSR_P4_BPU_PERFCTR1, MSR_P4_BPU_PERFCTR3}, 0, PFM_REGT_CTR},  /*   BPU_CTR1,3  */
+ 	/*pmd 2 */    {{MSR_P4_MS_PERFCTR0, MSR_P4_MS_PERFCTR2}, 0, PFM_REGT_CTR},  /*    MS_CTR0,2  */
+ 	/*pmd 3 */    {{MSR_P4_MS_PERFCTR1, MSR_P4_MS_PERFCTR3}, 0, PFM_REGT_CTR},  /*    MS_CTR1,3  */
+ 	/*pmd 4 */    {{MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_PERFCTR2}, 0, PFM_REGT_CTR},  /* FLAME_CTR0,2  */
+ 	/*pmd 5 */    {{MSR_P4_FLAME_PERFCTR1, MSR_P4_FLAME_PERFCTR3}, 0, PFM_REGT_CTR},  /* FLAME_CTR1,3  */
+ 	/*pmd 6 */    {{MSR_P4_IQ_PERFCTR0, MSR_P4_IQ_PERFCTR2}, 0, PFM_REGT_CTR},  /*    IQ_CTR0,2  */
+ 	/*pmd 7 */    {{MSR_P4_IQ_PERFCTR1, MSR_P4_IQ_PERFCTR3}, 0, PFM_REGT_CTR},  /*    IQ_CTR1,3  */
+ 	/*pmd 8 */    {{MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_PERFCTR5}, 0, PFM_REGT_CTR},  /*    IQ_CTR4,5  */
+ 	/*
+ 	 * non HT extensions
+ 	 */
+ 	/*pmd 9 */    {{MSR_P4_BPU_PERFCTR2,     0}, 0, PFM_REGT_NHTCTR},  /*   BPU_CTR2    */
+ 	/*pmd 10*/    {{MSR_P4_BPU_PERFCTR3,     0}, 0, PFM_REGT_NHTCTR},  /*   BPU_CTR3    */
+ 	/*pmd 11*/    {{MSR_P4_MS_PERFCTR2,     0}, 0, PFM_REGT_NHTCTR},  /*    MS_CTR2    */
+ 	/*pmd 12*/    {{MSR_P4_MS_PERFCTR3,     0}, 0, PFM_REGT_NHTCTR},  /*    MS_CTR3    */
+ 	/*pmd 13*/    {{MSR_P4_FLAME_PERFCTR2,     0}, 0, PFM_REGT_NHTCTR},  /* FLAME_CTR2    */
+ 	/*pmd 14*/    {{MSR_P4_FLAME_PERFCTR3,     0}, 0, PFM_REGT_NHTCTR},  /* FLAME_CTR3    */
+ 	/*pmd 15*/    {{MSR_P4_IQ_PERFCTR2,     0}, 0, PFM_REGT_NHTCTR},  /*    IQ_CTR2    */
+ 	/*pmd 16*/    {{MSR_P4_IQ_PERFCTR3,     0}, 0, PFM_REGT_NHTCTR},  /*    IQ_CTR3    */
+ 	/*pmd 17*/    {{MSR_P4_IQ_PERFCTR5,     0}, 0, PFM_REGT_NHTCTR},  /*    IQ_CTR5    */
+ },
+ .pebs_ctr_idx = 8, /* thread0: IQ_CTR4, thread1: IQ_CTR5 */
+ .pmu_style = PFM_X86_PMU_P4
+};
+
+static struct pfm_regmap_desc pfm_p4_pmc_desc[]={
+/* pmc0  */ PMC_D(PFM_REG_I, "BPU_ESCR0"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BPU_ESCR0),
+/* pmc1  */ PMC_D(PFM_REG_I, "IS_ESCR0"   , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR0),
+/* pmc2  */ PMC_D(PFM_REG_I, "MOB_ESCR0"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MOB_ESCR0),
+/* pmc3  */ PMC_D(PFM_REG_I, "ITLB_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ITLB_ESCR0),
+/* pmc4  */ PMC_D(PFM_REG_I, "PMH_ESCR0"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_PMH_ESCR0),
+/* pmc5  */ PMC_D(PFM_REG_I, "IX_ESCR0"   , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IX_ESCR0),
+/* pmc6  */ PMC_D(PFM_REG_I, "FSB_ESCR0"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FSB_ESCR0),
+/* pmc7  */ PMC_D(PFM_REG_I, "BSU_ESCR0"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BSU_ESCR0),
+/* pmc8  */ PMC_D(PFM_REG_I, "MS_ESCR0"   , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MS_ESCR0),
+/* pmc9  */ PMC_D(PFM_REG_I, "TC_ESCR0"   , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TC_ESCR0),
+/* pmc10 */ PMC_D(PFM_REG_I, "TBPU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TBPU_ESCR0),
+/* pmc11 */ PMC_D(PFM_REG_I, "FLAME_ESCR0", 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FLAME_ESCR0),
+/* pmc12 */ PMC_D(PFM_REG_I, "FIRM_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FIRM_ESCR0),
+/* pmc13 */ PMC_D(PFM_REG_I, "SAAT_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SAAT_ESCR0),
+/* pmc14 */ PMC_D(PFM_REG_I, "U2L_ESCR0"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_U2L_ESCR0),
+/* pmc15 */ PMC_D(PFM_REG_I, "DAC_ESCR0"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_DAC_ESCR0),
+/* pmc16 */ PMC_D(PFM_REG_I, "IQ_ESCR0"   , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR0), /* only model 1 and 2*/
+/* pmc17 */ PMC_D(PFM_REG_I, "ALF_ESCR0"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ALF_ESCR0),
+/* pmc18 */ PMC_D(PFM_REG_I, "RAT_ESCR0"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_RAT_ESCR0),
+/* pmc19 */ PMC_D(PFM_REG_I, "SSU_ESCR0"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SSU_ESCR0),
+/* pmc20 */ PMC_D(PFM_REG_I, "CRU_ESCR0"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR0),
+/* pmc21 */ PMC_D(PFM_REG_I, "CRU_ESCR2"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR2),
+/* pmc22 */ PMC_D(PFM_REG_I, "CRU_ESCR4"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR4),
+/* pmc23 */ PMC_D(PFM_REG_I64, "BPU_CCCR0"  , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR0),
+/* pmc24 */ PMC_D(PFM_REG_I64, "BPU_CCCR1"  , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR1),
+/* pmc25 */ PMC_D(PFM_REG_I64, "MS_CCCR0"   , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR0),
+/* pmc26 */ PMC_D(PFM_REG_I64, "MS_CCCR1"   , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR1),
+/* pmc27 */ PMC_D(PFM_REG_I64, "FLAME_CCCR0", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR0),
+/* pmc28 */ PMC_D(PFM_REG_I64, "FLAME_CCCR1", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR1),
+/* pmc29 */ PMC_D(PFM_REG_I64, "IQ_CCCR0"   , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR0),
+/* pmc30 */ PMC_D(PFM_REG_I64, "IQ_CCCR1"   , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR1),
+/* pmc31 */ PMC_D(PFM_REG_I64, "IQ_CCCR4"   , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR4),
+		/* No HT extension */
+/* pmc32 */ PMC_D(PFM_REG_I, "BPU_ESCR1"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BPU_ESCR1),
+/* pmc33 */ PMC_D(PFM_REG_I, "IS_ESCR1"   , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IS_ESCR1),
+/* pmc34 */ PMC_D(PFM_REG_I, "MOB_ESCR1"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MOB_ESCR1),
+/* pmc35 */ PMC_D(PFM_REG_I, "ITLB_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ITLB_ESCR1),
+/* pmc36 */ PMC_D(PFM_REG_I, "PMH_ESCR1"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_PMH_ESCR1),
+/* pmc37 */ PMC_D(PFM_REG_I, "IX_ESCR1"   , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IX_ESCR1),
+/* pmc38 */ PMC_D(PFM_REG_I, "FSB_ESCR1"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FSB_ESCR1),
+/* pmc39 */ PMC_D(PFM_REG_I, "BSU_ESCR1"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BSU_ESCR1),
+/* pmc40 */ PMC_D(PFM_REG_I, "MS_ESCR1"   , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MS_ESCR1),
+/* pmc41 */ PMC_D(PFM_REG_I, "TC_ESCR1"   , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TC_ESCR1),
+/* pmc42 */ PMC_D(PFM_REG_I, "TBPU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TBPU_ESCR1),
+/* pmc43 */ PMC_D(PFM_REG_I, "FLAME_ESCR1", 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FLAME_ESCR1),
+/* pmc44 */ PMC_D(PFM_REG_I, "FIRM_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FIRM_ESCR1),
+/* pmc45 */ PMC_D(PFM_REG_I, "SAAT_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SAAT_ESCR1),
+/* pmc46 */ PMC_D(PFM_REG_I, "U2L_ESCR1"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_U2L_ESCR1),
+/* pmc47 */ PMC_D(PFM_REG_I, "DAC_ESCR1"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_DAC_ESCR1),
+/* pmc48 */ PMC_D(PFM_REG_I, "IQ_ESCR1"   , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR1), /* only model 1 and 2 */
+/* pmc49 */ PMC_D(PFM_REG_I, "ALF_ESCR1"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ALF_ESCR1),
+/* pmc50 */ PMC_D(PFM_REG_I, "RAT_ESCR1"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_RAT_ESCR1),
+/* pmc51 */ PMC_D(PFM_REG_I, "CRU_ESCR1"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR1),
+/* pmc52 */ PMC_D(PFM_REG_I, "CRU_ESCR3"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR3),
+/* pmc53 */ PMC_D(PFM_REG_I, "CRU_ESCR5"  , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR5),
+/* pmc54 */ PMC_D(PFM_REG_I64, "BPU_CCCR2"  , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR2),
+/* pmc55 */ PMC_D(PFM_REG_I64, "BPU_CCCR3"  , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR3),
+/* pmc56 */ PMC_D(PFM_REG_I64, "MS_CCCR2"   , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR2),
+/* pmc57 */ PMC_D(PFM_REG_I64, "MS_CCCR3"   , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR3),
+/* pmc58 */ PMC_D(PFM_REG_I64, "FLAME_CCCR2", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR2),
+/* pmc59 */ PMC_D(PFM_REG_I64, "FLAME_CCCR3", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR3),
+/* pmc60 */ PMC_D(PFM_REG_I64, "IQ_CCCR2"   , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR2),
+/* pmc61 */ PMC_D(PFM_REG_I64, "IQ_CCCR3"   , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR3),
+/* pmc62 */ PMC_D(PFM_REG_I64, "IQ_CCCR5"   , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR5),
+/* pmc63 */ PMC_D(PFM_REG_I, "PEBS_MATRIX_VERT", 0, 0xffffffffffffffecULL, 0, 0x3f2),
+/* pmc64 */ PMC_D(PFM_REG_I, "PEBS_ENABLE", 0, 0xfffffffff8ffe000ULL, 0, 0x3f1)
+};
+#define PFM_P4_NUM_PMCS ARRAY_SIZE(pfm_p4_pmc_desc)
+
+/*
+ * See section 15.10.6.6 for details about the IQ block
+ */
+static struct pfm_regmap_desc pfm_p4_pmd_desc[]={
+/* pmd0  */ PMD_D(PFM_REG_C, "BPU_CTR0", MSR_P4_BPU_PERFCTR0),
+/* pmd1  */ PMD_D(PFM_REG_C, "BPU_CTR1", MSR_P4_BPU_PERFCTR1),
+/* pmd2  */ PMD_D(PFM_REG_C, "MS_CTR0", MSR_P4_MS_PERFCTR0),
+/* pmd3  */ PMD_D(PFM_REG_C, "MS_CTR1", MSR_P4_MS_PERFCTR1),
+/* pmd4  */ PMD_D(PFM_REG_C, "FLAME_CTR0", MSR_P4_FLAME_PERFCTR0),
+/* pmd5  */ PMD_D(PFM_REG_C, "FLAME_CTR1", MSR_P4_FLAME_PERFCTR1),
+/* pmd6  */ PMD_D(PFM_REG_C, "IQ_CTR0", MSR_P4_IQ_PERFCTR0),
+/* pmd7  */ PMD_D(PFM_REG_C, "IQ_CTR1", MSR_P4_IQ_PERFCTR1),
+/* pmd8  */ PMD_D(PFM_REG_C, "IQ_CTR4", MSR_P4_IQ_PERFCTR4),
+		/* no HT extension */
+/* pmd9  */ PMD_D(PFM_REG_C, "BPU_CTR2", MSR_P4_BPU_PERFCTR2),
+/* pmd10 */ PMD_D(PFM_REG_C, "BPU_CTR3", MSR_P4_BPU_PERFCTR3),
+/* pmd11 */ PMD_D(PFM_REG_C, "MS_CTR2", MSR_P4_MS_PERFCTR2),
+/* pmd12 */ PMD_D(PFM_REG_C, "MS_CTR3", MSR_P4_MS_PERFCTR3),
+/* pmd13 */ PMD_D(PFM_REG_C, "FLAME_CTR2", MSR_P4_FLAME_PERFCTR2),
+/* pmd14 */ PMD_D(PFM_REG_C, "FLAME_CTR3", MSR_P4_FLAME_PERFCTR3),
+/* pmd15 */ PMD_D(PFM_REG_C, "IQ_CTR2", MSR_P4_IQ_PERFCTR1),
+/* pmd16 */ PMD_D(PFM_REG_C, "IQ_CTR3", MSR_P4_IQ_PERFCTR3),
+/* pmd17 */ PMD_D(PFM_REG_C, "IQ_CTR5", MSR_P4_IQ_PERFCTR5)
+};
+#define PFM_P4_NUM_PMDS ARRAY_SIZE(pfm_p4_pmd_desc)
+
+/*
+ * Due to hotplug CPU support, threads may not necessarily
+ * be activated at the time the module is inserted. We need
+ * to check whether  they could be activated by looking at
+ * the present CPU (present != online).
+ */
+static int pfm_p4_probe_pmu(void)
+{
+	unsigned int i;
+	int ht_enabled;
+
+	/*
+	 * only works on Intel processors
+	 */
+	if (cpu_data->x86_vendor != X86_VENDOR_INTEL) {
+		PFM_INFO("not running on Intel processor");
+		return -1;
+	}
+
+	if (cpu_data->x86 != 15) {
+		PFM_INFO("unsupported family=%d", cpu_data->x86);
+		return -1;
+	}
+
+	switch(cpu_data->x86_model) {
+		case 0 ... 2:
+			break;
+		case 3 ... 6:
+			/*
+			 * IQ_ESCR0, IQ_ESCR1 only present on model 1, 2
+			 */
+			pfm_p4_pmc_desc[16].type = PFM_REG_NA;
+			pfm_p4_pmc_desc[48].type = PFM_REG_NA;
+			break;
+		default:
+			/*
+			 * do not know if they all work the same, so reject
+			 * for now
+			 */
+			if (!force) {
+				PFM_INFO("unsupported model %d", cpu_data->x86_model);
+				return -1;
+			}
+	}
+
+	/*
+	 * check for local APIC (required)
+	 */
+	if (!cpu_has_apic) {
+		PFM_INFO("no local APIC, unsupported");
+		return -1;
+	}
+#ifdef CONFIG_SMP
+	ht_enabled = (cpus_weight(cpu_core_map[smp_processor_id()])
+		   / cpu_data->x86_max_cores) > 1;
+#else
+	ht_enabled = 0;
+#endif
+	if (cpu_has_ht) {
+
+		PFM_INFO("HyperThreading supported, status %s",
+			 ht_enabled ? "on": "off");
+		/*
+		 * disable registers not supporting HT
+		 */
+		if (ht_enabled) {
+			PFM_INFO("disabling half the registers for HT");
+			for (i = 0; i < PFM_P4_NUM_PMCS; i++) {
+				if (pfm_p4_pmu_info.pmc_addrs[(i)].reg_type &
+				    PFM_REGT_NOHT)
+					pfm_p4_pmc_desc[i].type = PFM_REG_NA;
+			}
+			for (i = 0; i < PFM_P4_NUM_PMDS; i++) {
+				if (pfm_p4_pmu_info.pmd_addrs[(i)].reg_type &
+				    PFM_REGT_NOHT)
+					pfm_p4_pmd_desc[i].type = PFM_REG_NA;
+			}
+		}
+	}
+
+	if (cpu_has_ds) {
+		PFM_INFO("Data Save Area (DS) supported");
+
+		pfm_p4_pmu_info.flags = PFM_X86_FL_PMU_DS;
+
+		if (cpu_has_pebs) {
+			/*
+			 * PEBS does not work with HyperThreading enabled
+			 */
+	                if (ht_enabled) {
+				PFM_INFO("PEBS supported, status off (because of HT)");
+			} else {
+				pfm_p4_pmu_info.flags |= PFM_X86_FL_PMU_PEBS;
+				PFM_INFO("PEBS supported, status on");
+			}
+		}
+	}
+	/*
+	 * NMI using PMU?
+	 * Actual removal of NMI counter is done by pfm_pmu_acquire()
+	 */
+	if (nmi_watchdog == NMI_LOCAL_APIC || force_nmi)
+		pfm_p4_pmu_info.flags |= PFM_X86_FL_USE_NMI;
+	return 0;
+}
+
+static struct pfm_pmu_config pfm_p4_pmu_conf={
+	.pmu_name = "Intel P4",
+	.counter_width = 40,
+	.pmd_desc = pfm_p4_pmd_desc,
+	.pmc_desc = pfm_p4_pmc_desc,
+	.num_pmc_entries = PFM_P4_NUM_PMCS,
+	.num_pmd_entries = PFM_P4_NUM_PMDS,
+	.probe_pmu = pfm_p4_probe_pmu,
+	.version = "1.0",
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE,
+	.arch_info = &pfm_p4_pmu_info
+};
+	
+static int __init pfm_p4_pmu_init_module(void)
+{
+	return pfm_pmu_register(&pfm_p4_pmu_conf);
+}
+
+static void __exit pfm_p4_pmu_cleanup_module(void)
+{
+	pfm_pmu_unregister(&pfm_p4_pmu_conf);
+}
+
+module_init(pfm_p4_pmu_init_module);
+module_exit(pfm_p4_pmu_cleanup_module);
Index: linux-2.6/arch/i386/perfmon/perfmon_p6.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/i386/perfmon/perfmon_p6.c
@@ -0,0 +1,172 @@
+/*
+ * This file contains the P6 family processor PMU register description tables
+ *
+ * This module supports original P6 processors
+ * (Pentium II, Pentium Pro, Pentium III) and Pentium M.
+ *
+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+#include <asm/msr.h>
+#include <asm/nmi.h>
+
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("P6 PMU description table");
+MODULE_LICENSE("GPL");
+
+static int force_nmi;
+MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt");
+module_param(force_nmi, bool, 0600);
+
+/*
+ * - upper 32 bits are reserved
+ * - INT: APIC enable bit is reserved (forced to 1)
+ * - bit 21 is reserved
+ *
+ * RSVD: reserved bits are 1
+ */
+#define PFM_P6_PMC_RSVD	((~((1ULL<<32)-1)) \
+			| (1ULL<<20) \
+			| (1ULL<<21))
+
+/*
+ * force Local APIC interrupt on overflow
+ * disable with NO_EMUL64
+ */
+#define PFM_P6_PMC_VAL  (1ULL<<20)
+#define PFM_P6_NO64	(1ULL<<20)
+
+struct pfm_arch_pmu_info pfm_p6_pmu_info={
+	.pmc_addrs = {
+		{{MSR_P6_EVNTSEL0, 0}, 0, PFM_REGT_EN}, /* has enable bit */
+		{{MSR_P6_EVNTSEL1, 0}, 1, PFM_REGT_OTH} /* no enable bit  */
+	},
+	.pmd_addrs = {
+		{{MSR_P6_PERFCTR0, 0}, 0, PFM_REGT_CTR},
+		{{MSR_P6_PERFCTR1, 0}, 0, PFM_REGT_CTR}
+	},
+	.pmu_style = PFM_X86_PMU_P6
+};
+
+static struct pfm_regmap_desc pfm_p6_pmc_desc[]={
+/* pmc0  */ PMC_D(PFM_REG_I64, "PERFEVTSEL0", PFM_P6_PMC_VAL, PFM_P6_PMC_RSVD, PFM_P6_NO64, MSR_P6_EVNTSEL0),
+/* pmc1  */ PMC_D(PFM_REG_I64, "PERFEVTSEL1", PFM_P6_PMC_VAL, PFM_P6_PMC_RSVD, PFM_P6_NO64, MSR_P6_EVNTSEL1)
+};
+#define PFM_P6_NUM_PMCS	ARRAY_SIZE(pfm_p6_pmc_desc)
+
+static struct pfm_regmap_desc pfm_p6_pmd_desc[]={
+/* pmd0  */ PMD_D(PFM_REG_C  , "PERFCTR0", MSR_P6_PERFCTR0),
+/* pmd1  */ PMD_D(PFM_REG_C  , "PERFCTR1", MSR_P6_PERFCTR1)
+};
+#define PFM_P6_NUM_PMDS ARRAY_SIZE(pfm_p6_pmd_desc)
+
+static int pfm_p6_probe_pmu(void)
+{
+	int high, low;
+
+	if (cpu_data->x86_vendor != X86_VENDOR_INTEL) {
+		PFM_INFO("not an Intel processor");
+		return -1;
+	}
+
+	/*
+	 * check for P6 processor family
+	 */
+	if (cpu_data->x86 != 6) {
+		PFM_INFO("unsupported family=%d", cpu_data->x86);
+		return -1;
+	}
+
+	switch(cpu_data->x86_model) {
+		case 3:
+		case 5: /* Pentium II Deschutes */
+		case 7 ... 11:
+			break;
+		case 13:
+			/* for Pentium M, we need to check if PMU exist */
+			rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+			if (low & (1U << 7))
+				break;
+		default:
+			PFM_INFO("unsupported CPU model %d",
+				 cpu_data->x86_model);
+			return -1;
+
+	}
+
+	if (!cpu_has_apic) {
+		PFM_INFO("no Local APIC, try rebooting with lapic");
+		return -1;
+	}
+
+	PFM_INFO("nmi_watchdog=%d nmi_active=%d force_nmi=%d",
+		nmi_watchdog, atomic_read(&nmi_active), force_nmi);
+
+	/*
+	 * we cannot have perfmon/nmi_watchdog running together as there
+	 * is only one enable bit for both counters.
+	 */
+	if (nmi_watchdog == NMI_LOCAL_APIC) {
+		PFM_INFO("NMI watchdog using performance counters."
+			 "perfmon cannot work correctly, reboot with nmi_watchdog=0");
+		return -1;
+	}
+
+	/*
+	 * force NMI interrupt?
+	 */
+	if (force_nmi)
+		pfm_p6_pmu_info.flags |= PFM_X86_FL_USE_NMI;
+
+	return 0;
+}
+
+/*
+ * Counters have 40 bits implemented. However they are designed such
+ * that bits [32-39] are sign extensions of bit 31. As such the
+ * effective width of a counter for P6-like PMU is 31 bits only.
+ *
+ * See IA-32 Intel Architecture Software developer manual Vol 3B
+ */
+static struct pfm_pmu_config pfm_p6_pmu_conf={
+	.pmu_name = "Intel P6 processor Family",
+	.counter_width = 31,
+	.pmd_desc = pfm_p6_pmd_desc,
+	.pmc_desc = pfm_p6_pmc_desc,
+	.num_pmc_entries = PFM_P6_NUM_PMCS,
+	.num_pmd_entries = PFM_P6_NUM_PMDS,
+	.probe_pmu = pfm_p6_probe_pmu,
+	.version = "1.0",
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE,
+	.arch_info = &pfm_p6_pmu_info
+};
+	
+static int __init pfm_p6_pmu_init_module(void)
+{
+	return pfm_pmu_register(&pfm_p6_pmu_conf);
+}
+
+static void __exit pfm_p6_pmu_cleanup_module(void)
+{
+	pfm_pmu_unregister(&pfm_p6_pmu_conf);
+}
+
+module_init(pfm_p6_pmu_init_module);
+module_exit(pfm_p6_pmu_cleanup_module);
Index: linux-2.6/arch/i386/perfmon/perfmon_pebs_smpl.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/i386/perfmon/perfmon_pebs_smpl.c
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file implements the Precise Event Based Sampling (PEBS)
+ * sampling format. It supports the following processors:
+ *	- 32-bit Pentium 4, Xeon, Core-based processors.
+ *	- 64-bit Pentium 4, Xeon, Core-based processors.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+#include <asm/msr.h>
+
+#include <linux/perfmon.h>
+#include <asm/perfmon_pebs_smpl.h>
+
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("Intel Precise Event-Based Sampling (PEBS)");
+MODULE_LICENSE("GPL");
+
+#define ALIGN_PEBS(a, order) \
+		((a)+(1UL<<(order))-1) & ~((1UL<<(order))-1)
+
+#define PEBS_PADDING_ORDER 8 /* log2(256) padding for PEBS alignment constraint */
+
+static int pfm_pebs_fmt_validate(u32 flags, u16 npmds, void *data)
+{
+	struct pfm_pebs_smpl_arg *arg = data;
+	size_t min_buf_size;
+
+	/*
+	 * need to define at least the size of the buffer
+	 */
+	if (data == NULL) {
+		PFM_DBG("no argument passed");
+		return -EINVAL;
+	}
+
+	/*
+	 * compute min buf size. npmds is the maximum number
+	 * of implemented PMD registers.
+	 */
+	min_buf_size = sizeof(struct pfm_pebs_smpl_hdr)
+		     + sizeof(struct pfm_pebs_smpl_entry)
+	      	     + (1UL<<PEBS_PADDING_ORDER); /* padding for alignment */
+
+	PFM_DBG("validate flags=0x%x min_buf_size=%zu buf_size=%zu",
+		  flags,
+		  min_buf_size,
+		  arg->buf_size);
+
+	/*
+	 * must hold at least the buffer header + one minimally sized entry
+	 */
+	if (arg->buf_size < min_buf_size)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int pfm_pebs_fmt_get_size(unsigned int flags, void *data, size_t *size)
+{
+	struct pfm_pebs_smpl_arg *arg = data;
+
+	/*
+	 * size has been validated in pfm_pebs_fmt_validate()
+	 */
+	*size = arg->buf_size + (1UL<<PEBS_PADDING_ORDER);
+
+	return 0;
+}
+
+static int pfm_pebs_fmt_init(struct pfm_context *ctx, void *buf,
+			     u32 flags, u16 npmds, void *data)
+{
+	struct pfm_arch_context *ctx_arch;
+	struct pfm_pebs_smpl_hdr *hdr;
+	struct pfm_pebs_smpl_arg *arg = data;
+	unsigned long pebs_start, pebs_end;
+	struct pfm_ds_area *ds;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	hdr = buf;
+	ds = &hdr->ds;
+
+	/*
+	 * align PEBS buffer base
+	 */
+	pebs_start = ALIGN_PEBS((unsigned long)(hdr+1), PEBS_PADDING_ORDER);
+	pebs_end = pebs_start + arg->buf_size + 1;
+
+	hdr->version = PFM_PEBS_SMPL_VERSION;
+	hdr->buf_size = arg->buf_size;
+	hdr->overflows = 0;
+
+	/*
+	 * express PEBS buffer base as offset from the end of the header
+	 */
+	hdr->start_offs = pebs_start - (unsigned long)(hdr+1);
+
+	/*
+	 * PEBS buffer boundaries
+	 */
+	ds->pebs_buf_base = pebs_start;
+	ds->pebs_abs_max = pebs_end;
+
+	/*
+	 * PEBS starting position
+	 */
+	ds->pebs_index = pebs_start;
+
+	/*
+	 * PEBS interrupt threshold
+	 */
+	ds->pebs_intr_thres = pebs_start
+			    + arg->intr_thres
+			    * sizeof(struct pfm_pebs_smpl_entry);
+
+	/*
+	 * save counter reset value for PEBS counter
+	 */
+	ds->pebs_cnt_reset = arg->cnt_reset;
+
+	/*
+	 * keep track of DS AREA
+	 */
+	ctx_arch->ds_area = (unsigned long)ds;
+	ctx_arch->flags |= PFM_X86_USE_PEBS;
+
+	PFM_DBG("buffer=%p buf_size=%llu offs=%llu pebs_start=0x%lx "
+		  "pebs_end=0x%lx ds=%p pebs_thres=0x%lx cnt_reset=0x%llx",
+		  buf,
+		  (unsigned long long)hdr->buf_size,
+		  (unsigned long long)hdr->start_offs,
+		  pebs_start,
+		  pebs_end,
+		  ds,
+		  ds->pebs_intr_thres,
+		  (unsigned long long)ds->pebs_cnt_reset);
+
+	return 0;
+}
+
+static int pfm_pebs_fmt_handler(void *buf, struct pfm_ovfl_arg *arg,
+			       unsigned long ip, u64 tstamp, void *data)
+{
+	struct pfm_pebs_smpl_hdr *hdr;
+
+	hdr = buf;
+
+	PFM_DBG_ovfl("buffer full");
+	/*
+	 * increment number of buffer overflows.
+	 * important to detect duplicate set of samples.
+	 */
+	hdr->overflows++;
+
+	/*
+	 * request notification and masking of monitoring.
+	 * Notification is still subject to the overflowed
+	 * register having the FL_NOTIFY flag set.
+	 */
+	arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY| PFM_OVFL_CTRL_MASK;
+
+	return -ENOBUFS; /* we are full, sorry */
+}
+
+static int pfm_pebs_fmt_restart(int is_active, u32 *ovfl_ctrl,
+				void *buf)
+{
+	struct pfm_pebs_smpl_hdr *hdr = buf;
+
+	/*
+	 * reset index to base of buffer
+	 */
+	hdr->ds.pebs_index = hdr->ds.pebs_buf_base;
+
+	*ovfl_ctrl = PFM_OVFL_CTRL_RESET;
+
+	return 0;
+}
+
+static int pfm_pebs_fmt_exit(void *buf)
+{
+	return 0;
+}
+
+static struct pfm_smpl_fmt pebs_fmt={
+	.fmt_name = PFM_PEBS_SMPL_NAME,
+	.fmt_version = 0x1,
+	.fmt_arg_size = sizeof(struct pfm_pebs_smpl_arg),
+	.fmt_validate = pfm_pebs_fmt_validate,
+	.fmt_getsize = pfm_pebs_fmt_get_size,
+	.fmt_init = pfm_pebs_fmt_init,
+	.fmt_handler = pfm_pebs_fmt_handler,
+	.fmt_restart = pfm_pebs_fmt_restart,
+	.fmt_exit = pfm_pebs_fmt_exit,
+	.fmt_flags = PFM_FMT_BUILTIN_FLAG,
+	.owner = THIS_MODULE,
+};
+
+static int __init pfm_pebs_fmt_init_module(void)
+{
+	int ht_enabled;
+
+	if (!cpu_has_pebs) {
+		PFM_INFO("processor does not have PEBS support");
+		return -1;
+	}
+#ifdef CONFIG_SMP
+	ht_enabled = cpus_weight(cpu_core_map[smp_processor_id()])
+		   / cpu_data->x86_max_cores > 1;
+#else
+	ht_enabled = 0;
+#endif
+	if (ht_enabled) {
+		PFM_INFO("PEBS not available because HyperThreading is on");
+		return -1;
+	}
+	return pfm_fmt_register(&pebs_fmt);
+}
+
+static void __exit pfm_pebs_fmt_cleanup_module(void)
+{
+	pfm_fmt_unregister(&pebs_fmt);
+}
+
+module_init(pfm_pebs_fmt_init_module);
+module_exit(pfm_pebs_fmt_cleanup_module);
Index: linux-2.6/arch/ia64/Kconfig
===================================================================
--- linux-2.6.orig/arch/ia64/Kconfig
+++ linux-2.6/arch/ia64/Kconfig
@@ -424,14 +424,6 @@ config COMPAT
 config IA64_MCA_RECOVERY
 	tristate "MCA recovery from errors other than TLB."
 
-config PERFMON
-	bool "Performance monitor support"
-	help
-	  Selects whether support for the IA-64 performance monitor hardware
-	  is included in the kernel.  This makes some kernel data-structures a
-	  little bigger and slows down execution a bit, but it is generally
-	  a good idea to turn this on.  If you're unsure, say Y.
-
 config IA64_PALINFO
 	tristate "/proc/pal support"
 	help
@@ -493,6 +485,8 @@ source "drivers/firmware/Kconfig"
 
 source "fs/Kconfig.binfmt"
 
+source "arch/ia64/perfmon/Kconfig"
+
 endmenu
 
 menu "Power management and ACPI"
Index: linux-2.6/arch/ia64/Makefile
===================================================================
--- linux-2.6.orig/arch/ia64/Makefile
+++ linux-2.6/arch/ia64/Makefile
@@ -55,6 +55,7 @@ core-$(CONFIG_IA64_GENERIC) 	+= arch/ia6
 core-$(CONFIG_IA64_HP_ZX1)	+= arch/ia64/dig/
 core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/
 core-$(CONFIG_IA64_SGI_SN2)	+= arch/ia64/sn/
+core-$(CONFIG_PERFMON)		+= arch/ia64/perfmon/
 
 drivers-$(CONFIG_PCI)		+= arch/ia64/pci/
 drivers-$(CONFIG_IA64_HP_SIM)	+= arch/ia64/hp/sim/
Index: linux-2.6/arch/ia64/defconfig
===================================================================
--- linux-2.6.orig/arch/ia64/defconfig
+++ linux-2.6/arch/ia64/defconfig
@@ -162,7 +162,6 @@ CONFIG_HAVE_ARCH_NODEDATA_EXTENSION=y
 CONFIG_IA32_SUPPORT=y
 CONFIG_COMPAT=y
 CONFIG_IA64_MCA_RECOVERY=y
-CONFIG_PERFMON=y
 CONFIG_IA64_PALINFO=y
 # CONFIG_MC_ERR_INJECT is not set
 CONFIG_SGI_SN=y
@@ -184,6 +183,16 @@ CONFIG_BINFMT_ELF=y
 CONFIG_BINFMT_MISC=m
 
 #
+# Hardware Performance Monitoring support
+#
+CONFIG_PERFMON=y
+CONFIG_IA64_PERFMON_COMPAT=y
+CONFIG_IA64_PERFMON_GENERIC=m
+CONFIG_IA64_PERFMON_ITANIUM=y
+CONFIG_IA64_PERFMON_MCKINLEY=y
+CONFIG_IA64_PERFMON_MONTECITO=y
+
+#
 # Power management and ACPI
 #
 CONFIG_PM=y
Index: linux-2.6/arch/ia64/kernel/Makefile
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/Makefile
+++ linux-2.6/arch/ia64/kernel/Makefile
@@ -5,7 +5,7 @@
 extra-y	:= head.o init_task.o vmlinux.lds
 
 obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o	\
-	 irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o		\
+	 irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o ptrace.o sal.o		\
 	 salinfo.o semaphore.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
 	 unwind.o mca.o mca_asm.o topology.o
 
@@ -23,7 +23,6 @@ obj-$(CONFIG_IOSAPIC)		+= iosapic.o
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_NUMA)		+= numa.o
-obj-$(CONFIG_PERFMON)		+= perfmon_default_smpl.o
 obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
 obj-$(CONFIG_CPU_FREQ)		+= cpufreq/
 obj-$(CONFIG_IA64_MCA_RECOVERY)	+= mca_recovery.o
Index: linux-2.6/arch/ia64/kernel/entry.S
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/entry.S
+++ linux-2.6/arch/ia64/kernel/entry.S
@@ -1588,5 +1588,17 @@ sys_call_table:
 	data8 sys_signalfd
 	data8 sys_timerfd
 	data8 sys_eventfd
+	data8 sys_pfm_create_context		// 1310
+	data8 sys_pfm_write_pmcs
+	data8 sys_pfm_write_pmds
+	data8 sys_pfm_read_pmds
+	data8 sys_pfm_load_context
+	data8 sys_pfm_start			// 1305
+	data8 sys_pfm_stop
+	data8 sys_pfm_restart
+	data8 sys_pfm_create_evtsets
+	data8 sys_pfm_getinfo_evtsets
+	data8 sys_pfm_delete_evtsets		// 1310
+	data8 sys_pfm_unload_context
 
 	.org sys_call_table + 8*NR_syscalls	// guard against failures to increase NR_syscalls
Index: linux-2.6/arch/ia64/kernel/irq_ia64.c
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/irq_ia64.c
+++ linux-2.6/arch/ia64/kernel/irq_ia64.c
@@ -40,10 +40,6 @@
 #include <asm/system.h>
 #include <asm/tlbflush.h>
 
-#ifdef CONFIG_PERFMON
-# include <asm/perfmon.h>
-#endif
-
 #define IRQ_DEBUG	0
 
 /* These can be overridden in platform_irq_init */
@@ -320,9 +316,6 @@ init_IRQ (void)
 	register_percpu_irq(IA64_IPI_RESCHEDULE, &resched_irqaction);
 	register_percpu_irq(IA64_IPI_LOCAL_TLB_FLUSH, &tlb_irqaction);
 #endif
-#ifdef CONFIG_PERFMON
-	pfm_init_percpu();
-#endif
 	platform_irq_init();
 }
 
Index: linux-2.6/arch/ia64/kernel/perfmon.c
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/perfmon.c
+++ /dev/null
@@ -1,6879 +0,0 @@
-/*
- * This file implements the perfmon-2 subsystem which is used
- * to program the IA-64 Performance Monitoring Unit (PMU).
- *
- * The initial version of perfmon.c was written by
- * Ganesh Venkitachalam, IBM Corp.
- *
- * Then it was modified for perfmon-1.x by Stephane Eranian and
- * David Mosberger, Hewlett Packard Co.
- *
- * Version Perfmon-2.x is a rewrite of perfmon-1.x
- * by Stephane Eranian, Hewlett Packard Co.
- *
- * Copyright (C) 1999-2005  Hewlett Packard Co
- *               Stephane Eranian <eranian@hpl.hp.com>
- *               David Mosberger-Tang <davidm@hpl.hp.com>
- *
- * More information about perfmon available at:
- * 	http://www.hpl.hp.com/research/linux/perfmon
- */
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/interrupt.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <linux/vmalloc.h>
-#include <linux/mm.h>
-#include <linux/sysctl.h>
-#include <linux/list.h>
-#include <linux/file.h>
-#include <linux/poll.h>
-#include <linux/vfs.h>
-#include <linux/smp.h>
-#include <linux/pagemap.h>
-#include <linux/mount.h>
-#include <linux/bitops.h>
-#include <linux/capability.h>
-#include <linux/rcupdate.h>
-#include <linux/completion.h>
-
-#include <asm/errno.h>
-#include <asm/intrinsics.h>
-#include <asm/page.h>
-#include <asm/perfmon.h>
-#include <asm/processor.h>
-#include <asm/signal.h>
-#include <asm/system.h>
-#include <asm/uaccess.h>
-#include <asm/delay.h>
-
-#ifdef CONFIG_PERFMON
-/*
- * perfmon context state
- */
-#define PFM_CTX_UNLOADED	1	/* context is not loaded onto any task */
-#define PFM_CTX_LOADED		2	/* context is loaded onto a task */
-#define PFM_CTX_MASKED		3	/* context is loaded but monitoring is masked due to overflow */
-#define PFM_CTX_ZOMBIE		4	/* owner of the context is closing it */
-
-#define PFM_INVALID_ACTIVATION	(~0UL)
-
-#define PFM_NUM_PMC_REGS	64	/* PMC save area for ctxsw */
-#define PFM_NUM_PMD_REGS	64	/* PMD save area for ctxsw */
-
-/*
- * depth of message queue
- */
-#define PFM_MAX_MSGS		32
-#define PFM_CTXQ_EMPTY(g)	((g)->ctx_msgq_head == (g)->ctx_msgq_tail)
-
-/*
- * type of a PMU register (bitmask).
- * bitmask structure:
- * 	bit0   : register implemented
- * 	bit1   : end marker
- * 	bit2-3 : reserved
- * 	bit4   : pmc has pmc.pm
- * 	bit5   : pmc controls a counter (has pmc.oi), pmd is used as counter
- * 	bit6-7 : register type
- * 	bit8-31: reserved
- */
-#define PFM_REG_NOTIMPL		0x0 /* not implemented at all */
-#define PFM_REG_IMPL		0x1 /* register implemented */
-#define PFM_REG_END		0x2 /* end marker */
-#define PFM_REG_MONITOR		(0x1<<4|PFM_REG_IMPL) /* a PMC with a pmc.pm field only */
-#define PFM_REG_COUNTING	(0x2<<4|PFM_REG_MONITOR) /* a monitor + pmc.oi+ PMD used as a counter */
-#define PFM_REG_CONTROL		(0x4<<4|PFM_REG_IMPL) /* PMU control register */
-#define	PFM_REG_CONFIG		(0x8<<4|PFM_REG_IMPL) /* configuration register */
-#define PFM_REG_BUFFER	 	(0xc<<4|PFM_REG_IMPL) /* PMD used as buffer */
-
-#define PMC_IS_LAST(i)	(pmu_conf->pmc_desc[i].type & PFM_REG_END)
-#define PMD_IS_LAST(i)	(pmu_conf->pmd_desc[i].type & PFM_REG_END)
-
-#define PMC_OVFL_NOTIFY(ctx, i)	((ctx)->ctx_pmds[i].flags &  PFM_REGFL_OVFL_NOTIFY)
-
-/* i assumed unsigned */
-#define PMC_IS_IMPL(i)	  (i< PMU_MAX_PMCS && (pmu_conf->pmc_desc[i].type & PFM_REG_IMPL))
-#define PMD_IS_IMPL(i)	  (i< PMU_MAX_PMDS && (pmu_conf->pmd_desc[i].type & PFM_REG_IMPL))
-
-/* XXX: these assume that register i is implemented */
-#define PMD_IS_COUNTING(i) ((pmu_conf->pmd_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
-#define PMC_IS_COUNTING(i) ((pmu_conf->pmc_desc[i].type & PFM_REG_COUNTING) == PFM_REG_COUNTING)
-#define PMC_IS_MONITOR(i)  ((pmu_conf->pmc_desc[i].type & PFM_REG_MONITOR)  == PFM_REG_MONITOR)
-#define PMC_IS_CONTROL(i)  ((pmu_conf->pmc_desc[i].type & PFM_REG_CONTROL)  == PFM_REG_CONTROL)
-
-#define PMC_DFL_VAL(i)     pmu_conf->pmc_desc[i].default_value
-#define PMC_RSVD_MASK(i)   pmu_conf->pmc_desc[i].reserved_mask
-#define PMD_PMD_DEP(i)	   pmu_conf->pmd_desc[i].dep_pmd[0]
-#define PMC_PMD_DEP(i)	   pmu_conf->pmc_desc[i].dep_pmd[0]
-
-#define PFM_NUM_IBRS	  IA64_NUM_DBG_REGS
-#define PFM_NUM_DBRS	  IA64_NUM_DBG_REGS
-
-#define CTX_OVFL_NOBLOCK(c)	((c)->ctx_fl_block == 0)
-#define CTX_HAS_SMPL(c)		((c)->ctx_fl_is_sampling)
-#define PFM_CTX_TASK(h)		(h)->ctx_task
-
-#define PMU_PMC_OI		5 /* position of pmc.oi bit */
-
-/* XXX: does not support more than 64 PMDs */
-#define CTX_USED_PMD(ctx, mask) (ctx)->ctx_used_pmds[0] |= (mask)
-#define CTX_IS_USED_PMD(ctx, c) (((ctx)->ctx_used_pmds[0] & (1UL << (c))) != 0UL)
-
-#define CTX_USED_MONITOR(ctx, mask) (ctx)->ctx_used_monitors[0] |= (mask)
-
-#define CTX_USED_IBR(ctx,n) 	(ctx)->ctx_used_ibrs[(n)>>6] |= 1UL<< ((n) % 64)
-#define CTX_USED_DBR(ctx,n) 	(ctx)->ctx_used_dbrs[(n)>>6] |= 1UL<< ((n) % 64)
-#define CTX_USES_DBREGS(ctx)	(((pfm_context_t *)(ctx))->ctx_fl_using_dbreg==1)
-#define PFM_CODE_RR	0	/* requesting code range restriction */
-#define PFM_DATA_RR	1	/* requestion data range restriction */
-
-#define PFM_CPUINFO_CLEAR(v)	pfm_get_cpu_var(pfm_syst_info) &= ~(v)
-#define PFM_CPUINFO_SET(v)	pfm_get_cpu_var(pfm_syst_info) |= (v)
-#define PFM_CPUINFO_GET()	pfm_get_cpu_var(pfm_syst_info)
-
-#define RDEP(x)	(1UL<<(x))
-
-/*
- * context protection macros
- * in SMP:
- * 	- we need to protect against CPU concurrency (spin_lock)
- * 	- we need to protect against PMU overflow interrupts (local_irq_disable)
- * in UP:
- * 	- we need to protect against PMU overflow interrupts (local_irq_disable)
- *
- * spin_lock_irqsave()/spin_unlock_irqrestore():
- * 	in SMP: local_irq_disable + spin_lock
- * 	in UP : local_irq_disable
- *
- * spin_lock()/spin_lock():
- * 	in UP : removed automatically
- * 	in SMP: protect against context accesses from other CPU. interrupts
- * 	        are not masked. This is useful for the PMU interrupt handler
- * 	        because we know we will not get PMU concurrency in that code.
- */
-#define PROTECT_CTX(c, f) \
-	do {  \
-		DPRINT(("spinlock_irq_save ctx %p by [%d]\n", c, current->pid)); \
-		spin_lock_irqsave(&(c)->ctx_lock, f); \
-		DPRINT(("spinlocked ctx %p  by [%d]\n", c, current->pid)); \
-	} while(0)
-
-#define UNPROTECT_CTX(c, f) \
-	do { \
-		DPRINT(("spinlock_irq_restore ctx %p by [%d]\n", c, current->pid)); \
-		spin_unlock_irqrestore(&(c)->ctx_lock, f); \
-	} while(0)
-
-#define PROTECT_CTX_NOPRINT(c, f) \
-	do {  \
-		spin_lock_irqsave(&(c)->ctx_lock, f); \
-	} while(0)
-
-
-#define UNPROTECT_CTX_NOPRINT(c, f) \
-	do { \
-		spin_unlock_irqrestore(&(c)->ctx_lock, f); \
-	} while(0)
-
-
-#define PROTECT_CTX_NOIRQ(c) \
-	do {  \
-		spin_lock(&(c)->ctx_lock); \
-	} while(0)
-
-#define UNPROTECT_CTX_NOIRQ(c) \
-	do { \
-		spin_unlock(&(c)->ctx_lock); \
-	} while(0)
-
-
-#ifdef CONFIG_SMP
-
-#define GET_ACTIVATION()	pfm_get_cpu_var(pmu_activation_number)
-#define INC_ACTIVATION()	pfm_get_cpu_var(pmu_activation_number)++
-#define SET_ACTIVATION(c)	(c)->ctx_last_activation = GET_ACTIVATION()
-
-#else /* !CONFIG_SMP */
-#define SET_ACTIVATION(t) 	do {} while(0)
-#define GET_ACTIVATION(t) 	do {} while(0)
-#define INC_ACTIVATION(t) 	do {} while(0)
-#endif /* CONFIG_SMP */
-
-#define SET_PMU_OWNER(t, c)	do { pfm_get_cpu_var(pmu_owner) = (t); pfm_get_cpu_var(pmu_ctx) = (c); } while(0)
-#define GET_PMU_OWNER()		pfm_get_cpu_var(pmu_owner)
-#define GET_PMU_CTX()		pfm_get_cpu_var(pmu_ctx)
-
-#define LOCK_PFS(g)	    	spin_lock_irqsave(&pfm_sessions.pfs_lock, g)
-#define UNLOCK_PFS(g)	    	spin_unlock_irqrestore(&pfm_sessions.pfs_lock, g)
-
-#define PFM_REG_RETFLAG_SET(flags, val)	do { flags &= ~PFM_REG_RETFL_MASK; flags |= (val); } while(0)
-
-/*
- * cmp0 must be the value of pmc0
- */
-#define PMC0_HAS_OVFL(cmp0)  (cmp0 & ~0x1UL)
-
-#define PFMFS_MAGIC 0xa0b4d889
-
-/*
- * debugging
- */
-#define PFM_DEBUGGING 1
-#ifdef PFM_DEBUGGING
-#define DPRINT(a) \
-	do { \
-		if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
-	} while (0)
-
-#define DPRINT_ovfl(a) \
-	do { \
-		if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d [%d] ", __FUNCTION__, __LINE__, smp_processor_id(), current->pid); printk a; } \
-	} while (0)
-#endif
-
-/*
- * 64-bit software counter structure
- *
- * the next_reset_type is applied to the next call to pfm_reset_regs()
- */
-typedef struct {
-	unsigned long	val;		/* virtual 64bit counter value */
-	unsigned long	lval;		/* last reset value */
-	unsigned long	long_reset;	/* reset value on sampling overflow */
-	unsigned long	short_reset;    /* reset value on overflow */
-	unsigned long	reset_pmds[4];  /* which other pmds to reset when this counter overflows */
-	unsigned long	smpl_pmds[4];   /* which pmds are accessed when counter overflow */
-	unsigned long	seed;		/* seed for random-number generator */
-	unsigned long	mask;		/* mask for random-number generator */
-	unsigned int 	flags;		/* notify/do not notify */
-	unsigned long	eventid;	/* overflow event identifier */
-} pfm_counter_t;
-
-/*
- * context flags
- */
-typedef struct {
-	unsigned int block:1;		/* when 1, task will blocked on user notifications */
-	unsigned int system:1;		/* do system wide monitoring */
-	unsigned int using_dbreg:1;	/* using range restrictions (debug registers) */
-	unsigned int is_sampling:1;	/* true if using a custom format */
-	unsigned int excl_idle:1;	/* exclude idle task in system wide session */
-	unsigned int going_zombie:1;	/* context is zombie (MASKED+blocking) */
-	unsigned int trap_reason:2;	/* reason for going into pfm_handle_work() */
-	unsigned int no_msg:1;		/* no message sent on overflow */
-	unsigned int can_restart:1;	/* allowed to issue a PFM_RESTART */
-	unsigned int reserved:22;
-} pfm_context_flags_t;
-
-#define PFM_TRAP_REASON_NONE		0x0	/* default value */
-#define PFM_TRAP_REASON_BLOCK		0x1	/* we need to block on overflow */
-#define PFM_TRAP_REASON_RESET		0x2	/* we need to reset PMDs */
-
-
-/*
- * perfmon context: encapsulates all the state of a monitoring session
- */
-
-typedef struct pfm_context {
-	spinlock_t		ctx_lock;		/* context protection */
-
-	pfm_context_flags_t	ctx_flags;		/* bitmask of flags  (block reason incl.) */
-	unsigned int		ctx_state;		/* state: active/inactive (no bitfield) */
-
-	struct task_struct 	*ctx_task;		/* task to which context is attached */
-
-	unsigned long		ctx_ovfl_regs[4];	/* which registers overflowed (notification) */
-
-	struct completion	ctx_restart_done;  	/* use for blocking notification mode */
-
-	unsigned long		ctx_used_pmds[4];	/* bitmask of PMD used            */
-	unsigned long		ctx_all_pmds[4];	/* bitmask of all accessible PMDs */
-	unsigned long		ctx_reload_pmds[4];	/* bitmask of force reload PMD on ctxsw in */
-
-	unsigned long		ctx_all_pmcs[4];	/* bitmask of all accessible PMCs */
-	unsigned long		ctx_reload_pmcs[4];	/* bitmask of force reload PMC on ctxsw in */
-	unsigned long		ctx_used_monitors[4];	/* bitmask of monitor PMC being used */
-
-	unsigned long		ctx_pmcs[PFM_NUM_PMC_REGS];	/*  saved copies of PMC values */
-
-	unsigned int		ctx_used_ibrs[1];		/* bitmask of used IBR (speedup ctxsw in) */
-	unsigned int		ctx_used_dbrs[1];		/* bitmask of used DBR (speedup ctxsw in) */
-	unsigned long		ctx_dbrs[IA64_NUM_DBG_REGS];	/* DBR values (cache) when not loaded */
-	unsigned long		ctx_ibrs[IA64_NUM_DBG_REGS];	/* IBR values (cache) when not loaded */
-
-	pfm_counter_t		ctx_pmds[PFM_NUM_PMD_REGS]; /* software state for PMDS */
-
-	unsigned long		th_pmcs[PFM_NUM_PMC_REGS];	/* PMC thread save state */
-	unsigned long		th_pmds[PFM_NUM_PMD_REGS];	/* PMD thread save state */
-
-	u64			ctx_saved_psr_up;	/* only contains psr.up value */
-
-	unsigned long		ctx_last_activation;	/* context last activation number for last_cpu */
-	unsigned int		ctx_last_cpu;		/* CPU id of current or last CPU used (SMP only) */
-	unsigned int		ctx_cpu;		/* cpu to which perfmon is applied (system wide) */
-
-	int			ctx_fd;			/* file descriptor used my this context */
-	pfm_ovfl_arg_t		ctx_ovfl_arg;		/* argument to custom buffer format handler */
-
-	pfm_buffer_fmt_t	*ctx_buf_fmt;		/* buffer format callbacks */
-	void			*ctx_smpl_hdr;		/* points to sampling buffer header kernel vaddr */
-	unsigned long		ctx_smpl_size;		/* size of sampling buffer */
-	void			*ctx_smpl_vaddr;	/* user level virtual address of smpl buffer */
-
-	wait_queue_head_t 	ctx_msgq_wait;
-	pfm_msg_t		ctx_msgq[PFM_MAX_MSGS];
-	int			ctx_msgq_head;
-	int			ctx_msgq_tail;
-	struct fasync_struct	*ctx_async_queue;
-
-	wait_queue_head_t 	ctx_zombieq;		/* termination cleanup wait queue */
-} pfm_context_t;
-
-/*
- * magic number used to verify that structure is really
- * a perfmon context
- */
-#define PFM_IS_FILE(f)		((f)->f_op == &pfm_file_ops)
-
-#define PFM_GET_CTX(t)	 	((pfm_context_t *)(t)->thread.pfm_context)
-
-#ifdef CONFIG_SMP
-#define SET_LAST_CPU(ctx, v)	(ctx)->ctx_last_cpu = (v)
-#define GET_LAST_CPU(ctx)	(ctx)->ctx_last_cpu
-#else
-#define SET_LAST_CPU(ctx, v)	do {} while(0)
-#define GET_LAST_CPU(ctx)	do {} while(0)
-#endif
-
-
-#define ctx_fl_block		ctx_flags.block
-#define ctx_fl_system		ctx_flags.system
-#define ctx_fl_using_dbreg	ctx_flags.using_dbreg
-#define ctx_fl_is_sampling	ctx_flags.is_sampling
-#define ctx_fl_excl_idle	ctx_flags.excl_idle
-#define ctx_fl_going_zombie	ctx_flags.going_zombie
-#define ctx_fl_trap_reason	ctx_flags.trap_reason
-#define ctx_fl_no_msg		ctx_flags.no_msg
-#define ctx_fl_can_restart	ctx_flags.can_restart
-
-#define PFM_SET_WORK_PENDING(t, v)	do { (t)->thread.pfm_needs_checking = v; } while(0);
-#define PFM_GET_WORK_PENDING(t)		(t)->thread.pfm_needs_checking
-
-/*
- * global information about all sessions
- * mostly used to synchronize between system wide and per-process
- */
-typedef struct {
-	spinlock_t		pfs_lock;		   /* lock the structure */
-
-	unsigned int		pfs_task_sessions;	   /* number of per task sessions */
-	unsigned int		pfs_sys_sessions;	   /* number of per system wide sessions */
-	unsigned int		pfs_sys_use_dbregs;	   /* incremented when a system wide session uses debug regs */
-	unsigned int		pfs_ptrace_use_dbregs;	   /* incremented when a process uses debug regs */
-	struct task_struct	*pfs_sys_session[NR_CPUS]; /* point to task owning a system-wide session */
-} pfm_session_t;
-
-/*
- * information about a PMC or PMD.
- * dep_pmd[]: a bitmask of dependent PMD registers
- * dep_pmc[]: a bitmask of dependent PMC registers
- */
-typedef int (*pfm_reg_check_t)(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
-typedef struct {
-	unsigned int		type;
-	int			pm_pos;
-	unsigned long		default_value;	/* power-on default value */
-	unsigned long		reserved_mask;	/* bitmask of reserved bits */
-	pfm_reg_check_t		read_check;
-	pfm_reg_check_t		write_check;
-	unsigned long		dep_pmd[4];
-	unsigned long		dep_pmc[4];
-} pfm_reg_desc_t;
-
-/* assume cnum is a valid monitor */
-#define PMC_PM(cnum, val)	(((val) >> (pmu_conf->pmc_desc[cnum].pm_pos)) & 0x1)
-
-/*
- * This structure is initialized at boot time and contains
- * a description of the PMU main characteristics.
- *
- * If the probe function is defined, detection is based
- * on its return value: 
- * 	- 0 means recognized PMU
- * 	- anything else means not supported
- * When the probe function is not defined, then the pmu_family field
- * is used and it must match the host CPU family such that:
- * 	- cpu->family & config->pmu_family != 0
- */
-typedef struct {
-	unsigned long  ovfl_val;	/* overflow value for counters */
-
-	pfm_reg_desc_t *pmc_desc;	/* detailed PMC register dependencies descriptions */
-	pfm_reg_desc_t *pmd_desc;	/* detailed PMD register dependencies descriptions */
-
-	unsigned int   num_pmcs;	/* number of PMCS: computed at init time */
-	unsigned int   num_pmds;	/* number of PMDS: computed at init time */
-	unsigned long  impl_pmcs[4];	/* bitmask of implemented PMCS */
-	unsigned long  impl_pmds[4];	/* bitmask of implemented PMDS */
-
-	char	      *pmu_name;	/* PMU family name */
-	unsigned int  pmu_family;	/* cpuid family pattern used to identify pmu */
-	unsigned int  flags;		/* pmu specific flags */
-	unsigned int  num_ibrs;		/* number of IBRS: computed at init time */
-	unsigned int  num_dbrs;		/* number of DBRS: computed at init time */
-	unsigned int  num_counters;	/* PMC/PMD counting pairs : computed at init time */
-	int           (*probe)(void);   /* customized probe routine */
-	unsigned int  use_rr_dbregs:1;	/* set if debug registers used for range restriction */
-} pmu_config_t;
-/*
- * PMU specific flags
- */
-#define PFM_PMU_IRQ_RESEND	1	/* PMU needs explicit IRQ resend */
-
-/*
- * debug register related type definitions
- */
-typedef struct {
-	unsigned long ibr_mask:56;
-	unsigned long ibr_plm:4;
-	unsigned long ibr_ig:3;
-	unsigned long ibr_x:1;
-} ibr_mask_reg_t;
-
-typedef struct {
-	unsigned long dbr_mask:56;
-	unsigned long dbr_plm:4;
-	unsigned long dbr_ig:2;
-	unsigned long dbr_w:1;
-	unsigned long dbr_r:1;
-} dbr_mask_reg_t;
-
-typedef union {
-	unsigned long  val;
-	ibr_mask_reg_t ibr;
-	dbr_mask_reg_t dbr;
-} dbreg_t;
-
-
-/*
- * perfmon command descriptions
- */
-typedef struct {
-	int		(*cmd_func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
-	char		*cmd_name;
-	int		cmd_flags;
-	unsigned int	cmd_narg;
-	size_t		cmd_argsize;
-	int		(*cmd_getsize)(void *arg, size_t *sz);
-} pfm_cmd_desc_t;
-
-#define PFM_CMD_FD		0x01	/* command requires a file descriptor */
-#define PFM_CMD_ARG_READ	0x02	/* command must read argument(s) */
-#define PFM_CMD_ARG_RW		0x04	/* command must read/write argument(s) */
-#define PFM_CMD_STOP		0x08	/* command does not work on zombie context */
-
-
-#define PFM_CMD_NAME(cmd)	pfm_cmd_tab[(cmd)].cmd_name
-#define PFM_CMD_READ_ARG(cmd)	(pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_READ)
-#define PFM_CMD_RW_ARG(cmd)	(pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_ARG_RW)
-#define PFM_CMD_USE_FD(cmd)	(pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_FD)
-#define PFM_CMD_STOPPED(cmd)	(pfm_cmd_tab[(cmd)].cmd_flags & PFM_CMD_STOP)
-
-#define PFM_CMD_ARG_MANY	-1 /* cannot be zero */
-
-typedef struct {
-	unsigned long pfm_spurious_ovfl_intr_count;	/* keep track of spurious ovfl interrupts */
-	unsigned long pfm_replay_ovfl_intr_count;	/* keep track of replayed ovfl interrupts */
-	unsigned long pfm_ovfl_intr_count; 		/* keep track of ovfl interrupts */
-	unsigned long pfm_ovfl_intr_cycles;		/* cycles spent processing ovfl interrupts */
-	unsigned long pfm_ovfl_intr_cycles_min;		/* min cycles spent processing ovfl interrupts */
-	unsigned long pfm_ovfl_intr_cycles_max;		/* max cycles spent processing ovfl interrupts */
-	unsigned long pfm_smpl_handler_calls;
-	unsigned long pfm_smpl_handler_cycles;
-	char pad[SMP_CACHE_BYTES] ____cacheline_aligned;
-} pfm_stats_t;
-
-/*
- * perfmon internal variables
- */
-static pfm_stats_t		pfm_stats[NR_CPUS];
-static pfm_session_t		pfm_sessions;	/* global sessions information */
-
-static DEFINE_SPINLOCK(pfm_alt_install_check);
-static pfm_intr_handler_desc_t  *pfm_alt_intr_handler;
-
-static struct proc_dir_entry 	*perfmon_dir;
-static pfm_uuid_t		pfm_null_uuid = {0,};
-
-static spinlock_t		pfm_buffer_fmt_lock;
-static LIST_HEAD(pfm_buffer_fmt_list);
-
-static pmu_config_t		*pmu_conf;
-
-/* sysctl() controls */
-pfm_sysctl_t pfm_sysctl;
-EXPORT_SYMBOL(pfm_sysctl);
-
-static ctl_table pfm_ctl_table[]={
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "debug",
-		.data		= &pfm_sysctl.debug,
-		.maxlen		= sizeof(int),
-		.mode		= 0666,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "debug_ovfl",
-		.data		= &pfm_sysctl.debug_ovfl,
-		.maxlen		= sizeof(int),
-		.mode		= 0666,
-		.proc_handler	= &proc_dointvec,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "fastctxsw",
-		.data		= &pfm_sysctl.fastctxsw,
-		.maxlen		= sizeof(int),
-		.mode		= 0600,
-		.proc_handler	=  &proc_dointvec,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "expert_mode",
-		.data		= &pfm_sysctl.expert_mode,
-		.maxlen		= sizeof(int),
-		.mode		= 0600,
-		.proc_handler	= &proc_dointvec,
-	},
-	{}
-};
-static ctl_table pfm_sysctl_dir[] = {
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "perfmon",
-		.mode		= 0755,
-		.child		= pfm_ctl_table,
-	},
- 	{}
-};
-static ctl_table pfm_sysctl_root[] = {
-	{
-		.ctl_name	= CTL_KERN,
-		.procname	= "kernel",
-		.mode		= 0755,
-		.child		= pfm_sysctl_dir,
-	},
- 	{}
-};
-static struct ctl_table_header *pfm_sysctl_header;
-
-static int pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
-
-#define pfm_get_cpu_var(v)		__ia64_per_cpu_var(v)
-#define pfm_get_cpu_data(a,b)		per_cpu(a, b)
-
-static inline void
-pfm_put_task(struct task_struct *task)
-{
-	if (task != current) put_task_struct(task);
-}
-
-static inline void
-pfm_set_task_notify(struct task_struct *task)
-{
-	struct thread_info *info;
-
-	info = (struct thread_info *) ((char *) task + IA64_TASK_SIZE);
-	set_bit(TIF_NOTIFY_RESUME, &info->flags);
-}
-
-static inline void
-pfm_clear_task_notify(void)
-{
-	clear_thread_flag(TIF_NOTIFY_RESUME);
-}
-
-static inline void
-pfm_reserve_page(unsigned long a)
-{
-	SetPageReserved(vmalloc_to_page((void *)a));
-}
-static inline void
-pfm_unreserve_page(unsigned long a)
-{
-	ClearPageReserved(vmalloc_to_page((void*)a));
-}
-
-static inline unsigned long
-pfm_protect_ctx_ctxsw(pfm_context_t *x)
-{
-	spin_lock(&(x)->ctx_lock);
-	return 0UL;
-}
-
-static inline void
-pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f)
-{
-	spin_unlock(&(x)->ctx_lock);
-}
-
-static inline unsigned int
-pfm_do_munmap(struct mm_struct *mm, unsigned long addr, size_t len, int acct)
-{
-	return do_munmap(mm, addr, len);
-}
-
-static inline unsigned long 
-pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec)
-{
-	return get_unmapped_area(file, addr, len, pgoff, flags);
-}
-
-
-static int
-pfmfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *data,
-	     struct vfsmount *mnt)
-{
-	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt);
-}
-
-static struct file_system_type pfm_fs_type = {
-	.name     = "pfmfs",
-	.get_sb   = pfmfs_get_sb,
-	.kill_sb  = kill_anon_super,
-};
-
-DEFINE_PER_CPU(unsigned long, pfm_syst_info);
-DEFINE_PER_CPU(struct task_struct *, pmu_owner);
-DEFINE_PER_CPU(pfm_context_t  *, pmu_ctx);
-DEFINE_PER_CPU(unsigned long, pmu_activation_number);
-EXPORT_PER_CPU_SYMBOL_GPL(pfm_syst_info);
-
-
-/* forward declaration */
-static const struct file_operations pfm_file_ops;
-
-/*
- * forward declarations
- */
-#ifndef CONFIG_SMP
-static void pfm_lazy_save_regs (struct task_struct *ta);
-#endif
-
-void dump_pmu_state(const char *);
-static int pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
-
-#include "perfmon_itanium.h"
-#include "perfmon_mckinley.h"
-#include "perfmon_montecito.h"
-#include "perfmon_generic.h"
-
-static pmu_config_t *pmu_confs[]={
-	&pmu_conf_mont,
-	&pmu_conf_mck,
-	&pmu_conf_ita,
-	&pmu_conf_gen, /* must be last */
-	NULL
-};
-
-
-static int pfm_end_notify_user(pfm_context_t *ctx);
-
-static inline void
-pfm_clear_psr_pp(void)
-{
-	ia64_rsm(IA64_PSR_PP);
-	ia64_srlz_i();
-}
-
-static inline void
-pfm_set_psr_pp(void)
-{
-	ia64_ssm(IA64_PSR_PP);
-	ia64_srlz_i();
-}
-
-static inline void
-pfm_clear_psr_up(void)
-{
-	ia64_rsm(IA64_PSR_UP);
-	ia64_srlz_i();
-}
-
-static inline void
-pfm_set_psr_up(void)
-{
-	ia64_ssm(IA64_PSR_UP);
-	ia64_srlz_i();
-}
-
-static inline unsigned long
-pfm_get_psr(void)
-{
-	unsigned long tmp;
-	tmp = ia64_getreg(_IA64_REG_PSR);
-	ia64_srlz_i();
-	return tmp;
-}
-
-static inline void
-pfm_set_psr_l(unsigned long val)
-{
-	ia64_setreg(_IA64_REG_PSR_L, val);
-	ia64_srlz_i();
-}
-
-static inline void
-pfm_freeze_pmu(void)
-{
-	ia64_set_pmc(0,1UL);
-	ia64_srlz_d();
-}
-
-static inline void
-pfm_unfreeze_pmu(void)
-{
-	ia64_set_pmc(0,0UL);
-	ia64_srlz_d();
-}
-
-static inline void
-pfm_restore_ibrs(unsigned long *ibrs, unsigned int nibrs)
-{
-	int i;
-
-	for (i=0; i < nibrs; i++) {
-		ia64_set_ibr(i, ibrs[i]);
-		ia64_dv_serialize_instruction();
-	}
-	ia64_srlz_i();
-}
-
-static inline void
-pfm_restore_dbrs(unsigned long *dbrs, unsigned int ndbrs)
-{
-	int i;
-
-	for (i=0; i < ndbrs; i++) {
-		ia64_set_dbr(i, dbrs[i]);
-		ia64_dv_serialize_data();
-	}
-	ia64_srlz_d();
-}
-
-/*
- * PMD[i] must be a counter. no check is made
- */
-static inline unsigned long
-pfm_read_soft_counter(pfm_context_t *ctx, int i)
-{
-	return ctx->ctx_pmds[i].val + (ia64_get_pmd(i) & pmu_conf->ovfl_val);
-}
-
-/*
- * PMD[i] must be a counter. no check is made
- */
-static inline void
-pfm_write_soft_counter(pfm_context_t *ctx, int i, unsigned long val)
-{
-	unsigned long ovfl_val = pmu_conf->ovfl_val;
-
-	ctx->ctx_pmds[i].val = val  & ~ovfl_val;
-	/*
-	 * writing to unimplemented part is ignore, so we do not need to
-	 * mask off top part
-	 */
-	ia64_set_pmd(i, val & ovfl_val);
-}
-
-static pfm_msg_t *
-pfm_get_new_msg(pfm_context_t *ctx)
-{
-	int idx, next;
-
-	next = (ctx->ctx_msgq_tail+1) % PFM_MAX_MSGS;
-
-	DPRINT(("ctx_fd=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
-	if (next == ctx->ctx_msgq_head) return NULL;
-
- 	idx = 	ctx->ctx_msgq_tail;
-	ctx->ctx_msgq_tail = next;
-
-	DPRINT(("ctx=%p head=%d tail=%d msg=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, idx));
-
-	return ctx->ctx_msgq+idx;
-}
-
-static pfm_msg_t *
-pfm_get_next_msg(pfm_context_t *ctx)
-{
-	pfm_msg_t *msg;
-
-	DPRINT(("ctx=%p head=%d tail=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
-
-	if (PFM_CTXQ_EMPTY(ctx)) return NULL;
-
-	/*
-	 * get oldest message
-	 */
-	msg = ctx->ctx_msgq+ctx->ctx_msgq_head;
-
-	/*
-	 * and move forward
-	 */
-	ctx->ctx_msgq_head = (ctx->ctx_msgq_head+1) % PFM_MAX_MSGS;
-
-	DPRINT(("ctx=%p head=%d tail=%d type=%d\n", ctx, ctx->ctx_msgq_head, ctx->ctx_msgq_tail, msg->pfm_gen_msg.msg_type));
-
-	return msg;
-}
-
-static void
-pfm_reset_msgq(pfm_context_t *ctx)
-{
-	ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
-	DPRINT(("ctx=%p msgq reset\n", ctx));
-}
-
-static void *
-pfm_rvmalloc(unsigned long size)
-{
-	void *mem;
-	unsigned long addr;
-
-	size = PAGE_ALIGN(size);
-	mem  = vmalloc(size);
-	if (mem) {
-		//printk("perfmon: CPU%d pfm_rvmalloc(%ld)=%p\n", smp_processor_id(), size, mem);
-		memset(mem, 0, size);
-		addr = (unsigned long)mem;
-		while (size > 0) {
-			pfm_reserve_page(addr);
-			addr+=PAGE_SIZE;
-			size-=PAGE_SIZE;
-		}
-	}
-	return mem;
-}
-
-static void
-pfm_rvfree(void *mem, unsigned long size)
-{
-	unsigned long addr;
-
-	if (mem) {
-		DPRINT(("freeing physical buffer @%p size=%lu\n", mem, size));
-		addr = (unsigned long) mem;
-		while ((long) size > 0) {
-			pfm_unreserve_page(addr);
-			addr+=PAGE_SIZE;
-			size-=PAGE_SIZE;
-		}
-		vfree(mem);
-	}
-	return;
-}
-
-static pfm_context_t *
-pfm_context_alloc(void)
-{
-	pfm_context_t *ctx;
-
-	/* 
-	 * allocate context descriptor 
-	 * must be able to free with interrupts disabled
-	 */
-	ctx = kzalloc(sizeof(pfm_context_t), GFP_KERNEL);
-	if (ctx) {
-		DPRINT(("alloc ctx @%p\n", ctx));
-	}
-	return ctx;
-}
-
-static void
-pfm_context_free(pfm_context_t *ctx)
-{
-	if (ctx) {
-		DPRINT(("free ctx @%p\n", ctx));
-		kfree(ctx);
-	}
-}
-
-static void
-pfm_mask_monitoring(struct task_struct *task)
-{
-	pfm_context_t *ctx = PFM_GET_CTX(task);
-	unsigned long mask, val, ovfl_mask;
-	int i;
-
-	DPRINT_ovfl(("masking monitoring for [%d]\n", task->pid));
-
-	ovfl_mask = pmu_conf->ovfl_val;
-	/*
-	 * monitoring can only be masked as a result of a valid
-	 * counter overflow. In UP, it means that the PMU still
-	 * has an owner. Note that the owner can be different
-	 * from the current task. However the PMU state belongs
-	 * to the owner.
-	 * In SMP, a valid overflow only happens when task is
-	 * current. Therefore if we come here, we know that
-	 * the PMU state belongs to the current task, therefore
-	 * we can access the live registers.
-	 *
-	 * So in both cases, the live register contains the owner's
-	 * state. We can ONLY touch the PMU registers and NOT the PSR.
-	 *
-	 * As a consequence to this call, the ctx->th_pmds[] array
-	 * contains stale information which must be ignored
-	 * when context is reloaded AND monitoring is active (see
-	 * pfm_restart).
-	 */
-	mask = ctx->ctx_used_pmds[0];
-	for (i = 0; mask; i++, mask>>=1) {
-		/* skip non used pmds */
-		if ((mask & 0x1) == 0) continue;
-		val = ia64_get_pmd(i);
-
-		if (PMD_IS_COUNTING(i)) {
-			/*
-		 	 * we rebuild the full 64 bit value of the counter
-		 	 */
-			ctx->ctx_pmds[i].val += (val & ovfl_mask);
-		} else {
-			ctx->ctx_pmds[i].val = val;
-		}
-		DPRINT_ovfl(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
-			i,
-			ctx->ctx_pmds[i].val,
-			val & ovfl_mask));
-	}
-	/*
-	 * mask monitoring by setting the privilege level to 0
-	 * we cannot use psr.pp/psr.up for this, it is controlled by
-	 * the user
-	 *
-	 * if task is current, modify actual registers, otherwise modify
-	 * thread save state, i.e., what will be restored in pfm_load_regs()
-	 */
-	mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
-	for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
-		if ((mask & 0x1) == 0UL) continue;
-		ia64_set_pmc(i, ctx->th_pmcs[i] & ~0xfUL);
-		ctx->th_pmcs[i] &= ~0xfUL;
-		DPRINT_ovfl(("pmc[%d]=0x%lx\n", i, ctx->th_pmcs[i]));
-	}
-	/*
-	 * make all of this visible
-	 */
-	ia64_srlz_d();
-}
-
-/*
- * must always be done with task == current
- *
- * context must be in MASKED state when calling
- */
-static void
-pfm_restore_monitoring(struct task_struct *task)
-{
-	pfm_context_t *ctx = PFM_GET_CTX(task);
-	unsigned long mask, ovfl_mask;
-	unsigned long psr, val;
-	int i, is_system;
-
-	is_system = ctx->ctx_fl_system;
-	ovfl_mask = pmu_conf->ovfl_val;
-
-	if (task != current) {
-		printk(KERN_ERR "perfmon.%d: invalid task[%d] current[%d]\n", __LINE__, task->pid, current->pid);
-		return;
-	}
-	if (ctx->ctx_state != PFM_CTX_MASKED) {
-		printk(KERN_ERR "perfmon.%d: task[%d] current[%d] invalid state=%d\n", __LINE__,
-			task->pid, current->pid, ctx->ctx_state);
-		return;
-	}
-	psr = pfm_get_psr();
-	/*
-	 * monitoring is masked via the PMC.
-	 * As we restore their value, we do not want each counter to
-	 * restart right away. We stop monitoring using the PSR,
-	 * restore the PMC (and PMD) and then re-establish the psr
-	 * as it was. Note that there can be no pending overflow at
-	 * this point, because monitoring was MASKED.
-	 *
-	 * system-wide session are pinned and self-monitoring
-	 */
-	if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
-		/* disable dcr pp */
-		ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP);
-		pfm_clear_psr_pp();
-	} else {
-		pfm_clear_psr_up();
-	}
-	/*
-	 * first, we restore the PMD
-	 */
-	mask = ctx->ctx_used_pmds[0];
-	for (i = 0; mask; i++, mask>>=1) {
-		/* skip non used pmds */
-		if ((mask & 0x1) == 0) continue;
-
-		if (PMD_IS_COUNTING(i)) {
-			/*
-			 * we split the 64bit value according to
-			 * counter width
-			 */
-			val = ctx->ctx_pmds[i].val & ovfl_mask;
-			ctx->ctx_pmds[i].val &= ~ovfl_mask;
-		} else {
-			val = ctx->ctx_pmds[i].val;
-		}
-		ia64_set_pmd(i, val);
-
-		DPRINT(("pmd[%d]=0x%lx hw_pmd=0x%lx\n",
-			i,
-			ctx->ctx_pmds[i].val,
-			val));
-	}
-	/*
-	 * restore the PMCs
-	 */
-	mask = ctx->ctx_used_monitors[0] >> PMU_FIRST_COUNTER;
-	for(i= PMU_FIRST_COUNTER; mask; i++, mask>>=1) {
-		if ((mask & 0x1) == 0UL) continue;
-		ctx->th_pmcs[i] = ctx->ctx_pmcs[i];
-		ia64_set_pmc(i, ctx->th_pmcs[i]);
-		DPRINT(("[%d] pmc[%d]=0x%lx\n", task->pid, i, ctx->th_pmcs[i]));
-	}
-	ia64_srlz_d();
-
-	/*
-	 * must restore DBR/IBR because could be modified while masked
-	 * XXX: need to optimize 
-	 */
-	if (ctx->ctx_fl_using_dbreg) {
-		pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
-		pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
-	}
-
-	/*
-	 * now restore PSR
-	 */
-	if (is_system && (PFM_CPUINFO_GET() & PFM_CPUINFO_DCR_PP)) {
-		/* enable dcr pp */
-		ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) | IA64_DCR_PP);
-		ia64_srlz_i();
-	}
-	pfm_set_psr_l(psr);
-}
-
-static inline void
-pfm_save_pmds(unsigned long *pmds, unsigned long mask)
-{
-	int i;
-
-	ia64_srlz_d();
-
-	for (i=0; mask; i++, mask>>=1) {
-		if (mask & 0x1) pmds[i] = ia64_get_pmd(i);
-	}
-}
-
-/*
- * reload from thread state (used for ctxw only)
- */
-static inline void
-pfm_restore_pmds(unsigned long *pmds, unsigned long mask)
-{
-	int i;
-	unsigned long val, ovfl_val = pmu_conf->ovfl_val;
-
-	for (i=0; mask; i++, mask>>=1) {
-		if ((mask & 0x1) == 0) continue;
-		val = PMD_IS_COUNTING(i) ? pmds[i] & ovfl_val : pmds[i];
-		ia64_set_pmd(i, val);
-	}
-	ia64_srlz_d();
-}
-
-/*
- * propagate PMD from context to thread-state
- */
-static inline void
-pfm_copy_pmds(struct task_struct *task, pfm_context_t *ctx)
-{
-	unsigned long ovfl_val = pmu_conf->ovfl_val;
-	unsigned long mask = ctx->ctx_all_pmds[0];
-	unsigned long val;
-	int i;
-
-	DPRINT(("mask=0x%lx\n", mask));
-
-	for (i=0; mask; i++, mask>>=1) {
-
-		val = ctx->ctx_pmds[i].val;
-
-		/*
-		 * We break up the 64 bit value into 2 pieces
-		 * the lower bits go to the machine state in the
-		 * thread (will be reloaded on ctxsw in).
-		 * The upper part stays in the soft-counter.
-		 */
-		if (PMD_IS_COUNTING(i)) {
-			ctx->ctx_pmds[i].val = val & ~ovfl_val;
-			 val &= ovfl_val;
-		}
-		ctx->th_pmds[i] = val;
-
-		DPRINT(("pmd[%d]=0x%lx soft_val=0x%lx\n",
-			i,
-			ctx->th_pmds[i],
-			ctx->ctx_pmds[i].val));
-	}
-}
-
-/*
- * propagate PMC from context to thread-state
- */
-static inline void
-pfm_copy_pmcs(struct task_struct *task, pfm_context_t *ctx)
-{
-	unsigned long mask = ctx->ctx_all_pmcs[0];
-	int i;
-
-	DPRINT(("mask=0x%lx\n", mask));
-
-	for (i=0; mask; i++, mask>>=1) {
-		/* masking 0 with ovfl_val yields 0 */
-		ctx->th_pmcs[i] = ctx->ctx_pmcs[i];
-		DPRINT(("pmc[%d]=0x%lx\n", i, ctx->th_pmcs[i]));
-	}
-}
-
-
-
-static inline void
-pfm_restore_pmcs(unsigned long *pmcs, unsigned long mask)
-{
-	int i;
-
-	for (i=0; mask; i++, mask>>=1) {
-		if ((mask & 0x1) == 0) continue;
-		ia64_set_pmc(i, pmcs[i]);
-	}
-	ia64_srlz_d();
-}
-
-static inline int
-pfm_uuid_cmp(pfm_uuid_t a, pfm_uuid_t b)
-{
-	return memcmp(a, b, sizeof(pfm_uuid_t));
-}
-
-static inline int
-pfm_buf_fmt_exit(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, struct pt_regs *regs)
-{
-	int ret = 0;
-	if (fmt->fmt_exit) ret = (*fmt->fmt_exit)(task, buf, regs);
-	return ret;
-}
-
-static inline int
-pfm_buf_fmt_getsize(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags, int cpu, void *arg, unsigned long *size)
-{
-	int ret = 0;
-	if (fmt->fmt_getsize) ret = (*fmt->fmt_getsize)(task, flags, cpu, arg, size);
-	return ret;
-}
-
-
-static inline int
-pfm_buf_fmt_validate(pfm_buffer_fmt_t *fmt, struct task_struct *task, unsigned int flags,
-		     int cpu, void *arg)
-{
-	int ret = 0;
-	if (fmt->fmt_validate) ret = (*fmt->fmt_validate)(task, flags, cpu, arg);
-	return ret;
-}
-
-static inline int
-pfm_buf_fmt_init(pfm_buffer_fmt_t *fmt, struct task_struct *task, void *buf, unsigned int flags,
-		     int cpu, void *arg)
-{
-	int ret = 0;
-	if (fmt->fmt_init) ret = (*fmt->fmt_init)(task, buf, flags, cpu, arg);
-	return ret;
-}
-
-static inline int
-pfm_buf_fmt_restart(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
-{
-	int ret = 0;
-	if (fmt->fmt_restart) ret = (*fmt->fmt_restart)(task, ctrl, buf, regs);
-	return ret;
-}
-
-static inline int
-pfm_buf_fmt_restart_active(pfm_buffer_fmt_t *fmt, struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
-{
-	int ret = 0;
-	if (fmt->fmt_restart_active) ret = (*fmt->fmt_restart_active)(task, ctrl, buf, regs);
-	return ret;
-}
-
-static pfm_buffer_fmt_t *
-__pfm_find_buffer_fmt(pfm_uuid_t uuid)
-{
-	struct list_head * pos;
-	pfm_buffer_fmt_t * entry;
-
-	list_for_each(pos, &pfm_buffer_fmt_list) {
-		entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
-		if (pfm_uuid_cmp(uuid, entry->fmt_uuid) == 0)
-			return entry;
-	}
-	return NULL;
-}
- 
-/*
- * find a buffer format based on its uuid
- */
-static pfm_buffer_fmt_t *
-pfm_find_buffer_fmt(pfm_uuid_t uuid)
-{
-	pfm_buffer_fmt_t * fmt;
-	spin_lock(&pfm_buffer_fmt_lock);
-	fmt = __pfm_find_buffer_fmt(uuid);
-	spin_unlock(&pfm_buffer_fmt_lock);
-	return fmt;
-}
- 
-int
-pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt)
-{
-	int ret = 0;
-
-	/* some sanity checks */
-	if (fmt == NULL || fmt->fmt_name == NULL) return -EINVAL;
-
-	/* we need at least a handler */
-	if (fmt->fmt_handler == NULL) return -EINVAL;
-
-	/*
-	 * XXX: need check validity of fmt_arg_size
-	 */
-
-	spin_lock(&pfm_buffer_fmt_lock);
-
-	if (__pfm_find_buffer_fmt(fmt->fmt_uuid)) {
-		printk(KERN_ERR "perfmon: duplicate sampling format: %s\n", fmt->fmt_name);
-		ret = -EBUSY;
-		goto out;
-	} 
-	list_add(&fmt->fmt_list, &pfm_buffer_fmt_list);
-	printk(KERN_INFO "perfmon: added sampling format %s\n", fmt->fmt_name);
-
-out:
-	spin_unlock(&pfm_buffer_fmt_lock);
- 	return ret;
-}
-EXPORT_SYMBOL(pfm_register_buffer_fmt);
-
-int
-pfm_unregister_buffer_fmt(pfm_uuid_t uuid)
-{
-	pfm_buffer_fmt_t *fmt;
-	int ret = 0;
-
-	spin_lock(&pfm_buffer_fmt_lock);
-
-	fmt = __pfm_find_buffer_fmt(uuid);
-	if (!fmt) {
-		printk(KERN_ERR "perfmon: cannot unregister format, not found\n");
-		ret = -EINVAL;
-		goto out;
-	}
-	list_del_init(&fmt->fmt_list);
-	printk(KERN_INFO "perfmon: removed sampling format: %s\n", fmt->fmt_name);
-
-out:
-	spin_unlock(&pfm_buffer_fmt_lock);
-	return ret;
-
-}
-EXPORT_SYMBOL(pfm_unregister_buffer_fmt);
-
-extern void update_pal_halt_status(int);
-
-static int
-pfm_reserve_session(struct task_struct *task, int is_syswide, unsigned int cpu)
-{
-	unsigned long flags;
-	/*
-	 * validity checks on cpu_mask have been done upstream
-	 */
-	LOCK_PFS(flags);
-
-	DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
-		pfm_sessions.pfs_sys_sessions,
-		pfm_sessions.pfs_task_sessions,
-		pfm_sessions.pfs_sys_use_dbregs,
-		is_syswide,
-		cpu));
-
-	if (is_syswide) {
-		/*
-		 * cannot mix system wide and per-task sessions
-		 */
-		if (pfm_sessions.pfs_task_sessions > 0UL) {
-			DPRINT(("system wide not possible, %u conflicting task_sessions\n",
-			  	pfm_sessions.pfs_task_sessions));
-			goto abort;
-		}
-
-		if (pfm_sessions.pfs_sys_session[cpu]) goto error_conflict;
-
-		DPRINT(("reserving system wide session on CPU%u currently on CPU%u\n", cpu, smp_processor_id()));
-
-		pfm_sessions.pfs_sys_session[cpu] = task;
-
-		pfm_sessions.pfs_sys_sessions++ ;
-
-	} else {
-		if (pfm_sessions.pfs_sys_sessions) goto abort;
-		pfm_sessions.pfs_task_sessions++;
-	}
-
-	DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
-		pfm_sessions.pfs_sys_sessions,
-		pfm_sessions.pfs_task_sessions,
-		pfm_sessions.pfs_sys_use_dbregs,
-		is_syswide,
-		cpu));
-
-	/*
-	 * disable default_idle() to go to PAL_HALT
-	 */
-	update_pal_halt_status(0);
-
-	UNLOCK_PFS(flags);
-
-	return 0;
-
-error_conflict:
-	DPRINT(("system wide not possible, conflicting session [%d] on CPU%d\n",
-  		pfm_sessions.pfs_sys_session[cpu]->pid,
-		cpu));
-abort:
-	UNLOCK_PFS(flags);
-
-	return -EBUSY;
-
-}
-
-static int
-pfm_unreserve_session(pfm_context_t *ctx, int is_syswide, unsigned int cpu)
-{
-	unsigned long flags;
-	/*
-	 * validity checks on cpu_mask have been done upstream
-	 */
-	LOCK_PFS(flags);
-
-	DPRINT(("in sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
-		pfm_sessions.pfs_sys_sessions,
-		pfm_sessions.pfs_task_sessions,
-		pfm_sessions.pfs_sys_use_dbregs,
-		is_syswide,
-		cpu));
-
-
-	if (is_syswide) {
-		pfm_sessions.pfs_sys_session[cpu] = NULL;
-		/*
-		 * would not work with perfmon+more than one bit in cpu_mask
-		 */
-		if (ctx && ctx->ctx_fl_using_dbreg) {
-			if (pfm_sessions.pfs_sys_use_dbregs == 0) {
-				printk(KERN_ERR "perfmon: invalid release for ctx %p sys_use_dbregs=0\n", ctx);
-			} else {
-				pfm_sessions.pfs_sys_use_dbregs--;
-			}
-		}
-		pfm_sessions.pfs_sys_sessions--;
-	} else {
-		pfm_sessions.pfs_task_sessions--;
-	}
-	DPRINT(("out sys_sessions=%u task_sessions=%u dbregs=%u syswide=%d cpu=%u\n",
-		pfm_sessions.pfs_sys_sessions,
-		pfm_sessions.pfs_task_sessions,
-		pfm_sessions.pfs_sys_use_dbregs,
-		is_syswide,
-		cpu));
-
-	/*
-	 * if possible, enable default_idle() to go into PAL_HALT
-	 */
-	if (pfm_sessions.pfs_task_sessions == 0 && pfm_sessions.pfs_sys_sessions == 0)
-		update_pal_halt_status(1);
-
-	UNLOCK_PFS(flags);
-
-	return 0;
-}
-
-/*
- * removes virtual mapping of the sampling buffer.
- * IMPORTANT: cannot be called with interrupts disable, e.g. inside
- * a PROTECT_CTX() section.
- */
-static int
-pfm_remove_smpl_mapping(struct task_struct *task, void *vaddr, unsigned long size)
-{
-	int r;
-
-	/* sanity checks */
-	if (task->mm == NULL || size == 0UL || vaddr == NULL) {
-		printk(KERN_ERR "perfmon: pfm_remove_smpl_mapping [%d] invalid context mm=%p\n", task->pid, task->mm);
-		return -EINVAL;
-	}
-
-	DPRINT(("smpl_vaddr=%p size=%lu\n", vaddr, size));
-
-	/*
-	 * does the actual unmapping
-	 */
-	down_write(&task->mm->mmap_sem);
-
-	DPRINT(("down_write done smpl_vaddr=%p size=%lu\n", vaddr, size));
-
-	r = pfm_do_munmap(task->mm, (unsigned long)vaddr, size, 0);
-
-	up_write(&task->mm->mmap_sem);
-	if (r !=0) {
-		printk(KERN_ERR "perfmon: [%d] unable to unmap sampling buffer @%p size=%lu\n", task->pid, vaddr, size);
-	}
-
-	DPRINT(("do_unmap(%p, %lu)=%d\n", vaddr, size, r));
-
-	return 0;
-}
-
-/*
- * free actual physical storage used by sampling buffer
- */
-#if 0
-static int
-pfm_free_smpl_buffer(pfm_context_t *ctx)
-{
-	pfm_buffer_fmt_t *fmt;
-
-	if (ctx->ctx_smpl_hdr == NULL) goto invalid_free;
-
-	/*
-	 * we won't use the buffer format anymore
-	 */
-	fmt = ctx->ctx_buf_fmt;
-
-	DPRINT(("sampling buffer @%p size %lu vaddr=%p\n",
-		ctx->ctx_smpl_hdr,
-		ctx->ctx_smpl_size,
-		ctx->ctx_smpl_vaddr));
-
-	pfm_buf_fmt_exit(fmt, current, NULL, NULL);
-
-	/*
-	 * free the buffer
-	 */
-	pfm_rvfree(ctx->ctx_smpl_hdr, ctx->ctx_smpl_size);
-
-	ctx->ctx_smpl_hdr  = NULL;
-	ctx->ctx_smpl_size = 0UL;
-
-	return 0;
-
-invalid_free:
-	printk(KERN_ERR "perfmon: pfm_free_smpl_buffer [%d] no buffer\n", current->pid);
-	return -EINVAL;
-}
-#endif
-
-static inline void
-pfm_exit_smpl_buffer(pfm_buffer_fmt_t *fmt)
-{
-	if (fmt == NULL) return;
-
-	pfm_buf_fmt_exit(fmt, current, NULL, NULL);
-
-}
-
-/*
- * pfmfs should _never_ be mounted by userland - too much of security hassle,
- * no real gain from having the whole whorehouse mounted. So we don't need
- * any operations on the root directory. However, we need a non-trivial
- * d_name - pfm: will go nicely and kill the special-casing in procfs.
- */
-static struct vfsmount *pfmfs_mnt;
-
-static int __init
-init_pfm_fs(void)
-{
-	int err = register_filesystem(&pfm_fs_type);
-	if (!err) {
-		pfmfs_mnt = kern_mount(&pfm_fs_type);
-		err = PTR_ERR(pfmfs_mnt);
-		if (IS_ERR(pfmfs_mnt))
-			unregister_filesystem(&pfm_fs_type);
-		else
-			err = 0;
-	}
-	return err;
-}
-
-static void __exit
-exit_pfm_fs(void)
-{
-	unregister_filesystem(&pfm_fs_type);
-	mntput(pfmfs_mnt);
-}
-
-static ssize_t
-pfm_read(struct file *filp, char __user *buf, size_t size, loff_t *ppos)
-{
-	pfm_context_t *ctx;
-	pfm_msg_t *msg;
-	ssize_t ret;
-	unsigned long flags;
-  	DECLARE_WAITQUEUE(wait, current);
-	if (PFM_IS_FILE(filp) == 0) {
-		printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
-		return -EINVAL;
-	}
-
-	ctx = (pfm_context_t *)filp->private_data;
-	if (ctx == NULL) {
-		printk(KERN_ERR "perfmon: pfm_read: NULL ctx [%d]\n", current->pid);
-		return -EINVAL;
-	}
-
-	/*
-	 * check even when there is no message
-	 */
-	if (size < sizeof(pfm_msg_t)) {
-		DPRINT(("message is too small ctx=%p (>=%ld)\n", ctx, sizeof(pfm_msg_t)));
-		return -EINVAL;
-	}
-
-	PROTECT_CTX(ctx, flags);
-
-  	/*
-	 * put ourselves on the wait queue
-	 */
-  	add_wait_queue(&ctx->ctx_msgq_wait, &wait);
-
-
-  	for(;;) {
-		/*
-		 * check wait queue
-		 */
-
-  		set_current_state(TASK_INTERRUPTIBLE);
-
-		DPRINT(("head=%d tail=%d\n", ctx->ctx_msgq_head, ctx->ctx_msgq_tail));
-
-		ret = 0;
-		if(PFM_CTXQ_EMPTY(ctx) == 0) break;
-
-		UNPROTECT_CTX(ctx, flags);
-
-		/*
-		 * check non-blocking read
-		 */
-      		ret = -EAGAIN;
-		if(filp->f_flags & O_NONBLOCK) break;
-
-		/*
-		 * check pending signals
-		 */
-		if(signal_pending(current)) {
-			ret = -EINTR;
-			break;
-		}
-      		/*
-		 * no message, so wait
-		 */
-      		schedule();
-
-		PROTECT_CTX(ctx, flags);
-	}
-	DPRINT(("[%d] back to running ret=%ld\n", current->pid, ret));
-  	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&ctx->ctx_msgq_wait, &wait);
-
-	if (ret < 0) goto abort;
-
-	ret = -EINVAL;
-	msg = pfm_get_next_msg(ctx);
-	if (msg == NULL) {
-		printk(KERN_ERR "perfmon: pfm_read no msg for ctx=%p [%d]\n", ctx, current->pid);
-		goto abort_locked;
-	}
-
-	DPRINT(("fd=%d type=%d\n", msg->pfm_gen_msg.msg_ctx_fd, msg->pfm_gen_msg.msg_type));
-
-	ret = -EFAULT;
-  	if(copy_to_user(buf, msg, sizeof(pfm_msg_t)) == 0) ret = sizeof(pfm_msg_t);
-
-abort_locked:
-	UNPROTECT_CTX(ctx, flags);
-abort:
-	return ret;
-}
-
-static ssize_t
-pfm_write(struct file *file, const char __user *ubuf,
-			  size_t size, loff_t *ppos)
-{
-	DPRINT(("pfm_write called\n"));
-	return -EINVAL;
-}
-
-static unsigned int
-pfm_poll(struct file *filp, poll_table * wait)
-{
-	pfm_context_t *ctx;
-	unsigned long flags;
-	unsigned int mask = 0;
-
-	if (PFM_IS_FILE(filp) == 0) {
-		printk(KERN_ERR "perfmon: pfm_poll: bad magic [%d]\n", current->pid);
-		return 0;
-	}
-
-	ctx = (pfm_context_t *)filp->private_data;
-	if (ctx == NULL) {
-		printk(KERN_ERR "perfmon: pfm_poll: NULL ctx [%d]\n", current->pid);
-		return 0;
-	}
-
-
-	DPRINT(("pfm_poll ctx_fd=%d before poll_wait\n", ctx->ctx_fd));
-
-	poll_wait(filp, &ctx->ctx_msgq_wait, wait);
-
-	PROTECT_CTX(ctx, flags);
-
-	if (PFM_CTXQ_EMPTY(ctx) == 0)
-		mask =  POLLIN | POLLRDNORM;
-
-	UNPROTECT_CTX(ctx, flags);
-
-	DPRINT(("pfm_poll ctx_fd=%d mask=0x%x\n", ctx->ctx_fd, mask));
-
-	return mask;
-}
-
-static int
-pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg)
-{
-	DPRINT(("pfm_ioctl called\n"));
-	return -EINVAL;
-}
-
-/*
- * interrupt cannot be masked when coming here
- */
-static inline int
-pfm_do_fasync(int fd, struct file *filp, pfm_context_t *ctx, int on)
-{
-	int ret;
-
-	ret = fasync_helper (fd, filp, on, &ctx->ctx_async_queue);
-
-	DPRINT(("pfm_fasync called by [%d] on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
-		current->pid,
-		fd,
-		on,
-		ctx->ctx_async_queue, ret));
-
-	return ret;
-}
-
-static int
-pfm_fasync(int fd, struct file *filp, int on)
-{
-	pfm_context_t *ctx;
-	int ret;
-
-	if (PFM_IS_FILE(filp) == 0) {
-		printk(KERN_ERR "perfmon: pfm_fasync bad magic [%d]\n", current->pid);
-		return -EBADF;
-	}
-
-	ctx = (pfm_context_t *)filp->private_data;
-	if (ctx == NULL) {
-		printk(KERN_ERR "perfmon: pfm_fasync NULL ctx [%d]\n", current->pid);
-		return -EBADF;
-	}
-	/*
-	 * we cannot mask interrupts during this call because this may
-	 * may go to sleep if memory is not readily avalaible.
-	 *
-	 * We are protected from the conetxt disappearing by the get_fd()/put_fd()
-	 * done in caller. Serialization of this function is ensured by caller.
-	 */
-	ret = pfm_do_fasync(fd, filp, ctx, on);
-
-
-	DPRINT(("pfm_fasync called on ctx_fd=%d on=%d async_queue=%p ret=%d\n",
-		fd,
-		on,
-		ctx->ctx_async_queue, ret));
-
-	return ret;
-}
-
-#ifdef CONFIG_SMP
-/*
- * this function is exclusively called from pfm_close().
- * The context is not protected at that time, nor are interrupts
- * on the remote CPU. That's necessary to avoid deadlocks.
- */
-static void
-pfm_syswide_force_stop(void *info)
-{
-	pfm_context_t   *ctx = (pfm_context_t *)info;
-	struct pt_regs *regs = task_pt_regs(current);
-	struct task_struct *owner;
-	unsigned long flags;
-	int ret;
-
-	if (ctx->ctx_cpu != smp_processor_id()) {
-		printk(KERN_ERR "perfmon: pfm_syswide_force_stop for CPU%d  but on CPU%d\n",
-			ctx->ctx_cpu,
-			smp_processor_id());
-		return;
-	}
-	owner = GET_PMU_OWNER();
-	if (owner != ctx->ctx_task) {
-		printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected owner [%d] instead of [%d]\n",
-			smp_processor_id(),
-			owner->pid, ctx->ctx_task->pid);
-		return;
-	}
-	if (GET_PMU_CTX() != ctx) {
-		printk(KERN_ERR "perfmon: pfm_syswide_force_stop CPU%d unexpected ctx %p instead of %p\n",
-			smp_processor_id(),
-			GET_PMU_CTX(), ctx);
-		return;
-	}
-
-	DPRINT(("on CPU%d forcing system wide stop for [%d]\n", smp_processor_id(), ctx->ctx_task->pid));	
-	/*
-	 * the context is already protected in pfm_close(), we simply
-	 * need to mask interrupts to avoid a PMU interrupt race on
-	 * this CPU
-	 */
-	local_irq_save(flags);
-
-	ret = pfm_context_unload(ctx, NULL, 0, regs);
-	if (ret) {
-		DPRINT(("context_unload returned %d\n", ret));
-	}
-
-	/*
-	 * unmask interrupts, PMU interrupts are now spurious here
-	 */
-	local_irq_restore(flags);
-}
-
-static void
-pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx)
-{
-	int ret;
-
-	DPRINT(("calling CPU%d for cleanup\n", ctx->ctx_cpu));
-	ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 0, 1);
-	DPRINT(("called CPU%d for cleanup ret=%d\n", ctx->ctx_cpu, ret));
-}
-#endif /* CONFIG_SMP */
-
-/*
- * called for each close(). Partially free resources.
- * When caller is self-monitoring, the context is unloaded.
- */
-static int
-pfm_flush(struct file *filp, fl_owner_t id)
-{
-	pfm_context_t *ctx;
-	struct task_struct *task;
-	struct pt_regs *regs;
-	unsigned long flags;
-	unsigned long smpl_buf_size = 0UL;
-	void *smpl_buf_vaddr = NULL;
-	int state, is_system;
-
-	if (PFM_IS_FILE(filp) == 0) {
-		DPRINT(("bad magic for\n"));
-		return -EBADF;
-	}
-
-	ctx = (pfm_context_t *)filp->private_data;
-	if (ctx == NULL) {
-		printk(KERN_ERR "perfmon: pfm_flush: NULL ctx [%d]\n", current->pid);
-		return -EBADF;
-	}
-
-	/*
-	 * remove our file from the async queue, if we use this mode.
-	 * This can be done without the context being protected. We come
-	 * here when the context has become unreachable by other tasks.
-	 *
-	 * We may still have active monitoring at this point and we may
-	 * end up in pfm_overflow_handler(). However, fasync_helper()
-	 * operates with interrupts disabled and it cleans up the
-	 * queue. If the PMU handler is called prior to entering
-	 * fasync_helper() then it will send a signal. If it is
-	 * invoked after, it will find an empty queue and no
-	 * signal will be sent. In both case, we are safe
-	 */
-	if (filp->f_flags & FASYNC) {
-		DPRINT(("cleaning up async_queue=%p\n", ctx->ctx_async_queue));
-		pfm_do_fasync (-1, filp, ctx, 0);
-	}
-
-	PROTECT_CTX(ctx, flags);
-
-	state     = ctx->ctx_state;
-	is_system = ctx->ctx_fl_system;
-
-	task = PFM_CTX_TASK(ctx);
-	regs = task_pt_regs(task);
-
-	DPRINT(("ctx_state=%d is_current=%d\n",
-		state,
-		task == current ? 1 : 0));
-
-	/*
-	 * if state == UNLOADED, then task is NULL
-	 */
-
-	/*
-	 * we must stop and unload because we are losing access to the context.
-	 */
-	if (task == current) {
-#ifdef CONFIG_SMP
-		/*
-		 * the task IS the owner but it migrated to another CPU: that's bad
-		 * but we must handle this cleanly. Unfortunately, the kernel does
-		 * not provide a mechanism to block migration (while the context is loaded).
-		 *
-		 * We need to release the resource on the ORIGINAL cpu.
-		 */
-		if (is_system && ctx->ctx_cpu != smp_processor_id()) {
-
-			DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
-			/*
-			 * keep context protected but unmask interrupt for IPI
-			 */
-			local_irq_restore(flags);
-
-			pfm_syswide_cleanup_other_cpu(ctx);
-
-			/*
-			 * restore interrupt masking
-			 */
-			local_irq_save(flags);
-
-			/*
-			 * context is unloaded at this point
-			 */
-		} else
-#endif /* CONFIG_SMP */
-		{
-
-			DPRINT(("forcing unload\n"));
-			/*
-		 	* stop and unload, returning with state UNLOADED
-		 	* and session unreserved.
-		 	*/
-			pfm_context_unload(ctx, NULL, 0, regs);
-
-			DPRINT(("ctx_state=%d\n", ctx->ctx_state));
-		}
-	}
-
-	/*
-	 * remove virtual mapping, if any, for the calling task.
-	 * cannot reset ctx field until last user is calling close().
-	 *
-	 * ctx_smpl_vaddr must never be cleared because it is needed
-	 * by every task with access to the context
-	 *
-	 * When called from do_exit(), the mm context is gone already, therefore
-	 * mm is NULL, i.e., the VMA is already gone  and we do not have to
-	 * do anything here
-	 */
-	if (ctx->ctx_smpl_vaddr && current->mm) {
-		smpl_buf_vaddr = ctx->ctx_smpl_vaddr;
-		smpl_buf_size  = ctx->ctx_smpl_size;
-	}
-
-	UNPROTECT_CTX(ctx, flags);
-
-	/*
-	 * if there was a mapping, then we systematically remove it
-	 * at this point. Cannot be done inside critical section
-	 * because some VM function reenables interrupts.
-	 *
-	 */
-	if (smpl_buf_vaddr) pfm_remove_smpl_mapping(current, smpl_buf_vaddr, smpl_buf_size);
-
-	return 0;
-}
-/*
- * called either on explicit close() or from exit_files(). 
- * Only the LAST user of the file gets to this point, i.e., it is
- * called only ONCE.
- *
- * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero 
- * (fput()),i.e, last task to access the file. Nobody else can access the 
- * file at this point.
- *
- * When called from exit_files(), the VMA has been freed because exit_mm()
- * is executed before exit_files().
- *
- * When called from exit_files(), the current task is not yet ZOMBIE but we
- * flush the PMU state to the context. 
- */
-static int
-pfm_close(struct inode *inode, struct file *filp)
-{
-	pfm_context_t *ctx;
-	struct task_struct *task;
-	struct pt_regs *regs;
-  	DECLARE_WAITQUEUE(wait, current);
-	unsigned long flags;
-	unsigned long smpl_buf_size = 0UL;
-	void *smpl_buf_addr = NULL;
-	int free_possible = 1;
-	int state, is_system;
-
-	DPRINT(("pfm_close called private=%p\n", filp->private_data));
-
-	if (PFM_IS_FILE(filp) == 0) {
-		DPRINT(("bad magic\n"));
-		return -EBADF;
-	}
-	
-	ctx = (pfm_context_t *)filp->private_data;
-	if (ctx == NULL) {
-		printk(KERN_ERR "perfmon: pfm_close: NULL ctx [%d]\n", current->pid);
-		return -EBADF;
-	}
-
-	PROTECT_CTX(ctx, flags);
-
-	state     = ctx->ctx_state;
-	is_system = ctx->ctx_fl_system;
-
-	task = PFM_CTX_TASK(ctx);
-	regs = task_pt_regs(task);
-
-	DPRINT(("ctx_state=%d is_current=%d\n", 
-		state,
-		task == current ? 1 : 0));
-
-	/*
-	 * if task == current, then pfm_flush() unloaded the context
-	 */
-	if (state == PFM_CTX_UNLOADED) goto doit;
-
-	/*
-	 * context is loaded/masked and task != current, we need to
-	 * either force an unload or go zombie
-	 */
-
-	/*
-	 * The task is currently blocked or will block after an overflow.
-	 * we must force it to wakeup to get out of the
-	 * MASKED state and transition to the unloaded state by itself.
-	 *
-	 * This situation is only possible for per-task mode
-	 */
-	if (state == PFM_CTX_MASKED && CTX_OVFL_NOBLOCK(ctx) == 0) {
-
-		/*
-		 * set a "partial" zombie state to be checked
-		 * upon return from down() in pfm_handle_work().
-		 *
-		 * We cannot use the ZOMBIE state, because it is checked
-		 * by pfm_load_regs() which is called upon wakeup from down().
-		 * In such case, it would free the context and then we would
-		 * return to pfm_handle_work() which would access the
-		 * stale context. Instead, we set a flag invisible to pfm_load_regs()
-		 * but visible to pfm_handle_work().
-		 *
-		 * For some window of time, we have a zombie context with
-		 * ctx_state = MASKED  and not ZOMBIE
-		 */
-		ctx->ctx_fl_going_zombie = 1;
-
-		/*
-		 * force task to wake up from MASKED state
-		 */
-		complete(&ctx->ctx_restart_done);
-
-		DPRINT(("waking up ctx_state=%d\n", state));
-
-		/*
-		 * put ourself to sleep waiting for the other
-		 * task to report completion
-		 *
-		 * the context is protected by mutex, therefore there
-		 * is no risk of being notified of completion before
-		 * begin actually on the waitq.
-		 */
-  		set_current_state(TASK_INTERRUPTIBLE);
-  		add_wait_queue(&ctx->ctx_zombieq, &wait);
-
-		UNPROTECT_CTX(ctx, flags);
-
-		/*
-		 * XXX: check for signals :
-		 * 	- ok for explicit close
-		 * 	- not ok when coming from exit_files()
-		 */
-      		schedule();
-
-
-		PROTECT_CTX(ctx, flags);
-
-
-		remove_wait_queue(&ctx->ctx_zombieq, &wait);
-  		set_current_state(TASK_RUNNING);
-
-		/*
-		 * context is unloaded at this point
-		 */
-		DPRINT(("after zombie wakeup ctx_state=%d for\n", state));
-	}
-	else if (task != current) {
-#ifdef CONFIG_SMP
-		/*
-	 	 * switch context to zombie state
-	 	 */
-		ctx->ctx_state = PFM_CTX_ZOMBIE;
-
-		DPRINT(("zombie ctx for [%d]\n", task->pid));
-		/*
-		 * cannot free the context on the spot. deferred until
-		 * the task notices the ZOMBIE state
-		 */
-		free_possible = 0;
-#else
-		pfm_context_unload(ctx, NULL, 0, regs);
-#endif
-	}
-
-doit:
-	/* reload state, may have changed during  opening of critical section */
-	state = ctx->ctx_state;
-
-	/*
-	 * the context is still attached to a task (possibly current)
-	 * we cannot destroy it right now
-	 */
-
-	/*
-	 * we must free the sampling buffer right here because
-	 * we cannot rely on it being cleaned up later by the
-	 * monitored task. It is not possible to free vmalloc'ed
-	 * memory in pfm_load_regs(). Instead, we remove the buffer
-	 * now. should there be subsequent PMU overflow originally
-	 * meant for sampling, the will be converted to spurious
-	 * and that's fine because the monitoring tools is gone anyway.
-	 */
-	if (ctx->ctx_smpl_hdr) {
-		smpl_buf_addr = ctx->ctx_smpl_hdr;
-		smpl_buf_size = ctx->ctx_smpl_size;
-		/* no more sampling */
-		ctx->ctx_smpl_hdr = NULL;
-		ctx->ctx_fl_is_sampling = 0;
-	}
-
-	DPRINT(("ctx_state=%d free_possible=%d addr=%p size=%lu\n",
-		state,
-		free_possible,
-		smpl_buf_addr,
-		smpl_buf_size));
-
-	if (smpl_buf_addr) pfm_exit_smpl_buffer(ctx->ctx_buf_fmt);
-
-	/*
-	 * UNLOADED that the session has already been unreserved.
-	 */
-	if (state == PFM_CTX_ZOMBIE) {
-		pfm_unreserve_session(ctx, ctx->ctx_fl_system , ctx->ctx_cpu);
-	}
-
-	/*
-	 * disconnect file descriptor from context must be done
-	 * before we unlock.
-	 */
-	filp->private_data = NULL;
-
-	/*
-	 * if we free on the spot, the context is now completely unreachable
-	 * from the callers side. The monitored task side is also cut, so we
-	 * can freely cut.
-	 *
-	 * If we have a deferred free, only the caller side is disconnected.
-	 */
-	UNPROTECT_CTX(ctx, flags);
-
-	/*
-	 * All memory free operations (especially for vmalloc'ed memory)
-	 * MUST be done with interrupts ENABLED.
-	 */
-	if (smpl_buf_addr)  pfm_rvfree(smpl_buf_addr, smpl_buf_size);
-
-	/*
-	 * return the memory used by the context
-	 */
-	if (free_possible) pfm_context_free(ctx);
-
-	return 0;
-}
-
-static int
-pfm_no_open(struct inode *irrelevant, struct file *dontcare)
-{
-	DPRINT(("pfm_no_open called\n"));
-	return -ENXIO;
-}
-
-
-
-static const struct file_operations pfm_file_ops = {
-	.llseek   = no_llseek,
-	.read     = pfm_read,
-	.write    = pfm_write,
-	.poll     = pfm_poll,
-	.ioctl    = pfm_ioctl,
-	.open     = pfm_no_open,	/* special open code to disallow open via /proc */
-	.fasync   = pfm_fasync,
-	.release  = pfm_close,
-	.flush	  = pfm_flush
-};
-
-static int
-pfmfs_delete_dentry(struct dentry *dentry)
-{
-	return 1;
-}
-
-static struct dentry_operations pfmfs_dentry_operations = {
-	.d_delete = pfmfs_delete_dentry,
-};
-
-
-static int
-pfm_alloc_fd(struct file **cfile)
-{
-	int fd, ret = 0;
-	struct file *file = NULL;
-	struct inode * inode;
-	char name[32];
-	struct qstr this;
-
-	fd = get_unused_fd();
-	if (fd < 0) return -ENFILE;
-
-	ret = -ENFILE;
-
-	file = get_empty_filp();
-	if (!file) goto out;
-
-	/*
-	 * allocate a new inode
-	 */
-	inode = new_inode(pfmfs_mnt->mnt_sb);
-	if (!inode) goto out;
-
-	DPRINT(("new inode ino=%ld @%p\n", inode->i_ino, inode));
-
-	inode->i_mode = S_IFCHR|S_IRUGO;
-	inode->i_uid  = current->fsuid;
-	inode->i_gid  = current->fsgid;
-
-	sprintf(name, "[%lu]", inode->i_ino);
-	this.name = name;
-	this.len  = strlen(name);
-	this.hash = inode->i_ino;
-
-	ret = -ENOMEM;
-
-	/*
-	 * allocate a new dcache entry
-	 */
-	file->f_path.dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this);
-	if (!file->f_path.dentry) goto out;
-
-	file->f_path.dentry->d_op = &pfmfs_dentry_operations;
-
-	d_add(file->f_path.dentry, inode);
-	file->f_path.mnt = mntget(pfmfs_mnt);
-	file->f_mapping = inode->i_mapping;
-
-	file->f_op    = &pfm_file_ops;
-	file->f_mode  = FMODE_READ;
-	file->f_flags = O_RDONLY;
-	file->f_pos   = 0;
-
-	/*
-	 * may have to delay until context is attached?
-	 */
-	fd_install(fd, file);
-
-	/*
-	 * the file structure we will use
-	 */
-	*cfile = file;
-
-	return fd;
-out:
-	if (file) put_filp(file);
-	put_unused_fd(fd);
-	return ret;
-}
-
-static void
-pfm_free_fd(int fd, struct file *file)
-{
-	struct files_struct *files = current->files;
-	struct fdtable *fdt;
-
-	/* 
-	 * there ie no fd_uninstall(), so we do it here
-	 */
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	rcu_assign_pointer(fdt->fd[fd], NULL);
-	spin_unlock(&files->file_lock);
-
-	if (file)
-		put_filp(file);
-	put_unused_fd(fd);
-}
-
-static int
-pfm_remap_buffer(struct vm_area_struct *vma, unsigned long buf, unsigned long addr, unsigned long size)
-{
-	DPRINT(("CPU%d buf=0x%lx addr=0x%lx size=%ld\n", smp_processor_id(), buf, addr, size));
-
-	while (size > 0) {
-		unsigned long pfn = ia64_tpa(buf) >> PAGE_SHIFT;
-
-
-		if (remap_pfn_range(vma, addr, pfn, PAGE_SIZE, PAGE_READONLY))
-			return -ENOMEM;
-
-		addr  += PAGE_SIZE;
-		buf   += PAGE_SIZE;
-		size  -= PAGE_SIZE;
-	}
-	return 0;
-}
-
-/*
- * allocate a sampling buffer and remaps it into the user address space of the task
- */
-static int
-pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t *ctx, unsigned long rsize, void **user_vaddr)
-{
-	struct mm_struct *mm = task->mm;
-	struct vm_area_struct *vma = NULL;
-	unsigned long size;
-	void *smpl_buf;
-
-
-	/*
-	 * the fixed header + requested size and align to page boundary
-	 */
-	size = PAGE_ALIGN(rsize);
-
-	DPRINT(("sampling buffer rsize=%lu size=%lu bytes\n", rsize, size));
-
-	/*
-	 * check requested size to avoid Denial-of-service attacks
-	 * XXX: may have to refine this test
-	 * Check against address space limit.
-	 *
-	 * if ((mm->total_vm << PAGE_SHIFT) + len> task->rlim[RLIMIT_AS].rlim_cur)
-	 * 	return -ENOMEM;
-	 */
-	if (size > task->signal->rlim[RLIMIT_MEMLOCK].rlim_cur)
-		return -ENOMEM;
-
-	/*
-	 * We do the easy to undo allocations first.
- 	 *
-	 * pfm_rvmalloc(), clears the buffer, so there is no leak
-	 */
-	smpl_buf = pfm_rvmalloc(size);
-	if (smpl_buf == NULL) {
-		DPRINT(("Can't allocate sampling buffer\n"));
-		return -ENOMEM;
-	}
-
-	DPRINT(("smpl_buf @%p\n", smpl_buf));
-
-	/* allocate vma */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
-	if (!vma) {
-		DPRINT(("Cannot allocate vma\n"));
-		goto error_kmem;
-	}
-
-	/*
-	 * partially initialize the vma for the sampling buffer
-	 */
-	vma->vm_mm	     = mm;
-	vma->vm_file	     = filp;
-	vma->vm_flags	     = VM_READ| VM_MAYREAD |VM_RESERVED;
-	vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
-
-	/*
-	 * Now we have everything we need and we can initialize
-	 * and connect all the data structures
-	 */
-
-	ctx->ctx_smpl_hdr   = smpl_buf;
-	ctx->ctx_smpl_size  = size; /* aligned size */
-
-	/*
-	 * Let's do the difficult operations next.
-	 *
-	 * now we atomically find some area in the address space and
-	 * remap the buffer in it.
-	 */
-	down_write(&task->mm->mmap_sem);
-
-	/* find some free area in address space, must have mmap sem held */
-	vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0);
-	if (vma->vm_start == 0UL) {
-		DPRINT(("Cannot find unmapped area for size %ld\n", size));
-		up_write(&task->mm->mmap_sem);
-		goto error;
-	}
-	vma->vm_end = vma->vm_start + size;
-	vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT;
-
-	DPRINT(("aligned size=%ld, hdr=%p mapped @0x%lx\n", size, ctx->ctx_smpl_hdr, vma->vm_start));
-
-	/* can only be applied to current task, need to have the mm semaphore held when called */
-	if (pfm_remap_buffer(vma, (unsigned long)smpl_buf, vma->vm_start, size)) {
-		DPRINT(("Can't remap buffer\n"));
-		up_write(&task->mm->mmap_sem);
-		goto error;
-	}
-
-	get_file(filp);
-
-	/*
-	 * now insert the vma in the vm list for the process, must be
-	 * done with mmap lock held
-	 */
-	insert_vm_struct(mm, vma);
-
-	mm->total_vm  += size >> PAGE_SHIFT;
-	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-							vma_pages(vma));
-	up_write(&task->mm->mmap_sem);
-
-	/*
-	 * keep track of user level virtual address
-	 */
-	ctx->ctx_smpl_vaddr = (void *)vma->vm_start;
-	*(unsigned long *)user_vaddr = vma->vm_start;
-
-	return 0;
-
-error:
-	kmem_cache_free(vm_area_cachep, vma);
-error_kmem:
-	pfm_rvfree(smpl_buf, size);
-
-	return -ENOMEM;
-}
-
-/*
- * XXX: do something better here
- */
-static int
-pfm_bad_permissions(struct task_struct *task)
-{
-	/* inspired by ptrace_attach() */
-	DPRINT(("cur: uid=%d gid=%d task: euid=%d suid=%d uid=%d egid=%d sgid=%d\n",
-		current->uid,
-		current->gid,
-		task->euid,
-		task->suid,
-		task->uid,
-		task->egid,
-		task->sgid));
-
-	return ((current->uid != task->euid)
-	    || (current->uid != task->suid)
-	    || (current->uid != task->uid)
-	    || (current->gid != task->egid)
-	    || (current->gid != task->sgid)
-	    || (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE);
-}
-
-static int
-pfarg_is_sane(struct task_struct *task, pfarg_context_t *pfx)
-{
-	int ctx_flags;
-
-	/* valid signal */
-
-	ctx_flags = pfx->ctx_flags;
-
-	if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
-
-		/*
-		 * cannot block in this mode
-		 */
-		if (ctx_flags & PFM_FL_NOTIFY_BLOCK) {
-			DPRINT(("cannot use blocking mode when in system wide monitoring\n"));
-			return -EINVAL;
-		}
-	} else {
-	}
-	/* probably more to add here */
-
-	return 0;
-}
-
-static int
-pfm_setup_buffer_fmt(struct task_struct *task, struct file *filp, pfm_context_t *ctx, unsigned int ctx_flags,
-		     unsigned int cpu, pfarg_context_t *arg)
-{
-	pfm_buffer_fmt_t *fmt = NULL;
-	unsigned long size = 0UL;
-	void *uaddr = NULL;
-	void *fmt_arg = NULL;
-	int ret = 0;
-#define PFM_CTXARG_BUF_ARG(a)	(pfm_buffer_fmt_t *)(a+1)
-
-	/* invoke and lock buffer format, if found */
-	fmt = pfm_find_buffer_fmt(arg->ctx_smpl_buf_id);
-	if (fmt == NULL) {
-		DPRINT(("[%d] cannot find buffer format\n", task->pid));
-		return -EINVAL;
-	}
-
-	/*
-	 * buffer argument MUST be contiguous to pfarg_context_t
-	 */
-	if (fmt->fmt_arg_size) fmt_arg = PFM_CTXARG_BUF_ARG(arg);
-
-	ret = pfm_buf_fmt_validate(fmt, task, ctx_flags, cpu, fmt_arg);
-
-	DPRINT(("[%d] after validate(0x%x,%d,%p)=%d\n", task->pid, ctx_flags, cpu, fmt_arg, ret));
-
-	if (ret) goto error;
-
-	/* link buffer format and context */
-	ctx->ctx_buf_fmt = fmt;
-
-	/*
-	 * check if buffer format wants to use perfmon buffer allocation/mapping service
-	 */
-	ret = pfm_buf_fmt_getsize(fmt, task, ctx_flags, cpu, fmt_arg, &size);
-	if (ret) goto error;
-
-	if (size) {
-		/*
-		 * buffer is always remapped into the caller's address space
-		 */
-		ret = pfm_smpl_buffer_alloc(current, filp, ctx, size, &uaddr);
-		if (ret) goto error;
-
-		/* keep track of user address of buffer */
-		arg->ctx_smpl_vaddr = uaddr;
-	}
-	ret = pfm_buf_fmt_init(fmt, task, ctx->ctx_smpl_hdr, ctx_flags, cpu, fmt_arg);
-
-error:
-	return ret;
-}
-
-static void
-pfm_reset_pmu_state(pfm_context_t *ctx)
-{
-	int i;
-
-	/*
-	 * install reset values for PMC.
-	 */
-	for (i=1; PMC_IS_LAST(i) == 0; i++) {
-		if (PMC_IS_IMPL(i) == 0) continue;
-		ctx->ctx_pmcs[i] = PMC_DFL_VAL(i);
-		DPRINT(("pmc[%d]=0x%lx\n", i, ctx->ctx_pmcs[i]));
-	}
-	/*
-	 * PMD registers are set to 0UL when the context in memset()
-	 */
-
-	/*
-	 * On context switched restore, we must restore ALL pmc and ALL pmd even
-	 * when they are not actively used by the task. In UP, the incoming process
-	 * may otherwise pick up left over PMC, PMD state from the previous process.
-	 * As opposed to PMD, stale PMC can cause harm to the incoming
-	 * process because they may change what is being measured.
-	 * Therefore, we must systematically reinstall the entire
-	 * PMC state. In SMP, the same thing is possible on the
-	 * same CPU but also on between 2 CPUs.
-	 *
-	 * The problem with PMD is information leaking especially
-	 * to user level when psr.sp=0
-	 *
-	 * There is unfortunately no easy way to avoid this problem
-	 * on either UP or SMP. This definitively slows down the
-	 * pfm_load_regs() function.
-	 */
-
-	 /*
-	  * bitmask of all PMCs accessible to this context
-	  *
-	  * PMC0 is treated differently.
-	  */
-	ctx->ctx_all_pmcs[0] = pmu_conf->impl_pmcs[0] & ~0x1;
-
-	/*
-	 * bitmask of all PMDs that are accessible to this context
-	 */
-	ctx->ctx_all_pmds[0] = pmu_conf->impl_pmds[0];
-
-	DPRINT(("<%d> all_pmcs=0x%lx all_pmds=0x%lx\n", ctx->ctx_fd, ctx->ctx_all_pmcs[0],ctx->ctx_all_pmds[0]));
-
-	/*
-	 * useful in case of re-enable after disable
-	 */
-	ctx->ctx_used_ibrs[0] = 0UL;
-	ctx->ctx_used_dbrs[0] = 0UL;
-}
-
-static int
-pfm_ctx_getsize(void *arg, size_t *sz)
-{
-	pfarg_context_t *req = (pfarg_context_t *)arg;
-	pfm_buffer_fmt_t *fmt;
-
-	*sz = 0;
-
-	if (!pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) return 0;
-
-	fmt = pfm_find_buffer_fmt(req->ctx_smpl_buf_id);
-	if (fmt == NULL) {
-		DPRINT(("cannot find buffer format\n"));
-		return -EINVAL;
-	}
-	/* get just enough to copy in user parameters */
-	*sz = fmt->fmt_arg_size;
-	DPRINT(("arg_size=%lu\n", *sz));
-
-	return 0;
-}
-
-
-
-/*
- * cannot attach if :
- * 	- kernel task
- * 	- task not owned by caller
- * 	- task incompatible with context mode
- */
-static int
-pfm_task_incompatible(pfm_context_t *ctx, struct task_struct *task)
-{
-	/*
-	 * no kernel task or task not owner by caller
-	 */
-	if (task->mm == NULL) {
-		DPRINT(("task [%d] has not memory context (kernel thread)\n", task->pid));
-		return -EPERM;
-	}
-	if (pfm_bad_permissions(task)) {
-		DPRINT(("no permission to attach to  [%d]\n", task->pid));
-		return -EPERM;
-	}
-	/*
-	 * cannot block in self-monitoring mode
-	 */
-	if (CTX_OVFL_NOBLOCK(ctx) == 0 && task == current) {
-		DPRINT(("cannot load a blocking context on self for [%d]\n", task->pid));
-		return -EINVAL;
-	}
-
-	if (task->exit_state == EXIT_ZOMBIE) {
-		DPRINT(("cannot attach to  zombie task [%d]\n", task->pid));
-		return -EBUSY;
-	}
-
-	/*
-	 * always ok for self
-	 */
-	if (task == current) return 0;
-
-	if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
-		DPRINT(("cannot attach to non-stopped task [%d] state=%ld\n", task->pid, task->state));
-		return -EBUSY;
-	}
-	/*
-	 * make sure the task is off any CPU
-	 */
-	wait_task_inactive(task);
-
-	/* more to come... */
-
-	return 0;
-}
-
-static int
-pfm_get_task(pfm_context_t *ctx, pid_t pid, struct task_struct **task)
-{
-	struct task_struct *p = current;
-	int ret;
-
-	/* XXX: need to add more checks here */
-	if (pid < 2) return -EPERM;
-
-	if (pid != current->pid) {
-
-		read_lock(&tasklist_lock);
-
-		p = find_task_by_pid(pid);
-
-		/* make sure task cannot go away while we operate on it */
-		if (p) get_task_struct(p);
-
-		read_unlock(&tasklist_lock);
-
-		if (p == NULL) return -ESRCH;
-	}
-
-	ret = pfm_task_incompatible(ctx, p);
-	if (ret == 0) {
-		*task = p;
-	} else if (p != current) {
-		pfm_put_task(p);
-	}
-	return ret;
-}
-
-
-
-static int
-pfm_context_create(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	pfarg_context_t *req = (pfarg_context_t *)arg;
-	struct file *filp;
-	int ctx_flags;
-	int ret;
-
-	/* let's check the arguments first */
-	ret = pfarg_is_sane(current, req);
-	if (ret < 0) return ret;
-
-	ctx_flags = req->ctx_flags;
-
-	ret = -ENOMEM;
-
-	ctx = pfm_context_alloc();
-	if (!ctx) goto error;
-
-	ret = pfm_alloc_fd(&filp);
-	if (ret < 0) goto error_file;
-
-	req->ctx_fd = ctx->ctx_fd = ret;
-
-	/*
-	 * attach context to file
-	 */
-	filp->private_data = ctx;
-
-	/*
-	 * does the user want to sample?
-	 */
-	if (pfm_uuid_cmp(req->ctx_smpl_buf_id, pfm_null_uuid)) {
-		ret = pfm_setup_buffer_fmt(current, filp, ctx, ctx_flags, 0, req);
-		if (ret) goto buffer_error;
-	}
-
-	/*
-	 * init context protection lock
-	 */
-	spin_lock_init(&ctx->ctx_lock);
-
-	/*
-	 * context is unloaded
-	 */
-	ctx->ctx_state = PFM_CTX_UNLOADED;
-
-	/*
-	 * initialization of context's flags
-	 */
-	ctx->ctx_fl_block       = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
-	ctx->ctx_fl_system      = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
-	ctx->ctx_fl_is_sampling = ctx->ctx_buf_fmt ? 1 : 0; /* assume record() is defined */
-	ctx->ctx_fl_no_msg      = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0;
-	/*
-	 * will move to set properties
-	 * ctx->ctx_fl_excl_idle   = (ctx_flags & PFM_FL_EXCL_IDLE) ? 1: 0;
-	 */
-
-	/*
-	 * init restart semaphore to locked
-	 */
-	init_completion(&ctx->ctx_restart_done);
-
-	/*
-	 * activation is used in SMP only
-	 */
-	ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
-	SET_LAST_CPU(ctx, -1);
-
-	/*
-	 * initialize notification message queue
-	 */
-	ctx->ctx_msgq_head = ctx->ctx_msgq_tail = 0;
-	init_waitqueue_head(&ctx->ctx_msgq_wait);
-	init_waitqueue_head(&ctx->ctx_zombieq);
-
-	DPRINT(("ctx=%p flags=0x%x system=%d notify_block=%d excl_idle=%d no_msg=%d ctx_fd=%d \n",
-		ctx,
-		ctx_flags,
-		ctx->ctx_fl_system,
-		ctx->ctx_fl_block,
-		ctx->ctx_fl_excl_idle,
-		ctx->ctx_fl_no_msg,
-		ctx->ctx_fd));
-
-	/*
-	 * initialize soft PMU state
-	 */
-	pfm_reset_pmu_state(ctx);
-
-	return 0;
-
-buffer_error:
-	pfm_free_fd(ctx->ctx_fd, filp);
-
-	if (ctx->ctx_buf_fmt) {
-		pfm_buf_fmt_exit(ctx->ctx_buf_fmt, current, NULL, regs);
-	}
-error_file:
-	pfm_context_free(ctx);
-
-error:
-	return ret;
-}
-
-static inline unsigned long
-pfm_new_counter_value (pfm_counter_t *reg, int is_long_reset)
-{
-	unsigned long val = is_long_reset ? reg->long_reset : reg->short_reset;
-	unsigned long new_seed, old_seed = reg->seed, mask = reg->mask;
-	extern unsigned long carta_random32 (unsigned long seed);
-
-	if (reg->flags & PFM_REGFL_RANDOM) {
-		new_seed = carta_random32(old_seed);
-		val -= (old_seed & mask);	/* counter values are negative numbers! */
-		if ((mask >> 32) != 0)
-			/* construct a full 64-bit random value: */
-			new_seed |= carta_random32(old_seed >> 32) << 32;
-		reg->seed = new_seed;
-	}
-	reg->lval = val;
-	return val;
-}
-
-static void
-pfm_reset_regs_masked(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset)
-{
-	unsigned long mask = ovfl_regs[0];
-	unsigned long reset_others = 0UL;
-	unsigned long val;
-	int i;
-
-	/*
-	 * now restore reset value on sampling overflowed counters
-	 */
-	mask >>= PMU_FIRST_COUNTER;
-	for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
-
-		if ((mask & 0x1UL) == 0UL) continue;
-
-		ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
-		reset_others        |= ctx->ctx_pmds[i].reset_pmds[0];
-
-		DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
-	}
-
-	/*
-	 * Now take care of resetting the other registers
-	 */
-	for(i = 0; reset_others; i++, reset_others >>= 1) {
-
-		if ((reset_others & 0x1) == 0) continue;
-
-		ctx->ctx_pmds[i].val = val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
-
-		DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
-			  is_long_reset ? "long" : "short", i, val));
-	}
-}
-
-static void
-pfm_reset_regs(pfm_context_t *ctx, unsigned long *ovfl_regs, int is_long_reset)
-{
-	unsigned long mask = ovfl_regs[0];
-	unsigned long reset_others = 0UL;
-	unsigned long val;
-	int i;
-
-	DPRINT_ovfl(("ovfl_regs=0x%lx is_long_reset=%d\n", ovfl_regs[0], is_long_reset));
-
-	if (ctx->ctx_state == PFM_CTX_MASKED) {
-		pfm_reset_regs_masked(ctx, ovfl_regs, is_long_reset);
-		return;
-	}
-
-	/*
-	 * now restore reset value on sampling overflowed counters
-	 */
-	mask >>= PMU_FIRST_COUNTER;
-	for(i = PMU_FIRST_COUNTER; mask; i++, mask >>= 1) {
-
-		if ((mask & 0x1UL) == 0UL) continue;
-
-		val           = pfm_new_counter_value(ctx->ctx_pmds+ i, is_long_reset);
-		reset_others |= ctx->ctx_pmds[i].reset_pmds[0];
-
-		DPRINT_ovfl((" %s reset ctx_pmds[%d]=%lx\n", is_long_reset ? "long" : "short", i, val));
-
-		pfm_write_soft_counter(ctx, i, val);
-	}
-
-	/*
-	 * Now take care of resetting the other registers
-	 */
-	for(i = 0; reset_others; i++, reset_others >>= 1) {
-
-		if ((reset_others & 0x1) == 0) continue;
-
-		val = pfm_new_counter_value(ctx->ctx_pmds + i, is_long_reset);
-
-		if (PMD_IS_COUNTING(i)) {
-			pfm_write_soft_counter(ctx, i, val);
-		} else {
-			ia64_set_pmd(i, val);
-		}
-		DPRINT_ovfl(("%s reset_others pmd[%d]=%lx\n",
-			  is_long_reset ? "long" : "short", i, val));
-	}
-	ia64_srlz_d();
-}
-
-static int
-pfm_write_pmcs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	struct task_struct *task;
-	pfarg_reg_t *req = (pfarg_reg_t *)arg;
-	unsigned long value, pmc_pm;
-	unsigned long smpl_pmds, reset_pmds, impl_pmds;
-	unsigned int cnum, reg_flags, flags, pmc_type;
-	int i, can_access_pmu = 0, is_loaded, is_system, expert_mode;
-	int is_monitor, is_counting, state;
-	int ret = -EINVAL;
-	pfm_reg_check_t	wr_func;
-#define PFM_CHECK_PMC_PM(x, y, z) ((x)->ctx_fl_system ^ PMC_PM(y, z))
-
-	state     = ctx->ctx_state;
-	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
-	is_system = ctx->ctx_fl_system;
-	task      = ctx->ctx_task;
-	impl_pmds = pmu_conf->impl_pmds[0];
-
-	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
-
-	if (is_loaded) {
-		/*
-		 * In system wide and when the context is loaded, access can only happen
-		 * when the caller is running on the CPU being monitored by the session.
-		 * It does not have to be the owner (ctx_task) of the context per se.
-		 */
-		if (is_system && ctx->ctx_cpu != smp_processor_id()) {
-			DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
-			return -EBUSY;
-		}
-		can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
-	}
-	expert_mode = pfm_sysctl.expert_mode; 
-
-	for (i = 0; i < count; i++, req++) {
-
-		cnum       = req->reg_num;
-		reg_flags  = req->reg_flags;
-		value      = req->reg_value;
-		smpl_pmds  = req->reg_smpl_pmds[0];
-		reset_pmds = req->reg_reset_pmds[0];
-		flags      = 0;
-
-
-		if (cnum >= PMU_MAX_PMCS) {
-			DPRINT(("pmc%u is invalid\n", cnum));
-			goto error;
-		}
-
-		pmc_type   = pmu_conf->pmc_desc[cnum].type;
-		pmc_pm     = (value >> pmu_conf->pmc_desc[cnum].pm_pos) & 0x1;
-		is_counting = (pmc_type & PFM_REG_COUNTING) == PFM_REG_COUNTING ? 1 : 0;
-		is_monitor  = (pmc_type & PFM_REG_MONITOR) == PFM_REG_MONITOR ? 1 : 0;
-
-		/*
-		 * we reject all non implemented PMC as well
-		 * as attempts to modify PMC[0-3] which are used
-		 * as status registers by the PMU
-		 */
-		if ((pmc_type & PFM_REG_IMPL) == 0 || (pmc_type & PFM_REG_CONTROL) == PFM_REG_CONTROL) {
-			DPRINT(("pmc%u is unimplemented or no-access pmc_type=%x\n", cnum, pmc_type));
-			goto error;
-		}
-		wr_func = pmu_conf->pmc_desc[cnum].write_check;
-		/*
-		 * If the PMC is a monitor, then if the value is not the default:
-		 * 	- system-wide session: PMCx.pm=1 (privileged monitor)
-		 * 	- per-task           : PMCx.pm=0 (user monitor)
-		 */
-		if (is_monitor && value != PMC_DFL_VAL(cnum) && is_system ^ pmc_pm) {
-			DPRINT(("pmc%u pmc_pm=%lu is_system=%d\n",
-				cnum,
-				pmc_pm,
-				is_system));
-			goto error;
-		}
-
-		if (is_counting) {
-			/*
-		 	 * enforce generation of overflow interrupt. Necessary on all
-		 	 * CPUs.
-		 	 */
-			value |= 1 << PMU_PMC_OI;
-
-			if (reg_flags & PFM_REGFL_OVFL_NOTIFY) {
-				flags |= PFM_REGFL_OVFL_NOTIFY;
-			}
-
-			if (reg_flags & PFM_REGFL_RANDOM) flags |= PFM_REGFL_RANDOM;
-
-			/* verify validity of smpl_pmds */
-			if ((smpl_pmds & impl_pmds) != smpl_pmds) {
-				DPRINT(("invalid smpl_pmds 0x%lx for pmc%u\n", smpl_pmds, cnum));
-				goto error;
-			}
-
-			/* verify validity of reset_pmds */
-			if ((reset_pmds & impl_pmds) != reset_pmds) {
-				DPRINT(("invalid reset_pmds 0x%lx for pmc%u\n", reset_pmds, cnum));
-				goto error;
-			}
-		} else {
-			if (reg_flags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) {
-				DPRINT(("cannot set ovfl_notify or random on pmc%u\n", cnum));
-				goto error;
-			}
-			/* eventid on non-counting monitors are ignored */
-		}
-
-		/*
-		 * execute write checker, if any
-		 */
-		if (likely(expert_mode == 0 && wr_func)) {
-			ret = (*wr_func)(task, ctx, cnum, &value, regs);
-			if (ret) goto error;
-			ret = -EINVAL;
-		}
-
-		/*
-		 * no error on this register
-		 */
-		PFM_REG_RETFLAG_SET(req->reg_flags, 0);
-
-		/*
-		 * Now we commit the changes to the software state
-		 */
-
-		/*
-		 * update overflow information
-		 */
-		if (is_counting) {
-			/*
-		 	 * full flag update each time a register is programmed
-		 	 */
-			ctx->ctx_pmds[cnum].flags = flags;
-
-			ctx->ctx_pmds[cnum].reset_pmds[0] = reset_pmds;
-			ctx->ctx_pmds[cnum].smpl_pmds[0]  = smpl_pmds;
-			ctx->ctx_pmds[cnum].eventid       = req->reg_smpl_eventid;
-
-			/*
-			 * Mark all PMDS to be accessed as used.
-			 *
-			 * We do not keep track of PMC because we have to
-			 * systematically restore ALL of them.
-			 *
-			 * We do not update the used_monitors mask, because
-			 * if we have not programmed them, then will be in
-			 * a quiescent state, therefore we will not need to
-			 * mask/restore then when context is MASKED.
-			 */
-			CTX_USED_PMD(ctx, reset_pmds);
-			CTX_USED_PMD(ctx, smpl_pmds);
-			/*
-		 	 * make sure we do not try to reset on
-		 	 * restart because we have established new values
-		 	 */
-			if (state == PFM_CTX_MASKED) ctx->ctx_ovfl_regs[0] &= ~1UL << cnum;
-		}
-		/*
-		 * Needed in case the user does not initialize the equivalent
-		 * PMD. Clearing is done indirectly via pfm_reset_pmu_state() so there is no
-		 * possible leak here.
-		 */
-		CTX_USED_PMD(ctx, pmu_conf->pmc_desc[cnum].dep_pmd[0]);
-
-		/*
-		 * keep track of the monitor PMC that we are using.
-		 * we save the value of the pmc in ctx_pmcs[] and if
-		 * the monitoring is not stopped for the context we also
-		 * place it in the saved state area so that it will be
-		 * picked up later by the context switch code.
-		 *
-		 * The value in ctx_pmcs[] can only be changed in pfm_write_pmcs().
-		 *
-		 * The value in th_pmcs[] may be modified on overflow, i.e.,  when
-		 * monitoring needs to be stopped.
-		 */
-		if (is_monitor) CTX_USED_MONITOR(ctx, 1UL << cnum);
-
-		/*
-		 * update context state
-		 */
-		ctx->ctx_pmcs[cnum] = value;
-
-		if (is_loaded) {
-			/*
-			 * write thread state
-			 */
-			if (is_system == 0) ctx->th_pmcs[cnum] = value;
-
-			/*
-			 * write hardware register if we can
-			 */
-			if (can_access_pmu) {
-				ia64_set_pmc(cnum, value);
-			}
-#ifdef CONFIG_SMP
-			else {
-				/*
-				 * per-task SMP only here
-				 *
-			 	 * we are guaranteed that the task is not running on the other CPU,
-			 	 * we indicate that this PMD will need to be reloaded if the task
-			 	 * is rescheduled on the CPU it ran last on.
-			 	 */
-				ctx->ctx_reload_pmcs[0] |= 1UL << cnum;
-			}
-#endif
-		}
-
-		DPRINT(("pmc[%u]=0x%lx ld=%d apmu=%d flags=0x%x all_pmcs=0x%lx used_pmds=0x%lx eventid=%ld smpl_pmds=0x%lx reset_pmds=0x%lx reloads_pmcs=0x%lx used_monitors=0x%lx ovfl_regs=0x%lx\n",
-			  cnum,
-			  value,
-			  is_loaded,
-			  can_access_pmu,
-			  flags,
-			  ctx->ctx_all_pmcs[0],
-			  ctx->ctx_used_pmds[0],
-			  ctx->ctx_pmds[cnum].eventid,
-			  smpl_pmds,
-			  reset_pmds,
-			  ctx->ctx_reload_pmcs[0],
-			  ctx->ctx_used_monitors[0],
-			  ctx->ctx_ovfl_regs[0]));
-	}
-
-	/*
-	 * make sure the changes are visible
-	 */
-	if (can_access_pmu) ia64_srlz_d();
-
-	return 0;
-error:
-	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
-	return ret;
-}
-
-static int
-pfm_write_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	struct task_struct *task;
-	pfarg_reg_t *req = (pfarg_reg_t *)arg;
-	unsigned long value, hw_value, ovfl_mask;
-	unsigned int cnum;
-	int i, can_access_pmu = 0, state;
-	int is_counting, is_loaded, is_system, expert_mode;
-	int ret = -EINVAL;
-	pfm_reg_check_t wr_func;
-
-
-	state     = ctx->ctx_state;
-	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
-	is_system = ctx->ctx_fl_system;
-	ovfl_mask = pmu_conf->ovfl_val;
-	task      = ctx->ctx_task;
-
-	if (unlikely(state == PFM_CTX_ZOMBIE)) return -EINVAL;
-
-	/*
-	 * on both UP and SMP, we can only write to the PMC when the task is
-	 * the owner of the local PMU.
-	 */
-	if (likely(is_loaded)) {
-		/*
-		 * In system wide and when the context is loaded, access can only happen
-		 * when the caller is running on the CPU being monitored by the session.
-		 * It does not have to be the owner (ctx_task) of the context per se.
-		 */
-		if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
-			DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
-			return -EBUSY;
-		}
-		can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
-	}
-	expert_mode = pfm_sysctl.expert_mode; 
-
-	for (i = 0; i < count; i++, req++) {
-
-		cnum  = req->reg_num;
-		value = req->reg_value;
-
-		if (!PMD_IS_IMPL(cnum)) {
-			DPRINT(("pmd[%u] is unimplemented or invalid\n", cnum));
-			goto abort_mission;
-		}
-		is_counting = PMD_IS_COUNTING(cnum);
-		wr_func     = pmu_conf->pmd_desc[cnum].write_check;
-
-		/*
-		 * execute write checker, if any
-		 */
-		if (unlikely(expert_mode == 0 && wr_func)) {
-			unsigned long v = value;
-
-			ret = (*wr_func)(task, ctx, cnum, &v, regs);
-			if (ret) goto abort_mission;
-
-			value = v;
-			ret   = -EINVAL;
-		}
-
-		/*
-		 * no error on this register
-		 */
-		PFM_REG_RETFLAG_SET(req->reg_flags, 0);
-
-		/*
-		 * now commit changes to software state
-		 */
-		hw_value = value;
-
-		/*
-		 * update virtualized (64bits) counter
-		 */
-		if (is_counting) {
-			/*
-			 * write context state
-			 */
-			ctx->ctx_pmds[cnum].lval = value;
-
-			/*
-			 * when context is load we use the split value
-			 */
-			if (is_loaded) {
-				hw_value = value &  ovfl_mask;
-				value    = value & ~ovfl_mask;
-			}
-		}
-		/*
-		 * update reset values (not just for counters)
-		 */
-		ctx->ctx_pmds[cnum].long_reset  = req->reg_long_reset;
-		ctx->ctx_pmds[cnum].short_reset = req->reg_short_reset;
-
-		/*
-		 * update randomization parameters (not just for counters)
-		 */
-		ctx->ctx_pmds[cnum].seed = req->reg_random_seed;
-		ctx->ctx_pmds[cnum].mask = req->reg_random_mask;
-
-		/*
-		 * update context value
-		 */
-		ctx->ctx_pmds[cnum].val  = value;
-
-		/*
-		 * Keep track of what we use
-		 *
-		 * We do not keep track of PMC because we have to
-		 * systematically restore ALL of them.
-		 */
-		CTX_USED_PMD(ctx, PMD_PMD_DEP(cnum));
-
-		/*
-		 * mark this PMD register used as well
-		 */
-		CTX_USED_PMD(ctx, RDEP(cnum));
-
-		/*
-		 * make sure we do not try to reset on
-		 * restart because we have established new values
-		 */
-		if (is_counting && state == PFM_CTX_MASKED) {
-			ctx->ctx_ovfl_regs[0] &= ~1UL << cnum;
-		}
-
-		if (is_loaded) {
-			/*
-		 	 * write thread state
-		 	 */
-			if (is_system == 0) ctx->th_pmds[cnum] = hw_value;
-
-			/*
-			 * write hardware register if we can
-			 */
-			if (can_access_pmu) {
-				ia64_set_pmd(cnum, hw_value);
-			} else {
-#ifdef CONFIG_SMP
-				/*
-			 	 * we are guaranteed that the task is not running on the other CPU,
-			 	 * we indicate that this PMD will need to be reloaded if the task
-			 	 * is rescheduled on the CPU it ran last on.
-			 	 */
-				ctx->ctx_reload_pmds[0] |= 1UL << cnum;
-#endif
-			}
-		}
-
-		DPRINT(("pmd[%u]=0x%lx ld=%d apmu=%d, hw_value=0x%lx ctx_pmd=0x%lx  short_reset=0x%lx "
-			  "long_reset=0x%lx notify=%c seed=0x%lx mask=0x%lx used_pmds=0x%lx reset_pmds=0x%lx reload_pmds=0x%lx all_pmds=0x%lx ovfl_regs=0x%lx\n",
-			cnum,
-			value,
-			is_loaded,
-			can_access_pmu,
-			hw_value,
-			ctx->ctx_pmds[cnum].val,
-			ctx->ctx_pmds[cnum].short_reset,
-			ctx->ctx_pmds[cnum].long_reset,
-			PMC_OVFL_NOTIFY(ctx, cnum) ? 'Y':'N',
-			ctx->ctx_pmds[cnum].seed,
-			ctx->ctx_pmds[cnum].mask,
-			ctx->ctx_used_pmds[0],
-			ctx->ctx_pmds[cnum].reset_pmds[0],
-			ctx->ctx_reload_pmds[0],
-			ctx->ctx_all_pmds[0],
-			ctx->ctx_ovfl_regs[0]));
-	}
-
-	/*
-	 * make changes visible
-	 */
-	if (can_access_pmu) ia64_srlz_d();
-
-	return 0;
-
-abort_mission:
-	/*
-	 * for now, we have only one possibility for error
-	 */
-	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
-	return ret;
-}
-
-/*
- * By the way of PROTECT_CONTEXT(), interrupts are masked while we are in this function.
- * Therefore we know, we do not have to worry about the PMU overflow interrupt. If an
- * interrupt is delivered during the call, it will be kept pending until we leave, making
- * it appears as if it had been generated at the UNPROTECT_CONTEXT(). At least we are
- * guaranteed to return consistent data to the user, it may simply be old. It is not
- * trivial to treat the overflow while inside the call because you may end up in
- * some module sampling buffer code causing deadlocks.
- */
-static int
-pfm_read_pmds(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	struct task_struct *task;
-	unsigned long val = 0UL, lval, ovfl_mask, sval;
-	pfarg_reg_t *req = (pfarg_reg_t *)arg;
-	unsigned int cnum, reg_flags = 0;
-	int i, can_access_pmu = 0, state;
-	int is_loaded, is_system, is_counting, expert_mode;
-	int ret = -EINVAL;
-	pfm_reg_check_t rd_func;
-
-	/*
-	 * access is possible when loaded only for
-	 * self-monitoring tasks or in UP mode
-	 */
-
-	state     = ctx->ctx_state;
-	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
-	is_system = ctx->ctx_fl_system;
-	ovfl_mask = pmu_conf->ovfl_val;
-	task      = ctx->ctx_task;
-
-	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
-
-	if (likely(is_loaded)) {
-		/*
-		 * In system wide and when the context is loaded, access can only happen
-		 * when the caller is running on the CPU being monitored by the session.
-		 * It does not have to be the owner (ctx_task) of the context per se.
-		 */
-		if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
-			DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
-			return -EBUSY;
-		}
-		/*
-		 * this can be true when not self-monitoring only in UP
-		 */
-		can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
-
-		if (can_access_pmu) ia64_srlz_d();
-	}
-	expert_mode = pfm_sysctl.expert_mode; 
-
-	DPRINT(("ld=%d apmu=%d ctx_state=%d\n",
-		is_loaded,
-		can_access_pmu,
-		state));
-
-	/*
-	 * on both UP and SMP, we can only read the PMD from the hardware register when
-	 * the task is the owner of the local PMU.
-	 */
-
-	for (i = 0; i < count; i++, req++) {
-
-		cnum        = req->reg_num;
-		reg_flags   = req->reg_flags;
-
-		if (unlikely(!PMD_IS_IMPL(cnum))) goto error;
-		/*
-		 * we can only read the register that we use. That includes
-		 * the one we explicitly initialize AND the one we want included
-		 * in the sampling buffer (smpl_regs).
-		 *
-		 * Having this restriction allows optimization in the ctxsw routine
-		 * without compromising security (leaks)
-		 */
-		if (unlikely(!CTX_IS_USED_PMD(ctx, cnum))) goto error;
-
-		sval        = ctx->ctx_pmds[cnum].val;
-		lval        = ctx->ctx_pmds[cnum].lval;
-		is_counting = PMD_IS_COUNTING(cnum);
-
-		/*
-		 * If the task is not the current one, then we check if the
-		 * PMU state is still in the local live register due to lazy ctxsw.
-		 * If true, then we read directly from the registers.
-		 */
-		if (can_access_pmu){
-			val = ia64_get_pmd(cnum);
-		} else {
-			/*
-			 * context has been saved
-			 * if context is zombie, then task does not exist anymore.
-			 * In this case, we use the full value saved in the context (pfm_flush_regs()).
-			 */
-			val = is_loaded ? ctx->th_pmds[cnum] : 0UL;
-		}
-		rd_func = pmu_conf->pmd_desc[cnum].read_check;
-
-		if (is_counting) {
-			/*
-			 * XXX: need to check for overflow when loaded
-			 */
-			val &= ovfl_mask;
-			val += sval;
-		}
-
-		/*
-		 * execute read checker, if any
-		 */
-		if (unlikely(expert_mode == 0 && rd_func)) {
-			unsigned long v = val;
-			ret = (*rd_func)(ctx->ctx_task, ctx, cnum, &v, regs);
-			if (ret) goto error;
-			val = v;
-			ret = -EINVAL;
-		}
-
-		PFM_REG_RETFLAG_SET(reg_flags, 0);
-
-		DPRINT(("pmd[%u]=0x%lx\n", cnum, val));
-
-		/*
-		 * update register return value, abort all if problem during copy.
-		 * we only modify the reg_flags field. no check mode is fine because
-		 * access has been verified upfront in sys_perfmonctl().
-		 */
-		req->reg_value            = val;
-		req->reg_flags            = reg_flags;
-		req->reg_last_reset_val   = lval;
-	}
-
-	return 0;
-
-error:
-	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
-	return ret;
-}
-
-int
-pfm_mod_write_pmcs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
-{
-	pfm_context_t *ctx;
-
-	if (req == NULL) return -EINVAL;
-
- 	ctx = GET_PMU_CTX();
-
-	if (ctx == NULL) return -EINVAL;
-
-	/*
-	 * for now limit to current task, which is enough when calling
-	 * from overflow handler
-	 */
-	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
-
-	return pfm_write_pmcs(ctx, req, nreq, regs);
-}
-EXPORT_SYMBOL(pfm_mod_write_pmcs);
-
-int
-pfm_mod_read_pmds(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
-{
-	pfm_context_t *ctx;
-
-	if (req == NULL) return -EINVAL;
-
- 	ctx = GET_PMU_CTX();
-
-	if (ctx == NULL) return -EINVAL;
-
-	/*
-	 * for now limit to current task, which is enough when calling
-	 * from overflow handler
-	 */
-	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
-
-	return pfm_read_pmds(ctx, req, nreq, regs);
-}
-EXPORT_SYMBOL(pfm_mod_read_pmds);
-
-/*
- * Only call this function when a process it trying to
- * write the debug registers (reading is always allowed)
- */
-int
-pfm_use_debug_registers(struct task_struct *task)
-{
-	pfm_context_t *ctx = task->thread.pfm_context;
-	unsigned long flags;
-	int ret = 0;
-
-	if (pmu_conf->use_rr_dbregs == 0) return 0;
-
-	DPRINT(("called for [%d]\n", task->pid));
-
-	/*
-	 * do it only once
-	 */
-	if (task->thread.flags & IA64_THREAD_DBG_VALID) return 0;
-
-	/*
-	 * Even on SMP, we do not need to use an atomic here because
-	 * the only way in is via ptrace() and this is possible only when the
-	 * process is stopped. Even in the case where the ctxsw out is not totally
-	 * completed by the time we come here, there is no way the 'stopped' process
-	 * could be in the middle of fiddling with the pfm_write_ibr_dbr() routine.
-	 * So this is always safe.
-	 */
-	if (ctx && ctx->ctx_fl_using_dbreg == 1) return -1;
-
-	LOCK_PFS(flags);
-
-	/*
-	 * We cannot allow setting breakpoints when system wide monitoring
-	 * sessions are using the debug registers.
-	 */
-	if (pfm_sessions.pfs_sys_use_dbregs> 0)
-		ret = -1;
-	else
-		pfm_sessions.pfs_ptrace_use_dbregs++;
-
-	DPRINT(("ptrace_use_dbregs=%u  sys_use_dbregs=%u by [%d] ret = %d\n",
-		  pfm_sessions.pfs_ptrace_use_dbregs,
-		  pfm_sessions.pfs_sys_use_dbregs,
-		  task->pid, ret));
-
-	UNLOCK_PFS(flags);
-
-	return ret;
-}
-
-/*
- * This function is called for every task that exits with the
- * IA64_THREAD_DBG_VALID set. This indicates a task which was
- * able to use the debug registers for debugging purposes via
- * ptrace(). Therefore we know it was not using them for
- * perfmormance monitoring, so we only decrement the number
- * of "ptraced" debug register users to keep the count up to date
- */
-int
-pfm_release_debug_registers(struct task_struct *task)
-{
-	unsigned long flags;
-	int ret;
-
-	if (pmu_conf->use_rr_dbregs == 0) return 0;
-
-	LOCK_PFS(flags);
-	if (pfm_sessions.pfs_ptrace_use_dbregs == 0) {
-		printk(KERN_ERR "perfmon: invalid release for [%d] ptrace_use_dbregs=0\n", task->pid);
-		ret = -1;
-	}  else {
-		pfm_sessions.pfs_ptrace_use_dbregs--;
-		ret = 0;
-	}
-	UNLOCK_PFS(flags);
-
-	return ret;
-}
-
-static int
-pfm_restart(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	struct task_struct *task;
-	pfm_buffer_fmt_t *fmt;
-	pfm_ovfl_ctrl_t rst_ctrl;
-	int state, is_system;
-	int ret = 0;
-
-	state     = ctx->ctx_state;
-	fmt       = ctx->ctx_buf_fmt;
-	is_system = ctx->ctx_fl_system;
-	task      = PFM_CTX_TASK(ctx);
-
-	switch(state) {
-		case PFM_CTX_MASKED:
-			break;
-		case PFM_CTX_LOADED: 
-			if (CTX_HAS_SMPL(ctx) && fmt->fmt_restart_active) break;
-			/* fall through */
-		case PFM_CTX_UNLOADED:
-		case PFM_CTX_ZOMBIE:
-			DPRINT(("invalid state=%d\n", state));
-			return -EBUSY;
-		default:
-			DPRINT(("state=%d, cannot operate (no active_restart handler)\n", state));
-			return -EINVAL;
-	}
-
-	/*
- 	 * In system wide and when the context is loaded, access can only happen
- 	 * when the caller is running on the CPU being monitored by the session.
- 	 * It does not have to be the owner (ctx_task) of the context per se.
- 	 */
-	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
-		DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
-		return -EBUSY;
-	}
-
-	/* sanity check */
-	if (unlikely(task == NULL)) {
-		printk(KERN_ERR "perfmon: [%d] pfm_restart no task\n", current->pid);
-		return -EINVAL;
-	}
-
-	if (task == current || is_system) {
-
-		fmt = ctx->ctx_buf_fmt;
-
-		DPRINT(("restarting self %d ovfl=0x%lx\n",
-			task->pid,
-			ctx->ctx_ovfl_regs[0]));
-
-		if (CTX_HAS_SMPL(ctx)) {
-
-			prefetch(ctx->ctx_smpl_hdr);
-
-			rst_ctrl.bits.mask_monitoring = 0;
-			rst_ctrl.bits.reset_ovfl_pmds = 0;
-
-			if (state == PFM_CTX_LOADED)
-				ret = pfm_buf_fmt_restart_active(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
-			else
-				ret = pfm_buf_fmt_restart(fmt, task, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
-		} else {
-			rst_ctrl.bits.mask_monitoring = 0;
-			rst_ctrl.bits.reset_ovfl_pmds = 1;
-		}
-
-		if (ret == 0) {
-			if (rst_ctrl.bits.reset_ovfl_pmds)
-				pfm_reset_regs(ctx, ctx->ctx_ovfl_regs, PFM_PMD_LONG_RESET);
-
-			if (rst_ctrl.bits.mask_monitoring == 0) {
-				DPRINT(("resuming monitoring for [%d]\n", task->pid));
-
-				if (state == PFM_CTX_MASKED) pfm_restore_monitoring(task);
-			} else {
-				DPRINT(("keeping monitoring stopped for [%d]\n", task->pid));
-
-				// cannot use pfm_stop_monitoring(task, regs);
-			}
-		}
-		/*
-		 * clear overflowed PMD mask to remove any stale information
-		 */
-		ctx->ctx_ovfl_regs[0] = 0UL;
-
-		/*
-		 * back to LOADED state
-		 */
-		ctx->ctx_state = PFM_CTX_LOADED;
-
-		/*
-		 * XXX: not really useful for self monitoring
-		 */
-		ctx->ctx_fl_can_restart = 0;
-
-		return 0;
-	}
-
-	/* 
-	 * restart another task
-	 */
-
-	/*
-	 * When PFM_CTX_MASKED, we cannot issue a restart before the previous 
-	 * one is seen by the task.
-	 */
-	if (state == PFM_CTX_MASKED) {
-		if (ctx->ctx_fl_can_restart == 0) return -EINVAL;
-		/*
-		 * will prevent subsequent restart before this one is
-		 * seen by other task
-		 */
-		ctx->ctx_fl_can_restart = 0;
-	}
-
-	/*
-	 * if blocking, then post the semaphore is PFM_CTX_MASKED, i.e.
-	 * the task is blocked or on its way to block. That's the normal
-	 * restart path. If the monitoring is not masked, then the task
-	 * can be actively monitoring and we cannot directly intervene.
-	 * Therefore we use the trap mechanism to catch the task and
-	 * force it to reset the buffer/reset PMDs.
-	 *
-	 * if non-blocking, then we ensure that the task will go into
-	 * pfm_handle_work() before returning to user mode.
-	 *
-	 * We cannot explicitly reset another task, it MUST always
-	 * be done by the task itself. This works for system wide because
-	 * the tool that is controlling the session is logically doing 
-	 * "self-monitoring".
-	 */
-	if (CTX_OVFL_NOBLOCK(ctx) == 0 && state == PFM_CTX_MASKED) {
-		DPRINT(("unblocking [%d] \n", task->pid));
-		complete(&ctx->ctx_restart_done);
-	} else {
-		DPRINT(("[%d] armed exit trap\n", task->pid));
-
-		ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_RESET;
-
-		PFM_SET_WORK_PENDING(task, 1);
-
-		pfm_set_task_notify(task);
-
-		/*
-		 * XXX: send reschedule if task runs on another CPU
-		 */
-	}
-	return 0;
-}
-
-static int
-pfm_debug(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	unsigned int m = *(unsigned int *)arg;
-
-	pfm_sysctl.debug = m == 0 ? 0 : 1;
-
-	printk(KERN_INFO "perfmon debugging %s (timing reset)\n", pfm_sysctl.debug ? "on" : "off");
-
-	if (m == 0) {
-		memset(pfm_stats, 0, sizeof(pfm_stats));
-		for(m=0; m < NR_CPUS; m++) pfm_stats[m].pfm_ovfl_intr_cycles_min = ~0UL;
-	}
-	return 0;
-}
-
-/*
- * arg can be NULL and count can be zero for this function
- */
-static int
-pfm_write_ibr_dbr(int mode, pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	struct thread_struct *thread = NULL;
-	struct task_struct *task;
-	pfarg_dbreg_t *req = (pfarg_dbreg_t *)arg;
-	unsigned long flags;
-	dbreg_t dbreg;
-	unsigned int rnum;
-	int first_time;
-	int ret = 0, state;
-	int i, can_access_pmu = 0;
-	int is_system, is_loaded;
-
-	if (pmu_conf->use_rr_dbregs == 0) return -EINVAL;
-
-	state     = ctx->ctx_state;
-	is_loaded = state == PFM_CTX_LOADED ? 1 : 0;
-	is_system = ctx->ctx_fl_system;
-	task      = ctx->ctx_task;
-
-	if (state == PFM_CTX_ZOMBIE) return -EINVAL;
-
-	/*
-	 * on both UP and SMP, we can only write to the PMC when the task is
-	 * the owner of the local PMU.
-	 */
-	if (is_loaded) {
-		thread = &task->thread;
-		/*
-		 * In system wide and when the context is loaded, access can only happen
-		 * when the caller is running on the CPU being monitored by the session.
-		 * It does not have to be the owner (ctx_task) of the context per se.
-		 */
-		if (unlikely(is_system && ctx->ctx_cpu != smp_processor_id())) {
-			DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
-			return -EBUSY;
-		}
-		can_access_pmu = GET_PMU_OWNER() == task || is_system ? 1 : 0;
-	}
-
-	/*
-	 * we do not need to check for ipsr.db because we do clear ibr.x, dbr.r, and dbr.w
-	 * ensuring that no real breakpoint can be installed via this call.
-	 *
-	 * IMPORTANT: regs can be NULL in this function
-	 */
-
-	first_time = ctx->ctx_fl_using_dbreg == 0;
-
-	/*
-	 * don't bother if we are loaded and task is being debugged
-	 */
-	if (is_loaded && (thread->flags & IA64_THREAD_DBG_VALID) != 0) {
-		DPRINT(("debug registers already in use for [%d]\n", task->pid));
-		return -EBUSY;
-	}
-
-	/*
-	 * check for debug registers in system wide mode
-	 *
-	 * If though a check is done in pfm_context_load(),
-	 * we must repeat it here, in case the registers are
-	 * written after the context is loaded
-	 */
-	if (is_loaded) {
-		LOCK_PFS(flags);
-
-		if (first_time && is_system) {
-			if (pfm_sessions.pfs_ptrace_use_dbregs)
-				ret = -EBUSY;
-			else
-				pfm_sessions.pfs_sys_use_dbregs++;
-		}
-		UNLOCK_PFS(flags);
-	}
-
-	if (ret != 0) return ret;
-
-	/*
-	 * mark ourself as user of the debug registers for
-	 * perfmon purposes.
-	 */
-	ctx->ctx_fl_using_dbreg = 1;
-
-	/*
- 	 * clear hardware registers to make sure we don't
- 	 * pick up stale state.
-	 *
-	 * for a system wide session, we do not use
-	 * thread.dbr, thread.ibr because this process
-	 * never leaves the current CPU and the state
-	 * is shared by all processes running on it
- 	 */
-	if (first_time && can_access_pmu) {
-		DPRINT(("[%d] clearing ibrs, dbrs\n", task->pid));
-		for (i=0; i < pmu_conf->num_ibrs; i++) {
-			ia64_set_ibr(i, 0UL);
-			ia64_dv_serialize_instruction();
-		}
-		ia64_srlz_i();
-		for (i=0; i < pmu_conf->num_dbrs; i++) {
-			ia64_set_dbr(i, 0UL);
-			ia64_dv_serialize_data();
-		}
-		ia64_srlz_d();
-	}
-
-	/*
-	 * Now install the values into the registers
-	 */
-	for (i = 0; i < count; i++, req++) {
-
-		rnum      = req->dbreg_num;
-		dbreg.val = req->dbreg_value;
-
-		ret = -EINVAL;
-
-		if ((mode == PFM_CODE_RR && rnum >= PFM_NUM_IBRS) || ((mode == PFM_DATA_RR) && rnum >= PFM_NUM_DBRS)) {
-			DPRINT(("invalid register %u val=0x%lx mode=%d i=%d count=%d\n",
-				  rnum, dbreg.val, mode, i, count));
-
-			goto abort_mission;
-		}
-
-		/*
-		 * make sure we do not install enabled breakpoint
-		 */
-		if (rnum & 0x1) {
-			if (mode == PFM_CODE_RR)
-				dbreg.ibr.ibr_x = 0;
-			else
-				dbreg.dbr.dbr_r = dbreg.dbr.dbr_w = 0;
-		}
-
-		PFM_REG_RETFLAG_SET(req->dbreg_flags, 0);
-
-		/*
-		 * Debug registers, just like PMC, can only be modified
-		 * by a kernel call. Moreover, perfmon() access to those
-		 * registers are centralized in this routine. The hardware
-		 * does not modify the value of these registers, therefore,
-		 * if we save them as they are written, we can avoid having
-		 * to save them on context switch out. This is made possible
-		 * by the fact that when perfmon uses debug registers, ptrace()
-		 * won't be able to modify them concurrently.
-		 */
-		if (mode == PFM_CODE_RR) {
-			CTX_USED_IBR(ctx, rnum);
-
-			if (can_access_pmu) {
-				ia64_set_ibr(rnum, dbreg.val);
-				ia64_dv_serialize_instruction();
-			}
-
-			ctx->ctx_ibrs[rnum] = dbreg.val;
-
-			DPRINT(("write ibr%u=0x%lx used_ibrs=0x%x ld=%d apmu=%d\n",
-				rnum, dbreg.val, ctx->ctx_used_ibrs[0], is_loaded, can_access_pmu));
-		} else {
-			CTX_USED_DBR(ctx, rnum);
-
-			if (can_access_pmu) {
-				ia64_set_dbr(rnum, dbreg.val);
-				ia64_dv_serialize_data();
-			}
-			ctx->ctx_dbrs[rnum] = dbreg.val;
-
-			DPRINT(("write dbr%u=0x%lx used_dbrs=0x%x ld=%d apmu=%d\n",
-				rnum, dbreg.val, ctx->ctx_used_dbrs[0], is_loaded, can_access_pmu));
-		}
-	}
-
-	return 0;
-
-abort_mission:
-	/*
-	 * in case it was our first attempt, we undo the global modifications
-	 */
-	if (first_time) {
-		LOCK_PFS(flags);
-		if (ctx->ctx_fl_system) {
-			pfm_sessions.pfs_sys_use_dbregs--;
-		}
-		UNLOCK_PFS(flags);
-		ctx->ctx_fl_using_dbreg = 0;
-	}
-	/*
-	 * install error return flag
-	 */
-	PFM_REG_RETFLAG_SET(req->dbreg_flags, PFM_REG_RETFL_EINVAL);
-
-	return ret;
-}
-
-static int
-pfm_write_ibrs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	return pfm_write_ibr_dbr(PFM_CODE_RR, ctx, arg, count, regs);
-}
-
-static int
-pfm_write_dbrs(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	return pfm_write_ibr_dbr(PFM_DATA_RR, ctx, arg, count, regs);
-}
-
-int
-pfm_mod_write_ibrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
-{
-	pfm_context_t *ctx;
-
-	if (req == NULL) return -EINVAL;
-
- 	ctx = GET_PMU_CTX();
-
-	if (ctx == NULL) return -EINVAL;
-
-	/*
-	 * for now limit to current task, which is enough when calling
-	 * from overflow handler
-	 */
-	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
-
-	return pfm_write_ibrs(ctx, req, nreq, regs);
-}
-EXPORT_SYMBOL(pfm_mod_write_ibrs);
-
-int
-pfm_mod_write_dbrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs)
-{
-	pfm_context_t *ctx;
-
-	if (req == NULL) return -EINVAL;
-
- 	ctx = GET_PMU_CTX();
-
-	if (ctx == NULL) return -EINVAL;
-
-	/*
-	 * for now limit to current task, which is enough when calling
-	 * from overflow handler
-	 */
-	if (task != current && ctx->ctx_fl_system == 0) return -EBUSY;
-
-	return pfm_write_dbrs(ctx, req, nreq, regs);
-}
-EXPORT_SYMBOL(pfm_mod_write_dbrs);
-
-
-static int
-pfm_get_features(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	pfarg_features_t *req = (pfarg_features_t *)arg;
-
-	req->ft_version = PFM_VERSION;
-	return 0;
-}
-
-static int
-pfm_stop(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	struct pt_regs *tregs;
-	struct task_struct *task = PFM_CTX_TASK(ctx);
-	int state, is_system;
-
-	state     = ctx->ctx_state;
-	is_system = ctx->ctx_fl_system;
-
-	/*
-	 * context must be attached to issue the stop command (includes LOADED,MASKED,ZOMBIE)
-	 */
-	if (state == PFM_CTX_UNLOADED) return -EINVAL;
-
-	/*
- 	 * In system wide and when the context is loaded, access can only happen
- 	 * when the caller is running on the CPU being monitored by the session.
- 	 * It does not have to be the owner (ctx_task) of the context per se.
- 	 */
-	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
-		DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
-		return -EBUSY;
-	}
-	DPRINT(("task [%d] ctx_state=%d is_system=%d\n",
-		PFM_CTX_TASK(ctx)->pid,
-		state,
-		is_system));
-	/*
-	 * in system mode, we need to update the PMU directly
-	 * and the user level state of the caller, which may not
-	 * necessarily be the creator of the context.
-	 */
-	if (is_system) {
-		/*
-		 * Update local PMU first
-		 *
-		 * disable dcr pp
-		 */
-		ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) & ~IA64_DCR_PP);
-		ia64_srlz_i();
-
-		/*
-		 * update local cpuinfo
-		 */
-		PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
-
-		/*
-		 * stop monitoring, does srlz.i
-		 */
-		pfm_clear_psr_pp();
-
-		/*
-		 * stop monitoring in the caller
-		 */
-		ia64_psr(regs)->pp = 0;
-
-		return 0;
-	}
-	/*
-	 * per-task mode
-	 */
-
-	if (task == current) {
-		/* stop monitoring  at kernel level */
-		pfm_clear_psr_up();
-
-		/*
-	 	 * stop monitoring at the user level
-	 	 */
-		ia64_psr(regs)->up = 0;
-	} else {
-		tregs = task_pt_regs(task);
-
-		/*
-	 	 * stop monitoring at the user level
-	 	 */
-		ia64_psr(tregs)->up = 0;
-
-		/*
-		 * monitoring disabled in kernel at next reschedule
-		 */
-		ctx->ctx_saved_psr_up = 0;
-		DPRINT(("task=[%d]\n", task->pid));
-	}
-	return 0;
-}
-
-
-static int
-pfm_start(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	struct pt_regs *tregs;
-	int state, is_system;
-
-	state     = ctx->ctx_state;
-	is_system = ctx->ctx_fl_system;
-
-	if (state != PFM_CTX_LOADED) return -EINVAL;
-
-	/*
- 	 * In system wide and when the context is loaded, access can only happen
- 	 * when the caller is running on the CPU being monitored by the session.
- 	 * It does not have to be the owner (ctx_task) of the context per se.
- 	 */
-	if (is_system && ctx->ctx_cpu != smp_processor_id()) {
-		DPRINT(("should be running on CPU%d\n", ctx->ctx_cpu));
-		return -EBUSY;
-	}
-
-	/*
-	 * in system mode, we need to update the PMU directly
-	 * and the user level state of the caller, which may not
-	 * necessarily be the creator of the context.
-	 */
-	if (is_system) {
-
-		/*
-		 * set user level psr.pp for the caller
-		 */
-		ia64_psr(regs)->pp = 1;
-
-		/*
-		 * now update the local PMU and cpuinfo
-		 */
-		PFM_CPUINFO_SET(PFM_CPUINFO_DCR_PP);
-
-		/*
-		 * start monitoring at kernel level
-		 */
-		pfm_set_psr_pp();
-
-		/* enable dcr pp */
-		ia64_setreg(_IA64_REG_CR_DCR, ia64_getreg(_IA64_REG_CR_DCR) | IA64_DCR_PP);
-		ia64_srlz_i();
-
-		return 0;
-	}
-
-	/*
-	 * per-process mode
-	 */
-
-	if (ctx->ctx_task == current) {
-
-		/* start monitoring at kernel level */
-		pfm_set_psr_up();
-
-		/*
-		 * activate monitoring at user level
-		 */
-		ia64_psr(regs)->up = 1;
-
-	} else {
-		tregs = task_pt_regs(ctx->ctx_task);
-
-		/*
-		 * start monitoring at the kernel level the next
-		 * time the task is scheduled
-		 */
-		ctx->ctx_saved_psr_up = IA64_PSR_UP;
-
-		/*
-		 * activate monitoring at user level
-		 */
-		ia64_psr(tregs)->up = 1;
-	}
-	return 0;
-}
-
-static int
-pfm_get_pmc_reset(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	pfarg_reg_t *req = (pfarg_reg_t *)arg;
-	unsigned int cnum;
-	int i;
-	int ret = -EINVAL;
-
-	for (i = 0; i < count; i++, req++) {
-
-		cnum = req->reg_num;
-
-		if (!PMC_IS_IMPL(cnum)) goto abort_mission;
-
-		req->reg_value = PMC_DFL_VAL(cnum);
-
-		PFM_REG_RETFLAG_SET(req->reg_flags, 0);
-
-		DPRINT(("pmc_reset_val pmc[%u]=0x%lx\n", cnum, req->reg_value));
-	}
-	return 0;
-
-abort_mission:
-	PFM_REG_RETFLAG_SET(req->reg_flags, PFM_REG_RETFL_EINVAL);
-	return ret;
-}
-
-static int
-pfm_check_task_exist(pfm_context_t *ctx)
-{
-	struct task_struct *g, *t;
-	int ret = -ESRCH;
-
-	read_lock(&tasklist_lock);
-
-	do_each_thread (g, t) {
-		if (t->thread.pfm_context == ctx) {
-			ret = 0;
-			break;
-		}
-	} while_each_thread (g, t);
-
-	read_unlock(&tasklist_lock);
-
-	DPRINT(("pfm_check_task_exist: ret=%d ctx=%p\n", ret, ctx));
-
-	return ret;
-}
-
-static int
-pfm_context_load(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	struct task_struct *task;
-	struct thread_struct *thread;
-	struct pfm_context_t *old;
-	unsigned long flags;
-#ifndef CONFIG_SMP
-	struct task_struct *owner_task = NULL;
-#endif
-	pfarg_load_t *req = (pfarg_load_t *)arg;
-	unsigned long *pmcs_source, *pmds_source;
-	int the_cpu;
-	int ret = 0;
-	int state, is_system, set_dbregs = 0;
-
-	state     = ctx->ctx_state;
-	is_system = ctx->ctx_fl_system;
-	/*
-	 * can only load from unloaded or terminated state
-	 */
-	if (state != PFM_CTX_UNLOADED) {
-		DPRINT(("cannot load to [%d], invalid ctx_state=%d\n",
-			req->load_pid,
-			ctx->ctx_state));
-		return -EBUSY;
-	}
-
-	DPRINT(("load_pid [%d] using_dbreg=%d\n", req->load_pid, ctx->ctx_fl_using_dbreg));
-
-	if (CTX_OVFL_NOBLOCK(ctx) == 0 && req->load_pid == current->pid) {
-		DPRINT(("cannot use blocking mode on self\n"));
-		return -EINVAL;
-	}
-
-	ret = pfm_get_task(ctx, req->load_pid, &task);
-	if (ret) {
-		DPRINT(("load_pid [%d] get_task=%d\n", req->load_pid, ret));
-		return ret;
-	}
-
-	ret = -EINVAL;
-
-	/*
-	 * system wide is self monitoring only
-	 */
-	if (is_system && task != current) {
-		DPRINT(("system wide is self monitoring only load_pid=%d\n",
-			req->load_pid));
-		goto error;
-	}
-
-	thread = &task->thread;
-
-	ret = 0;
-	/*
-	 * cannot load a context which is using range restrictions,
-	 * into a task that is being debugged.
-	 */
-	if (ctx->ctx_fl_using_dbreg) {
-		if (thread->flags & IA64_THREAD_DBG_VALID) {
-			ret = -EBUSY;
-			DPRINT(("load_pid [%d] task is debugged, cannot load range restrictions\n", req->load_pid));
-			goto error;
-		}
-		LOCK_PFS(flags);
-
-		if (is_system) {
-			if (pfm_sessions.pfs_ptrace_use_dbregs) {
-				DPRINT(("cannot load [%d] dbregs in use\n", task->pid));
-				ret = -EBUSY;
-			} else {
-				pfm_sessions.pfs_sys_use_dbregs++;
-				DPRINT(("load [%d] increased sys_use_dbreg=%u\n", task->pid, pfm_sessions.pfs_sys_use_dbregs));
-				set_dbregs = 1;
-			}
-		}
-
-		UNLOCK_PFS(flags);
-
-		if (ret) goto error;
-	}
-
-	/*
-	 * SMP system-wide monitoring implies self-monitoring.
-	 *
-	 * The programming model expects the task to
-	 * be pinned on a CPU throughout the session.
-	 * Here we take note of the current CPU at the
-	 * time the context is loaded. No call from
-	 * another CPU will be allowed.
-	 *
-	 * The pinning via shed_setaffinity()
-	 * must be done by the calling task prior
-	 * to this call.
-	 *
-	 * systemwide: keep track of CPU this session is supposed to run on
-	 */
-	the_cpu = ctx->ctx_cpu = smp_processor_id();
-
-	ret = -EBUSY;
-	/*
-	 * now reserve the session
-	 */
-	ret = pfm_reserve_session(current, is_system, the_cpu);
-	if (ret) goto error;
-
-	/*
-	 * task is necessarily stopped at this point.
-	 *
-	 * If the previous context was zombie, then it got removed in
-	 * pfm_save_regs(). Therefore we should not see it here.
-	 * If we see a context, then this is an active context
-	 *
-	 * XXX: needs to be atomic
-	 */
-	DPRINT(("before cmpxchg() old_ctx=%p new_ctx=%p\n",
-		thread->pfm_context, ctx));
-
-	ret = -EBUSY;
-	old = ia64_cmpxchg(acq, &thread->pfm_context, NULL, ctx, sizeof(pfm_context_t *));
-	if (old != NULL) {
-		DPRINT(("load_pid [%d] already has a context\n", req->load_pid));
-		goto error_unres;
-	}
-
-	pfm_reset_msgq(ctx);
-
-	ctx->ctx_state = PFM_CTX_LOADED;
-
-	/*
-	 * link context to task
-	 */
-	ctx->ctx_task = task;
-
-	if (is_system) {
-		/*
-		 * we load as stopped
-		 */
-		PFM_CPUINFO_SET(PFM_CPUINFO_SYST_WIDE);
-		PFM_CPUINFO_CLEAR(PFM_CPUINFO_DCR_PP);
-
-		if (ctx->ctx_fl_excl_idle) PFM_CPUINFO_SET(PFM_CPUINFO_EXCL_IDLE);
-	} else {
-		thread->flags |= IA64_THREAD_PM_VALID;
-	}
-
-	/*
-	 * propagate into thread-state
-	 */
-	pfm_copy_pmds(task, ctx);
-	pfm_copy_pmcs(task, ctx);
-
-	pmcs_source = ctx->th_pmcs;
-	pmds_source = ctx->th_pmds;
-
-	/*
-	 * always the case for system-wide
-	 */
-	if (task == current) {
-
-		if (is_system == 0) {
-
-			/* allow user level control */
-			ia64_psr(regs)->sp = 0;
-			DPRINT(("clearing psr.sp for [%d]\n", task->pid));
-
-			SET_LAST_CPU(ctx, smp_processor_id());
-			INC_ACTIVATION();
-			SET_ACTIVATION(ctx);
-#ifndef CONFIG_SMP
-			/*
-			 * push the other task out, if any
-			 */
-			owner_task = GET_PMU_OWNER();
-			if (owner_task) pfm_lazy_save_regs(owner_task);
-#endif
-		}
-		/*
-		 * load all PMD from ctx to PMU (as opposed to thread state)
-		 * restore all PMC from ctx to PMU
-		 */
-		pfm_restore_pmds(pmds_source, ctx->ctx_all_pmds[0]);
-		pfm_restore_pmcs(pmcs_source, ctx->ctx_all_pmcs[0]);
-
-		ctx->ctx_reload_pmcs[0] = 0UL;
-		ctx->ctx_reload_pmds[0] = 0UL;
-
-		/*
-		 * guaranteed safe by earlier check against DBG_VALID
-		 */
-		if (ctx->ctx_fl_using_dbreg) {
-			pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
-			pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
-		}
-		/*
-		 * set new ownership
-		 */
-		SET_PMU_OWNER(task, ctx);
-
-		DPRINT(("context loaded on PMU for [%d]\n", task->pid));
-	} else {
-		/*
-		 * when not current, task MUST be stopped, so this is safe
-		 */
-		regs = task_pt_regs(task);
-
-		/* force a full reload */
-		ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
-		SET_LAST_CPU(ctx, -1);
-
-		/* initial saved psr (stopped) */
-		ctx->ctx_saved_psr_up = 0UL;
-		ia64_psr(regs)->up = ia64_psr(regs)->pp = 0;
-	}
-
-	ret = 0;
-
-error_unres:
-	if (ret) pfm_unreserve_session(ctx, ctx->ctx_fl_system, the_cpu);
-error:
-	/*
-	 * we must undo the dbregs setting (for system-wide)
-	 */
-	if (ret && set_dbregs) {
-		LOCK_PFS(flags);
-		pfm_sessions.pfs_sys_use_dbregs--;
-		UNLOCK_PFS(flags);
-	}
-	/*
-	 * release task, there is now a link with the context
-	 */
-	if (is_system == 0 && task != current) {
-		pfm_put_task(task);
-
-		if (ret == 0) {
-			ret = pfm_check_task_exist(ctx);
-			if (ret) {
-				ctx->ctx_state = PFM_CTX_UNLOADED;
-				ctx->ctx_task  = NULL;
-			}
-		}
-	}
-	return ret;
-}
-
-/*
- * in this function, we do not need to increase the use count
- * for the task via get_task_struct(), because we hold the
- * context lock. If the task were to disappear while having
- * a context attached, it would go through pfm_exit_thread()
- * which also grabs the context lock  and would therefore be blocked
- * until we are here.
- */
-static void pfm_flush_pmds(struct task_struct *, pfm_context_t *ctx);
-
-static int
-pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs)
-{
-	struct task_struct *task = PFM_CTX_TASK(ctx);
-	struct pt_regs *tregs;
-	int prev_state, is_system;
-	int ret;
-
-	DPRINT(("ctx_state=%d task [%d]\n", ctx->ctx_state, task ? task->pid : -1));
-
-	prev_state = ctx->ctx_state;
-	is_system  = ctx->ctx_fl_system;
-
-	/*
-	 * unload only when necessary
-	 */
-	if (prev_state == PFM_CTX_UNLOADED) {
-		DPRINT(("ctx_state=%d, nothing to do\n", prev_state));
-		return 0;
-	}
-
-	/*
-	 * clear psr and dcr bits
-	 */
-	ret = pfm_stop(ctx, NULL, 0, regs);
-	if (ret) return ret;
-
-	ctx->ctx_state = PFM_CTX_UNLOADED;
-
-	/*
-	 * in system mode, we need to update the PMU directly
-	 * and the user level state of the caller, which may not
-	 * necessarily be the creator of the context.
-	 */
-	if (is_system) {
-
-		/*
-		 * Update cpuinfo
-		 *
-		 * local PMU is taken care of in pfm_stop()
-		 */
-		PFM_CPUINFO_CLEAR(PFM_CPUINFO_SYST_WIDE);
-		PFM_CPUINFO_CLEAR(PFM_CPUINFO_EXCL_IDLE);
-
-		/*
-		 * save PMDs in context
-		 * release ownership
-		 */
-		pfm_flush_pmds(current, ctx);
-
-		/*
-		 * at this point we are done with the PMU
-		 * so we can unreserve the resource.
-		 */
-		if (prev_state != PFM_CTX_ZOMBIE) 
-			pfm_unreserve_session(ctx, 1 , ctx->ctx_cpu);
-
-		/*
-		 * disconnect context from task
-		 */
-		task->thread.pfm_context = NULL;
-		/*
-		 * disconnect task from context
-		 */
-		ctx->ctx_task = NULL;
-
-		/*
-		 * There is nothing more to cleanup here.
-		 */
-		return 0;
-	}
-
-	/*
-	 * per-task mode
-	 */
-	tregs = task == current ? regs : task_pt_regs(task);
-
-	if (task == current) {
-		/*
-		 * cancel user level control
-		 */
-		ia64_psr(regs)->sp = 1;
-
-		DPRINT(("setting psr.sp for [%d]\n", task->pid));
-	}
-	/*
-	 * save PMDs to context
-	 * release ownership
-	 */
-	pfm_flush_pmds(task, ctx);
-
-	/*
-	 * at this point we are done with the PMU
-	 * so we can unreserve the resource.
-	 *
-	 * when state was ZOMBIE, we have already unreserved.
-	 */
-	if (prev_state != PFM_CTX_ZOMBIE) 
-		pfm_unreserve_session(ctx, 0 , ctx->ctx_cpu);
-
-	/*
-	 * reset activation counter and psr
-	 */
-	ctx->ctx_last_activation = PFM_INVALID_ACTIVATION;
-	SET_LAST_CPU(ctx, -1);
-
-	/*
-	 * PMU state will not be restored
-	 */
-	task->thread.flags &= ~IA64_THREAD_PM_VALID;
-
-	/*
-	 * break links between context and task
-	 */
-	task->thread.pfm_context  = NULL;
-	ctx->ctx_task             = NULL;
-
-	PFM_SET_WORK_PENDING(task, 0);
-
-	ctx->ctx_fl_trap_reason  = PFM_TRAP_REASON_NONE;
-	ctx->ctx_fl_can_restart  = 0;
-	ctx->ctx_fl_going_zombie = 0;
-
-	DPRINT(("disconnected [%d] from context\n", task->pid));
-
-	return 0;
-}
-
-
-/*
- * called only from exit_thread(): task == current
- * we come here only if current has a context attached (loaded or masked)
- */
-void
-pfm_exit_thread(struct task_struct *task)
-{
-	pfm_context_t *ctx;
-	unsigned long flags;
-	struct pt_regs *regs = task_pt_regs(task);
-	int ret, state;
-	int free_ok = 0;
-
-	ctx = PFM_GET_CTX(task);
-
-	PROTECT_CTX(ctx, flags);
-
-	DPRINT(("state=%d task [%d]\n", ctx->ctx_state, task->pid));
-
-	state = ctx->ctx_state;
-	switch(state) {
-		case PFM_CTX_UNLOADED:
-			/*
-	 		 * only comes to this function if pfm_context is not NULL, i.e., cannot
-			 * be in unloaded state
-	 		 */
-			printk(KERN_ERR "perfmon: pfm_exit_thread [%d] ctx unloaded\n", task->pid);
-			break;
-		case PFM_CTX_LOADED:
-		case PFM_CTX_MASKED:
-			ret = pfm_context_unload(ctx, NULL, 0, regs);
-			if (ret) {
-				printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
-			}
-			DPRINT(("ctx unloaded for current state was %d\n", state));
-
-			pfm_end_notify_user(ctx);
-			break;
-		case PFM_CTX_ZOMBIE:
-			ret = pfm_context_unload(ctx, NULL, 0, regs);
-			if (ret) {
-				printk(KERN_ERR "perfmon: pfm_exit_thread [%d] state=%d unload failed %d\n", task->pid, state, ret);
-			}
-			free_ok = 1;
-			break;
-		default:
-			printk(KERN_ERR "perfmon: pfm_exit_thread [%d] unexpected state=%d\n", task->pid, state);
-			break;
-	}
-	UNPROTECT_CTX(ctx, flags);
-
-	{ u64 psr = pfm_get_psr();
-	  BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
-	  BUG_ON(GET_PMU_OWNER());
-	  BUG_ON(ia64_psr(regs)->up);
-	  BUG_ON(ia64_psr(regs)->pp);
-	}
-
-	/*
-	 * All memory free operations (especially for vmalloc'ed memory)
-	 * MUST be done with interrupts ENABLED.
-	 */
-	if (free_ok) pfm_context_free(ctx);
-}
-
-/*
- * functions MUST be listed in the increasing order of their index (see permfon.h)
- */
-#define PFM_CMD(name, flags, arg_count, arg_type, getsz) { name, #name, flags, arg_count, sizeof(arg_type), getsz }
-#define PFM_CMD_S(name, flags) { name, #name, flags, 0, 0, NULL }
-#define PFM_CMD_PCLRWS	(PFM_CMD_FD|PFM_CMD_ARG_RW|PFM_CMD_STOP)
-#define PFM_CMD_PCLRW	(PFM_CMD_FD|PFM_CMD_ARG_RW)
-#define PFM_CMD_NONE	{ NULL, "no-cmd", 0, 0, 0, NULL}
-
-static pfm_cmd_desc_t pfm_cmd_tab[]={
-/* 0  */PFM_CMD_NONE,
-/* 1  */PFM_CMD(pfm_write_pmcs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
-/* 2  */PFM_CMD(pfm_write_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
-/* 3  */PFM_CMD(pfm_read_pmds, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
-/* 4  */PFM_CMD_S(pfm_stop, PFM_CMD_PCLRWS),
-/* 5  */PFM_CMD_S(pfm_start, PFM_CMD_PCLRWS),
-/* 6  */PFM_CMD_NONE,
-/* 7  */PFM_CMD_NONE,
-/* 8  */PFM_CMD(pfm_context_create, PFM_CMD_ARG_RW, 1, pfarg_context_t, pfm_ctx_getsize),
-/* 9  */PFM_CMD_NONE,
-/* 10 */PFM_CMD_S(pfm_restart, PFM_CMD_PCLRW),
-/* 11 */PFM_CMD_NONE,
-/* 12 */PFM_CMD(pfm_get_features, PFM_CMD_ARG_RW, 1, pfarg_features_t, NULL),
-/* 13 */PFM_CMD(pfm_debug, 0, 1, unsigned int, NULL),
-/* 14 */PFM_CMD_NONE,
-/* 15 */PFM_CMD(pfm_get_pmc_reset, PFM_CMD_ARG_RW, PFM_CMD_ARG_MANY, pfarg_reg_t, NULL),
-/* 16 */PFM_CMD(pfm_context_load, PFM_CMD_PCLRWS, 1, pfarg_load_t, NULL),
-/* 17 */PFM_CMD_S(pfm_context_unload, PFM_CMD_PCLRWS),
-/* 18 */PFM_CMD_NONE,
-/* 19 */PFM_CMD_NONE,
-/* 20 */PFM_CMD_NONE,
-/* 21 */PFM_CMD_NONE,
-/* 22 */PFM_CMD_NONE,
-/* 23 */PFM_CMD_NONE,
-/* 24 */PFM_CMD_NONE,
-/* 25 */PFM_CMD_NONE,
-/* 26 */PFM_CMD_NONE,
-/* 27 */PFM_CMD_NONE,
-/* 28 */PFM_CMD_NONE,
-/* 29 */PFM_CMD_NONE,
-/* 30 */PFM_CMD_NONE,
-/* 31 */PFM_CMD_NONE,
-/* 32 */PFM_CMD(pfm_write_ibrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL),
-/* 33 */PFM_CMD(pfm_write_dbrs, PFM_CMD_PCLRWS, PFM_CMD_ARG_MANY, pfarg_dbreg_t, NULL)
-};
-#define PFM_CMD_COUNT	(sizeof(pfm_cmd_tab)/sizeof(pfm_cmd_desc_t))
-
-static int
-pfm_check_task_state(pfm_context_t *ctx, int cmd, unsigned long flags)
-{
-	struct task_struct *task;
-	int state, old_state;
-
-recheck:
-	state = ctx->ctx_state;
-	task  = ctx->ctx_task;
-
-	if (task == NULL) {
-		DPRINT(("context %d no task, state=%d\n", ctx->ctx_fd, state));
-		return 0;
-	}
-
-	DPRINT(("context %d state=%d [%d] task_state=%ld must_stop=%d\n",
-		ctx->ctx_fd,
-		state,
-		task->pid,
-		task->state, PFM_CMD_STOPPED(cmd)));
-
-	/*
-	 * self-monitoring always ok.
-	 *
-	 * for system-wide the caller can either be the creator of the
-	 * context (to one to which the context is attached to) OR
-	 * a task running on the same CPU as the session.
-	 */
-	if (task == current || ctx->ctx_fl_system) return 0;
-
-	/*
-	 * we are monitoring another thread
-	 */
-	switch(state) {
-		case PFM_CTX_UNLOADED:
-			/*
-			 * if context is UNLOADED we are safe to go
-			 */
-			return 0;
-		case PFM_CTX_ZOMBIE:
-			/*
-			 * no command can operate on a zombie context
-			 */
-			DPRINT(("cmd %d state zombie cannot operate on context\n", cmd));
-			return -EINVAL;
-		case PFM_CTX_MASKED:
-			/*
-			 * PMU state has been saved to software even though
-			 * the thread may still be running.
-			 */
-			if (cmd != PFM_UNLOAD_CONTEXT) return 0;
-	}
-
-	/*
-	 * context is LOADED or MASKED. Some commands may need to have 
-	 * the task stopped.
-	 *
-	 * We could lift this restriction for UP but it would mean that
-	 * the user has no guarantee the task would not run between
-	 * two successive calls to perfmonctl(). That's probably OK.
-	 * If this user wants to ensure the task does not run, then
-	 * the task must be stopped.
-	 */
-	if (PFM_CMD_STOPPED(cmd)) {
-		if ((task->state != TASK_STOPPED) && (task->state != TASK_TRACED)) {
-			DPRINT(("[%d] task not in stopped state\n", task->pid));
-			return -EBUSY;
-		}
-		/*
-		 * task is now stopped, wait for ctxsw out
-		 *
-		 * This is an interesting point in the code.
-		 * We need to unprotect the context because
-		 * the pfm_save_regs() routines needs to grab
-		 * the same lock. There are danger in doing
-		 * this because it leaves a window open for
-		 * another task to get access to the context
-		 * and possibly change its state. The one thing
-		 * that is not possible is for the context to disappear
-		 * because we are protected by the VFS layer, i.e.,
-		 * get_fd()/put_fd().
-		 */
-		old_state = state;
-
-		UNPROTECT_CTX(ctx, flags);
-
-		wait_task_inactive(task);
-
-		PROTECT_CTX(ctx, flags);
-
-		/*
-		 * we must recheck to verify if state has changed
-		 */
-		if (ctx->ctx_state != old_state) {
-			DPRINT(("old_state=%d new_state=%d\n", old_state, ctx->ctx_state));
-			goto recheck;
-		}
-	}
-	return 0;
-}
-
-/*
- * system-call entry point (must return long)
- */
-asmlinkage long
-sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
-{
-	struct file *file = NULL;
-	pfm_context_t *ctx = NULL;
-	unsigned long flags = 0UL;
-	void *args_k = NULL;
-	long ret; /* will expand int return types */
-	size_t base_sz, sz, xtra_sz = 0;
-	int narg, completed_args = 0, call_made = 0, cmd_flags;
-	int (*func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
-	int (*getsize)(void *arg, size_t *sz);
-#define PFM_MAX_ARGSIZE	4096
-
-	/*
-	 * reject any call if perfmon was disabled at initialization
-	 */
-	if (unlikely(pmu_conf == NULL)) return -ENOSYS;
-
-	if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT)) {
-		DPRINT(("invalid cmd=%d\n", cmd));
-		return -EINVAL;
-	}
-
-	func      = pfm_cmd_tab[cmd].cmd_func;
-	narg      = pfm_cmd_tab[cmd].cmd_narg;
-	base_sz   = pfm_cmd_tab[cmd].cmd_argsize;
-	getsize   = pfm_cmd_tab[cmd].cmd_getsize;
-	cmd_flags = pfm_cmd_tab[cmd].cmd_flags;
-
-	if (unlikely(func == NULL)) {
-		DPRINT(("invalid cmd=%d\n", cmd));
-		return -EINVAL;
-	}
-
-	DPRINT(("cmd=%s idx=%d narg=0x%x argsz=%lu count=%d\n",
-		PFM_CMD_NAME(cmd),
-		cmd,
-		narg,
-		base_sz,
-		count));
-
-	/*
-	 * check if number of arguments matches what the command expects
-	 */
-	if (unlikely((narg == PFM_CMD_ARG_MANY && count <= 0) || (narg > 0 && narg != count)))
-		return -EINVAL;
-
-restart_args:
-	sz = xtra_sz + base_sz*count;
-	/*
-	 * limit abuse to min page size
-	 */
-	if (unlikely(sz > PFM_MAX_ARGSIZE)) {
-		printk(KERN_ERR "perfmon: [%d] argument too big %lu\n", current->pid, sz);
-		return -E2BIG;
-	}
-
-	/*
-	 * allocate default-sized argument buffer
-	 */
-	if (likely(count && args_k == NULL)) {
-		args_k = kmalloc(PFM_MAX_ARGSIZE, GFP_KERNEL);
-		if (args_k == NULL) return -ENOMEM;
-	}
-
-	ret = -EFAULT;
-
-	/*
-	 * copy arguments
-	 *
-	 * assume sz = 0 for command without parameters
-	 */
-	if (sz && copy_from_user(args_k, arg, sz)) {
-		DPRINT(("cannot copy_from_user %lu bytes @%p\n", sz, arg));
-		goto error_args;
-	}
-
-	/*
-	 * check if command supports extra parameters
-	 */
-	if (completed_args == 0 && getsize) {
-		/*
-		 * get extra parameters size (based on main argument)
-		 */
-		ret = (*getsize)(args_k, &xtra_sz);
-		if (ret) goto error_args;
-
-		completed_args = 1;
-
-		DPRINT(("restart_args sz=%lu xtra_sz=%lu\n", sz, xtra_sz));
-
-		/* retry if necessary */
-		if (likely(xtra_sz)) goto restart_args;
-	}
-
-	if (unlikely((cmd_flags & PFM_CMD_FD) == 0)) goto skip_fd;
-
-	ret = -EBADF;
-
-	file = fget(fd);
-	if (unlikely(file == NULL)) {
-		DPRINT(("invalid fd %d\n", fd));
-		goto error_args;
-	}
-	if (unlikely(PFM_IS_FILE(file) == 0)) {
-		DPRINT(("fd %d not related to perfmon\n", fd));
-		goto error_args;
-	}
-
-	ctx = (pfm_context_t *)file->private_data;
-	if (unlikely(ctx == NULL)) {
-		DPRINT(("no context for fd %d\n", fd));
-		goto error_args;
-	}
-	prefetch(&ctx->ctx_state);
-
-	PROTECT_CTX(ctx, flags);
-
-	/*
-	 * check task is stopped
-	 */
-	ret = pfm_check_task_state(ctx, cmd, flags);
-	if (unlikely(ret)) goto abort_locked;
-
-skip_fd:
-	ret = (*func)(ctx, args_k, count, task_pt_regs(current));
-
-	call_made = 1;
-
-abort_locked:
-	if (likely(ctx)) {
-		DPRINT(("context unlocked\n"));
-		UNPROTECT_CTX(ctx, flags);
-	}
-
-	/* copy argument back to user, if needed */
-	if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT;
-
-error_args:
-	if (file)
-		fput(file);
-
-	kfree(args_k);
-
-	DPRINT(("cmd=%s ret=%ld\n", PFM_CMD_NAME(cmd), ret));
-
-	return ret;
-}
-
-static void
-pfm_resume_after_ovfl(pfm_context_t *ctx, unsigned long ovfl_regs, struct pt_regs *regs)
-{
-	pfm_buffer_fmt_t *fmt = ctx->ctx_buf_fmt;
-	pfm_ovfl_ctrl_t rst_ctrl;
-	int state;
-	int ret = 0;
-
-	state = ctx->ctx_state;
-	/*
-	 * Unlock sampling buffer and reset index atomically
-	 * XXX: not really needed when blocking
-	 */
-	if (CTX_HAS_SMPL(ctx)) {
-
-		rst_ctrl.bits.mask_monitoring = 0;
-		rst_ctrl.bits.reset_ovfl_pmds = 0;
-
-		if (state == PFM_CTX_LOADED)
-			ret = pfm_buf_fmt_restart_active(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
-		else
-			ret = pfm_buf_fmt_restart(fmt, current, &rst_ctrl, ctx->ctx_smpl_hdr, regs);
-	} else {
-		rst_ctrl.bits.mask_monitoring = 0;
-		rst_ctrl.bits.reset_ovfl_pmds = 1;
-	}
-
-	if (ret == 0) {
-		if (rst_ctrl.bits.reset_ovfl_pmds) {
-			pfm_reset_regs(ctx, &ovfl_regs, PFM_PMD_LONG_RESET);
-		}
-		if (rst_ctrl.bits.mask_monitoring == 0) {
-			DPRINT(("resuming monitoring\n"));
-			if (ctx->ctx_state == PFM_CTX_MASKED) pfm_restore_monitoring(current);
-		} else {
-			DPRINT(("stopping monitoring\n"));
-			//pfm_stop_monitoring(current, regs);
-		}
-		ctx->ctx_state = PFM_CTX_LOADED;
-	}
-}
-
-/*
- * context MUST BE LOCKED when calling
- * can only be called for current
- */
-static void
-pfm_context_force_terminate(pfm_context_t *ctx, struct pt_regs *regs)
-{
-	int ret;
-
-	DPRINT(("entering for [%d]\n", current->pid));
-
-	ret = pfm_context_unload(ctx, NULL, 0, regs);
-	if (ret) {
-		printk(KERN_ERR "pfm_context_force_terminate: [%d] unloaded failed with %d\n", current->pid, ret);
-	}
-
-	/*
-	 * and wakeup controlling task, indicating we are now disconnected
-	 */
-	wake_up_interruptible(&ctx->ctx_zombieq);
-
-	/*
-	 * given that context is still locked, the controlling
-	 * task will only get access when we return from
-	 * pfm_handle_work().
-	 */
-}
-
-static int pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds);
- /*
-  * pfm_handle_work() can be called with interrupts enabled
-  * (TIF_NEED_RESCHED) or disabled. The down_interruptible
-  * call may sleep, therefore we must re-enable interrupts
-  * to avoid deadlocks. It is safe to do so because this function
-  * is called ONLY when returning to user level (PUStk=1), in which case
-  * there is no risk of kernel stack overflow due to deep
-  * interrupt nesting.
-  */
-void
-pfm_handle_work(void)
-{
-	pfm_context_t *ctx;
-	struct pt_regs *regs;
-	unsigned long flags, dummy_flags;
-	unsigned long ovfl_regs;
-	unsigned int reason;
-	int ret;
-
-	ctx = PFM_GET_CTX(current);
-	if (ctx == NULL) {
-		printk(KERN_ERR "perfmon: [%d] has no PFM context\n", current->pid);
-		return;
-	}
-
-	PROTECT_CTX(ctx, flags);
-
-	PFM_SET_WORK_PENDING(current, 0);
-
-	pfm_clear_task_notify();
-
-	regs = task_pt_regs(current);
-
-	/*
-	 * extract reason for being here and clear
-	 */
-	reason = ctx->ctx_fl_trap_reason;
-	ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_NONE;
-	ovfl_regs = ctx->ctx_ovfl_regs[0];
-
-	DPRINT(("reason=%d state=%d\n", reason, ctx->ctx_state));
-
-	/*
-	 * must be done before we check for simple-reset mode
-	 */
-	if (ctx->ctx_fl_going_zombie || ctx->ctx_state == PFM_CTX_ZOMBIE) goto do_zombie;
-
-
-	//if (CTX_OVFL_NOBLOCK(ctx)) goto skip_blocking;
-	if (reason == PFM_TRAP_REASON_RESET) goto skip_blocking;
-
-	/*
-	 * restore interrupt mask to what it was on entry.
-	 * Could be enabled/diasbled.
-	 */
-	UNPROTECT_CTX(ctx, flags);
-
-	/*
-	 * force interrupt enable because of down_interruptible()
-	 */
-	local_irq_enable();
-
-	DPRINT(("before block sleeping\n"));
-
-	/*
-	 * may go through without blocking on SMP systems
-	 * if restart has been received already by the time we call down()
-	 */
-	ret = wait_for_completion_interruptible(&ctx->ctx_restart_done);
-
-	DPRINT(("after block sleeping ret=%d\n", ret));
-
-	/*
-	 * lock context and mask interrupts again
-	 * We save flags into a dummy because we may have
-	 * altered interrupts mask compared to entry in this
-	 * function.
-	 */
-	PROTECT_CTX(ctx, dummy_flags);
-
-	/*
-	 * we need to read the ovfl_regs only after wake-up
-	 * because we may have had pfm_write_pmds() in between
-	 * and that can changed PMD values and therefore 
-	 * ovfl_regs is reset for these new PMD values.
-	 */
-	ovfl_regs = ctx->ctx_ovfl_regs[0];
-
-	if (ctx->ctx_fl_going_zombie) {
-do_zombie:
-		DPRINT(("context is zombie, bailing out\n"));
-		pfm_context_force_terminate(ctx, regs);
-		goto nothing_to_do;
-	}
-	/*
-	 * in case of interruption of down() we don't restart anything
-	 */
-	if (ret < 0) goto nothing_to_do;
-
-skip_blocking:
-	pfm_resume_after_ovfl(ctx, ovfl_regs, regs);
-	ctx->ctx_ovfl_regs[0] = 0UL;
-
-nothing_to_do:
-	/*
-	 * restore flags as they were upon entry
-	 */
-	UNPROTECT_CTX(ctx, flags);
-}
-
-static int
-pfm_notify_user(pfm_context_t *ctx, pfm_msg_t *msg)
-{
-	if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
-		DPRINT(("ignoring overflow notification, owner is zombie\n"));
-		return 0;
-	}
-
-	DPRINT(("waking up somebody\n"));
-
-	if (msg) wake_up_interruptible(&ctx->ctx_msgq_wait);
-
-	/*
-	 * safe, we are not in intr handler, nor in ctxsw when
-	 * we come here
-	 */
-	kill_fasync (&ctx->ctx_async_queue, SIGIO, POLL_IN);
-
-	return 0;
-}
-
-static int
-pfm_ovfl_notify_user(pfm_context_t *ctx, unsigned long ovfl_pmds)
-{
-	pfm_msg_t *msg = NULL;
-
-	if (ctx->ctx_fl_no_msg == 0) {
-		msg = pfm_get_new_msg(ctx);
-		if (msg == NULL) {
-			printk(KERN_ERR "perfmon: pfm_ovfl_notify_user no more notification msgs\n");
-			return -1;
-		}
-
-		msg->pfm_ovfl_msg.msg_type         = PFM_MSG_OVFL;
-		msg->pfm_ovfl_msg.msg_ctx_fd       = ctx->ctx_fd;
-		msg->pfm_ovfl_msg.msg_active_set   = 0;
-		msg->pfm_ovfl_msg.msg_ovfl_pmds[0] = ovfl_pmds;
-		msg->pfm_ovfl_msg.msg_ovfl_pmds[1] = 0UL;
-		msg->pfm_ovfl_msg.msg_ovfl_pmds[2] = 0UL;
-		msg->pfm_ovfl_msg.msg_ovfl_pmds[3] = 0UL;
-		msg->pfm_ovfl_msg.msg_tstamp       = 0UL;
-	}
-
-	DPRINT(("ovfl msg: msg=%p no_msg=%d fd=%d ovfl_pmds=0x%lx\n",
-		msg,
-		ctx->ctx_fl_no_msg,
-		ctx->ctx_fd,
-		ovfl_pmds));
-
-	return pfm_notify_user(ctx, msg);
-}
-
-static int
-pfm_end_notify_user(pfm_context_t *ctx)
-{
-	pfm_msg_t *msg;
-
-	msg = pfm_get_new_msg(ctx);
-	if (msg == NULL) {
-		printk(KERN_ERR "perfmon: pfm_end_notify_user no more notification msgs\n");
-		return -1;
-	}
-	/* no leak */
-	memset(msg, 0, sizeof(*msg));
-
-	msg->pfm_end_msg.msg_type    = PFM_MSG_END;
-	msg->pfm_end_msg.msg_ctx_fd  = ctx->ctx_fd;
-	msg->pfm_ovfl_msg.msg_tstamp = 0UL;
-
-	DPRINT(("end msg: msg=%p no_msg=%d ctx_fd=%d\n",
-		msg,
-		ctx->ctx_fl_no_msg,
-		ctx->ctx_fd));
-
-	return pfm_notify_user(ctx, msg);
-}
-
-/*
- * main overflow processing routine.
- * it can be called from the interrupt path or explicitly during the context switch code
- */
-static void
-pfm_overflow_handler(struct task_struct *task, pfm_context_t *ctx, u64 pmc0, struct pt_regs *regs)
-{
-	pfm_ovfl_arg_t *ovfl_arg;
-	unsigned long mask;
-	unsigned long old_val, ovfl_val, new_val;
-	unsigned long ovfl_notify = 0UL, ovfl_pmds = 0UL, smpl_pmds = 0UL, reset_pmds;
-	unsigned long tstamp;
-	pfm_ovfl_ctrl_t	ovfl_ctrl;
-	unsigned int i, has_smpl;
-	int must_notify = 0;
-
-	if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) goto stop_monitoring;
-
-	/*
-	 * sanity test. Should never happen
-	 */
-	if (unlikely((pmc0 & 0x1) == 0)) goto sanity_check;
-
-	tstamp   = ia64_get_itc();
-	mask     = pmc0 >> PMU_FIRST_COUNTER;
-	ovfl_val = pmu_conf->ovfl_val;
-	has_smpl = CTX_HAS_SMPL(ctx);
-
-	DPRINT_ovfl(("pmc0=0x%lx pid=%d iip=0x%lx, %s "
-		     "used_pmds=0x%lx\n",
-			pmc0,
-			task ? task->pid: -1,
-			(regs ? regs->cr_iip : 0),
-			CTX_OVFL_NOBLOCK(ctx) ? "nonblocking" : "blocking",
-			ctx->ctx_used_pmds[0]));
-
-
-	/*
-	 * first we update the virtual counters
-	 * assume there was a prior ia64_srlz_d() issued
-	 */
-	for (i = PMU_FIRST_COUNTER; mask ; i++, mask >>= 1) {
-
-		/* skip pmd which did not overflow */
-		if ((mask & 0x1) == 0) continue;
-
-		/*
-		 * Note that the pmd is not necessarily 0 at this point as qualified events
-		 * may have happened before the PMU was frozen. The residual count is not
-		 * taken into consideration here but will be with any read of the pmd via
-		 * pfm_read_pmds().
-		 */
-		old_val              = new_val = ctx->ctx_pmds[i].val;
-		new_val             += 1 + ovfl_val;
-		ctx->ctx_pmds[i].val = new_val;
-
-		/*
-		 * check for overflow condition
-		 */
-		if (likely(old_val > new_val)) {
-			ovfl_pmds |= 1UL << i;
-			if (PMC_OVFL_NOTIFY(ctx, i)) ovfl_notify |= 1UL << i;
-		}
-
-		DPRINT_ovfl(("ctx_pmd[%d].val=0x%lx old_val=0x%lx pmd=0x%lx ovfl_pmds=0x%lx ovfl_notify=0x%lx\n",
-			i,
-			new_val,
-			old_val,
-			ia64_get_pmd(i) & ovfl_val,
-			ovfl_pmds,
-			ovfl_notify));
-	}
-
-	/*
-	 * there was no 64-bit overflow, nothing else to do
-	 */
-	if (ovfl_pmds == 0UL) return;
-
-	/* 
-	 * reset all control bits
-	 */
-	ovfl_ctrl.val = 0;
-	reset_pmds    = 0UL;
-
-	/*
-	 * if a sampling format module exists, then we "cache" the overflow by 
-	 * calling the module's handler() routine.
-	 */
-	if (has_smpl) {
-		unsigned long start_cycles, end_cycles;
-		unsigned long pmd_mask;
-		int j, k, ret = 0;
-		int this_cpu = smp_processor_id();
-
-		pmd_mask = ovfl_pmds >> PMU_FIRST_COUNTER;
-		ovfl_arg = &ctx->ctx_ovfl_arg;
-
-		prefetch(ctx->ctx_smpl_hdr);
-
-		for(i=PMU_FIRST_COUNTER; pmd_mask && ret == 0; i++, pmd_mask >>=1) {
-
-			mask = 1UL << i;
-
-			if ((pmd_mask & 0x1) == 0) continue;
-
-			ovfl_arg->ovfl_pmd      = (unsigned char )i;
-			ovfl_arg->ovfl_notify   = ovfl_notify & mask ? 1 : 0;
-			ovfl_arg->active_set    = 0;
-			ovfl_arg->ovfl_ctrl.val = 0; /* module must fill in all fields */
-			ovfl_arg->smpl_pmds[0]  = smpl_pmds = ctx->ctx_pmds[i].smpl_pmds[0];
-
-			ovfl_arg->pmd_value      = ctx->ctx_pmds[i].val;
-			ovfl_arg->pmd_last_reset = ctx->ctx_pmds[i].lval;
-			ovfl_arg->pmd_eventid    = ctx->ctx_pmds[i].eventid;
-
-			/*
-		 	 * copy values of pmds of interest. Sampling format may copy them
-		 	 * into sampling buffer.
-		 	 */
-			if (smpl_pmds) {
-				for(j=0, k=0; smpl_pmds; j++, smpl_pmds >>=1) {
-					if ((smpl_pmds & 0x1) == 0) continue;
-					ovfl_arg->smpl_pmds_values[k++] = PMD_IS_COUNTING(j) ?  pfm_read_soft_counter(ctx, j) : ia64_get_pmd(j);
-					DPRINT_ovfl(("smpl_pmd[%d]=pmd%u=0x%lx\n", k-1, j, ovfl_arg->smpl_pmds_values[k-1]));
-				}
-			}
-
-			pfm_stats[this_cpu].pfm_smpl_handler_calls++;
-
-			start_cycles = ia64_get_itc();
-
-			/*
-		 	 * call custom buffer format record (handler) routine
-		 	 */
-			ret = (*ctx->ctx_buf_fmt->fmt_handler)(task, ctx->ctx_smpl_hdr, ovfl_arg, regs, tstamp);
-
-			end_cycles = ia64_get_itc();
-
-			/*
-			 * For those controls, we take the union because they have
-			 * an all or nothing behavior.
-			 */
-			ovfl_ctrl.bits.notify_user     |= ovfl_arg->ovfl_ctrl.bits.notify_user;
-			ovfl_ctrl.bits.block_task      |= ovfl_arg->ovfl_ctrl.bits.block_task;
-			ovfl_ctrl.bits.mask_monitoring |= ovfl_arg->ovfl_ctrl.bits.mask_monitoring;
-			/*
-			 * build the bitmask of pmds to reset now
-			 */
-			if (ovfl_arg->ovfl_ctrl.bits.reset_ovfl_pmds) reset_pmds |= mask;
-
-			pfm_stats[this_cpu].pfm_smpl_handler_cycles += end_cycles - start_cycles;
-		}
-		/*
-		 * when the module cannot handle the rest of the overflows, we abort right here
-		 */
-		if (ret && pmd_mask) {
-			DPRINT(("handler aborts leftover ovfl_pmds=0x%lx\n",
-				pmd_mask<<PMU_FIRST_COUNTER));
-		}
-		/*
-		 * remove the pmds we reset now from the set of pmds to reset in pfm_restart()
-		 */
-		ovfl_pmds &= ~reset_pmds;
-	} else {
-		/*
-		 * when no sampling module is used, then the default
-		 * is to notify on overflow if requested by user
-		 */
-		ovfl_ctrl.bits.notify_user     = ovfl_notify ? 1 : 0;
-		ovfl_ctrl.bits.block_task      = ovfl_notify ? 1 : 0;
-		ovfl_ctrl.bits.mask_monitoring = ovfl_notify ? 1 : 0; /* XXX: change for saturation */
-		ovfl_ctrl.bits.reset_ovfl_pmds = ovfl_notify ? 0 : 1;
-		/*
-		 * if needed, we reset all overflowed pmds
-		 */
-		if (ovfl_notify == 0) reset_pmds = ovfl_pmds;
-	}
-
-	DPRINT_ovfl(("ovfl_pmds=0x%lx reset_pmds=0x%lx\n", ovfl_pmds, reset_pmds));
-
-	/*
-	 * reset the requested PMD registers using the short reset values
-	 */
-	if (reset_pmds) {
-		unsigned long bm = reset_pmds;
-		pfm_reset_regs(ctx, &bm, PFM_PMD_SHORT_RESET);
-	}
-
-	if (ovfl_notify && ovfl_ctrl.bits.notify_user) {
-		/*
-		 * keep track of what to reset when unblocking
-		 */
-		ctx->ctx_ovfl_regs[0] = ovfl_pmds;
-
-		/*
-		 * check for blocking context 
-		 */
-		if (CTX_OVFL_NOBLOCK(ctx) == 0 && ovfl_ctrl.bits.block_task) {
-
-			ctx->ctx_fl_trap_reason = PFM_TRAP_REASON_BLOCK;
-
-			/*
-			 * set the perfmon specific checking pending work for the task
-			 */
-			PFM_SET_WORK_PENDING(task, 1);
-
-			/*
-			 * when coming from ctxsw, current still points to the
-			 * previous task, therefore we must work with task and not current.
-			 */
-			pfm_set_task_notify(task);
-		}
-		/*
-		 * defer until state is changed (shorten spin window). the context is locked
-		 * anyway, so the signal receiver would come spin for nothing.
-		 */
-		must_notify = 1;
-	}
-
-	DPRINT_ovfl(("owner [%d] pending=%ld reason=%u ovfl_pmds=0x%lx ovfl_notify=0x%lx masked=%d\n",
-			GET_PMU_OWNER() ? GET_PMU_OWNER()->pid : -1,
-			PFM_GET_WORK_PENDING(task),
-			ctx->ctx_fl_trap_reason,
-			ovfl_pmds,
-			ovfl_notify,
-			ovfl_ctrl.bits.mask_monitoring ? 1 : 0));
-	/*
-	 * in case monitoring must be stopped, we toggle the psr bits
-	 */
-	if (ovfl_ctrl.bits.mask_monitoring) {
-		pfm_mask_monitoring(task);
-		ctx->ctx_state = PFM_CTX_MASKED;
-		ctx->ctx_fl_can_restart = 1;
-	}
-
-	/*
-	 * send notification now
-	 */
-	if (must_notify) pfm_ovfl_notify_user(ctx, ovfl_notify);
-
-	return;
-
-sanity_check:
-	printk(KERN_ERR "perfmon: CPU%d overflow handler [%d] pmc0=0x%lx\n",
-			smp_processor_id(),
-			task ? task->pid : -1,
-			pmc0);
-	return;
-
-stop_monitoring:
-	/*
-	 * in SMP, zombie context is never restored but reclaimed in pfm_load_regs().
-	 * Moreover, zombies are also reclaimed in pfm_save_regs(). Therefore we can
-	 * come here as zombie only if the task is the current task. In which case, we
-	 * can access the PMU  hardware directly.
-	 *
-	 * Note that zombies do have PM_VALID set. So here we do the minimal.
-	 *
-	 * In case the context was zombified it could not be reclaimed at the time
-	 * the monitoring program exited. At this point, the PMU reservation has been
-	 * returned, the sampiing buffer has been freed. We must convert this call
-	 * into a spurious interrupt. However, we must also avoid infinite overflows
-	 * by stopping monitoring for this task. We can only come here for a per-task
-	 * context. All we need to do is to stop monitoring using the psr bits which
-	 * are always task private. By re-enabling secure montioring, we ensure that
-	 * the monitored task will not be able to re-activate monitoring.
-	 * The task will eventually be context switched out, at which point the context
-	 * will be reclaimed (that includes releasing ownership of the PMU).
-	 *
-	 * So there might be a window of time where the number of per-task session is zero
-	 * yet one PMU might have a owner and get at most one overflow interrupt for a zombie
-	 * context. This is safe because if a per-task session comes in, it will push this one
-	 * out and by the virtue on pfm_save_regs(), this one will disappear. If a system wide
-	 * session is force on that CPU, given that we use task pinning, pfm_save_regs() will
-	 * also push our zombie context out.
-	 *
-	 * Overall pretty hairy stuff....
-	 */
-	DPRINT(("ctx is zombie for [%d], converted to spurious\n", task ? task->pid: -1));
-	pfm_clear_psr_up();
-	ia64_psr(regs)->up = 0;
-	ia64_psr(regs)->sp = 1;
-	return;
-}
-
-static int
-pfm_do_interrupt_handler(int irq, void *arg, struct pt_regs *regs)
-{
-	struct task_struct *task;
-	pfm_context_t *ctx;
-	unsigned long flags;
-	u64 pmc0;
-	int this_cpu = smp_processor_id();
-	int retval = 0;
-
-	pfm_stats[this_cpu].pfm_ovfl_intr_count++;
-
-	/*
-	 * srlz.d done before arriving here
-	 */
-	pmc0 = ia64_get_pmc(0);
-
-	task = GET_PMU_OWNER();
-	ctx  = GET_PMU_CTX();
-
-	/*
-	 * if we have some pending bits set
-	 * assumes : if any PMC0.bit[63-1] is set, then PMC0.fr = 1
-	 */
-	if (PMC0_HAS_OVFL(pmc0) && task) {
-		/*
-		 * we assume that pmc0.fr is always set here
-		 */
-
-		/* sanity check */
-		if (!ctx) goto report_spurious1;
-
-		if (ctx->ctx_fl_system == 0 && (task->thread.flags & IA64_THREAD_PM_VALID) == 0) 
-			goto report_spurious2;
-
-		PROTECT_CTX_NOPRINT(ctx, flags);
-
-		pfm_overflow_handler(task, ctx, pmc0, regs);
-
-		UNPROTECT_CTX_NOPRINT(ctx, flags);
-
-	} else {
-		pfm_stats[this_cpu].pfm_spurious_ovfl_intr_count++;
-		retval = -1;
-	}
-	/*
-	 * keep it unfrozen at all times
-	 */
-	pfm_unfreeze_pmu();
-
-	return retval;
-
-report_spurious1:
-	printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d has no PFM context\n",
-		this_cpu, task->pid);
-	pfm_unfreeze_pmu();
-	return -1;
-report_spurious2:
-	printk(KERN_INFO "perfmon: spurious overflow interrupt on CPU%d: process %d, invalid flag\n", 
-		this_cpu, 
-		task->pid);
-	pfm_unfreeze_pmu();
-	return -1;
-}
-
-static irqreturn_t
-pfm_interrupt_handler(int irq, void *arg)
-{
-	unsigned long start_cycles, total_cycles;
-	unsigned long min, max;
-	int this_cpu;
-	int ret;
-	struct pt_regs *regs = get_irq_regs();
-
-	this_cpu = get_cpu();
-	if (likely(!pfm_alt_intr_handler)) {
-		min = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min;
-		max = pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max;
-
-		start_cycles = ia64_get_itc();
-
-		ret = pfm_do_interrupt_handler(irq, arg, regs);
-
-		total_cycles = ia64_get_itc();
-
-		/*
-		 * don't measure spurious interrupts
-		 */
-		if (likely(ret == 0)) {
-			total_cycles -= start_cycles;
-
-			if (total_cycles < min) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_min = total_cycles;
-			if (total_cycles > max) pfm_stats[this_cpu].pfm_ovfl_intr_cycles_max = total_cycles;
-
-			pfm_stats[this_cpu].pfm_ovfl_intr_cycles += total_cycles;
-		}
-	}
-	else {
-		(*pfm_alt_intr_handler->handler)(irq, arg, regs);
-	}
-
-	put_cpu_no_resched();
-	return IRQ_HANDLED;
-}
-
-/*
- * /proc/perfmon interface, for debug only
- */
-
-#define PFM_PROC_SHOW_HEADER	((void *)NR_CPUS+1)
-
-static void *
-pfm_proc_start(struct seq_file *m, loff_t *pos)
-{
-	if (*pos == 0) {
-		return PFM_PROC_SHOW_HEADER;
-	}
-
-	while (*pos <= NR_CPUS) {
-		if (cpu_online(*pos - 1)) {
-			return (void *)*pos;
-		}
-		++*pos;
-	}
-	return NULL;
-}
-
-static void *
-pfm_proc_next(struct seq_file *m, void *v, loff_t *pos)
-{
-	++*pos;
-	return pfm_proc_start(m, pos);
-}
-
-static void
-pfm_proc_stop(struct seq_file *m, void *v)
-{
-}
-
-static void
-pfm_proc_show_header(struct seq_file *m)
-{
-	struct list_head * pos;
-	pfm_buffer_fmt_t * entry;
-	unsigned long flags;
-
- 	seq_printf(m,
-		"perfmon version           : %u.%u\n"
-		"model                     : %s\n"
-		"fastctxsw                 : %s\n"
-		"expert mode               : %s\n"
-		"ovfl_mask                 : 0x%lx\n"
-		"PMU flags                 : 0x%x\n",
-		PFM_VERSION_MAJ, PFM_VERSION_MIN,
-		pmu_conf->pmu_name,
-		pfm_sysctl.fastctxsw > 0 ? "Yes": "No",
-		pfm_sysctl.expert_mode > 0 ? "Yes": "No",
-		pmu_conf->ovfl_val,
-		pmu_conf->flags);
-
-  	LOCK_PFS(flags);
-
- 	seq_printf(m,
- 		"proc_sessions             : %u\n"
- 		"sys_sessions              : %u\n"
- 		"sys_use_dbregs            : %u\n"
- 		"ptrace_use_dbregs         : %u\n",
- 		pfm_sessions.pfs_task_sessions,
- 		pfm_sessions.pfs_sys_sessions,
- 		pfm_sessions.pfs_sys_use_dbregs,
- 		pfm_sessions.pfs_ptrace_use_dbregs);
-
-  	UNLOCK_PFS(flags);
-
-	spin_lock(&pfm_buffer_fmt_lock);
-
-	list_for_each(pos, &pfm_buffer_fmt_list) {
-		entry = list_entry(pos, pfm_buffer_fmt_t, fmt_list);
-		seq_printf(m, "format                    : %02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x-%02x %s\n",
-			entry->fmt_uuid[0],
-			entry->fmt_uuid[1],
-			entry->fmt_uuid[2],
-			entry->fmt_uuid[3],
-			entry->fmt_uuid[4],
-			entry->fmt_uuid[5],
-			entry->fmt_uuid[6],
-			entry->fmt_uuid[7],
-			entry->fmt_uuid[8],
-			entry->fmt_uuid[9],
-			entry->fmt_uuid[10],
-			entry->fmt_uuid[11],
-			entry->fmt_uuid[12],
-			entry->fmt_uuid[13],
-			entry->fmt_uuid[14],
-			entry->fmt_uuid[15],
-			entry->fmt_name);
-	}
-	spin_unlock(&pfm_buffer_fmt_lock);
-
-}
-
-static int
-pfm_proc_show(struct seq_file *m, void *v)
-{
-	unsigned long psr;
-	unsigned int i;
-	int cpu;
-
-	if (v == PFM_PROC_SHOW_HEADER) {
-		pfm_proc_show_header(m);
-		return 0;
-	}
-
-	/* show info for CPU (v - 1) */
-
-	cpu = (long)v - 1;
-	seq_printf(m,
-		"CPU%-2d overflow intrs      : %lu\n"
-		"CPU%-2d overflow cycles     : %lu\n"
-		"CPU%-2d overflow min        : %lu\n"
-		"CPU%-2d overflow max        : %lu\n"
-		"CPU%-2d smpl handler calls  : %lu\n"
-		"CPU%-2d smpl handler cycles : %lu\n"
-		"CPU%-2d spurious intrs      : %lu\n"
-		"CPU%-2d replay   intrs      : %lu\n"
-		"CPU%-2d syst_wide           : %d\n"
-		"CPU%-2d dcr_pp              : %d\n"
-		"CPU%-2d exclude idle        : %d\n"
-		"CPU%-2d owner               : %d\n"
-		"CPU%-2d context             : %p\n"
-		"CPU%-2d activations         : %lu\n",
-		cpu, pfm_stats[cpu].pfm_ovfl_intr_count,
-		cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles,
-		cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_min,
-		cpu, pfm_stats[cpu].pfm_ovfl_intr_cycles_max,
-		cpu, pfm_stats[cpu].pfm_smpl_handler_calls,
-		cpu, pfm_stats[cpu].pfm_smpl_handler_cycles,
-		cpu, pfm_stats[cpu].pfm_spurious_ovfl_intr_count,
-		cpu, pfm_stats[cpu].pfm_replay_ovfl_intr_count,
-		cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_SYST_WIDE ? 1 : 0,
-		cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_DCR_PP ? 1 : 0,
-		cpu, pfm_get_cpu_data(pfm_syst_info, cpu) & PFM_CPUINFO_EXCL_IDLE ? 1 : 0,
-		cpu, pfm_get_cpu_data(pmu_owner, cpu) ? pfm_get_cpu_data(pmu_owner, cpu)->pid: -1,
-		cpu, pfm_get_cpu_data(pmu_ctx, cpu),
-		cpu, pfm_get_cpu_data(pmu_activation_number, cpu));
-
-	if (num_online_cpus() == 1 && pfm_sysctl.debug > 0) {
-
-		psr = pfm_get_psr();
-
-		ia64_srlz_d();
-
-		seq_printf(m, 
-			"CPU%-2d psr                 : 0x%lx\n"
-			"CPU%-2d pmc0                : 0x%lx\n", 
-			cpu, psr,
-			cpu, ia64_get_pmc(0));
-
-		for (i=0; PMC_IS_LAST(i) == 0;  i++) {
-			if (PMC_IS_COUNTING(i) == 0) continue;
-   			seq_printf(m, 
-				"CPU%-2d pmc%u                : 0x%lx\n"
-   				"CPU%-2d pmd%u                : 0x%lx\n", 
-				cpu, i, ia64_get_pmc(i),
-				cpu, i, ia64_get_pmd(i));
-  		}
-	}
-	return 0;
-}
-
-struct seq_operations pfm_seq_ops = {
-	.start =	pfm_proc_start,
- 	.next =		pfm_proc_next,
- 	.stop =		pfm_proc_stop,
- 	.show =		pfm_proc_show
-};
-
-static int
-pfm_proc_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &pfm_seq_ops);
-}
-
-
-/*
- * we come here as soon as local_cpu_data->pfm_syst_wide is set. this happens
- * during pfm_enable() hence before pfm_start(). We cannot assume monitoring
- * is active or inactive based on mode. We must rely on the value in
- * local_cpu_data->pfm_syst_info
- */
-void
-pfm_syst_wide_update_task(struct task_struct *task, unsigned long info, int is_ctxswin)
-{
-	struct pt_regs *regs;
-	unsigned long dcr;
-	unsigned long dcr_pp;
-
-	dcr_pp = info & PFM_CPUINFO_DCR_PP ? 1 : 0;
-
-	/*
-	 * pid 0 is guaranteed to be the idle task. There is one such task with pid 0
-	 * on every CPU, so we can rely on the pid to identify the idle task.
-	 */
-	if ((info & PFM_CPUINFO_EXCL_IDLE) == 0 || task->pid) {
-		regs = task_pt_regs(task);
-		ia64_psr(regs)->pp = is_ctxswin ? dcr_pp : 0;
-		return;
-	}
-	/*
-	 * if monitoring has started
-	 */
-	if (dcr_pp) {
-		dcr = ia64_getreg(_IA64_REG_CR_DCR);
-		/*
-		 * context switching in?
-		 */
-		if (is_ctxswin) {
-			/* mask monitoring for the idle task */
-			ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP);
-			pfm_clear_psr_pp();
-			ia64_srlz_i();
-			return;
-		}
-		/*
-		 * context switching out
-		 * restore monitoring for next task
-		 *
-		 * Due to inlining this odd if-then-else construction generates
-		 * better code.
-		 */
-		ia64_setreg(_IA64_REG_CR_DCR, dcr |IA64_DCR_PP);
-		pfm_set_psr_pp();
-		ia64_srlz_i();
-	}
-}
-
-#ifdef CONFIG_SMP
-
-static void
-pfm_force_cleanup(pfm_context_t *ctx, struct pt_regs *regs)
-{
-	struct task_struct *task = ctx->ctx_task;
-
-	ia64_psr(regs)->up = 0;
-	ia64_psr(regs)->sp = 1;
-
-	if (GET_PMU_OWNER() == task) {
-		DPRINT(("cleared ownership for [%d]\n", ctx->ctx_task->pid));
-		SET_PMU_OWNER(NULL, NULL);
-	}
-
-	/*
-	 * disconnect the task from the context and vice-versa
-	 */
-	PFM_SET_WORK_PENDING(task, 0);
-
-	task->thread.pfm_context  = NULL;
-	task->thread.flags       &= ~IA64_THREAD_PM_VALID;
-
-	DPRINT(("force cleanup for [%d]\n",  task->pid));
-}
-
-
-/*
- * in 2.6, interrupts are masked when we come here and the runqueue lock is held
- */
-void
-pfm_save_regs(struct task_struct *task)
-{
-	pfm_context_t *ctx;
-	unsigned long flags;
-	u64 psr;
-
-
-	ctx = PFM_GET_CTX(task);
-	if (ctx == NULL) return;
-
-	/*
- 	 * we always come here with interrupts ALREADY disabled by
- 	 * the scheduler. So we simply need to protect against concurrent
-	 * access, not CPU concurrency.
-	 */
-	flags = pfm_protect_ctx_ctxsw(ctx);
-
-	if (ctx->ctx_state == PFM_CTX_ZOMBIE) {
-		struct pt_regs *regs = task_pt_regs(task);
-
-		pfm_clear_psr_up();
-
-		pfm_force_cleanup(ctx, regs);
-
-		BUG_ON(ctx->ctx_smpl_hdr);
-
-		pfm_unprotect_ctx_ctxsw(ctx, flags);
-
-		pfm_context_free(ctx);
-		return;
-	}
-
-	/*
-	 * save current PSR: needed because we modify it
-	 */
-	ia64_srlz_d();
-	psr = pfm_get_psr();
-
-	BUG_ON(psr & (IA64_PSR_I));
-
-	/*
-	 * stop monitoring:
-	 * This is the last instruction which may generate an overflow
-	 *
-	 * We do not need to set psr.sp because, it is irrelevant in kernel.
-	 * It will be restored from ipsr when going back to user level
-	 */
-	pfm_clear_psr_up();
-
-	/*
-	 * keep a copy of psr.up (for reload)
-	 */
-	ctx->ctx_saved_psr_up = psr & IA64_PSR_UP;
-
-	/*
-	 * release ownership of this PMU.
-	 * PM interrupts are masked, so nothing
-	 * can happen.
-	 */
-	SET_PMU_OWNER(NULL, NULL);
-
-	/*
-	 * we systematically save the PMD as we have no
-	 * guarantee we will be schedule at that same
-	 * CPU again.
-	 */
-	pfm_save_pmds(ctx->th_pmds, ctx->ctx_used_pmds[0]);
-
-	/*
-	 * save pmc0 ia64_srlz_d() done in pfm_save_pmds()
-	 * we will need it on the restore path to check
-	 * for pending overflow.
-	 */
-	ctx->th_pmcs[0] = ia64_get_pmc(0);
-
-	/*
-	 * unfreeze PMU if had pending overflows
-	 */
-	if (ctx->th_pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
-
-	/*
-	 * finally, allow context access.
-	 * interrupts will still be masked after this call.
-	 */
-	pfm_unprotect_ctx_ctxsw(ctx, flags);
-}
-
-#else /* !CONFIG_SMP */
-void
-pfm_save_regs(struct task_struct *task)
-{
-	pfm_context_t *ctx;
-	u64 psr;
-
-	ctx = PFM_GET_CTX(task);
-	if (ctx == NULL) return;
-
-	/*
-	 * save current PSR: needed because we modify it
-	 */
-	psr = pfm_get_psr();
-
-	BUG_ON(psr & (IA64_PSR_I));
-
-	/*
-	 * stop monitoring:
-	 * This is the last instruction which may generate an overflow
-	 *
-	 * We do not need to set psr.sp because, it is irrelevant in kernel.
-	 * It will be restored from ipsr when going back to user level
-	 */
-	pfm_clear_psr_up();
-
-	/*
-	 * keep a copy of psr.up (for reload)
-	 */
-	ctx->ctx_saved_psr_up = psr & IA64_PSR_UP;
-}
-
-static void
-pfm_lazy_save_regs (struct task_struct *task)
-{
-	pfm_context_t *ctx;
-	unsigned long flags;
-
-	{ u64 psr  = pfm_get_psr();
-	  BUG_ON(psr & IA64_PSR_UP);
-	}
-
-	ctx = PFM_GET_CTX(task);
-
-	/*
-	 * we need to mask PMU overflow here to
-	 * make sure that we maintain pmc0 until
-	 * we save it. overflow interrupts are
-	 * treated as spurious if there is no
-	 * owner.
-	 *
-	 * XXX: I don't think this is necessary
-	 */
-	PROTECT_CTX(ctx,flags);
-
-	/*
-	 * release ownership of this PMU.
-	 * must be done before we save the registers.
-	 *
-	 * after this call any PMU interrupt is treated
-	 * as spurious.
-	 */
-	SET_PMU_OWNER(NULL, NULL);
-
-	/*
-	 * save all the pmds we use
-	 */
-	pfm_save_pmds(ctx->th_pmds, ctx->ctx_used_pmds[0]);
-
-	/*
-	 * save pmc0 ia64_srlz_d() done in pfm_save_pmds()
-	 * it is needed to check for pended overflow
-	 * on the restore path
-	 */
-	ctx->th_pmcs[0] = ia64_get_pmc(0);
-
-	/*
-	 * unfreeze PMU if had pending overflows
-	 */
-	if (ctx->th_pmcs[0] & ~0x1UL) pfm_unfreeze_pmu();
-
-	/*
-	 * now get can unmask PMU interrupts, they will
-	 * be treated as purely spurious and we will not
-	 * lose any information
-	 */
-	UNPROTECT_CTX(ctx,flags);
-}
-#endif /* CONFIG_SMP */
-
-#ifdef CONFIG_SMP
-/*
- * in 2.6, interrupts are masked when we come here and the runqueue lock is held
- */
-void
-pfm_load_regs (struct task_struct *task)
-{
-	pfm_context_t *ctx;
-	unsigned long pmc_mask = 0UL, pmd_mask = 0UL;
-	unsigned long flags;
-	u64 psr, psr_up;
-	int need_irq_resend;
-
-	ctx = PFM_GET_CTX(task);
-	if (unlikely(ctx == NULL)) return;
-
-	BUG_ON(GET_PMU_OWNER());
-
-	/*
-	 * possible on unload
-	 */
-	if (unlikely((task->thread.flags & IA64_THREAD_PM_VALID) == 0)) return;
-
-	/*
- 	 * we always come here with interrupts ALREADY disabled by
- 	 * the scheduler. So we simply need to protect against concurrent
-	 * access, not CPU concurrency.
-	 */
-	flags = pfm_protect_ctx_ctxsw(ctx);
-	psr   = pfm_get_psr();
-
-	need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
-
-	BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
-	BUG_ON(psr & IA64_PSR_I);
-
-	if (unlikely(ctx->ctx_state == PFM_CTX_ZOMBIE)) {
-		struct pt_regs *regs = task_pt_regs(task);
-
-		BUG_ON(ctx->ctx_smpl_hdr);
-
-		pfm_force_cleanup(ctx, regs);
-
-		pfm_unprotect_ctx_ctxsw(ctx, flags);
-
-		/*
-		 * this one (kmalloc'ed) is fine with interrupts disabled
-		 */
-		pfm_context_free(ctx);
-
-		return;
-	}
-
-	/*
-	 * we restore ALL the debug registers to avoid picking up
-	 * stale state.
-	 */
-	if (ctx->ctx_fl_using_dbreg) {
-		pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
-		pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
-	}
-	/*
-	 * retrieve saved psr.up
-	 */
-	psr_up = ctx->ctx_saved_psr_up;
-
-	/*
-	 * if we were the last user of the PMU on that CPU,
-	 * then nothing to do except restore psr
-	 */
-	if (GET_LAST_CPU(ctx) == smp_processor_id() && ctx->ctx_last_activation == GET_ACTIVATION()) {
-
-		/*
-		 * retrieve partial reload masks (due to user modifications)
-		 */
-		pmc_mask = ctx->ctx_reload_pmcs[0];
-		pmd_mask = ctx->ctx_reload_pmds[0];
-
-	} else {
-		/*
-	 	 * To avoid leaking information to the user level when psr.sp=0,
-	 	 * we must reload ALL implemented pmds (even the ones we don't use).
-	 	 * In the kernel we only allow PFM_READ_PMDS on registers which
-	 	 * we initialized or requested (sampling) so there is no risk there.
-	 	 */
-		pmd_mask = pfm_sysctl.fastctxsw ?  ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0];
-
-		/*
-	 	 * ALL accessible PMCs are systematically reloaded, unused registers
-	 	 * get their default (from pfm_reset_pmu_state()) values to avoid picking
-	 	 * up stale configuration.
-	 	 *
-	 	 * PMC0 is never in the mask. It is always restored separately.
-	 	 */
-		pmc_mask = ctx->ctx_all_pmcs[0];
-	}
-	/*
-	 * when context is MASKED, we will restore PMC with plm=0
-	 * and PMD with stale information, but that's ok, nothing
-	 * will be captured.
-	 *
-	 * XXX: optimize here
-	 */
-	if (pmd_mask) pfm_restore_pmds(ctx->th_pmds, pmd_mask);
-	if (pmc_mask) pfm_restore_pmcs(ctx->th_pmcs, pmc_mask);
-
-	/*
-	 * check for pending overflow at the time the state
-	 * was saved.
-	 */
-	if (unlikely(PMC0_HAS_OVFL(ctx->th_pmcs[0]))) {
-		/*
-		 * reload pmc0 with the overflow information
-		 * On McKinley PMU, this will trigger a PMU interrupt
-		 */
-		ia64_set_pmc(0, ctx->th_pmcs[0]);
-		ia64_srlz_d();
-		ctx->th_pmcs[0] = 0UL;
-
-		/*
-		 * will replay the PMU interrupt
-		 */
-		if (need_irq_resend) ia64_resend_irq(IA64_PERFMON_VECTOR);
-
-		pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
-	}
-
-	/*
-	 * we just did a reload, so we reset the partial reload fields
-	 */
-	ctx->ctx_reload_pmcs[0] = 0UL;
-	ctx->ctx_reload_pmds[0] = 0UL;
-
-	SET_LAST_CPU(ctx, smp_processor_id());
-
-	/*
-	 * dump activation value for this PMU
-	 */
-	INC_ACTIVATION();
-	/*
-	 * record current activation for this context
-	 */
-	SET_ACTIVATION(ctx);
-
-	/*
-	 * establish new ownership. 
-	 */
-	SET_PMU_OWNER(task, ctx);
-
-	/*
-	 * restore the psr.up bit. measurement
-	 * is active again.
-	 * no PMU interrupt can happen at this point
-	 * because we still have interrupts disabled.
-	 */
-	if (likely(psr_up)) pfm_set_psr_up();
-
-	/*
-	 * allow concurrent access to context
-	 */
-	pfm_unprotect_ctx_ctxsw(ctx, flags);
-}
-#else /*  !CONFIG_SMP */
-/*
- * reload PMU state for UP kernels
- * in 2.5 we come here with interrupts disabled
- */
-void
-pfm_load_regs (struct task_struct *task)
-{
-	pfm_context_t *ctx;
-	struct task_struct *owner;
-	unsigned long pmd_mask, pmc_mask;
-	u64 psr, psr_up;
-	int need_irq_resend;
-
-	owner = GET_PMU_OWNER();
-	ctx   = PFM_GET_CTX(task);
-	psr   = pfm_get_psr();
-
-	BUG_ON(psr & (IA64_PSR_UP|IA64_PSR_PP));
-	BUG_ON(psr & IA64_PSR_I);
-
-	/*
-	 * we restore ALL the debug registers to avoid picking up
-	 * stale state.
-	 *
-	 * This must be done even when the task is still the owner
-	 * as the registers may have been modified via ptrace()
-	 * (not perfmon) by the previous task.
-	 */
-	if (ctx->ctx_fl_using_dbreg) {
-		pfm_restore_ibrs(ctx->ctx_ibrs, pmu_conf->num_ibrs);
-		pfm_restore_dbrs(ctx->ctx_dbrs, pmu_conf->num_dbrs);
-	}
-
-	/*
-	 * retrieved saved psr.up
-	 */
-	psr_up = ctx->ctx_saved_psr_up;
-	need_irq_resend = pmu_conf->flags & PFM_PMU_IRQ_RESEND;
-
-	/*
-	 * short path, our state is still there, just
-	 * need to restore psr and we go
-	 *
-	 * we do not touch either PMC nor PMD. the psr is not touched
-	 * by the overflow_handler. So we are safe w.r.t. to interrupt
-	 * concurrency even without interrupt masking.
-	 */
-	if (likely(owner == task)) {
-		if (likely(psr_up)) pfm_set_psr_up();
-		return;
-	}
-
-	/*
-	 * someone else is still using the PMU, first push it out and
-	 * then we'll be able to install our stuff !
-	 *
-	 * Upon return, there will be no owner for the current PMU
-	 */
-	if (owner) pfm_lazy_save_regs(owner);
-
-	/*
-	 * To avoid leaking information to the user level when psr.sp=0,
-	 * we must reload ALL implemented pmds (even the ones we don't use).
-	 * In the kernel we only allow PFM_READ_PMDS on registers which
-	 * we initialized or requested (sampling) so there is no risk there.
-	 */
-	pmd_mask = pfm_sysctl.fastctxsw ?  ctx->ctx_used_pmds[0] : ctx->ctx_all_pmds[0];
-
-	/*
-	 * ALL accessible PMCs are systematically reloaded, unused registers
-	 * get their default (from pfm_reset_pmu_state()) values to avoid picking
-	 * up stale configuration.
-	 *
-	 * PMC0 is never in the mask. It is always restored separately
-	 */
-	pmc_mask = ctx->ctx_all_pmcs[0];
-
-	pfm_restore_pmds(ctx->th_pmds, pmd_mask);
-	pfm_restore_pmcs(ctx->th_pmcs, pmc_mask);
-
-	/*
-	 * check for pending overflow at the time the state
-	 * was saved.
-	 */
-	if (unlikely(PMC0_HAS_OVFL(ctx->th_pmcs[0]))) {
-		/*
-		 * reload pmc0 with the overflow information
-		 * On McKinley PMU, this will trigger a PMU interrupt
-		 */
-		ia64_set_pmc(0, ctx->th_pmcs[0]);
-		ia64_srlz_d();
-
-		ctx->th_pmcs[0] = 0UL;
-
-		/*
-		 * will replay the PMU interrupt
-		 */
-		if (need_irq_resend) ia64_resend_irq(IA64_PERFMON_VECTOR);
-
-		pfm_stats[smp_processor_id()].pfm_replay_ovfl_intr_count++;
-	}
-
-	/*
-	 * establish new ownership. 
-	 */
-	SET_PMU_OWNER(task, ctx);
-
-	/*
-	 * restore the psr.up bit. measurement
-	 * is active again.
-	 * no PMU interrupt can happen at this point
-	 * because we still have interrupts disabled.
-	 */
-	if (likely(psr_up)) pfm_set_psr_up();
-}
-#endif /* CONFIG_SMP */
-
-/*
- * this function assumes monitoring is stopped
- */
-static void
-pfm_flush_pmds(struct task_struct *task, pfm_context_t *ctx)
-{
-	u64 pmc0;
-	unsigned long mask2, val, pmd_val, ovfl_val;
-	int i, can_access_pmu = 0;
-	int is_self;
-
-	/*
-	 * is the caller the task being monitored (or which initiated the
-	 * session for system wide measurements)
-	 */
-	is_self = ctx->ctx_task == task ? 1 : 0;
-
-	/*
-	 * can access PMU is task is the owner of the PMU state on the current CPU
-	 * or if we are running on the CPU bound to the context in system-wide mode
-	 * (that is not necessarily the task the context is attached to in this mode).
-	 * In system-wide we always have can_access_pmu true because a task running on an
-	 * invalid processor is flagged earlier in the call stack (see pfm_stop).
-	 */
-	can_access_pmu = (GET_PMU_OWNER() == task) || (ctx->ctx_fl_system && ctx->ctx_cpu == smp_processor_id());
-	if (can_access_pmu) {
-		/*
-		 * Mark the PMU as not owned
-		 * This will cause the interrupt handler to do nothing in case an overflow
-		 * interrupt was in-flight
-		 * This also guarantees that pmc0 will contain the final state
-		 * It virtually gives us full control on overflow processing from that point
-		 * on.
-		 */
-		SET_PMU_OWNER(NULL, NULL);
-		DPRINT(("releasing ownership\n"));
-
-		/*
-		 * read current overflow status:
-		 *
-		 * we are guaranteed to read the final stable state
-		 */
-		ia64_srlz_d();
-		pmc0 = ia64_get_pmc(0); /* slow */
-
-		/*
-		 * reset freeze bit, overflow status information destroyed
-		 */
-		pfm_unfreeze_pmu();
-	} else {
-		pmc0 = ctx->th_pmcs[0];
-		/*
-		 * clear whatever overflow status bits there were
-		 */
-		ctx->th_pmcs[0] = 0;
-	}
-	ovfl_val = pmu_conf->ovfl_val;
-	/*
-	 * we save all the used pmds
-	 * we take care of overflows for counting PMDs
-	 *
-	 * XXX: sampling situation is not taken into account here
-	 */
-	mask2 = ctx->ctx_used_pmds[0];
-
-	DPRINT(("is_self=%d ovfl_val=0x%lx mask2=0x%lx\n", is_self, ovfl_val, mask2));
-
-	for (i = 0; mask2; i++, mask2>>=1) {
-
-		/* skip non used pmds */
-		if ((mask2 & 0x1) == 0) continue;
-
-		/*
-		 * can access PMU always true in system wide mode
-		 */
-		val = pmd_val = can_access_pmu ? ia64_get_pmd(i) : ctx->th_pmds[i];
-
-		if (PMD_IS_COUNTING(i)) {
-			DPRINT(("[%d] pmd[%d] ctx_pmd=0x%lx hw_pmd=0x%lx\n",
-				task->pid,
-				i,
-				ctx->ctx_pmds[i].val,
-				val & ovfl_val));
-
-			/*
-			 * we rebuild the full 64 bit value of the counter
-			 */
-			val = ctx->ctx_pmds[i].val + (val & ovfl_val);
-
-			/*
-			 * now everything is in ctx_pmds[] and we need
-			 * to clear the saved context from save_regs() such that
-			 * pfm_read_pmds() gets the correct value
-			 */
-			pmd_val = 0UL;
-
-			/*
-			 * take care of overflow inline
-			 */
-			if (pmc0 & (1UL << i)) {
-				val += 1 + ovfl_val;
-				DPRINT(("[%d] pmd[%d] overflowed\n", task->pid, i));
-			}
-		}
-
-		DPRINT(("[%d] ctx_pmd[%d]=0x%lx  pmd_val=0x%lx\n", task->pid, i, val, pmd_val));
-
-		if (is_self) ctx->th_pmds[i] = pmd_val;
-
-		ctx->ctx_pmds[i].val = val;
-	}
-}
-
-static struct irqaction perfmon_irqaction = {
-	.handler = pfm_interrupt_handler,
-	.flags   = IRQF_DISABLED,
-	.name    = "perfmon"
-};
-
-static void
-pfm_alt_save_pmu_state(void *data)
-{
-	struct pt_regs *regs;
-
-	regs = task_pt_regs(current);
-
-	DPRINT(("called\n"));
-
-	/*
-	 * should not be necessary but
-	 * let's take not risk
-	 */
-	pfm_clear_psr_up();
-	pfm_clear_psr_pp();
-	ia64_psr(regs)->pp = 0;
-
-	/*
-	 * This call is required
-	 * May cause a spurious interrupt on some processors
-	 */
-	pfm_freeze_pmu();
-
-	ia64_srlz_d();
-}
-
-void
-pfm_alt_restore_pmu_state(void *data)
-{
-	struct pt_regs *regs;
-
-	regs = task_pt_regs(current);
-
-	DPRINT(("called\n"));
-
-	/*
-	 * put PMU back in state expected
-	 * by perfmon
-	 */
-	pfm_clear_psr_up();
-	pfm_clear_psr_pp();
-	ia64_psr(regs)->pp = 0;
-
-	/*
-	 * perfmon runs with PMU unfrozen at all times
-	 */
-	pfm_unfreeze_pmu();
-
-	ia64_srlz_d();
-}
-
-int
-pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
-{
-	int ret, i;
-	int reserve_cpu;
-
-	/* some sanity checks */
-	if (hdl == NULL || hdl->handler == NULL) return -EINVAL;
-
-	/* do the easy test first */
-	if (pfm_alt_intr_handler) return -EBUSY;
-
-	/* one at a time in the install or remove, just fail the others */
-	if (!spin_trylock(&pfm_alt_install_check)) {
-		return -EBUSY;
-	}
-
-	/* reserve our session */
-	for_each_online_cpu(reserve_cpu) {
-		ret = pfm_reserve_session(NULL, 1, reserve_cpu);
-		if (ret) goto cleanup_reserve;
-	}
-
-	/* save the current system wide pmu states */
-	ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 0, 1);
-	if (ret) {
-		DPRINT(("on_each_cpu() failed: %d\n", ret));
-		goto cleanup_reserve;
-	}
-
-	/* officially change to the alternate interrupt handler */
-	pfm_alt_intr_handler = hdl;
-
-	spin_unlock(&pfm_alt_install_check);
-
-	return 0;
-
-cleanup_reserve:
-	for_each_online_cpu(i) {
-		/* don't unreserve more than we reserved */
-		if (i >= reserve_cpu) break;
-
-		pfm_unreserve_session(NULL, 1, i);
-	}
-
-	spin_unlock(&pfm_alt_install_check);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(pfm_install_alt_pmu_interrupt);
-
-int
-pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
-{
-	int i;
-	int ret;
-
-	if (hdl == NULL) return -EINVAL;
-
-	/* cannot remove someone else's handler! */
-	if (pfm_alt_intr_handler != hdl) return -EINVAL;
-
-	/* one at a time in the install or remove, just fail the others */
-	if (!spin_trylock(&pfm_alt_install_check)) {
-		return -EBUSY;
-	}
-
-	pfm_alt_intr_handler = NULL;
-
-	ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 0, 1);
-	if (ret) {
-		DPRINT(("on_each_cpu() failed: %d\n", ret));
-	}
-
-	for_each_online_cpu(i) {
-		pfm_unreserve_session(NULL, 1, i);
-	}
-
-	spin_unlock(&pfm_alt_install_check);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(pfm_remove_alt_pmu_interrupt);
-
-/*
- * perfmon initialization routine, called from the initcall() table
- */
-static int init_pfm_fs(void);
-
-static int __init
-pfm_probe_pmu(void)
-{
-	pmu_config_t **p;
-	int family;
-
-	family = local_cpu_data->family;
-	p      = pmu_confs;
-
-	while(*p) {
-		if ((*p)->probe) {
-			if ((*p)->probe() == 0) goto found;
-		} else if ((*p)->pmu_family == family || (*p)->pmu_family == 0xff) {
-			goto found;
-		}
-		p++;
-	}
-	return -1;
-found:
-	pmu_conf = *p;
-	return 0;
-}
-
-static const struct file_operations pfm_proc_fops = {
-	.open		= pfm_proc_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-int __init
-pfm_init(void)
-{
-	unsigned int n, n_counters, i;
-
-	printk("perfmon: version %u.%u IRQ %u\n",
-		PFM_VERSION_MAJ,
-		PFM_VERSION_MIN,
-		IA64_PERFMON_VECTOR);
-
-	if (pfm_probe_pmu()) {
-		printk(KERN_INFO "perfmon: disabled, there is no support for processor family %d\n", 
-				local_cpu_data->family);
-		return -ENODEV;
-	}
-
-	/*
-	 * compute the number of implemented PMD/PMC from the
-	 * description tables
-	 */
-	n = 0;
-	for (i=0; PMC_IS_LAST(i) == 0;  i++) {
-		if (PMC_IS_IMPL(i) == 0) continue;
-		pmu_conf->impl_pmcs[i>>6] |= 1UL << (i&63);
-		n++;
-	}
-	pmu_conf->num_pmcs = n;
-
-	n = 0; n_counters = 0;
-	for (i=0; PMD_IS_LAST(i) == 0;  i++) {
-		if (PMD_IS_IMPL(i) == 0) continue;
-		pmu_conf->impl_pmds[i>>6] |= 1UL << (i&63);
-		n++;
-		if (PMD_IS_COUNTING(i)) n_counters++;
-	}
-	pmu_conf->num_pmds      = n;
-	pmu_conf->num_counters  = n_counters;
-
-	/*
-	 * sanity checks on the number of debug registers
-	 */
-	if (pmu_conf->use_rr_dbregs) {
-		if (pmu_conf->num_ibrs > IA64_NUM_DBG_REGS) {
-			printk(KERN_INFO "perfmon: unsupported number of code debug registers (%u)\n", pmu_conf->num_ibrs);
-			pmu_conf = NULL;
-			return -1;
-		}
-		if (pmu_conf->num_dbrs > IA64_NUM_DBG_REGS) {
-			printk(KERN_INFO "perfmon: unsupported number of data debug registers (%u)\n", pmu_conf->num_ibrs);
-			pmu_conf = NULL;
-			return -1;
-		}
-	}
-
-	printk("perfmon: %s PMU detected, %u PMCs, %u PMDs, %u counters (%lu bits)\n",
-	       pmu_conf->pmu_name,
-	       pmu_conf->num_pmcs,
-	       pmu_conf->num_pmds,
-	       pmu_conf->num_counters,
-	       ffz(pmu_conf->ovfl_val));
-
-	/* sanity check */
-	if (pmu_conf->num_pmds >= PFM_NUM_PMD_REGS || pmu_conf->num_pmcs >= PFM_NUM_PMC_REGS) {
-		printk(KERN_ERR "perfmon: not enough pmc/pmd, perfmon disabled\n");
-		pmu_conf = NULL;
-		return -1;
-	}
-
-	/*
-	 * create /proc/perfmon (mostly for debugging purposes)
-	 */
- 	perfmon_dir = create_proc_entry("perfmon", S_IRUGO, NULL);
-	if (perfmon_dir == NULL) {
-		printk(KERN_ERR "perfmon: cannot create /proc entry, perfmon disabled\n");
-		pmu_conf = NULL;
-		return -1;
-	}
-  	/*
- 	 * install customized file operations for /proc/perfmon entry
- 	 */
- 	perfmon_dir->proc_fops = &pfm_proc_fops;
-
-	/*
-	 * create /proc/sys/kernel/perfmon (for debugging purposes)
-	 */
-	pfm_sysctl_header = register_sysctl_table(pfm_sysctl_root);
-
-	/*
-	 * initialize all our spinlocks
-	 */
-	spin_lock_init(&pfm_sessions.pfs_lock);
-	spin_lock_init(&pfm_buffer_fmt_lock);
-
-	init_pfm_fs();
-
-	for(i=0; i < NR_CPUS; i++) pfm_stats[i].pfm_ovfl_intr_cycles_min = ~0UL;
-
-	return 0;
-}
-
-__initcall(pfm_init);
-
-/*
- * this function is called before pfm_init()
- */
-void
-pfm_init_percpu (void)
-{
-	static int first_time=1;
-	/*
-	 * make sure no measurement is active
-	 * (may inherit programmed PMCs from EFI).
-	 */
-	pfm_clear_psr_pp();
-	pfm_clear_psr_up();
-
-	/*
-	 * we run with the PMU not frozen at all times
-	 */
-	pfm_unfreeze_pmu();
-
-	if (first_time) {
-		register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
-		first_time=0;
-	}
-
-	ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
-	ia64_srlz_d();
-}
-
-/*
- * used for debug purposes only
- */
-void
-dump_pmu_state(const char *from)
-{
-	struct task_struct *task;
-	struct pt_regs *regs;
-	pfm_context_t *ctx;
-	unsigned long psr, dcr, info, flags;
-	int i, this_cpu;
-
-	local_irq_save(flags);
-
-	this_cpu = smp_processor_id();
-	regs     = task_pt_regs(current);
-	info     = PFM_CPUINFO_GET();
-	dcr      = ia64_getreg(_IA64_REG_CR_DCR);
-
-	if (info == 0 && ia64_psr(regs)->pp == 0 && (dcr & IA64_DCR_PP) == 0) {
-		local_irq_restore(flags);
-		return;
-	}
-
-	printk("CPU%d from %s() current [%d] iip=0x%lx %s\n", 
-		this_cpu, 
-		from, 
-		current->pid, 
-		regs->cr_iip,
-		current->comm);
-
-	task = GET_PMU_OWNER();
-	ctx  = GET_PMU_CTX();
-
-	printk("->CPU%d owner [%d] ctx=%p\n", this_cpu, task ? task->pid : -1, ctx);
-
-	psr = pfm_get_psr();
-
-	printk("->CPU%d pmc0=0x%lx psr.pp=%d psr.up=%d dcr.pp=%d syst_info=0x%lx user_psr.up=%d user_psr.pp=%d\n", 
-		this_cpu,
-		ia64_get_pmc(0),
-		psr & IA64_PSR_PP ? 1 : 0,
-		psr & IA64_PSR_UP ? 1 : 0,
-		dcr & IA64_DCR_PP ? 1 : 0,
-		info,
-		ia64_psr(regs)->up,
-		ia64_psr(regs)->pp);
-
-	ia64_psr(regs)->up = 0;
-	ia64_psr(regs)->pp = 0;
-
-	for (i=1; PMC_IS_LAST(i) == 0; i++) {
-		if (PMC_IS_IMPL(i) == 0) continue;
-		printk("->CPU%d pmc[%d]=0x%lx thread_pmc[%d]=0x%lx\n", this_cpu, i, ia64_get_pmc(i), i, ctx->th_pmcs[i]);
-	}
-
-	for (i=1; PMD_IS_LAST(i) == 0; i++) {
-		if (PMD_IS_IMPL(i) == 0) continue;
-		printk("->CPU%d pmd[%d]=0x%lx thread_pmd[%d]=0x%lx\n", this_cpu, i, ia64_get_pmd(i), i, ctx->th_pmds[i]);
-	}
-
-	if (ctx) {
-		printk("->CPU%d ctx_state=%d vaddr=%p addr=%p fd=%d ctx_task=[%d] saved_psr_up=0x%lx\n",
-				this_cpu,
-				ctx->ctx_state,
-				ctx->ctx_smpl_vaddr,
-				ctx->ctx_smpl_hdr,
-				ctx->ctx_msgq_head,
-				ctx->ctx_msgq_tail,
-				ctx->ctx_saved_psr_up);
-	}
-	local_irq_restore(flags);
-}
-
-/*
- * called from process.c:copy_thread(). task is new child.
- */
-void
-pfm_inherit(struct task_struct *task, struct pt_regs *regs)
-{
-	struct thread_struct *thread;
-
-	DPRINT(("perfmon: pfm_inherit clearing state for [%d]\n", task->pid));
-
-	thread = &task->thread;
-
-	/*
-	 * cut links inherited from parent (current)
-	 */
-	thread->pfm_context = NULL;
-
-	PFM_SET_WORK_PENDING(task, 0);
-
-	/*
-	 * the psr bits are already set properly in copy_threads()
-	 */
-}
-#else  /* !CONFIG_PERFMON */
-asmlinkage long
-sys_perfmonctl (int fd, int cmd, void *arg, int count)
-{
-	return -ENOSYS;
-}
-#endif /* CONFIG_PERFMON */
Index: linux-2.6/arch/ia64/kernel/perfmon_default_smpl.c
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/perfmon_default_smpl.c
+++ /dev/null
@@ -1,296 +0,0 @@
-/*
- * Copyright (C) 2002-2003 Hewlett-Packard Co
- *               Stephane Eranian <eranian@hpl.hp.com>
- *
- * This file implements the default sampling buffer format
- * for the Linux/ia64 perfmon-2 subsystem.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <asm/delay.h>
-#include <linux/smp.h>
-
-#include <asm/perfmon.h>
-#include <asm/perfmon_default_smpl.h>
-
-MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
-MODULE_DESCRIPTION("perfmon default sampling format");
-MODULE_LICENSE("GPL");
-
-#define DEFAULT_DEBUG 1
-
-#ifdef DEFAULT_DEBUG
-#define DPRINT(a) \
-	do { \
-		if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
-	} while (0)
-
-#define DPRINT_ovfl(a) \
-	do { \
-		if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d ", __FUNCTION__, __LINE__, smp_processor_id()); printk a; } \
-	} while (0)
-
-#else
-#define DPRINT(a)
-#define DPRINT_ovfl(a)
-#endif
-
-static int
-default_validate(struct task_struct *task, unsigned int flags, int cpu, void *data)
-{
-	pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t*)data;
-	int ret = 0;
-
-	if (data == NULL) {
-		DPRINT(("[%d] no argument passed\n", task->pid));
-		return -EINVAL;
-	}
-
-	DPRINT(("[%d] validate flags=0x%x CPU%d\n", task->pid, flags, cpu));
-
-	/*
-	 * must hold at least the buffer header + one minimally sized entry
-	 */
-	if (arg->buf_size < PFM_DEFAULT_SMPL_MIN_BUF_SIZE) return -EINVAL;
-
-	DPRINT(("buf_size=%lu\n", arg->buf_size));
-
-	return ret;
-}
-
-static int
-default_get_size(struct task_struct *task, unsigned int flags, int cpu, void *data, unsigned long *size)
-{
-	pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data;
-
-	/*
-	 * size has been validated in default_validate
-	 */
-	*size = arg->buf_size;
-
-	return 0;
-}
-
-static int
-default_init(struct task_struct *task, void *buf, unsigned int flags, int cpu, void *data)
-{
-	pfm_default_smpl_hdr_t *hdr;
-	pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data;
-
-	hdr = (pfm_default_smpl_hdr_t *)buf;
-
-	hdr->hdr_version      = PFM_DEFAULT_SMPL_VERSION;
-	hdr->hdr_buf_size     = arg->buf_size;
-	hdr->hdr_cur_offs     = sizeof(*hdr);
-	hdr->hdr_overflows    = 0UL;
-	hdr->hdr_count        = 0UL;
-
-	DPRINT(("[%d] buffer=%p buf_size=%lu hdr_size=%lu hdr_version=%u cur_offs=%lu\n",
-		task->pid,
-		buf,
-		hdr->hdr_buf_size,
-		sizeof(*hdr),
-		hdr->hdr_version,
-		hdr->hdr_cur_offs));
-
-	return 0;
-}
-
-static int
-default_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp)
-{
-	pfm_default_smpl_hdr_t *hdr;
-	pfm_default_smpl_entry_t *ent;
-	void *cur, *last;
-	unsigned long *e, entry_size;
-	unsigned int npmds, i;
-	unsigned char ovfl_pmd;
-	unsigned char ovfl_notify;
-
-	if (unlikely(buf == NULL || arg == NULL|| regs == NULL || task == NULL)) {
-		DPRINT(("[%d] invalid arguments buf=%p arg=%p\n", task->pid, buf, arg));
-		return -EINVAL;
-	}
-
-	hdr         = (pfm_default_smpl_hdr_t *)buf;
-	cur         = buf+hdr->hdr_cur_offs;
-	last        = buf+hdr->hdr_buf_size;
-	ovfl_pmd    = arg->ovfl_pmd;
-	ovfl_notify = arg->ovfl_notify;
-
-	/*
-	 * precheck for sanity
-	 */
-	if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full;
-
-	npmds = hweight64(arg->smpl_pmds[0]);
-
-	ent = (pfm_default_smpl_entry_t *)cur;
-
-	prefetch(arg->smpl_pmds_values);
-
-	entry_size = sizeof(*ent) + (npmds << 3);
-
-	/* position for first pmd */
-	e = (unsigned long *)(ent+1);
-
-	hdr->hdr_count++;
-
-	DPRINT_ovfl(("[%d] count=%lu cur=%p last=%p free_bytes=%lu ovfl_pmd=%d ovfl_notify=%d npmds=%u\n",
-			task->pid,
-			hdr->hdr_count,
-			cur, last,
-			last-cur,
-			ovfl_pmd,
-			ovfl_notify, npmds));
-
-	/*
-	 * current = task running at the time of the overflow.
-	 *
-	 * per-task mode:
-	 * 	- this is ususally the task being monitored.
-	 * 	  Under certain conditions, it might be a different task
-	 *
-	 * system-wide:
-	 * 	- this is not necessarily the task controlling the session
-	 */
-	ent->pid            = current->pid;
-	ent->ovfl_pmd  	    = ovfl_pmd;
-	ent->last_reset_val = arg->pmd_last_reset; //pmd[0].reg_last_reset_val;
-
-	/*
-	 * where did the fault happen (includes slot number)
-	 */
-	ent->ip = regs->cr_iip | ((regs->cr_ipsr >> 41) & 0x3);
-
-	ent->tstamp    = stamp;
-	ent->cpu       = smp_processor_id();
-	ent->set       = arg->active_set;
-	ent->tgid      = current->tgid;
-
-	/*
-	 * selectively store PMDs in increasing index number
-	 */
-	if (npmds) {
-		unsigned long *val = arg->smpl_pmds_values;
-		for(i=0; i < npmds; i++) {
-			*e++ = *val++;
-		}
-	}
-
-	/*
-	 * update position for next entry
-	 */
-	hdr->hdr_cur_offs += entry_size;
-	cur               += entry_size;
-
-	/*
-	 * post check to avoid losing the last sample
-	 */
-	if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full;
-
-	/*
-	 * keep same ovfl_pmds, ovfl_notify
-	 */
-	arg->ovfl_ctrl.bits.notify_user     = 0;
-	arg->ovfl_ctrl.bits.block_task      = 0;
-	arg->ovfl_ctrl.bits.mask_monitoring = 0;
-	arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1; /* reset before returning from interrupt handler */
-
-	return 0;
-full:
-	DPRINT_ovfl(("sampling buffer full free=%lu, count=%lu, ovfl_notify=%d\n", last-cur, hdr->hdr_count, ovfl_notify));
-
-	/*
-	 * increment number of buffer overflow.
-	 * important to detect duplicate set of samples.
-	 */
-	hdr->hdr_overflows++;
-
-	/*
-	 * if no notification requested, then we saturate the buffer
-	 */
-	if (ovfl_notify == 0) {
-		arg->ovfl_ctrl.bits.notify_user     = 0;
-		arg->ovfl_ctrl.bits.block_task      = 0;
-		arg->ovfl_ctrl.bits.mask_monitoring = 1;
-		arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0;
-	} else {
-		arg->ovfl_ctrl.bits.notify_user     = 1;
-		arg->ovfl_ctrl.bits.block_task      = 1; /* ignored for non-blocking context */
-		arg->ovfl_ctrl.bits.mask_monitoring = 1;
-		arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; /* no reset now */
-	}
-	return -1; /* we are full, sorry */
-}
-
-static int
-default_restart(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs)
-{
-	pfm_default_smpl_hdr_t *hdr;
-
-	hdr = (pfm_default_smpl_hdr_t *)buf;
-
-	hdr->hdr_count    = 0UL;
-	hdr->hdr_cur_offs = sizeof(*hdr);
-
-	ctrl->bits.mask_monitoring = 0;
-	ctrl->bits.reset_ovfl_pmds = 1; /* uses long-reset values */
-
-	return 0;
-}
-
-static int
-default_exit(struct task_struct *task, void *buf, struct pt_regs *regs)
-{
-	DPRINT(("[%d] exit(%p)\n", task->pid, buf));
-	return 0;
-}
-
-static pfm_buffer_fmt_t default_fmt={
- 	.fmt_name 	    = "default_format",
- 	.fmt_uuid	    = PFM_DEFAULT_SMPL_UUID,
- 	.fmt_arg_size	    = sizeof(pfm_default_smpl_arg_t),
- 	.fmt_validate	    = default_validate,
- 	.fmt_getsize	    = default_get_size,
- 	.fmt_init	    = default_init,
- 	.fmt_handler	    = default_handler,
- 	.fmt_restart	    = default_restart,
- 	.fmt_restart_active = default_restart,
- 	.fmt_exit	    = default_exit,
-};
-
-static int __init
-pfm_default_smpl_init_module(void)
-{
-	int ret;
-
-	ret = pfm_register_buffer_fmt(&default_fmt);
-	if (ret == 0) {
-		printk("perfmon_default_smpl: %s v%u.%u registered\n",
-			default_fmt.fmt_name,
-			PFM_DEFAULT_SMPL_VERSION_MAJ,
-			PFM_DEFAULT_SMPL_VERSION_MIN);
-	} else {
-		printk("perfmon_default_smpl: %s cannot register ret=%d\n",
-			default_fmt.fmt_name,
-			ret);
-	}
-
-	return ret;
-}
-
-static void __exit
-pfm_default_smpl_cleanup_module(void)
-{
-	int ret;
-	ret = pfm_unregister_buffer_fmt(default_fmt.fmt_uuid);
-
-	printk("perfmon_default_smpl: unregister %s=%d\n", default_fmt.fmt_name, ret);
-}
-
-module_init(pfm_default_smpl_init_module);
-module_exit(pfm_default_smpl_cleanup_module);
-
Index: linux-2.6/arch/ia64/kernel/perfmon_generic.h
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/perfmon_generic.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * This file contains the generic PMU register description tables
- * and pmc checker used by perfmon.c.
- *
- * Copyright (C) 2002-2003  Hewlett Packard Co
- *               Stephane Eranian <eranian@hpl.hp.com>
- */
-
-static pfm_reg_desc_t pfm_gen_pmc_desc[PMU_MAX_PMCS]={
-/* pmc0  */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc1  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc2  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc3  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc4  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc5  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc6  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc7  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-	    { PFM_REG_END     , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
-};
-
-static pfm_reg_desc_t pfm_gen_pmd_desc[PMU_MAX_PMDS]={
-/* pmd0  */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
-/* pmd1  */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
-/* pmd2  */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
-/* pmd3  */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}},
-/* pmd4  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
-/* pmd5  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
-/* pmd6  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
-/* pmd7  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
-	    { PFM_REG_END     , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
-};
-
-/*
- * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
- */
-static pmu_config_t pmu_conf_gen={
-	.pmu_name   = "Generic",
-	.pmu_family = 0xff, /* any */
-	.ovfl_val   = (1UL << 32) - 1,
-	.num_ibrs   = 0, /* does not use */
-	.num_dbrs   = 0, /* does not use */
-	.pmd_desc   = pfm_gen_pmd_desc,
-	.pmc_desc   = pfm_gen_pmc_desc
-};
-
Index: linux-2.6/arch/ia64/kernel/perfmon_itanium.h
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/perfmon_itanium.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * This file contains the Itanium PMU register description tables
- * and pmc checker used by perfmon.c.
- *
- * Copyright (C) 2002-2003  Hewlett Packard Co
- *               Stephane Eranian <eranian@hpl.hp.com>
- */
-static int pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
-
-static pfm_reg_desc_t pfm_ita_pmc_desc[PMU_MAX_PMCS]={
-/* pmc0  */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc1  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc2  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc3  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc4  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc5  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc6  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc7  */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc8  */ { PFM_REG_CONFIG  , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc9  */ { PFM_REG_CONFIG  , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc10 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0000000010000000UL, -1UL, NULL, pfm_ita_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc13 */ { PFM_REG_CONFIG  , 0, 0x0003ffff00000001UL, -1UL, NULL, pfm_ita_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-	    { PFM_REG_END     , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
-};
-
-static pfm_reg_desc_t pfm_ita_pmd_desc[PMU_MAX_PMDS]={
-/* pmd0  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
-/* pmd1  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
-/* pmd2  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
-/* pmd3  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
-/* pmd4  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
-/* pmd5  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
-/* pmd6  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
-/* pmd7  */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
-/* pmd8  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd9  */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd10 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd11 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd12 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd13 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd14 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd15 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd16 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd17 */ { PFM_REG_BUFFER  , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
-	    { PFM_REG_END     , 0, 0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
-};
-
-static int
-pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs)
-{
-	int ret;
-	int is_loaded;
-
-	/* sanitfy check */
-	if (ctx == NULL) return -EINVAL;
-
-	is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED;
-
-	/*
-	 * we must clear the (instruction) debug registers if pmc13.ta bit is cleared
-	 * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
-	 */
-	if (cnum == 13 && is_loaded && ((*val & 0x1) == 0UL) && ctx->ctx_fl_using_dbreg == 0) {
-
-		DPRINT(("pmc[%d]=0x%lx has active pmc13.ta cleared, clearing ibr\n", cnum, *val));
-
-		/* don't mix debug with perfmon */
-		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
-
-		/*
-		 * a count of 0 will mark the debug registers as in use and also
-		 * ensure that they are properly cleared.
-		 */
-		ret = pfm_write_ibr_dbr(1, ctx, NULL, 0, regs);
-		if (ret) return ret;
-	}
-
-	/*
-	 * we must clear the (data) debug registers if pmc11.pt bit is cleared
-	 * before they are written (fl_using_dbreg==0) to avoid picking up stale information.
-	 */
-	if (cnum == 11 && is_loaded && ((*val >> 28)& 0x1) == 0 && ctx->ctx_fl_using_dbreg == 0) {
-
-		DPRINT(("pmc[%d]=0x%lx has active pmc11.pt cleared, clearing dbr\n", cnum, *val));
-
-		/* don't mix debug with perfmon */
-		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
-
-		/*
-		 * a count of 0 will mark the debug registers as in use and also
-		 * ensure that they are properly cleared.
-		 */
-		ret = pfm_write_ibr_dbr(0, ctx, NULL, 0, regs);
-		if (ret) return ret;
-	}
-	return 0;
-}
-
-/*
- * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
- */
-static pmu_config_t pmu_conf_ita={
-	.pmu_name      = "Itanium",
-	.pmu_family    = 0x7,
-	.ovfl_val      = (1UL << 32) - 1,
-	.pmd_desc      = pfm_ita_pmd_desc,
-	.pmc_desc      = pfm_ita_pmc_desc,
-	.num_ibrs      = 8,
-	.num_dbrs      = 8,
-	.use_rr_dbregs = 1, /* debug register are use for range retrictions */
-};
-
-
Index: linux-2.6/arch/ia64/kernel/perfmon_mckinley.h
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/perfmon_mckinley.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * This file contains the McKinley PMU register description tables
- * and pmc checker used by perfmon.c.
- *
- * Copyright (C) 2002-2003  Hewlett Packard Co
- *               Stephane Eranian <eranian@hpl.hp.com>
- */
-static int pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
-
-static pfm_reg_desc_t pfm_mck_pmc_desc[PMU_MAX_PMCS]={
-/* pmc0  */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc1  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc2  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc3  */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc4  */ { PFM_REG_COUNTING, 6, 0x0000000000800000UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc5  */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL,  pfm_mck_pmc_check, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc6  */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL,  pfm_mck_pmc_check, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc7  */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL,  pfm_mck_pmc_check, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc8  */ { PFM_REG_CONFIG  , 0, 0xffffffff3fffffffUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc9  */ { PFM_REG_CONFIG  , 0, 0xffffffff3ffffffcUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc10 */ { PFM_REG_MONITOR , 4, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0UL, 0x30f01cf, NULL,  pfm_mck_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, 0xffffUL, NULL,  pfm_mck_pmc_check, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc13 */ { PFM_REG_CONFIG  , 0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc14 */ { PFM_REG_CONFIG  , 0, 0x0db60db60db60db6UL, 0x2492UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-/* pmc15 */ { PFM_REG_CONFIG  , 0, 0x00000000fffffff0UL, 0xfUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}},
-	    { PFM_REG_END     , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
-};
-
-static pfm_reg_desc_t pfm_mck_pmd_desc[PMU_MAX_PMDS]={
-/* pmd0  */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
-/* pmd1  */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}},
-/* pmd2  */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
-/* pmd3  */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
-/* pmd4  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}},
-/* pmd5  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}},
-/* pmd6  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}},
-/* pmd7  */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}},
-/* pmd8  */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd9  */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd10 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd11 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd12 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd13 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd14 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd15 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd16 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}},
-/* pmd17 */ { PFM_REG_BUFFER  , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}},
-	    { PFM_REG_END     , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */
-};
-
-/*
- * PMC reserved fields must have their power-up values preserved
- */
-static int
-pfm_mck_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs)
-{
-	unsigned long tmp1, tmp2, ival = *val;
-
-	/* remove reserved areas from user value */
-	tmp1 = ival & PMC_RSVD_MASK(cnum);
-
-	/* get reserved fields values */
-	tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum);
-
-	*val = tmp1 | tmp2;
-
-	DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n",
-		  cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val));
-	return 0;
-}
-
-/*
- * task can be NULL if the context is unloaded
- */
-static int
-pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs)
-{
-	int ret = 0, check_case1 = 0;
-	unsigned long val8 = 0, val14 = 0, val13 = 0;
-	int is_loaded;
-
-	/* first preserve the reserved fields */
-	pfm_mck_reserved(cnum, val, regs);
-
-	/* sanitfy check */
-	if (ctx == NULL) return -EINVAL;
-
-	is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED;
-
-	/*
-	 * we must clear the debug registers if pmc13 has a value which enable
-	 * memory pipeline event constraints. In this case we need to clear the
-	 * the debug registers if they have not yet been accessed. This is required
-	 * to avoid picking stale state.
-	 * PMC13 is "active" if:
-	 * 	one of the pmc13.cfg_dbrpXX field is different from 0x3
-	 * AND
-	 * 	at the corresponding pmc13.ena_dbrpXX is set.
-	 */
-	DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, *val, ctx->ctx_fl_using_dbreg, is_loaded));
-
-	if (cnum == 13 && is_loaded
-	    && (*val & 0x1e00000000000UL) && (*val & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) {
-
-		DPRINT(("pmc[%d]=0x%lx has active pmc13 settings, clearing dbr\n", cnum, *val));
-
-		/* don't mix debug with perfmon */
-		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
-
-		/*
-		 * a count of 0 will mark the debug registers as in use and also
-		 * ensure that they are properly cleared.
-		 */
-		ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs);
-		if (ret) return ret;
-	}
-	/*
-	 * we must clear the (instruction) debug registers if any pmc14.ibrpX bit is enabled
-	 * before they are (fl_using_dbreg==0) to avoid picking up stale information.
-	 */
-	if (cnum == 14 && is_loaded && ((*val & 0x2222UL) != 0x2222UL) && ctx->ctx_fl_using_dbreg == 0) {
-
-		DPRINT(("pmc[%d]=0x%lx has active pmc14 settings, clearing ibr\n", cnum, *val));
-
-		/* don't mix debug with perfmon */
-		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
-
-		/*
-		 * a count of 0 will mark the debug registers as in use and also
-		 * ensure that they are properly cleared.
-		 */
-		ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs);
-		if (ret) return ret;
-
-	}
-
-	switch(cnum) {
-		case  4: *val |= 1UL << 23; /* force power enable bit */
-			 break;
-		case  8: val8 = *val;
-			 val13 = ctx->ctx_pmcs[13];
-			 val14 = ctx->ctx_pmcs[14];
-			 check_case1 = 1;
-			 break;
-		case 13: val8  = ctx->ctx_pmcs[8];
-			 val13 = *val;
-			 val14 = ctx->ctx_pmcs[14];
-			 check_case1 = 1;
-			 break;
-		case 14: val8  = ctx->ctx_pmcs[8];
-			 val13 = ctx->ctx_pmcs[13];
-			 val14 = *val;
-			 check_case1 = 1;
-			 break;
-	}
-	/* check illegal configuration which can produce inconsistencies in tagging
-	 * i-side events in L1D and L2 caches
-	 */
-	if (check_case1) {
-		ret =   ((val13 >> 45) & 0xf) == 0
-		   && ((val8 & 0x1) == 0)
-		   && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0)
-		       ||(((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0));
-
-		if (ret) DPRINT((KERN_DEBUG "perfmon: failure check_case1\n"));
-	}
-
-	return ret ? -EINVAL : 0;
-}
-
-/*
- * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
- */
-static pmu_config_t pmu_conf_mck={
-	.pmu_name      = "Itanium 2",
-	.pmu_family    = 0x1f,
-	.flags	       = PFM_PMU_IRQ_RESEND,
-	.ovfl_val      = (1UL << 47) - 1,
-	.pmd_desc      = pfm_mck_pmd_desc,
-	.pmc_desc      = pfm_mck_pmc_desc,
-	.num_ibrs       = 8,
-	.num_dbrs       = 8,
-	.use_rr_dbregs = 1 /* debug register are use for range restrictions */
-};
-
-
Index: linux-2.6/arch/ia64/kernel/perfmon_montecito.h
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/perfmon_montecito.h
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * This file contains the Montecito PMU register description tables
- * and pmc checker used by perfmon.c.
- *
- * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
- *               Contributed by Stephane Eranian <eranian@hpl.hp.com>
- */
-static int pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs);
-
-#define RDEP_MONT_ETB	(RDEP(38)|RDEP(39)|RDEP(48)|RDEP(49)|RDEP(50)|RDEP(51)|RDEP(52)|RDEP(53)|RDEP(54)|\
-			 RDEP(55)|RDEP(56)|RDEP(57)|RDEP(58)|RDEP(59)|RDEP(60)|RDEP(61)|RDEP(62)|RDEP(63))
-#define RDEP_MONT_DEAR  (RDEP(32)|RDEP(33)|RDEP(36))
-#define RDEP_MONT_IEAR  (RDEP(34)|RDEP(35))
-
-static pfm_reg_desc_t pfm_mont_pmc_desc[PMU_MAX_PMCS]={
-/* pmc0  */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}},
-/* pmc1  */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}},
-/* pmc2  */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}},
-/* pmc3  */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}},
-/* pmc4  */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(4),0, 0, 0}, {0,0, 0, 0}},
-/* pmc5  */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(5),0, 0, 0}, {0,0, 0, 0}},
-/* pmc6  */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(6),0, 0, 0}, {0,0, 0, 0}},
-/* pmc7  */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(7),0, 0, 0}, {0,0, 0, 0}},
-/* pmc8  */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(8),0, 0, 0}, {0,0, 0, 0}},
-/* pmc9  */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(9),0, 0, 0}, {0,0, 0, 0}},
-/* pmc10 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(10),0, 0, 0}, {0,0, 0, 0}},
-/* pmc11 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(11),0, 0, 0}, {0,0, 0, 0}},
-/* pmc12 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(12),0, 0, 0}, {0,0, 0, 0}},
-/* pmc13 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(13),0, 0, 0}, {0,0, 0, 0}},
-/* pmc14 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(14),0, 0, 0}, {0,0, 0, 0}},
-/* pmc15 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(15),0, 0, 0}, {0,0, 0, 0}},
-/* pmc16 */ { PFM_REG_NOTIMPL, },
-/* pmc17 */ { PFM_REG_NOTIMPL, },
-/* pmc18 */ { PFM_REG_NOTIMPL, },
-/* pmc19 */ { PFM_REG_NOTIMPL, },
-/* pmc20 */ { PFM_REG_NOTIMPL, },
-/* pmc21 */ { PFM_REG_NOTIMPL, },
-/* pmc22 */ { PFM_REG_NOTIMPL, },
-/* pmc23 */ { PFM_REG_NOTIMPL, },
-/* pmc24 */ { PFM_REG_NOTIMPL, },
-/* pmc25 */ { PFM_REG_NOTIMPL, },
-/* pmc26 */ { PFM_REG_NOTIMPL, },
-/* pmc27 */ { PFM_REG_NOTIMPL, },
-/* pmc28 */ { PFM_REG_NOTIMPL, },
-/* pmc29 */ { PFM_REG_NOTIMPL, },
-/* pmc30 */ { PFM_REG_NOTIMPL, },
-/* pmc31 */ { PFM_REG_NOTIMPL, },
-/* pmc32 */ { PFM_REG_CONFIG,  0, 0x30f01ffffffffffUL, 0x30f01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
-/* pmc33 */ { PFM_REG_CONFIG,  0, 0x0,  0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
-/* pmc34 */ { PFM_REG_CONFIG,  0, 0xf01ffffffffffUL, 0xf01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
-/* pmc35 */ { PFM_REG_CONFIG,  0, 0x0,  0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
-/* pmc36 */ { PFM_REG_CONFIG,  0, 0xfffffff0, 0xf, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
-/* pmc37 */ { PFM_REG_MONITOR, 4, 0x0, 0x3fff, NULL, pfm_mont_pmc_check, {RDEP_MONT_IEAR, 0, 0, 0}, {0, 0, 0, 0}},
-/* pmc38 */ { PFM_REG_CONFIG,  0, 0xdb6, 0x2492, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
-/* pmc39 */ { PFM_REG_MONITOR, 6, 0x0, 0xffcf, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}},
-/* pmc40 */ { PFM_REG_MONITOR, 6, 0x2000000, 0xf01cf, NULL, pfm_mont_pmc_check, {RDEP_MONT_DEAR,0, 0, 0}, {0,0, 0, 0}},
-/* pmc41 */ { PFM_REG_CONFIG,  0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}},
-/* pmc42 */ { PFM_REG_MONITOR, 6, 0x0, 0x7ff4f, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}},
-	    { PFM_REG_END    , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */
-};
-
-static pfm_reg_desc_t pfm_mont_pmd_desc[PMU_MAX_PMDS]={
-/* pmd0  */ { PFM_REG_NOTIMPL, }, 
-/* pmd1  */ { PFM_REG_NOTIMPL, },
-/* pmd2  */ { PFM_REG_NOTIMPL, },
-/* pmd3  */ { PFM_REG_NOTIMPL, },
-/* pmd4  */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(4),0, 0, 0}},
-/* pmd5  */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(5),0, 0, 0}},
-/* pmd6  */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(6),0, 0, 0}},
-/* pmd7  */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(7),0, 0, 0}},
-/* pmd8  */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(8),0, 0, 0}}, 
-/* pmd9  */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(9),0, 0, 0}},
-/* pmd10 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(10),0, 0, 0}},
-/* pmd11 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(11),0, 0, 0}},
-/* pmd12 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(12),0, 0, 0}},
-/* pmd13 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(13),0, 0, 0}},
-/* pmd14 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(14),0, 0, 0}},
-/* pmd15 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(15),0, 0, 0}},
-/* pmd16 */ { PFM_REG_NOTIMPL, },
-/* pmd17 */ { PFM_REG_NOTIMPL, },
-/* pmd18 */ { PFM_REG_NOTIMPL, },
-/* pmd19 */ { PFM_REG_NOTIMPL, },
-/* pmd20 */ { PFM_REG_NOTIMPL, },
-/* pmd21 */ { PFM_REG_NOTIMPL, },
-/* pmd22 */ { PFM_REG_NOTIMPL, },
-/* pmd23 */ { PFM_REG_NOTIMPL, },
-/* pmd24 */ { PFM_REG_NOTIMPL, },
-/* pmd25 */ { PFM_REG_NOTIMPL, },
-/* pmd26 */ { PFM_REG_NOTIMPL, },
-/* pmd27 */ { PFM_REG_NOTIMPL, },
-/* pmd28 */ { PFM_REG_NOTIMPL, },
-/* pmd29 */ { PFM_REG_NOTIMPL, },
-/* pmd30 */ { PFM_REG_NOTIMPL, },
-/* pmd31 */ { PFM_REG_NOTIMPL, },
-/* pmd32 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(33)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}},
-/* pmd33 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}},
-/* pmd34 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(35),0, 0, 0}, {RDEP(37),0, 0, 0}},
-/* pmd35 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(34),0, 0, 0}, {RDEP(37),0, 0, 0}},
-/* pmd36 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(33),0, 0, 0}, {RDEP(40),0, 0, 0}},
-/* pmd37 */ { PFM_REG_NOTIMPL, },
-/* pmd38 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd39 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd40 */ { PFM_REG_NOTIMPL, },
-/* pmd41 */ { PFM_REG_NOTIMPL, },
-/* pmd42 */ { PFM_REG_NOTIMPL, },
-/* pmd43 */ { PFM_REG_NOTIMPL, },
-/* pmd44 */ { PFM_REG_NOTIMPL, },
-/* pmd45 */ { PFM_REG_NOTIMPL, },
-/* pmd46 */ { PFM_REG_NOTIMPL, },
-/* pmd47 */ { PFM_REG_NOTIMPL, },
-/* pmd48 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd49 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd50 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd51 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd52 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd53 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd54 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd55 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd56 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd57 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd58 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd59 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd60 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd61 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd62 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-/* pmd63 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}},
-	    { PFM_REG_END   , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */
-};
-
-/*
- * PMC reserved fields must have their power-up values preserved
- */
-static int
-pfm_mont_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs)
-{
-	unsigned long tmp1, tmp2, ival = *val;
-
-	/* remove reserved areas from user value */
-	tmp1 = ival & PMC_RSVD_MASK(cnum);
-
-	/* get reserved fields values */
-	tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum);
-
-	*val = tmp1 | tmp2;
-
-	DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n",
-		  cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val));
-	return 0;
-}
-
-/*
- * task can be NULL if the context is unloaded
- */
-static int
-pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs)
-{
-	int ret = 0;
-	unsigned long val32 = 0, val38 = 0, val41 = 0;
-	unsigned long tmpval;
-	int check_case1 = 0;
-	int is_loaded;
-
-	/* first preserve the reserved fields */
-	pfm_mont_reserved(cnum, val, regs);
-
-	tmpval = *val;
-
-	/* sanity check */
-	if (ctx == NULL) return -EINVAL;
-
-	is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED;
-
-	/*
-	 * we must clear the debug registers if pmc41 has a value which enable
-	 * memory pipeline event constraints. In this case we need to clear the
-	 * the debug registers if they have not yet been accessed. This is required
-	 * to avoid picking stale state.
-	 * PMC41 is "active" if:
-	 * 	one of the pmc41.cfg_dtagXX field is different from 0x3
-	 * AND
-	 * 	at the corresponding pmc41.en_dbrpXX is set.
-	 * AND
-	 *	ctx_fl_using_dbreg == 0  (i.e., dbr not yet used)
-	 */
-	DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, tmpval, ctx->ctx_fl_using_dbreg, is_loaded));
-
-	if (cnum == 41 && is_loaded 
-	    && (tmpval & 0x1e00000000000UL) && (tmpval & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) {
-
-		DPRINT(("pmc[%d]=0x%lx has active pmc41 settings, clearing dbr\n", cnum, tmpval));
-
-		/* don't mix debug with perfmon */
-		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
-
-		/*
-		 * a count of 0 will mark the debug registers if:
-		 * AND
-		 */
-		ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs);
-		if (ret) return ret;
-	}
-	/*
-	 * we must clear the (instruction) debug registers if:
-	 * 	pmc38.ig_ibrpX is 0 (enabled)
-	 * AND
-	 *	ctx_fl_using_dbreg == 0  (i.e., dbr not yet used)
-	 */
-	if (cnum == 38 && is_loaded && ((tmpval & 0x492UL) != 0x492UL) && ctx->ctx_fl_using_dbreg == 0) {
-
-		DPRINT(("pmc38=0x%lx has active pmc38 settings, clearing ibr\n", tmpval));
-
-		/* don't mix debug with perfmon */
-		if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL;
-
-		/*
-		 * a count of 0 will mark the debug registers as in use and also
-		 * ensure that they are properly cleared.
-		 */
-		ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs);
-		if (ret) return ret;
-
-	}
-	switch(cnum) {
-		case  32: val32 = *val;
-			  val38 = ctx->ctx_pmcs[38];
-			  val41 = ctx->ctx_pmcs[41];
-			  check_case1 = 1;
-			  break;
-		case  38: val38 = *val;
-			  val32 = ctx->ctx_pmcs[32];
-			  val41 = ctx->ctx_pmcs[41];
-			  check_case1 = 1;
-			  break;
-		case  41: val41 = *val;
-			  val32 = ctx->ctx_pmcs[32];
-			  val38 = ctx->ctx_pmcs[38];
-			  check_case1 = 1;
-			  break;
-	}
-	/* check illegal configuration which can produce inconsistencies in tagging
-	 * i-side events in L1D and L2 caches
-	 */
-	if (check_case1) {
-		ret =   (((val41 >> 45) & 0xf) == 0 && ((val32>>57) & 0x1) == 0)
-		     && ((((val38>>1) & 0x3) == 0x2 || ((val38>>1) & 0x3) == 0)
-		     ||  (((val38>>4) & 0x3) == 0x2 || ((val38>>4) & 0x3) == 0));
-		if (ret) {
-			DPRINT(("invalid config pmc38=0x%lx pmc41=0x%lx pmc32=0x%lx\n", val38, val41, val32));
-			return -EINVAL;
-		}
-	}
-	*val = tmpval;
-	return 0;
-}
-
-/*
- * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
- */
-static pmu_config_t pmu_conf_mont={
-	.pmu_name        = "Montecito",
-	.pmu_family      = 0x20,
-	.flags           = PFM_PMU_IRQ_RESEND,
-	.ovfl_val        = (1UL << 47) - 1,
-	.pmd_desc        = pfm_mont_pmd_desc,
-	.pmc_desc        = pfm_mont_pmc_desc,
-	.num_ibrs        = 8,
-	.num_dbrs        = 8,
-	.use_rr_dbregs   = 1 /* debug register are use for range retrictions */
-};
Index: linux-2.6/arch/ia64/kernel/process.c
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/process.c
+++ linux-2.6/arch/ia64/kernel/process.c
@@ -27,6 +27,7 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/kdebug.h>
+#include <linux/perfmon.h>
 
 #include <asm/cpu.h>
 #include <asm/delay.h>
@@ -44,10 +45,6 @@
 
 #include "entry.h"
 
-#ifdef CONFIG_PERFMON
-# include <asm/perfmon.h>
-#endif
-
 #include "sigframe.h"
 
 void (*ia64_mark_idle)(int);
@@ -164,32 +161,23 @@ do_notify_resume_user (sigset_t *unused,
 		return;
 	}
 
-#ifdef CONFIG_PERFMON
-	if (current->thread.pfm_needs_checking)
-		pfm_handle_work();
-#endif
+	if (test_thread_flag(TIF_PERFMON_WORK))
+		pfm_handle_work(task_pt_regs(current));
 
 	/* deal with pending signal delivery */
 	if (test_thread_flag(TIF_SIGPENDING)||test_thread_flag(TIF_RESTORE_SIGMASK))
 		ia64_do_signal(scr, in_syscall);
 }
 
-static int pal_halt        = 1;
 static int can_do_pal_halt = 1;
 
 static int __init nohalt_setup(char * str)
 {
-	pal_halt = can_do_pal_halt = 0;
+	can_do_pal_halt = 0;
 	return 1;
 }
 __setup("nohalt", nohalt_setup);
 
-void
-update_pal_halt_status(int status)
-{
-	can_do_pal_halt = pal_halt && status;
-}
-
 /*
  * We use this if we don't have any better idle routine..
  */
@@ -198,10 +186,34 @@ default_idle (void)
 {
 	local_irq_enable();
 	while (!need_resched()) {
+#ifdef CONFIG_PERFMON
+		u64 psr = 0;
+		/*
+		 * If requested, we stop the PMU to avoid
+		 * measuring across the core idle loop.
+		 *
+		 * dcr.pp is not modified on purpose
+		 * it is used when coming out of
+		 * safe_halt() via interrupt
+		 */
+		if ((__get_cpu_var(pfm_syst_info) & PFM_ITA_CPUINFO_IDLE_EXCL)) {
+			psr = ia64_getreg(_IA64_REG_PSR);
+			if (psr & IA64_PSR_PP)
+				ia64_rsm(IA64_PSR_PP);
+		}
+#endif
+
 		if (can_do_pal_halt)
 			safe_halt();
 		else
 			cpu_relax();
+#ifdef CONFIG_PERFMON
+		if ((__get_cpu_var(pfm_syst_info) & PFM_ITA_CPUINFO_IDLE_EXCL)) {
+			if (psr & IA64_PSR_PP)
+				ia64_ssm(IA64_PSR_PP);
+		}
+#endif
+
 	}
 }
 
@@ -315,22 +327,9 @@ cpu_idle (void)
 void
 ia64_save_extra (struct task_struct *task)
 {
-#ifdef CONFIG_PERFMON
-	unsigned long info;
-#endif
-
 	if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
 		ia64_save_debug_regs(&task->thread.dbr[0]);
 
-#ifdef CONFIG_PERFMON
-	if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
-		pfm_save_regs(task);
-
-	info = __get_cpu_var(pfm_syst_info);
-	if (info & PFM_CPUINFO_SYST_WIDE)
-		pfm_syst_wide_update_task(task, info, 0);
-#endif
-
 #ifdef CONFIG_IA32_SUPPORT
 	if (IS_IA32_PROCESS(task_pt_regs(task)))
 		ia32_save_state(task);
@@ -340,22 +339,9 @@ ia64_save_extra (struct task_struct *tas
 void
 ia64_load_extra (struct task_struct *task)
 {
-#ifdef CONFIG_PERFMON
-	unsigned long info;
-#endif
-
 	if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0)
 		ia64_load_debug_regs(&task->thread.dbr[0]);
 
-#ifdef CONFIG_PERFMON
-	if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0)
-		pfm_load_regs(task);
-
-	info = __get_cpu_var(pfm_syst_info);
-	if (info & PFM_CPUINFO_SYST_WIDE) 
-		pfm_syst_wide_update_task(task, info, 1);
-#endif
-
 #ifdef CONFIG_IA32_SUPPORT
 	if (IS_IA32_PROCESS(task_pt_regs(task)))
 		ia32_load_state(task);
@@ -481,8 +467,7 @@ copy_thread (int nr, unsigned long clone
 	 * call behavior where scratch registers are preserved across
 	 * system calls (unless used by the system call itself).
 	 */
-#	define THREAD_FLAGS_TO_CLEAR	(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \
-					 | IA64_THREAD_PM_VALID)
+#	define THREAD_FLAGS_TO_CLEAR	(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID)
 #	define THREAD_FLAGS_TO_SET	0
 	p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR)
 			   | THREAD_FLAGS_TO_SET);
@@ -503,10 +488,8 @@ copy_thread (int nr, unsigned long clone
 	}
 #endif
 
-#ifdef CONFIG_PERFMON
-	if (current->thread.pfm_context)
-		pfm_inherit(p, child_ptregs);
-#endif
+	pfm_copy_thread(p);
+
 	return retval;
 }
 
@@ -743,15 +726,13 @@ exit_thread (void)
 {
 
 	ia64_drop_fpu(current);
-#ifdef CONFIG_PERFMON
-       /* if needed, stop monitoring and flush state to perfmon context */
-	if (current->thread.pfm_context)
-		pfm_exit_thread(current);
-
-	/* free debug register resources */
-	if (current->thread.flags & IA64_THREAD_DBG_VALID)
-		pfm_release_debug_registers(current);
-#endif
+
+        /* if needed, stop monitoring and flush state to perfmon context */
+ 	pfm_exit_thread(current);
+
+  	/* free debug register resources */
+	pfm_release_dbregs(current);
+
 	if (IS_IA32_PROCESS(task_pt_regs(current)))
 		ia32_drop_partial_page_list(current);
 }
Index: linux-2.6/arch/ia64/kernel/ptrace.c
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/ptrace.c
+++ linux-2.6/arch/ia64/kernel/ptrace.c
@@ -25,9 +25,6 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/unwind.h>
-#ifdef CONFIG_PERFMON
-#include <asm/perfmon.h>
-#endif
 
 #include "entry.h"
 
Index: linux-2.6/arch/ia64/kernel/setup.c
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/setup.c
+++ linux-2.6/arch/ia64/kernel/setup.c
@@ -45,6 +45,7 @@
 #include <linux/cpufreq.h>
 #include <linux/kexec.h>
 #include <linux/crash_dump.h>
+#include <linux/perfmon.h>
 
 #include <asm/ia32.h>
 #include <asm/machvec.h>
@@ -988,6 +989,8 @@ cpu_init (void)
 	}
 	platform_cpu_init();
 	pm_idle = default_idle;
+
+	pfm_init_percpu();
 }
 
 /*
Index: linux-2.6/arch/ia64/kernel/smpboot.c
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/smpboot.c
+++ linux-2.6/arch/ia64/kernel/smpboot.c
@@ -39,6 +39,7 @@
 #include <linux/efi.h>
 #include <linux/percpu.h>
 #include <linux/bitops.h>
+#include <linux/perfmon.h>
 
 #include <asm/atomic.h>
 #include <asm/cache.h>
@@ -378,10 +379,6 @@ smp_callin (void)
 	extern void ia64_init_itm(void);
 	extern volatile int time_keeper_id;
 
-#ifdef CONFIG_PERFMON
-	extern void pfm_init_percpu(void);
-#endif
-
 	cpuid = smp_processor_id();
 	phys_id = hard_smp_processor_id();
 	itc_master = time_keeper_id;
@@ -403,10 +400,6 @@ smp_callin (void)
 
 	ia64_mca_cmc_vector_setup();	/* Setup vector on AP */
 
-#ifdef CONFIG_PERFMON
-	pfm_init_percpu();
-#endif
-
 	local_irq_enable();
 
 	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) {
@@ -738,6 +731,7 @@ int __cpu_disable(void)
 	fixup_irqs();
 	local_flush_tlb_all();
 	cpu_clear(cpu, cpu_callin_map);
+	pfm_cpu_disable();
 	return 0;
 }
 
Index: linux-2.6/arch/ia64/kernel/sys_ia64.c
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/sys_ia64.c
+++ linux-2.6/arch/ia64/kernel/sys_ia64.c
@@ -284,3 +284,10 @@ sys_pciconfig_write (unsigned long bus, 
 }
 
 #endif /* CONFIG_PCI */
+
+#ifndef CONFIG_PERFMON
+asmlinkage long sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
+{
+	return -ENOSYS;
+}
+#endif
Index: linux-2.6/arch/ia64/kernel/time.c
===================================================================
--- linux-2.6.orig/arch/ia64/kernel/time.c
+++ linux-2.6/arch/ia64/kernel/time.c
@@ -19,6 +19,7 @@
 #include <linux/interrupt.h>
 #include <linux/efi.h>
 #include <linux/timex.h>
+#include <linux/perfmon.h>
 
 #include <asm/machvec.h>
 #include <asm/delay.h>
@@ -62,6 +63,8 @@ timer_interrupt (int irq, void *dev_id)
 
 	profile_tick(CPU_PROFILING);
 
+	pfm_handle_switch_timeout();
+
 	while (1) {
 		update_process_times(user_mode(get_irq_regs()));
 
Index: linux-2.6/arch/ia64/lib/Makefile
===================================================================
--- linux-2.6.orig/arch/ia64/lib/Makefile
+++ linux-2.6/arch/ia64/lib/Makefile
@@ -13,7 +13,6 @@ lib-y := __divsi3.o __udivsi3.o __modsi3
 
 lib-$(CONFIG_ITANIUM)	+= copy_page.o copy_user.o memcpy.o
 lib-$(CONFIG_MCKINLEY)	+= copy_page_mck.o memcpy_mck.o
-lib-$(CONFIG_PERFMON)	+= carta_random.o
 
 AFLAGS___divdi3.o	=
 AFLAGS___udivdi3.o	= -DUNSIGNED
Index: linux-2.6/arch/ia64/oprofile/init.c
===================================================================
--- linux-2.6.orig/arch/ia64/oprofile/init.c
+++ linux-2.6/arch/ia64/oprofile/init.c
@@ -12,8 +12,8 @@
 #include <linux/init.h>
 #include <linux/errno.h>
  
-extern int perfmon_init(struct oprofile_operations * ops);
-extern void perfmon_exit(void);
+extern int op_perfmon_init(struct oprofile_operations * ops);
+extern void op_perfmon_exit(void);
 extern void ia64_backtrace(struct pt_regs * const regs, unsigned int depth);
 
 int __init oprofile_arch_init(struct oprofile_operations * ops)
@@ -22,7 +22,7 @@ int __init oprofile_arch_init(struct opr
 
 #ifdef CONFIG_PERFMON
 	/* perfmon_init() can fail, but we have no way to report it */
-	ret = perfmon_init(ops);
+	ret = op_perfmon_init(ops);
 #endif
 	ops->backtrace = ia64_backtrace;
 
@@ -33,6 +33,6 @@ int __init oprofile_arch_init(struct opr
 void oprofile_arch_exit(void)
 {
 #ifdef CONFIG_PERFMON
-	perfmon_exit();
+	op_perfmon_exit();
 #endif
 }
Index: linux-2.6/arch/ia64/oprofile/perfmon.c
===================================================================
--- linux-2.6.orig/arch/ia64/oprofile/perfmon.c
+++ linux-2.6/arch/ia64/oprofile/perfmon.c
@@ -10,19 +10,21 @@
 #include <linux/kernel.h>
 #include <linux/oprofile.h>
 #include <linux/sched.h>
-#include <asm/perfmon.h>
+#include <linux/module.h>
+#include <linux/perfmon.h>
 #include <asm/ptrace.h>
 #include <asm/errno.h>
 
 static int allow_ints;
 
 static int
-perfmon_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg,
-                struct pt_regs *regs, unsigned long stamp)
+perfmon_handler(void *buf, struct pfm_ovfl_arg *arg,
+                unsigned long ip, u64 stamp, void *data)
 {
+ 	struct pt_regs *regs = data;
 	int event = arg->pmd_eventid;
  
-	arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1;
+ 	arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET;
 
 	/* the owner of the oprofile event buffer may have exited
 	 * without perfmon being shutdown (e.g. SIGSEGV)
@@ -45,17 +47,13 @@ static void perfmon_stop(void)
 	allow_ints = 0;
 }
 
-
-#define OPROFILE_FMT_UUID { \
-	0x77, 0x7a, 0x6e, 0x61, 0x20, 0x65, 0x73, 0x69, 0x74, 0x6e, 0x72, 0x20, 0x61, 0x65, 0x0a, 0x6c }
-
-static pfm_buffer_fmt_t oprofile_fmt = {
- 	.fmt_name 	    = "oprofile_format",
- 	.fmt_uuid	    = OPROFILE_FMT_UUID,
- 	.fmt_handler	    = perfmon_handler,
+static struct pfm_smpl_fmt oprofile_fmt = {
+ 	.fmt_name = "OProfile",
+ 	.fmt_handler = perfmon_handler,
+	.fmt_flags = PFM_FMT_BUILTIN_FLAG,
+	.owner = THIS_MODULE
 };
 
-
 static char * get_cpu_type(void)
 {
 	__u8 family = local_cpu_data->family;
@@ -75,25 +73,26 @@ static char * get_cpu_type(void)
 
 static int using_perfmon;
 
-int perfmon_init(struct oprofile_operations * ops)
+int __init op_perfmon_init(struct oprofile_operations * ops)
 {
-	int ret = pfm_register_buffer_fmt(&oprofile_fmt);
+	int ret = pfm_fmt_register(&oprofile_fmt);
 	if (ret)
 		return -ENODEV;
 
 	ops->cpu_type = get_cpu_type();
 	ops->start = perfmon_start;
 	ops->stop = perfmon_stop;
+	ops->implementation = "perfmon2";
 	using_perfmon = 1;
 	printk(KERN_INFO "oprofile: using perfmon.\n");
 	return 0;
 }
 
 
-void perfmon_exit(void)
+void __exit op_perfmon_exit(void)
 {
 	if (!using_perfmon)
 		return;
 
-	pfm_unregister_buffer_fmt(oprofile_fmt.fmt_uuid);
+	pfm_fmt_unregister(&oprofile_fmt);
 }
Index: linux-2.6/arch/ia64/perfmon/Kconfig
===================================================================
--- /dev/null
+++ linux-2.6/arch/ia64/perfmon/Kconfig
@@ -0,0 +1,58 @@
+menu "Hardware Performance Monitoring support"
+config PERFMON
+ 	bool "Perfmon2 performance monitoring interface"
+	default n
+ 	help
+  	Enables the perfmon2 interface to access the hardware
+	performance counters. See <http://perfmon2.sf.net/> for
+ 	more details.
+
+config PERFMON_DEBUG
+ 	bool "Perfmon debugging"
+	default n
+	depends on PERFMON
+ 	help
+  	Enables perfmon debugging support
+
+config IA64_PERFMON_COMPAT
+	bool "Enable old perfmon-2 compatbility mode"
+	default n
+	depends PERFMON
+	help
+	Enable this option to allow performance tools which used the old
+	perfmon-2 interface to continue to work. Old tools are those using
+	the obsolete commands and arguments. Check your programs and look
+	in include/asm-ia64/perfmon_compat.h for more information.
+
+config IA64_PERFMON_GENERIC
+	tristate "Generic IA-64 PMU support"
+	depends PERFMON
+	default n
+	help
+	Enables generic IA-64 PMU support.
+	The generic PMU is defined by the IA-64 architecture document.
+	This option should only be necessary when running with a PMU that
+	is not yet explicitely supported. Even then, there is no guarantee
+	that this support will work.
+
+config IA64_PERFMON_ITANIUM
+	tristate "Itanium (Merced) Performance Monitoring support"
+	depends PERFMON
+	default n
+	help
+	Enables Itanium (Merced) PMU support.
+
+config IA64_PERFMON_MCKINLEY
+	tristate "Itanium 2 (McKinley) Performance Monitoring  support"
+	depends PERFMON
+	default n
+	help
+	Enables Itanium 2 (McKinley, Madison, Deerfield) PMU support.
+
+config IA64_PERFMON_MONTECITO
+	tristate "Itanium 2 9000 (Montecito) Performance Monitoring  support"
+	depends PERFMON
+	default n
+	help
+	Enables support for Itanium 2 9000 (Montecito) PMU.
+endmenu
Index: linux-2.6/arch/ia64/perfmon/Makefile
===================================================================
--- /dev/null
+++ linux-2.6/arch/ia64/perfmon/Makefile
@@ -0,0 +1,11 @@
+#
+# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+# Contributed by Stephane Eranian <eranian@hpl.hp.com>
+#
+obj-$(CONFIG_PERFMON)			+= perfmon.o
+obj-$(CONFIG_IA64_PERFMON_COMPAT)	+= perfmon_default_smpl.o \
+					   perfmon_compat.o
+obj-$(CONFIG_IA64_PERFMON_GENERIC)	+= perfmon_generic.o
+obj-$(CONFIG_IA64_PERFMON_ITANIUM)	+= perfmon_itanium.o
+obj-$(CONFIG_IA64_PERFMON_MCKINLEY)	+= perfmon_mckinley.o
+obj-$(CONFIG_IA64_PERFMON_MONTECITO)	+= perfmon_montecito.o
Index: linux-2.6/arch/ia64/perfmon/perfmon.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/ia64/perfmon/perfmon.c
@@ -0,0 +1,951 @@
+/*
+ * This file implements the IA-64 specific
+ * support for the perfmon2 interface
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/perfmon.h>
+
+struct pfm_arch_session {
+	u32	pfs_sys_use_dbr;    /* syswide session uses dbr */
+	u32	pfs_ptrace_use_dbr; /* a thread uses dbr via ptrace()*/
+};
+
+static struct pfm_arch_session pfm_arch_sessions;
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_arch_sessions_lock);
+
+static inline void pfm_clear_psr_pp(void)
+{
+	ia64_rsm(IA64_PSR_PP);
+}
+
+static inline void pfm_set_psr_pp(void)
+{
+	ia64_ssm(IA64_PSR_PP);
+}
+
+static inline void pfm_clear_psr_up(void)
+{
+	ia64_rsm(IA64_PSR_UP);
+}
+
+static inline void pfm_set_psr_up(void)
+{
+	ia64_ssm(IA64_PSR_UP);
+}
+
+static inline void pfm_set_psr_l(u64 val)
+{
+	ia64_setreg(_IA64_REG_PSR_L, val);
+}
+
+static inline void pfm_restore_ibrs(u64 *ibrs, unsigned int nibrs)
+{
+	unsigned int i;
+
+	for (i = 0; i < nibrs; i++) {
+		ia64_set_ibr(i, ibrs[i]);
+		ia64_dv_serialize_instruction();
+	}
+	ia64_srlz_i();
+}
+
+static inline void pfm_restore_dbrs(u64 *dbrs, unsigned int ndbrs)
+{
+	unsigned int i;
+
+	for (i = 0; i < ndbrs; i++) {
+		ia64_set_dbr(i, dbrs[i]);
+		ia64_dv_serialize_data();
+	}
+	ia64_srlz_d();
+}
+
+irqreturn_t pmu_interrupt_handler(int irq, void *arg)
+{
+	struct pt_regs *regs;
+	regs = get_irq_regs();
+	irq_enter();
+	pfm_interrupt_handler(instruction_pointer(regs), regs);
+	irq_exit();
+	return IRQ_HANDLED;
+}
+static struct irqaction perfmon_irqaction = {
+	.handler = pmu_interrupt_handler,
+	.flags = IRQF_DISABLED, /* means keep interrupts masked */
+	.name = "perfmon"
+};
+
+void pfm_arch_quiesce_pmu_percpu(void)
+{
+	u64 dcr;
+	/*
+	 * make sure no measurement is active
+	 * (may inherit programmed PMCs from EFI).
+	 */
+	pfm_clear_psr_pp();
+	pfm_clear_psr_up();
+
+	/*
+	 * ensure dcr.pp is cleared
+	 */
+	dcr = ia64_getreg(_IA64_REG_CR_DCR);
+	ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP);
+
+	/*
+	 * we run with the PMU not frozen at all times
+	 */
+	ia64_set_pmc(0, 0);
+	ia64_srlz_d();
+}
+
+void pfm_arch_init_percpu(void)
+{
+	pfm_arch_quiesce_pmu_percpu();
+	/*
+	 * program PMU interrupt vector
+	 */
+	ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR);
+	ia64_srlz_d();
+}
+
+int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags)
+{
+	struct pfm_arch_context *ctx_arch;
+	
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	ctx_arch->flags.use_dbr = 0;
+	ctx_arch->flags.insecure = (ctx_flags & PFM_ITA_FL_INSECURE) ? 1: 0;
+
+	PFM_DBG("insecure=%d", ctx_arch->flags.insecure);
+
+	return 0;
+}
+
+/*
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * Context is locked. Interrupts are masked. Monitoring may be active.
+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
+ *
+ * Return:
+ * 	non-zero : did not save PMDs (as part of stopping the PMU)
+ * 	       0 : saved PMDs (no need to save them in caller)
+ */
+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx,
+		       struct pfm_event_set *set)
+{
+	struct pfm_arch_context *ctx_arch;
+	u64 psr, tmp;
+	
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	/*
+	 * save current PSR: needed because we modify it
+	 */
+	ia64_srlz_d();
+	psr = ia64_getreg(_IA64_REG_PSR);
+
+	/*
+	 * stop monitoring:
+	 * This is the last instruction which may generate an overflow
+	 *
+	 * we do not clear ipsr.up
+	 */
+	pfm_clear_psr_up();
+	ia64_srlz_d();
+
+	/*
+	 * extract overflow status bits
+	 */
+	tmp =  ia64_get_pmc(0) & ~0xf;
+
+	/*
+	 * keep a copy of psr.up (for reload)
+	 */
+	ctx_arch->ctx_saved_psr_up = psr & IA64_PSR_UP;
+
+	/*
+	 * save overflow status bits
+	 */
+	set->povfl_pmds[0] = tmp;
+
+	/*
+	 * record how many pending overflows
+	 * XXX: assume identity mapping for counters
+	 */
+	set->npend_ovfls = ia64_popcnt(tmp);
+
+	/*
+	 * make sure the PMU is unfrozen for the next task
+	 */
+	if (set->npend_ovfls) {
+		ia64_set_pmc(0, 0);
+		ia64_srlz_d();
+	}
+	return 1;
+}
+
+/*
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * set cannot be NULL. Context is locked. Interrupts are masked.
+ * Caller has already restored all PMD and PMC registers.
+ *
+ * must reactivate monitoring
+ */
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx,
+		      struct pfm_event_set *set)
+{
+	struct pfm_arch_context *ctx_arch;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	/*
+	 * when monitoring is not explicitly started
+	 * then psr_up = 0, in which case we do not
+	 * need to restore
+	 */
+	if (likely(ctx_arch->ctx_saved_psr_up)) {
+		pfm_set_psr_up();
+		ia64_srlz_d();
+	}
+}
+
+int pfm_arch_reserve_session(struct pfm_context *ctx, u32 cpu)
+{
+	struct pfm_arch_context *ctx_arch;
+	int is_system;
+	int ret = 0;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	is_system = ctx->flags.system;
+
+	spin_lock(&pfm_arch_sessions_lock);
+
+	if (is_system && ctx_arch->flags.use_dbr) {
+		PFM_DBG("syswide context uses dbregs");
+
+		if (pfm_arch_sessions.pfs_ptrace_use_dbr) {
+			PFM_DBG("cannot reserve syswide context: "
+				  "dbregs in use by ptrace");
+			ret = -EBUSY;
+		} else {
+			pfm_arch_sessions.pfs_sys_use_dbr++;
+		}
+	}
+	spin_unlock(&pfm_arch_sessions_lock);
+
+	return ret;
+}
+
+void pfm_arch_release_session(struct pfm_context *ctx, u32 cpu)
+{
+	struct pfm_arch_context *ctx_arch;
+	int is_system;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	is_system = ctx->flags.system;
+
+	spin_lock(&pfm_arch_sessions_lock);
+
+	if (is_system && ctx_arch->flags.use_dbr) {
+		pfm_arch_sessions.pfs_sys_use_dbr--;
+	}
+	spin_unlock(&pfm_arch_sessions_lock);
+}
+
+/*
+ * function called from pfm_load_context_*(). Task is not guaranteed to be
+ * current task. If not then other task is guaranteed stopped and off any CPU.
+ * context is locked and interrupts are masked.
+ *
+ * On PFM_LOAD_CONTEXT, the interface guarantees monitoring is stopped.
+ *
+ * For system-wide task is NULL
+ */
+int pfm_arch_load_context(struct pfm_context *ctx, struct pfm_event_set *set,
+			  struct task_struct *task)
+{
+	struct pfm_arch_context *ctx_arch;
+	struct pt_regs *regs;
+	int ret = 0;
+	
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	/*
+	 * cannot load a context which is using range restrictions,
+	 * into a thread that is being debugged.
+	 *
+	 * if one set out of several is using the debug registers, then
+	 * we assume the context as whole is using them.
+	 */
+	if (ctx_arch->flags.use_dbr) {
+		if (ctx->flags.system) {
+			spin_lock(&pfm_arch_sessions_lock);
+
+			if (pfm_arch_sessions.pfs_ptrace_use_dbr) {
+				PFM_DBG("cannot reserve syswide context: "
+				  	"dbregs in use by ptrace");
+				ret = -EBUSY;
+			} else {
+				pfm_arch_sessions.pfs_sys_use_dbr++;
+				PFM_DBG("pfs_sys_use_dbr=%u", pfm_arch_sessions.pfs_sys_use_dbr);
+			}
+			spin_unlock(&pfm_arch_sessions_lock);
+
+		} else if (task->thread.flags & IA64_THREAD_DBG_VALID) {
+			PFM_DBG("load_pid [%d] thread is debugged, cannot "
+				  "use range restrictions", task->pid);
+			ret = -EBUSY;
+		}
+		if (ret)
+			return ret;
+	}
+
+	/*
+	 * We need to intervene on context switch to toggle the
+	 * psr.pp bit in system-wide. As such, we set the TIF
+	 * flag so that pfm_arch_ctxswout_sys() and the
+	 * pfm_arch_ctxswin_sys() functions get called
+	 * from pfm_ctxsw_sys();
+	 */
+	if (ctx->flags.system) {
+		set_thread_flag(TIF_PERFMON_CTXSW);
+		PFM_DBG("[%d] set TIF", current->pid);
+		return 0;
+	}
+
+	regs = task_pt_regs(task);
+
+	/*
+	 * self-monitoring systematically allows user level control
+	 */
+	if (task != current) {
+		/*
+		 * when not current, task is stopped, so this is safe
+		 */
+		ctx_arch->ctx_saved_psr_up = 0;
+		ia64_psr(regs)->up = ia64_psr(regs)->pp = 0;
+	} else
+		ctx_arch->flags.insecure = 1;
+
+	/*
+	 * allow user level control (start/stop/read pmd) if:
+	 * 	- self-monitoring
+	 * 	- requested at context creation (PFM_IA64_FL_INSECURE)
+	 *
+	 * There is not security hole with PFM_IA64_FL_INSECURE because
+	 * when not self-monitored, the caller must have permissions to
+	 * attached to the task.
+	 */
+	if (ctx_arch->flags.insecure) {
+		ia64_psr(regs)->sp = 0;
+		PFM_DBG("clearing psr.sp for [%d]", task->pid);
+	}
+	return 0;
+}
+
+int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags)
+{
+#define PFM_SETFL_BOTH_SWITCH	(PFM_SETFL_OVFL_SWITCH|PFM_SETFL_TIME_SWITCH)
+#define PFM_ITA_SETFL_BOTH_INTR	(PFM_ITA_SETFL_INTR_ONLY|\
+			 	 PFM_ITA_SETFL_EXCL_INTR)
+
+/* exclude return value field */
+#define PFM_SETFL_ALL_MASK	( PFM_ITA_SETFL_BOTH_INTR \
+				| PFM_SETFL_BOTH_SWITCH \
+				| PFM_ITA_SETFL_IDLE_EXCL)
+
+	if ((flags & ~PFM_SETFL_ALL_MASK)) {
+		PFM_DBG("invalid flags=0x%x", flags);
+		return -EINVAL;
+	}
+
+	if ((flags & PFM_ITA_SETFL_BOTH_INTR) == PFM_ITA_SETFL_BOTH_INTR) {
+		PFM_DBG("both excl intr and ontr only are set");
+		return -EINVAL;
+	}
+
+	if ((flags & PFM_ITA_SETFL_IDLE_EXCL) && !ctx->flags.system) {
+		PFM_DBG("idle exclude flag only for system-wide context");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * function called from pfm_unload_context_*(). Context is locked.
+ * interrupts are masked. task is not guaranteed to be current task.
+ * Access to PMU is not guaranteed.
+ *
+ * function must do whatever arch-specific action is required on unload
+ * of a context.
+ *
+ * called for both system-wide and per-thread. task is NULL for ssytem-wide
+ */
+int pfm_arch_unload_context(struct pfm_context *ctx, struct task_struct *task)
+{
+	struct pfm_arch_context *ctx_arch;
+	struct pt_regs *regs;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	if (ctx->flags.system) {
+		/*
+	 	 * disable context switch hook
+	 	 */
+		clear_thread_flag(TIF_PERFMON_CTXSW);
+
+		if (ctx_arch->flags.use_dbr) {
+			spin_lock(&pfm_arch_sessions_lock);
+			pfm_arch_sessions.pfs_sys_use_dbr--;
+			PFM_DBG("sys_use_dbr=%u", pfm_arch_sessions.pfs_sys_use_dbr);
+			spin_unlock(&pfm_arch_sessions_lock);
+		}
+		return 0;
+	}
+
+	regs = task_pt_regs(task);
+
+	/*
+	 * cancel user level control for per-task context
+	 */
+	ia64_psr(regs)->sp = 1;
+	PFM_DBG("setting psr.sp for [%d]", task->pid);
+	return 0;
+}
+
+/*
+ * mask monitoring by setting the privilege level to 0
+ * we cannot use psr.pp/psr.up for this, it is controlled by
+ * the user
+ */
+void pfm_arch_mask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	unsigned long mask;
+	unsigned int i;
+
+	/*
+	 * as an optimization we look at the first 64 PMC
+	 * registers only starting at PMC4.
+	 */
+	mask = arch_info->mask_pmcs[0] >> PFM_ITA_FCNTR;
+	for(i = PFM_ITA_FCNTR; mask; i++, mask >>=1) {
+		if (likely(mask & 0x1))
+			ia64_set_pmc(i, set->pmcs[i] & ~0xfUL);
+	}
+	/*
+	 * make changes visisble
+	 */
+	ia64_srlz_d();
+}
+
+/*
+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
+ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets()
+ * context is locked. Interrupts are masked. set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ *
+ * function must restore all PMD registers from set.
+ */
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	struct pfm_arch_context *ctx_arch;
+	unsigned long *mask;
+	u16 i, num;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	if (ctx_arch->flags.insecure) {
+		num = pfm_pmu_conf->regs.num_rw_pmd;
+		mask = pfm_pmu_conf->regs.rw_pmds;
+	} else {
+		num = set->nused_pmds;
+		mask = set->used_pmds;
+	}
+	PFM_DBG("num=%u mask=0x%lx", num, mask[0]);
+	/*
+	 * must restore all implemented read-write PMDS to avoid leaking
+	 * information especially when PFM_IA64_FL_INSECURE is set.
+	 *
+	 * XXX: should check PFM_IA64_FL_INSECURE==0 and use used_pmd instead
+	 */
+	for (i = 0; num; i++) {
+		if (likely(test_bit(i, mask))) {
+			pfm_arch_write_pmd(ctx, i, set->pmds[i].value);
+			num--;
+		}
+	}
+	ia64_srlz_d();
+}
+
+/*
+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
+ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets()
+ * context is locked. Interrupts are masked. set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ *
+ * function must restore all PMC registers from set if needed
+ */
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	u64 mask2 = 0, val, plm;
+	unsigned long impl_mask, mask_pmcs;
+	unsigned int i;
+
+	/*
+	 * as an optimization we only look at the first 64
+	 * PMC registers. In fact, we should never scan the
+	 * entire impl_pmcs because ibr/dbr are implemented
+	 * separately.
+	 *
+	 * always skip PMC0-PMC3. PMC0 taken care of when saving
+	 * state. PMC1-PMC3 not used until we get counters in
+	 * the 60 and above index range.
+	 */
+	impl_mask = pfm_pmu_conf->regs.pmcs[0] >> PFM_ITA_FCNTR;
+	mask_pmcs = arch_info->mask_pmcs[0] >> PFM_ITA_FCNTR;
+	plm = ctx->state == PFM_CTX_MASKED ? ~0xf : ~0x0;
+
+	for (i = PFM_ITA_FCNTR;
+	     impl_mask;
+	     i++, impl_mask >>=1, mask_pmcs >>=1) {
+		if (likely(impl_mask & 0x1)) {
+			mask2 = mask_pmcs & 0x1 ? plm : ~0;
+			val = set->pmcs[i] & mask2;
+			ia64_set_pmc(i, val);
+			PFM_DBG_ovfl("pmc%u=0x%lx", i, val);
+		}
+	}
+	/*
+	 * restore DBR/IBR
+	 */
+	if (set->priv_flags & PFM_ITA_SETFL_USE_DBR) {
+		pfm_restore_ibrs(set->pmcs+256, 8);
+		pfm_restore_dbrs(set->pmcs+264, 8);
+	}
+	ia64_srlz_d();
+}
+
+void pfm_arch_unmask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	u64 psr;
+	int is_system;
+
+	is_system = ctx->flags.system;
+
+	psr = ia64_getreg(_IA64_REG_PSR);
+
+	/*
+	 * monitoring is masked via the PMC.plm
+	 *
+	 * As we restore their value, we do not want each counter to
+	 * restart right away. We stop monitoring using the PSR,
+	 * restore the PMC (and PMD) and then re-establish the psr
+	 * as it was. Note that there can be no pending overflow at
+	 * this point, because monitoring is still MASKED.
+	 *
+	 * Because interrupts are masked we can avoid changing
+	 * DCR.pp.
+	 */
+	if (is_system)
+		pfm_clear_psr_pp();
+	else
+		pfm_clear_psr_up();
+
+	ia64_srlz_d();
+
+	pfm_arch_restore_pmcs(ctx, set);
+
+	/*
+	 * restore psr
+	 *
+	 * monitoring may start right now but interrupts
+	 * are still masked
+	 */
+	pfm_set_psr_l(psr);
+	ia64_srlz_d();
+}
+
+/*
+ * Called from pfm_stop()
+ *
+ * For per-thread:
+ *   task is not necessarily current. If not current task, then
+ *   task is guaranteed stopped and off any cpu. Access to PMU
+ *   is not guaranteed. Interrupts are masked. Context is locked.
+ *   Set is the active set.
+ *
+ * must disable active monitoring. ctx cannot be NULL
+ */
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx,
+		   struct pfm_event_set *set)
+{
+	struct pfm_arch_context *ctx_arch;
+	struct pt_regs *regs;
+	u64 dcr, psr;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	regs = task_pt_regs(task);
+
+	if (!ctx->flags.system) {
+		/*
+		 * in ZOMBIE state we always have task == current due to
+		 * pfm_exit_thread()
+		 */
+		ia64_psr(regs)->up = 0;
+		ctx_arch->ctx_saved_psr_up = 0;
+
+		/*
+		 * in case of ZOMBIE state, there is no unload to clear
+		 * insecure monitoring, so we do it in stop instead.
+		 */
+		if (ctx->state == PFM_CTX_ZOMBIE)
+			ia64_psr(regs)->sp = 1;
+
+		if (task == current) {
+			pfm_clear_psr_up();
+			ia64_srlz_d();
+		}
+	} else if (ctx->flags.started) { /* do not stop twice */
+		dcr = ia64_getreg(_IA64_REG_CR_DCR);
+		psr = ia64_getreg(_IA64_REG_PSR);
+
+		ia64_psr(regs)->pp = 0;
+		ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP);
+		pfm_clear_psr_pp();
+		ia64_srlz_d();
+
+		if (set->flags & PFM_ITA_SETFL_IDLE_EXCL) {
+			PFM_DBG("disabling idle exclude");
+			__get_cpu_var(pfm_syst_info) &= ~PFM_ITA_CPUINFO_IDLE_EXCL;
+		}
+	}
+}
+
+/*
+ * called from pfm_start()
+ *
+ * Interrupts are masked. Context is locked. Set is the active set.
+ *
+ * For per-thread:
+ * 	Task is not necessarily current. If not current task, then task
+ * 	is guaranteed stopped and off any cpu. No access to PMU is task
+ *	is not current.
+ *
+ * For system-wide:
+ * 	task is always current
+ *
+ * must enable active monitoring.
+ */
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx,
+		    struct pfm_event_set *set)
+{
+	struct pfm_arch_context *ctx_arch;
+	struct pt_regs *regs;
+	u64 dcr, dcr_pp, psr_pp;
+	u32 flags;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	regs = task_pt_regs(task);
+	flags = set->flags;
+
+	/*
+	 * take care of per-thread mode
+	 */
+	if (!ctx->flags.system) {
+
+		ia64_psr(regs)->up = 1;
+
+		if (task == current) {
+			pfm_set_psr_up();
+			ia64_srlz_d();
+		} else {
+			/*
+			 * start monitoring at the kernel level the next
+			 * time the task is scheduled
+			 */
+			ctx_arch->ctx_saved_psr_up = IA64_PSR_UP;
+		}
+		return;
+	}
+
+	/*
+	 * take care of system-wide mode
+	 */
+	dcr = ia64_getreg(_IA64_REG_CR_DCR);
+	if (flags & PFM_ITA_SETFL_INTR_ONLY) {
+		dcr_pp = 1;
+		psr_pp = 0;
+	} else if (flags & PFM_ITA_SETFL_EXCL_INTR) {
+		dcr_pp = 0;
+		psr_pp = 1;
+	} else {
+		dcr_pp = psr_pp = 1;
+	}
+	PFM_DBG("dcr_pp=%lu psr_pp=%lu", dcr_pp, psr_pp);
+
+	/*
+	 * update dcr_pp and psr_pp
+	 */
+	if (dcr_pp)
+		ia64_setreg(_IA64_REG_CR_DCR, dcr | IA64_DCR_PP);
+	else
+		ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP);
+
+	if (psr_pp) {
+		pfm_set_psr_pp();
+		ia64_psr(regs)->pp = 1;
+	} else {
+		pfm_clear_psr_pp();
+		ia64_psr(regs)->pp = 0;
+	}
+	ia64_srlz_d();
+
+	if (set->flags & PFM_ITA_SETFL_IDLE_EXCL) {
+		PFM_DBG("enable idle exclude");
+		__get_cpu_var(pfm_syst_info) |= PFM_ITA_CPUINFO_IDLE_EXCL;
+	}
+}
+
+/*
+ * Only call this function when a process is trying to
+ * write the debug registers (reading is always allowed)
+ * called from arch/ia64/kernel/ptrace.c:access_uarea()
+ */
+int __pfm_use_dbregs(struct task_struct *task)
+{
+	struct pfm_arch_context *ctx_arch;
+	struct pfm_context *ctx;
+	unsigned long flags;
+	int ret = 0;
+
+	PFM_DBG("called for [%d]", task->pid);
+
+	ctx = task->pfm_context;
+
+	/*
+	 * do it only once
+	 */
+	if (task->thread.flags & IA64_THREAD_DBG_VALID) {
+		PFM_DBG("IA64_THREAD_DBG_VALID already set");
+		return 0;
+	}
+	if (ctx) {
+		spin_lock_irqsave(&ctx->lock, flags);
+		ctx_arch = pfm_ctx_arch(ctx);
+
+		if (ctx_arch->flags.use_dbr == 1) {
+			PFM_DBG("PMU using dbregs already, no ptrace access");
+			ret = -1;
+		}
+		spin_unlock_irqrestore(&ctx->lock, flags);
+		if (ret)
+			return ret;
+	}
+
+	spin_lock(&pfm_arch_sessions_lock);
+
+	/*
+	 * We cannot allow setting breakpoints when system wide monitoring
+	 * sessions are using the debug registers.
+	 */
+	if (!pfm_arch_sessions.pfs_sys_use_dbr)
+		pfm_arch_sessions.pfs_ptrace_use_dbr++;
+	else
+		ret = -1;
+
+	PFM_DBG("ptrace_use_dbr=%u  sys_use_dbr=%u by [%d] ret = %d",
+		  pfm_arch_sessions.pfs_ptrace_use_dbr,
+		  pfm_arch_sessions.pfs_sys_use_dbr,
+		  task->pid, ret);
+
+	spin_unlock(&pfm_arch_sessions_lock);
+	if (ret)
+		return ret;
+#ifndef CONFIG_SMP
+	/*
+	 * in UP, we need to check whether the current
+	 * owner of the PMU is not using the debug registers
+	 * for monitoring. Because we are using a lazy
+	 * save on ctxswout, we must force a save in this
+	 * case because the debug registers are being
+	 * modified by another task. We save the current
+	 * PMD registers, and clear ownership. In ctxswin,
+	 * full state will be reloaded.
+	 *
+	 * Note: we overwrite task.
+	 */
+	task = __get_cpu_var(pmu_owner);
+	ctx = __get_cpu_var(pmu_ctx);
+
+	if (task == NULL)
+		return 0;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	if (ctx_arch->flags.use_dbr)
+		pfm_save_pmds_release(ctx);
+#endif
+	return 0;
+}
+
+/*
+ * This function is called for every task that exits with the
+ * IA64_THREAD_DBG_VALID set. This indicates a task which was
+ * able to use the debug registers for debugging purposes via
+ * ptrace(). Therefore we know it was not using them for
+ * perfmormance monitoring, so we only decrement the number
+ * of "ptraced" debug register users to keep the count up to date
+ */
+int __pfm_release_dbregs(struct task_struct *task)
+{
+	int ret;
+
+	spin_lock(&pfm_arch_sessions_lock);
+
+	if (pfm_arch_sessions.pfs_ptrace_use_dbr == 0) {
+		PFM_ERR("invalid release for [%d] ptrace_use_dbr=0", task->pid);
+		ret = -1;
+	}  else {
+		pfm_arch_sessions.pfs_ptrace_use_dbr--;
+		ret = 0;
+	}
+	spin_unlock(&pfm_arch_sessions_lock);
+
+	return ret;
+}
+
+int pfm_ia64_mark_dbregs_used(struct pfm_context *ctx,
+			      struct pfm_event_set *set)
+{
+	struct pfm_arch_context *ctx_arch;
+	struct task_struct *task;
+	struct thread_struct *thread;
+	int ret = 0, state;
+	int i, can_access_pmu = 0;
+	int is_loaded, is_system;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+	state = ctx->state;
+	task = ctx->task;
+	is_loaded = state == PFM_CTX_LOADED || state == PFM_CTX_MASKED;
+	is_system = ctx->flags.system;
+	can_access_pmu = __get_cpu_var(pmu_owner) == task || is_system;
+
+	if (is_loaded == 0)
+		goto done;
+
+	if (is_system == 0) {
+		thread = &(task->thread);
+
+		/*
+	 	* cannot use debug registers for montioring if they are
+	 	* already used for debugging
+	 	*/
+		if (thread->flags & IA64_THREAD_DBG_VALID) {
+			PFM_DBG("debug registers already in use for [%d]",
+				  task->pid);
+			return -EBUSY;
+		}
+	}
+
+	/*
+	 * check for debug registers in system wide mode
+	 */
+	spin_lock(&pfm_arch_sessions_lock);
+
+	if (is_system) {
+		if (pfm_arch_sessions.pfs_ptrace_use_dbr)
+			ret = -EBUSY;
+		else
+			pfm_arch_sessions.pfs_sys_use_dbr++;
+	}
+
+	spin_unlock(&pfm_arch_sessions_lock);
+
+	if (ret != 0)
+		return ret;
+
+	/*
+	 * clear hardware registers to make sure we don't
+	 * pick up stale state.
+	 */
+	if (can_access_pmu) {
+		PFM_DBG("clearing ibrs, dbrs");
+		for (i = 0; i < 8; i++) {
+			ia64_set_ibr(i, 0);
+			ia64_dv_serialize_instruction();
+		}
+		ia64_srlz_i();
+		for (i = 0; i < 8; i++) {
+			ia64_set_dbr(i, 0);
+			ia64_dv_serialize_data();
+		}
+		ia64_srlz_d();
+	}
+done:
+	/*
+	 * debug registers are now in use
+	 */
+	ctx_arch->flags.use_dbr = 1;
+	set->priv_flags |= PFM_ITA_SETFL_USE_DBR;
+	PFM_DBG("set%u use_dbr=1", set->id);
+	return 0;
+}
+EXPORT_SYMBOL(pfm_ia64_mark_dbregs_used);
+
+char *pfm_arch_get_pmu_module_name(void)
+{
+	switch(local_cpu_data->family) {
+	case 0x07:
+		return "perfmon_itanium";
+	case 0x1f:
+		return "perfmon_mckinley";
+	case 0x20:
+		return "perfmon_montecito";
+	default:
+		return "perfmon_generic";
+	}
+	return NULL;
+}
+
+/*
+ * global arch-specific intialization, called only once
+ */
+int __init pfm_arch_init(void)
+{
+	int ret;
+
+	spin_lock_init(&pfm_arch_sessions_lock);
+
+#ifdef CONFIG_IA64_PERFMON_COMPAT
+	ret = pfm_ia64_compat_init();
+	if (ret)
+		return ret;
+#endif
+	register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction);
+
+
+	return 0;
+}
Index: linux-2.6/arch/ia64/perfmon/perfmon_compat.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/ia64/perfmon/perfmon_compat.c
@@ -0,0 +1,1166 @@
+/*
+ * This file implements the IA-64 specific
+ * support for the perfmon2 interface
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+#include <linux/proc_fs.h>
+#include <linux/perfmon.h>
+#include <asm/uaccess.h>
+#include <asm/perfmon.h>
+
+asmlinkage long sys_pfm_stop(int fd);
+asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *st);
+asmlinkage long sys_pfm_unload_context(int fd);
+asmlinkage long sys_pfm_restart(int fd);
+asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ld);
+
+/*
+ * function providing some help for backward compatiblity with old IA-64
+ * applications. In the old model, certain attributes of a counter were
+ * passed via the PMC, now they are passed via the PMD.
+ */
+static int pfm_compat_update_pmd(struct pfm_context *ctx, u16 set_id, u16 cnum,
+		          u32 rflags,
+			  unsigned long *smpl_pmds,
+		          unsigned long *reset_pmds,
+			  u64 eventid)
+{
+	struct pfm_event_set *set;
+	int is_counting;
+	unsigned long *impl_pmds;
+	u32 flags = 0;
+	u16 max_pmd;
+
+	impl_pmds = pfm_pmu_conf->regs.pmds;
+	max_pmd	= pfm_pmu_conf->regs.max_pmd;
+
+	/*
+	 * given that we do not maintain PMC ->PMD dependencies
+	 * we cannot figure out what to do in case PMCxx != PMDxx
+	 */
+	if (cnum > max_pmd)
+		return 0;
+
+	/*
+	 * assumes PMCxx controls PMDxx which is always true for counters
+	 * on Itanium PMUs.
+	 */
+	is_counting = pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64;
+	set = pfm_find_set(ctx, set_id, 0);
+
+	if (is_counting) {
+		if (rflags & PFM_REGFL_OVFL_NOTIFY)
+			flags |= PFM_REGFL_OVFL_NOTIFY;
+		if (rflags & PFM_REGFL_RANDOM)
+			flags |= PFM_REGFL_RANDOM;
+		/*
+		 * verify validity of smpl_pmds
+		 */
+		if (unlikely(bitmap_subset(smpl_pmds,
+					   impl_pmds, max_pmd) == 0)) {
+			PFM_DBG("invalid smpl_pmds=0x%llx for pmd%u",
+				  (unsigned long long)smpl_pmds[0], cnum);
+			return -EINVAL;
+		}
+		/*
+		 * verify validity of reset_pmds
+		 */
+		if (unlikely(bitmap_subset(reset_pmds,
+					   impl_pmds, max_pmd) == 0)) {
+			PFM_DBG("invalid reset_pmds=0x%lx for pmd%u",
+				  reset_pmds[0], cnum);
+			return -EINVAL;
+		}
+		/*
+		 * ensures that a PFM_READ_PMDS succeeds with a
+		 * corresponding PFM_WRITE_PMDS
+		 */
+		__set_bit(cnum, set->used_pmds);
+
+	} else if (rflags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) {
+		PFM_DBG("cannot set ovfl_notify or random on pmd%u", cnum);
+		return -EINVAL;
+	}
+
+	set->pmds[cnum].flags = flags;
+
+	if (is_counting) {
+		bitmap_copy(set->pmds[cnum].reset_pmds,
+			    reset_pmds,
+			    max_pmd);
+
+		bitmap_copy(set->pmds[cnum].smpl_pmds,
+			    smpl_pmds,
+			    max_pmd);
+
+		set->pmds[cnum].eventid = eventid;
+
+		/*
+		 * update ovfl_notify
+		 */
+		if (rflags & PFM_REGFL_OVFL_NOTIFY)
+			__set_bit(cnum, set->ovfl_notify);
+		else
+			__clear_bit(cnum, set->ovfl_notify);
+
+	}
+	PFM_DBG("pmd%u flags=0x%x eventid=0x%lx r_pmds=0x%lx s_pmds=0x%lx",
+		  cnum, flags,
+		  eventid,
+		  reset_pmds[0],
+		  smpl_pmds[0]);
+
+	return 0;
+}
+
+
+int __pfm_write_ibrs_old(struct pfm_context *ctx, void *arg, int count)
+{
+	struct pfarg_dbreg *req = arg;
+	struct pfarg_pmc pmc;
+	int i, ret = 0;
+
+	memset(&pmc, 0, sizeof(pmc));
+
+	for (i = 0; i < count; i++, req++) {
+		pmc.reg_num   = 256+req->dbreg_num;
+		pmc.reg_value = req->dbreg_value;
+		pmc.reg_flags = 0;
+		pmc.reg_set   = req->dbreg_set;
+
+		ret = __pfm_write_pmcs(ctx, &pmc, 1);
+
+		req->dbreg_flags &= ~PFM_REG_RETFL_MASK;
+		req->dbreg_flags |= pmc.reg_flags;
+		
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static long pfm_write_ibrs_old(int fd, void __user *ureq, int count)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	struct pfarg_dbreg *req = NULL;
+	void *fptr;
+	unsigned long flags;
+	size_t sz;
+	int ret, fput_needed;
+
+	if (count < 1 || count >= PFM_MAX_ARG_COUNT(req))
+		return -EINVAL;
+
+	sz = count*sizeof(*req);
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+	if (ret == 0)
+		ret = __pfm_write_ibrs_old(ctx, req, count);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (copy_to_user(ureq, req, sz))
+		ret = -EFAULT;
+
+	kfree(fptr);
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+int __pfm_write_dbrs_old(struct pfm_context *ctx, void *arg, int count)
+{
+	struct pfarg_dbreg *req = arg;
+	struct pfarg_pmc pmc;
+	int i, ret = 0;
+
+	memset(&pmc, 0, sizeof(pmc));
+
+	for (i = 0; i < count; i++, req++) {
+		pmc.reg_num   = 264+req->dbreg_num;
+		pmc.reg_value = req->dbreg_value;
+		pmc.reg_flags = 0;
+		pmc.reg_set   = req->dbreg_set;
+
+		ret = __pfm_write_pmcs(ctx, &pmc, 1);
+
+		req->dbreg_flags &= ~PFM_REG_RETFL_MASK;
+		req->dbreg_flags |= pmc.reg_flags;
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static long pfm_write_dbrs_old(int fd, void __user *ureq, int count)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	struct pfarg_dbreg *req = NULL;
+	void *fptr;
+	unsigned long flags;
+	size_t sz;
+	int ret, fput_needed;
+
+	if (count < 1 || count >= PFM_MAX_ARG_COUNT(req))
+		return -EINVAL;
+
+	sz = count*sizeof(*req);
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+	if (ret == 0)
+		ret = __pfm_write_dbrs_old(ctx, req, count);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (copy_to_user(ureq, req, sz))
+		ret = -EFAULT;
+
+	kfree(fptr);
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+int __pfm_write_pmcs_old(struct pfm_context *ctx, struct pfarg_reg *req_old,
+			 int count)
+{
+	struct pfarg_pmc req;
+	unsigned int i;
+	int ret, error_code;
+
+	memset(&req, 0, sizeof(req));
+
+	for (i = 0; i < count; i++, req_old++) {
+		req.reg_num   = req_old->reg_num;
+		req.reg_set   = req_old->reg_set;
+		req.reg_flags = 0;
+		req.reg_value = req_old->reg_value;
+
+		ret = __pfm_write_pmcs(ctx, (void *)&req, 1);
+		req_old->reg_flags &= ~PFM_REG_RETFL_MASK;
+		req_old->reg_flags |= req.reg_flags;
+
+		if (ret)
+			return ret;
+
+		ret = pfm_compat_update_pmd(ctx, req_old->reg_set,
+				      req_old->reg_num,
+				      (u32)req_old->reg_flags,
+				      req_old->reg_smpl_pmds,
+				      req_old->reg_reset_pmds,
+				      req_old->reg_smpl_eventid);
+
+		error_code = ret ? PFM_REG_RETFL_EINVAL : 0;
+		req_old->reg_flags &= ~PFM_REG_RETFL_MASK;
+		req_old->reg_flags |= error_code;
+
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static long pfm_write_pmcs_old(int fd, void __user *ureq, int count)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	struct pfarg_reg *req = NULL;
+	void *fptr;
+	unsigned long flags;
+	size_t sz;
+	int ret, fput_needed;
+
+	if (count < 1 || count >= PFM_MAX_ARG_COUNT(req))
+		return -EINVAL;
+
+	sz = count*sizeof(*req);
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+	if (ret == 0)
+		ret = __pfm_write_pmcs_old(ctx, req, count);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (copy_to_user(ureq, req, sz))
+		ret = -EFAULT;
+
+	kfree(fptr);
+
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+int __pfm_write_pmds_old(struct pfm_context *ctx, struct pfarg_reg *req_old,
+			 int count)
+{
+	struct pfarg_pmd req;
+	int i, ret;
+
+	memset(&req, 0, sizeof(req));
+
+	for (i = 0; i < count; i++, req_old++) {
+		req.reg_num   = req_old->reg_num;
+		req.reg_set   = req_old->reg_set;
+		req.reg_value = req_old->reg_value;
+		req.reg_flags = req_old->reg_flags;
+
+		req.reg_long_reset  = req_old->reg_long_reset;
+		req.reg_short_reset = req_old->reg_short_reset;
+		req.reg_random_mask = req_old->reg_random_mask;
+		/*
+		 * reg_random_seed is ignored since v2.3
+		 */
+
+		/*
+		 * skip last_reset_val not used for writing
+		 * skip smpl_pmds, reset_pmds, eventid, ovfl_swtch_cnt
+		 * as set in pfm_write_pmcs_old.
+		 */
+		req.reg_ovfl_switch_cnt = req_old->reg_ovfl_switch_cnt;
+
+		ret = __pfm_write_pmds(ctx, (void *)&req, 1, 1);
+
+		req_old->reg_flags &= ~PFM_REG_RETFL_MASK;
+		req_old->reg_flags |= req.reg_flags;
+
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static long pfm_write_pmds_old(int fd, void __user *ureq, int count)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	struct pfarg_reg *req = NULL;
+	void *fptr;
+	unsigned long flags;
+	size_t sz;
+	int ret, fput_needed;
+
+	if (count < 1 || count >= PFM_MAX_ARG_COUNT(req))
+		return -EINVAL;
+
+	sz = count*sizeof(*req);
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+	if (ret == 0)
+		ret = __pfm_write_pmds_old(ctx, req, count);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (copy_to_user(ureq, req, sz))
+		ret = -EFAULT;
+
+	kfree(fptr);
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+int __pfm_read_pmds_old(struct pfm_context *ctx, struct pfarg_reg *req_old,
+			int count)
+{
+	struct pfarg_pmd req;
+	int i, ret;
+
+	memset(&req, 0, sizeof(req));
+
+	for (i = 0; i < count; i++, req_old++) {
+		req.reg_num   = req_old->reg_num;
+		req.reg_set   = req_old->reg_set;
+
+		/* skip value not used for reading */
+		req.reg_flags = req_old->reg_flags;
+
+		/* skip short/long_reset not used for reading */
+		/* skip last_reset_val not used for reading */
+		/* skip ovfl_switch_cnt not used for reading */
+
+		ret = __pfm_read_pmds(ctx, (void *)&req, 1);
+
+		req_old->reg_flags &= ~PFM_REG_RETFL_MASK;
+		req_old->reg_flags |= req.reg_flags;
+		if (ret)
+			return ret;
+
+		/* update fields */
+		req_old->reg_value = req.reg_value;
+
+		req_old->reg_last_reset_val  = req.reg_last_reset_val;
+		req_old->reg_ovfl_switch_cnt = req.reg_ovfl_switch_cnt;
+	}
+	return 0;
+}
+
+static long pfm_read_pmds_old(int fd, void __user *ureq, int count)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	struct pfarg_reg *req = NULL;
+	void *fptr;
+	unsigned long flags;
+	size_t sz;
+	int ret, fput_needed;
+
+	if (count < 1 || count >= PFM_MAX_ARG_COUNT(req))
+		return -EINVAL;
+
+	sz = count*sizeof(*req);
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+	if (ret == 0)
+		ret = __pfm_read_pmds_old(ctx, req, count);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (copy_to_user(ureq, req, sz))
+		ret = -EFAULT;
+
+	kfree(fptr);
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+/*
+ * OBSOLETE: use /proc/perfmon_map instead
+ */
+static long pfm_get_default_pmcs_old(int fd, void __user *ureq, int count)
+{
+	struct pfarg_reg *req = NULL;
+	void *fptr;
+	size_t sz;
+	int ret, i;
+	unsigned int cnum;
+
+	if (count < 1)
+		return -EINVAL;
+
+	/*
+	 * ensure the pfm_pmu_conf does not disappear while
+	 * we use it
+	 */
+	ret = pfm_pmu_conf_get(1);
+	if (ret)
+		return ret;
+
+	sz = count*sizeof(*ureq);
+
+	ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+
+	for (i = 0; i < count; i++, req++) {
+		cnum   = req->reg_num;
+
+		if (i >= PFM_MAX_PMCS ||
+		    (pfm_pmu_conf->pmc_desc[cnum].type & PFM_REG_I) == 0) {
+			req->reg_flags = PFM_REG_RETFL_EINVAL;
+			break;
+		}
+		req->reg_value = pfm_pmu_conf->pmc_desc[cnum].dfl_val;
+		req->reg_flags = 0;
+
+		PFM_DBG("pmc[%u]=0x%lx", cnum, req->reg_value);
+	}
+
+	if (copy_to_user(ureq, req, sz))
+		ret = -EFAULT;
+
+	kfree(fptr);
+error:
+	pfm_pmu_conf_put();
+
+	return ret;
+}
+
+/*
+ * allocate a sampling buffer and remaps it into the user address space of
+ * the task. This is only in compatibility mode
+ *
+ * function called ONLY on current task
+ */
+int pfm_smpl_buffer_alloc_compat(struct pfm_context *ctx, size_t rsize,
+			      struct file *filp)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma = NULL;
+	struct pfm_arch_context *ctx_arch;
+	size_t size;
+	int ret;
+	extern struct vm_operations_struct pfm_buf_map_vm_ops;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	/*
+	 * allocate buffer + map desc
+	 */
+	ret = pfm_smpl_buffer_alloc(ctx, rsize);
+	if (ret)
+		return ret;
+
+	size = ctx->smpl_size;
+
+
+	/* allocate vma */
+	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	if (!vma) {
+		PFM_DBG("Cannot allocate vma");
+		goto error_kmem;
+	}
+	memset(vma, 0, sizeof(*vma));
+
+	/*
+	 * partially initialize the vma for the sampling buffer
+	 */
+	vma->vm_mm	     = mm;
+	vma->vm_flags	     = VM_READ| VM_MAYREAD |VM_RESERVED;
+	vma->vm_page_prot    = PAGE_READONLY;
+	vma->vm_ops	     = &pfm_buf_map_vm_ops;
+	vma->vm_file	     = filp;
+	vma->vm_private_data = ctx;
+	vma->vm_pgoff        = 0;
+
+	/*
+	 * simulate effect of mmap()
+	 */
+	get_file(filp);
+
+	/*
+	 * Let's do the difficult operations next.
+	 *
+	 * now we atomically find some area in the address space and
+	 * remap the buffer into it.
+	 */
+	down_write(&current->mm->mmap_sem);
+
+	/* find some free area in address space, must have mmap sem held */
+	vma->vm_start = get_unmapped_area(NULL, 0, size, 0,
+					  MAP_PRIVATE|MAP_ANONYMOUS);
+	if (vma->vm_start == 0) {
+		PFM_DBG("cannot find unmapped area of size %zu", size);
+		up_write(&current->mm->mmap_sem);
+		goto error;
+	}
+	vma->vm_end = vma->vm_start + size;
+
+	PFM_DBG("aligned_size=%zu mapped @0x%lx", size, vma->vm_start);
+	/*
+	 * now insert the vma in the vm list for the process, must be
+	 * done with mmap lock held
+	 */
+	insert_vm_struct(mm, vma);
+
+	mm->total_vm  += size >> PAGE_SHIFT;
+
+	up_write(&current->mm->mmap_sem);
+
+	/*
+	 * IMPORTANT: we do not issue the fput()
+	 * because we want to increase the ref count
+	 * on the descriptor to simulate what mmap()
+	 * would do
+	 */
+
+	/*
+	 * used to propagate vaddr to syscall stub
+	 */
+	ctx_arch->ctx_smpl_vaddr = (void *)vma->vm_start;
+
+	return 0;
+error:
+	kmem_cache_free(vm_area_cachep, vma);
+error_kmem:
+	pfm_release_buf_space(ctx, ctx->smpl_size);
+	vfree(ctx->smpl_addr);
+	return -ENOMEM;
+}
+
+#define PFM_DEFAULT_SMPL_UUID { \
+		0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82,\
+		0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97}
+
+static pfm_uuid_t old_default_uuid = PFM_DEFAULT_SMPL_UUID;
+static pfm_uuid_t null_uuid;
+
+/*
+ * function invoked in case, pfm_context_create fails
+ * at the last operation, copy_to_user. It needs to
+ * undo memory allocations and free the file descriptor
+ */
+static void pfm_undo_create_context_fd(int fd, struct pfm_context *ctx)
+{
+	struct files_struct *files = current->files;
+	struct file *file;
+	int fput_needed;
+
+	file = fget_light(fd, &fput_needed);
+	/*
+	 * there is no fd_uninstall(), so we do it
+	 * here. put_unused_fd() does not remove the
+	 * effect of fd_install().
+	 */
+
+	spin_lock(&files->file_lock);
+	files->fd_array[fd] = NULL;
+	spin_unlock(&files->file_lock);
+
+	fput_light(file, fput_needed);
+
+	/*
+	 * decrement ref count and kill file
+	 */
+	put_filp(file);
+
+	put_unused_fd(fd);
+
+	pfm_context_free(ctx);
+}
+
+static int pfm_get_smpl_arg_old(pfm_uuid_t uuid, void __user *fmt_uarg,
+				size_t usize, void **arg,
+		     		struct pfm_smpl_fmt **fmt)
+{
+	struct pfm_smpl_fmt *f;
+	void *addr = NULL;
+	size_t sz;
+	int ret;
+
+	if (!memcmp(uuid, null_uuid, sizeof(pfm_uuid_t)))
+		return 0;
+
+	if (memcmp(uuid, old_default_uuid, sizeof(pfm_uuid_t))) {
+		PFM_DBG("compatibility mode supports only default sampling format");
+		return -EINVAL;
+	}
+	/*
+	 * find fmt and increase refcount
+	 */
+	f = pfm_smpl_fmt_get("default-old");
+	if (f == NULL) {
+		PFM_DBG("default-old buffer format not found");
+		return -EINVAL;
+	}
+
+	/*
+	 * expected format argument size
+	 */
+	sz = f->fmt_arg_size;
+
+	/*
+	 * check user size matches expected size
+	 * usize = -1 is for IA-64 backward compatibility
+	 */
+	ret = -EINVAL;
+	if (sz != usize && usize != -1) {
+		PFM_DBG("invalid arg size %zu, format expects %zu",
+			usize, sz);
+		goto error;
+	}
+	
+	ret = -ENOMEM;
+	addr = kmalloc(sz, GFP_KERNEL);
+	if (addr == NULL)
+		goto error;
+
+	ret = -EFAULT;
+	if (copy_from_user(addr, fmt_uarg, sz))
+		goto error;
+
+	*arg = addr;
+	*fmt = f;
+	return 0;
+
+error:
+	kfree(addr);
+	pfm_smpl_fmt_put(f);
+	return ret;
+}
+
+static long pfm_create_context_old(int fd, void __user *ureq, int count)
+{
+	struct pfm_context *new_ctx;
+	struct pfm_arch_context *ctx_arch;
+	struct pfm_smpl_fmt *fmt = NULL;
+	struct pfarg_context req_old;
+	void __user *usmpl_arg;
+	void *smpl_arg = NULL;
+	struct pfarg_ctx req;
+	int ret;
+
+	if (count != 1)
+		return -EINVAL;
+
+	if (copy_from_user(&req_old, ureq, sizeof(req_old)))
+		return -EFAULT;
+
+	memset(&req, 0, sizeof(req));
+
+	/*
+	 * sampling format args are following pfarg_context
+	 */
+	usmpl_arg = ureq+sizeof(req_old);
+
+	ret = pfm_get_smpl_arg_old(req_old.ctx_smpl_buf_id, usmpl_arg, -1,
+			           &smpl_arg, &fmt);
+	if (ret)
+		return ret;
+
+	req.ctx_flags = req_old.ctx_flags;
+
+	/*
+	 * returns file descriptor if >=0, or error code */
+	ret = __pfm_create_context(&req, fmt, smpl_arg, PFM_COMPAT, &new_ctx);
+	if (ret >= 0) {
+		ctx_arch = pfm_ctx_arch(new_ctx);
+		req_old.ctx_fd = ret;
+		req_old.ctx_smpl_vaddr = ctx_arch->ctx_smpl_vaddr;
+	}
+
+	if (copy_to_user(ureq, &req_old, sizeof(req_old))) {
+		pfm_undo_create_context_fd(req_old.ctx_fd, new_ctx);
+		ret = -EFAULT;
+	}
+
+	kfree(smpl_arg);
+
+	return ret;
+}
+
+/*
+ * obsolete call: use /proc/perfmon
+ */
+static long pfm_get_features_old(int fd, void __user *arg, int count)
+{
+	struct pfarg_features req;
+	int ret = 0;
+
+	if (count != 1)
+		return -EINVAL;
+
+	memset(&req, 0, sizeof(req));
+
+	req.ft_version = PFM_VERSION;
+
+	if (copy_to_user(arg, &req, sizeof(req)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static long pfm_debug_old(int fd, void __user *arg, int count)
+{
+	int m;
+
+	if (count != 1)
+		return -EINVAL;
+
+	if (get_user(m, (int __user*)arg))
+		return -EFAULT;
+
+
+	pfm_controls.debug = m == 0 ? 0 : 1;
+
+	PFM_INFO("debugging %s (timing reset)",
+		 pfm_controls.debug ? "on" : "off");
+
+	if (m == 0)
+		for_each_online_cpu(m) {
+			memset(&per_cpu(pfm_stats,m), 0,
+			       sizeof(struct pfm_stats));
+		}
+	return 0;
+}
+
+static long pfm_unload_context_old(int fd, void __user *arg, int count)
+{
+	if (count)
+		return -EINVAL;
+
+	return sys_pfm_unload_context(fd);
+}
+
+static long pfm_restart_old(int fd, void __user *arg, int count)
+{
+	if (count)
+		return -EINVAL;
+
+	return sys_pfm_restart(fd);
+}
+
+static long pfm_stop_old(int fd, void __user *arg, int count)
+{
+	if (count)
+		return -EINVAL;
+
+	return sys_pfm_stop(fd);
+}
+
+static long pfm_start_old(int fd, void __user *arg, int count)
+{
+	if (count > 1)
+		return -EINVAL;
+
+	return sys_pfm_start(fd, arg);
+}
+
+static long pfm_load_context_old(int fd, void __user *ureq, int count)
+{
+	struct pfm_context *ctx;
+	struct task_struct *task;
+	struct file *filp;
+	unsigned long flags;
+	struct pfarg_load req;
+	int ret, fput_needed;
+
+	if (count != 1)
+		return -EINVAL;
+
+	if (copy_from_user(&req, ureq, sizeof(req)))
+		return -EFAULT;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	task = NULL;
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	/*
+	 * in per-thread mode (not self-monitoring), get a reference
+	 * on task to monitor. This must be done with interrupts enabled
+	 * Upon succesful return, refcount on task is increased.
+	 *
+	 * fget_light() is protecting the context.
+	 */
+	if (!ctx->flags.system) {
+		if (req.load_pid != current->pid) {
+			ret = pfm_get_task(ctx, req.load_pid, &task);
+			if (ret)
+				goto error;
+		} else
+			task = current;
+	}
+	/*
+	 * irqsave is required to avoid race in case context is already
+	 * loaded or with switch timeout in the case of self-monitoring
+	 */
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	/*
+	 * the new interface requires the desired CPU to be explicitely set
+	 * in this field. the kernel then checks you are on the right CPU
+	 */
+	if (ctx->flags.system)
+		req.load_pid = smp_processor_id();
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags);
+	if (!ret)
+		ret = __pfm_load_context(ctx, &req, task);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	/*
+	 * in per-thread mode (not self-monitoring), we need
+	 * to decrease refcount on task to monitor:
+	 *   - load successful: we have a reference to the task in ctx->task
+	 *   - load failed    : undo the effect of pfm_get_task()
+	 */
+	if (task && task != current)
+		put_task_struct(task);
+
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+/*
+ * perfmon command descriptions
+ */
+struct pfm_cmd_desc {
+	long (*cmd_func)(int fd, void __user *arg, int count);
+};
+
+/*
+ * functions MUST be listed in the increasing order of
+ * their index (see permfon.h)
+ */
+#define PFM_CMD(name)  \
+	{ .cmd_func = name,  \
+	}
+#define PFM_CMD_NONE		\
+	{ .cmd_func = NULL   \
+	}
+	
+static struct pfm_cmd_desc pfm_cmd_tab[]={
+/* 0  */PFM_CMD_NONE,
+/* 1  */PFM_CMD(pfm_write_pmcs_old),
+/* 2  */PFM_CMD(pfm_write_pmds_old),
+/* 3  */PFM_CMD(pfm_read_pmds_old),
+/* 4  */PFM_CMD(pfm_stop_old),
+/* 5  */PFM_CMD(pfm_start_old),
+/* 6  */PFM_CMD_NONE,
+/* 7  */PFM_CMD_NONE,
+/* 8  */PFM_CMD(pfm_create_context_old),
+/* 9  */PFM_CMD_NONE,
+/* 10 */PFM_CMD(pfm_restart_old),
+/* 11 */PFM_CMD_NONE,
+/* 12 */PFM_CMD(pfm_get_features_old),
+/* 13 */PFM_CMD(pfm_debug_old),
+/* 14 */PFM_CMD_NONE,
+/* 15 */PFM_CMD(pfm_get_default_pmcs_old),
+/* 16 */PFM_CMD(pfm_load_context_old),
+/* 17 */PFM_CMD(pfm_unload_context_old),
+/* 18 */PFM_CMD_NONE,
+/* 19 */PFM_CMD_NONE,
+/* 20 */PFM_CMD_NONE,
+/* 21 */PFM_CMD_NONE,
+/* 22 */PFM_CMD_NONE,
+/* 23 */PFM_CMD_NONE,
+/* 24 */PFM_CMD_NONE,
+/* 25 */PFM_CMD_NONE,
+/* 26 */PFM_CMD_NONE,
+/* 27 */PFM_CMD_NONE,
+/* 28 */PFM_CMD_NONE,
+/* 29 */PFM_CMD_NONE,
+/* 30 */PFM_CMD_NONE,
+/* 31 */PFM_CMD_NONE,
+/* 32 */PFM_CMD(pfm_write_ibrs_old),
+/* 33 */PFM_CMD(pfm_write_dbrs_old),
+};
+#define PFM_CMD_COUNT ARRAY_SIZE(pfm_cmd_tab)
+
+/*
+ * system-call entry point (must return long)
+ */
+asmlinkage long sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
+{
+	if (perfmon_disabled)
+		return -ENOSYS;
+
+	if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT
+		     || pfm_cmd_tab[cmd].cmd_func == NULL)) {
+		PFM_DBG("invalid cmd=%d", cmd);
+		return -EINVAL;
+	}
+	return (long)pfm_cmd_tab[cmd].cmd_func(fd, arg, count);
+}
+
+/*
+ * legacy /proc/perfmon simplified interface (we only maintain the
+ * global information (no more per-cpu stats, use
+ * /sys/devices/system/cpu/cpuXX/perfmon
+ */
+static struct proc_dir_entry 	*perfmon_proc;
+
+static void *pfm_proc_start(struct seq_file *m, loff_t *pos)
+{
+	if (*pos == 0)
+		return (void *)1;
+
+	return NULL;
+}
+
+static void *pfm_proc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return pfm_proc_start(m, pos);
+}
+
+static void pfm_proc_stop(struct seq_file *m, void *v)
+{
+}
+
+/*
+ * this is a simplified version of the legacy /proc/perfmon.
+ * We have retained ONLY the key information that tools are actually
+ * using
+ */
+static void pfm_proc_show_header(struct seq_file *m)
+{
+	char buf[128];
+
+	pfm_sysfs_session_show(buf, sizeof(buf), 3);
+
+	seq_printf(m, "perfmon version            : %u.%u\n",
+		PFM_VERSION_MAJ, PFM_VERSION_MIN);
+
+	seq_printf(m, "model                      : %s", buf);
+}
+
+static int pfm_proc_show(struct seq_file *m, void *v)
+{
+	pfm_proc_show_header(m);
+	return 0;
+}
+
+struct seq_operations pfm_proc_seq_ops = {
+	.start = pfm_proc_start,
+	.next =	pfm_proc_next,
+	.stop =	pfm_proc_stop,
+	.show =	pfm_proc_show
+};
+
+static int pfm_proc_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &pfm_proc_seq_ops);
+}
+
+
+static struct file_operations pfm_proc_fops = {
+	.open = pfm_proc_open,
+	.read = seq_read,
+	.llseek	= seq_lseek,
+	.release = seq_release,
+};
+
+/*
+ * called from pfm_arch_init(), global initialization, called once
+ */
+int __init pfm_ia64_compat_init(void)
+{
+	/*
+	 * create /proc/perfmon
+	 */
+	perfmon_proc = create_proc_entry("perfmon", S_IRUGO, NULL);
+	if (perfmon_proc == NULL) {
+		PFM_ERR("cannot create /proc entry, perfmon disabled");
+		return -1;
+	}
+	perfmon_proc->proc_fops = &pfm_proc_fops;
+	return 0;
+}
Index: linux-2.6/arch/ia64/perfmon/perfmon_default_smpl.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/ia64/perfmon/perfmon_default_smpl.c
@@ -0,0 +1,268 @@
+/*
+ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file implements the old default sampling buffer format
+ * for the Linux/ia64 perfmon-2 subsystem. This is for backward
+ * compatibility only. use the new default format in perfmon/
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/delay.h>
+#include <linux/smp.h>
+#include <linux/sysctl.h>
+
+#ifdef MODULE
+#define FMT_FLAGS	0
+#else
+#define FMT_FLAGS	PFM_FMTFL_IS_BUILTIN
+#endif
+
+#include <linux/perfmon.h>
+#include <asm-ia64/perfmon_default_smpl.h>
+
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("perfmon old default sampling format");
+MODULE_LICENSE("GPL");
+
+static int pfm_default_fmt_validate(u32 flags, u16 npmds, void *data)
+{
+	struct pfm_default_smpl_arg *arg = data;
+	size_t min_buf_size;
+
+	if (data == NULL) {
+		PFM_DBG("no argument passed");
+		return -EINVAL;
+	}
+
+	/*
+	 * compute min buf size. All PMD are manipulated as 64bit entities
+	 */
+	min_buf_size = sizeof(struct pfm_default_smpl_hdr)
+	             + (sizeof(struct pfm_default_smpl_entry)
+		     + (npmds*sizeof(u64)));
+
+	PFM_DBG("validate flags=0x%x npmds=%u min_buf_size=%lu "
+		  "buf_size=%lu CPU%d", flags, npmds, min_buf_size,
+		  arg->buf_size, smp_processor_id());
+
+	/*
+	 * must hold at least the buffer header + one minimally sized entry
+	 */
+	if (arg->buf_size < min_buf_size) return -EINVAL;
+
+	return 0;
+}
+
+static int pfm_default_fmt_get_size(unsigned int flags, void *data,
+				    size_t *size)
+{
+	struct pfm_default_smpl_arg *arg = data;
+
+	/*
+	 * size has been validated in default_validate
+	 */
+	*size = arg->buf_size;
+
+	return 0;
+}
+
+static int pfm_default_fmt_init(struct pfm_context *ctx, void *buf,
+				u32 flags, u16 npmds, void *data)
+{
+	struct pfm_default_smpl_hdr *hdr;
+	struct pfm_default_smpl_arg *arg = data;
+
+	hdr = buf;
+
+	hdr->hdr_version      = PFM_DEFAULT_SMPL_VERSION;
+	hdr->hdr_buf_size     = arg->buf_size;
+	hdr->hdr_cur_offs     = sizeof(*hdr);
+	hdr->hdr_overflows    = 0;
+	hdr->hdr_count        = 0;
+
+	PFM_DBG("buffer=%p buf_size=%lu hdr_size=%lu "
+		  "hdr_version=%u cur_offs=%lu",
+		  buf,
+		  hdr->hdr_buf_size,
+		  sizeof(*hdr),
+		  hdr->hdr_version,
+		  hdr->hdr_cur_offs);
+
+	return 0;
+}
+
+static int pfm_default_fmt_handler(void *buf, struct pfm_ovfl_arg *arg,
+				   unsigned long ip, u64 tstamp, void *data)
+{
+	struct pfm_default_smpl_hdr *hdr;
+	struct pfm_default_smpl_entry *ent;
+	void *cur, *last;
+	u64 *e;
+	size_t entry_size;
+	u16 npmds, i, ovfl_pmd;
+
+	hdr         = buf;
+	cur         = buf+hdr->hdr_cur_offs;
+	last        = buf+hdr->hdr_buf_size;
+	ovfl_pmd    = arg->ovfl_pmd;
+
+	/*
+	 * precheck for sanity
+	 */
+	if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full;
+
+	npmds = arg->num_smpl_pmds;
+
+	ent = cur;
+
+	prefetch(arg->smpl_pmds_values);
+
+	entry_size = sizeof(*ent) + (npmds << 3);
+
+	/* position for first pmd */
+	e = (unsigned long *)(ent+1);
+
+	hdr->hdr_count++;
+
+	PFM_DBG_ovfl("count=%lu cur=%p last=%p free_bytes=%lu "
+		       "ovfl_pmd=%d npmds=%u",
+		       hdr->hdr_count,
+		       cur, last,
+		       last-cur,
+		       ovfl_pmd,
+		       npmds);
+
+	/*
+	 * current = task running at the time of the overflow.
+	 *
+	 * per-task mode:
+	 * 	- this is ususally the task being monitored.
+	 * 	  Under certain conditions, it might be a different task
+	 *
+	 * system-wide:
+	 * 	- this is not necessarily the task controlling the session
+	 */
+	ent->pid            = current->pid;
+	ent->ovfl_pmd  	    = ovfl_pmd;
+	ent->last_reset_val = arg->pmd_last_reset;
+
+	/*
+	 * where did the fault happen (includes slot number)
+	 */
+	ent->ip = ip;
+
+	ent->tstamp    = tstamp;
+	ent->cpu       = smp_processor_id();
+	ent->set       = arg->active_set;
+	ent->tgid      = current->tgid;
+
+	/*
+	 * selectively store PMDs in increasing index number
+	 */
+	if (npmds) {
+		u64 *val = arg->smpl_pmds_values;
+		for(i=0; i < npmds; i++) {
+			*e++ = *val++;
+		}
+	}
+
+	/*
+	 * update position for next entry
+	 */
+	hdr->hdr_cur_offs += entry_size;
+	cur               += entry_size;
+
+	/*
+	 * post check to avoid losing the last sample
+	 */
+	if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full;
+
+	/*
+	 * reset before returning from interrupt handler
+	 */
+	arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET;
+	return 0;
+full:
+	PFM_DBG_ovfl("smpl buffer full free=%lu, count=%lu",
+		       last-cur, hdr->hdr_count);
+
+	/*
+	 * increment number of buffer overflow.
+	 * important to detect duplicate set of samples.
+	 */
+	hdr->hdr_overflows++;
+
+	/*
+	 * request notification and masking of monitoring.
+	 * Notification is still subject to the overflowed
+	 */
+	arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY| PFM_OVFL_CTRL_MASK;
+
+	return -ENOBUFS; /* we are full, sorry */
+}
+
+static int pfm_default_fmt_restart(int is_active, u32 *ovfl_ctrl, void *buf)
+{
+	struct pfm_default_smpl_hdr *hdr;
+
+	hdr = buf;
+
+	hdr->hdr_count    = 0;
+	hdr->hdr_cur_offs = sizeof(*hdr);
+
+	*ovfl_ctrl = PFM_OVFL_CTRL_RESET;
+
+	return 0;
+}
+
+static int pfm_default_fmt_exit(void *buf)
+{
+	return 0;
+}
+
+static struct pfm_smpl_fmt default_fmt={
+	.fmt_name = "default-old",
+	.fmt_version = 0x10000,
+	.fmt_arg_size = sizeof(struct pfm_default_smpl_arg),
+	.fmt_validate = pfm_default_fmt_validate,
+	.fmt_getsize = pfm_default_fmt_get_size,
+	.fmt_init = pfm_default_fmt_init,
+	.fmt_handler = pfm_default_fmt_handler,
+	.fmt_restart = pfm_default_fmt_restart,
+	.fmt_exit = pfm_default_fmt_exit,
+	.fmt_flags = FMT_FLAGS,
+	.owner= THIS_MODULE
+};
+
+static int pfm_default_fmt_init_module(void)
+{
+	int ret;
+
+	return pfm_fmt_register(&default_fmt);
+	return ret;
+}
+
+static void pfm_default_fmt_cleanup_module(void)
+{
+	pfm_fmt_unregister(&default_fmt);
+}
+
+module_init(pfm_default_fmt_init_module);
+module_exit(pfm_default_fmt_cleanup_module);
Index: linux-2.6/arch/ia64/perfmon/perfmon_generic.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/ia64/perfmon/perfmon_generic.c
@@ -0,0 +1,148 @@
+/*
+ * This file contains the generic PMU register description tables
+ * and pmc checker used by perfmon.c.
+ *
+ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P.
+ * contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+#include <asm/pal.h>
+
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("Generic IA-64 PMU description tables");
+MODULE_LICENSE("GPL");
+
+#define RDEP(x)	(1UL << (x))
+
+#define PFM_IA64GEN_MASK_PMCS	(RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7))
+#define PFM_IA64GEN_RSVD	(0xffffffffffff0080UL)
+#define PFM_IA64GEN_NO64	(1UL<<5)
+
+/* forward declaration */
+static struct pfm_pmu_config pfm_ia64gen_pmu_conf;
+
+static struct pfm_arch_pmu_info pfm_ia64gen_pmu_info={
+	.mask_pmcs = {PFM_IA64GEN_MASK_PMCS,},
+};
+
+static struct pfm_regmap_desc pfm_ia64gen_pmc_desc[]={
+/* pmc0  */ PMX_NA,
+/* pmc1  */ PMX_NA,
+/* pmc2  */ PMX_NA,
+/* pmc3  */ PMX_NA,
+/* pmc4  */ PMC_D(PFM_REG_W64, "PMC4", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 4),
+/* pmc5  */ PMC_D(PFM_REG_W64, "PMC5", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 5),
+/* pmc6  */ PMC_D(PFM_REG_W64, "PMC6", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 6),
+/* pmc7  */ PMC_D(PFM_REG_W64, "PMC7", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 7)
+};
+#define PFM_IA64GEN_NUM_PMCS ARRAY_SIZE(pfm_ia64gen_pmc_desc)
+
+static struct pfm_regmap_desc pfm_ia64gen_pmd_desc[]={
+/* pmd0  */ PMX_NA,
+/* pmd1  */ PMX_NA,
+/* pmd2  */ PMX_NA,
+/* pmd3  */ PMX_NA,
+/* pmd4  */ PMD_D(PFM_REG_C, "PMD4", 4),
+/* pmd5  */ PMD_D(PFM_REG_C, "PMD5", 5),
+/* pmd6  */ PMD_D(PFM_REG_C, "PMD6", 6),
+/* pmd7  */ PMD_D(PFM_REG_C, "PMD7", 7)
+};
+#define PFM_IA64GEN_NUM_PMDS ARRAY_SIZE(pfm_ia64gen_pmd_desc)
+
+static int pfm_ia64gen_pmc_check(struct pfm_context *ctx,
+				 struct pfm_event_set *set,
+			         struct pfarg_pmc *req)
+{
+#define PFM_IA64GEN_PMC_PM_POS6	(1UL<< 6)
+	u64 tmpval;
+	int is_system;
+
+	is_system = ctx->flags.system;
+	tmpval = req->reg_value;
+
+	switch(req->reg_num) {
+		case  4:
+		case  5:
+		case  6:
+		case  7:
+			/* set pmc.oi for 64-bit emulation */
+			tmpval |= 1UL << 5;
+
+			if (is_system)
+				tmpval |= PFM_IA64GEN_PMC_PM_POS6;
+			else
+				tmpval &= ~PFM_IA64GEN_PMC_PM_POS6;
+			break;
+
+	}
+	req->reg_value = tmpval;
+
+	return 0;
+}
+
+/*
+ * matches anything
+ */
+static int pfm_ia64gen_probe_pmu(void)
+{
+	u64 pm_buffer[16];
+	pal_perf_mon_info_u_t pm_info;
+
+	/*
+	 * call PAL_PERFMON_INFO to retrieve counter width which
+	 * is implementation specific
+	 */
+	if (ia64_pal_perf_mon_info(pm_buffer, &pm_info))
+		return -1;
+
+	pfm_ia64gen_pmu_conf.counter_width = pm_info.pal_perf_mon_info_s.width;
+
+	return 0;
+}
+
+/*
+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
+ */
+static struct pfm_pmu_config pfm_ia64gen_pmu_conf={
+	.pmu_name = "Generic IA-64",
+	.counter_width = 0, /* computed from PAL_PERFMON_INFO */
+	.pmd_desc = pfm_ia64gen_pmd_desc,
+	.pmc_desc = pfm_ia64gen_pmc_desc,
+	.probe_pmu = pfm_ia64gen_probe_pmu,
+	.num_pmc_entries = PFM_IA64GEN_NUM_PMCS,
+	.num_pmd_entries = PFM_IA64GEN_NUM_PMDS,
+	.pmc_write_check = pfm_ia64gen_pmc_check,
+	.version = "1.0",
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE,
+	.arch_info = & pfm_ia64gen_pmu_info
+	/* no read/write checkers */
+};
+
+static int __init pfm_gen_pmu_init_module(void)
+{
+	return pfm_pmu_register(&pfm_ia64gen_pmu_conf);
+}
+
+static void __exit pfm_gen_pmu_cleanup_module(void)
+{
+	pfm_pmu_unregister(&pfm_ia64gen_pmu_conf);
+}
+
+module_init(pfm_gen_pmu_init_module);
+module_exit(pfm_gen_pmu_cleanup_module);
Index: linux-2.6/arch/ia64/perfmon/perfmon_itanium.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/ia64/perfmon/perfmon_itanium.c
@@ -0,0 +1,229 @@
+/*
+ * This file contains the Itanium PMU register description tables
+ * and pmc checker used by perfmon.c.
+ *
+ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("Itanium (Merced) PMU description tables");
+MODULE_LICENSE("GPL");
+
+#define RDEP(x)	(1ULL << (x))
+
+#define PFM_ITA_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|RDEP(10)|RDEP(11)|\
+			   RDEP(12))
+
+#define PFM_ITA_NO64	(1ULL<<5)
+
+static struct pfm_arch_pmu_info pfm_ita_pmu_info={
+	.mask_pmcs = {PFM_ITA_MASK_PMCS,},
+};
+/* reserved bits are 1 in the mask */
+#define PFM_ITA_RSVD 0xfffffffffc8000a0UL
+/*
+ * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using
+ * PMC256+y means we use PMC256+y+1.  Yet, we do not have dependency information
+ * but this is fine because they are handled separately in the IA-64 specific
+ * code.
+ */
+static struct pfm_regmap_desc pfm_ita_pmc_desc[]={
+/* pmc0  */ PMX_NA,
+/* pmc1  */ PMX_NA,
+/* pmc2  */ PMX_NA,
+/* pmc3  */ PMX_NA,
+/* pmc4  */ PMC_D(PFM_REG_W64, "PMC4" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 4),
+/* pmc5  */ PMC_D(PFM_REG_W64, "PMC5" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 5),
+/* pmc6  */ PMC_D(PFM_REG_W64, "PMC6" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 6),
+/* pmc7  */ PMC_D(PFM_REG_W64, "PMC7" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 7),
+/* pmc8  */ PMC_D(PFM_REG_W  , "PMC8" , 0xfffffffe3ffffff8UL, 0xfff00000001c0000UL, 0, 8),
+/* pmc9  */ PMC_D(PFM_REG_W  , "PMC9" , 0xfffffffe3ffffff8UL, 0xfff00000001c0000UL, 0, 9),
+/* pmc10 */ PMC_D(PFM_REG_W  , "PMC10", 0x0, 0xfffffffff3f0ff30UL, 0, 10),
+/* pmc11 */ PMC_D(PFM_REG_W  , "PMC11", 0x10000000UL, 0xffffffffecf0ff30UL, 0, 11),
+/* pmc12 */ PMC_D(PFM_REG_W  , "PMC12", 0x0, 0xffffffffffff0030UL, 0, 12),
+/* pmc13 */ PMC_D(PFM_REG_W  , "PMC13", 0x3ffff00000001UL, 0xfffffffffffffffeUL, 0, 13),
+/* pmc14 */ PMX_NA,
+/* pmc15 */ PMX_NA,
+/* pmc16 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc24 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc32 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc40 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc48 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc56 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc64 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc72 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc80 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc88 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc96 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc104 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc112 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc120 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc128 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc136 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc144 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc152 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc160 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc168 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc176 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc184 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc192 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc200 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc208 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc216 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc224 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc232 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc240 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc248 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc256 */ PMC_D(PFM_REG_W  , "IBR0", 0x0, 0, 0, 0),
+/* pmc257 */ PMC_D(PFM_REG_W  , "IBR1", 0x0, 0x8000000000000000UL, 0, 1),
+/* pmc258 */ PMC_D(PFM_REG_W  , "IBR2", 0x0, 0, 0, 2),
+/* pmc259 */ PMC_D(PFM_REG_W  , "IBR3", 0x0, 0x8000000000000000UL, 0, 3),
+/* pmc260 */ PMC_D(PFM_REG_W  , "IBR4", 0x0, 0, 0, 4),
+/* pmc261 */ PMC_D(PFM_REG_W  , "IBR5", 0x0, 0x8000000000000000UL, 0, 5),
+/* pmc262 */ PMC_D(PFM_REG_W  , "IBR6", 0x0, 0, 0, 6),
+/* pmc263 */ PMC_D(PFM_REG_W  , "IBR7", 0x0, 0x8000000000000000UL, 0, 7),
+/* pmc264 */ PMC_D(PFM_REG_W  , "DBR0", 0x0, 0, 0, 0),
+/* pmc265 */ PMC_D(PFM_REG_W  , "DBR1", 0x0, 0xc000000000000000UL, 0, 1),
+/* pmc266 */ PMC_D(PFM_REG_W  , "DBR2", 0x0, 0, 0, 2),
+/* pmc267 */ PMC_D(PFM_REG_W  , "DBR3", 0x0, 0xc000000000000000UL, 0, 3),
+/* pmc268 */ PMC_D(PFM_REG_W  , "DBR4", 0x0, 0, 0, 4),
+/* pmc269 */ PMC_D(PFM_REG_W  , "DBR5", 0x0, 0xc000000000000000UL, 0, 5),
+/* pmc270 */ PMC_D(PFM_REG_W  , "DBR6", 0x0, 0, 0, 6),
+/* pmc271 */ PMC_D(PFM_REG_W  , "DBR7", 0x0, 0xc000000000000000UL, 0, 7)
+};
+#define PFM_ITA_NUM_PMCS ARRAY_SIZE(pfm_ita_pmc_desc)
+
+static struct pfm_regmap_desc pfm_ita_pmd_desc[]={
+/* pmd0  */ PMD_D(PFM_REG_I , "PMD0", 0),
+/* pmd1  */ PMD_D(PFM_REG_I , "PMD1", 1),
+/* pmd2  */ PMD_D(PFM_REG_I , "PMD2", 2),
+/* pmd3  */ PMD_D(PFM_REG_I , "PMD3", 3),
+/* pmd4  */ PMD_D(PFM_REG_C , "PMD4", 4),
+/* pmd5  */ PMD_D(PFM_REG_C , "PMD5", 5),
+/* pmd6  */ PMD_D(PFM_REG_C , "PMD6", 6),
+/* pmd7  */ PMD_D(PFM_REG_C , "PMD7", 7),
+/* pmd8  */ PMD_D(PFM_REG_I , "PMD8", 8),
+/* pmd9  */ PMD_D(PFM_REG_I , "PMD9", 9),
+/* pmd10 */ PMD_D(PFM_REG_I , "PMD10", 10),
+/* pmd11 */ PMD_D(PFM_REG_I , "PMD11", 11),
+/* pmd12 */ PMD_D(PFM_REG_I , "PMD12", 12),
+/* pmd13 */ PMD_D(PFM_REG_I , "PMD13", 13),
+/* pmd14 */ PMD_D(PFM_REG_I , "PMD14", 14),
+/* pmd15 */ PMD_D(PFM_REG_I , "PMD15", 15),
+/* pmd16 */ PMD_D(PFM_REG_I , "PMD16", 16),
+/* pmd17 */ PMD_D(PFM_REG_I , "PMD17", 17)
+};
+#define PFM_ITA_NUM_PMDS ARRAY_SIZE(pfm_ita_pmd_desc)
+
+static int pfm_ita_pmc_check(struct pfm_context *ctx,
+			     struct pfm_event_set *set,
+			     struct pfarg_pmc *req)
+{
+#define PFM_ITA_PMC_PM_POS6	(1UL<< 6)
+	struct pfm_arch_context *ctx_arch;
+	u64 tmpval;
+	u16 cnum;
+	int ret = 0, is_system;
+
+	tmpval = req->reg_value;
+	cnum   = req->reg_num;
+	ctx_arch = pfm_ctx_arch(ctx);
+	is_system = ctx->flags.system;
+
+	switch(cnum) {
+		case  4:
+		case  5:
+		case  6:
+		case  7:
+		case 10:
+		case 11:
+		case 12: if (is_system)
+				 tmpval |= PFM_ITA_PMC_PM_POS6;
+			 else
+				 tmpval &= ~PFM_ITA_PMC_PM_POS6;
+			 break;
+	}
+
+	/*
+	 * we must clear the (instruction) debug registers if pmc13.ta bit is
+	 * cleared before they are written (fl_using_dbreg==0) to avoid
+	 * picking up stale information.
+	 */
+	if (cnum == 13 && ((tmpval & 0x1) == 0)
+		&& ctx_arch->flags.use_dbr == 0) {
+		PFM_DBG("pmc13 has pmc13.ta cleared, clearing ibr");
+		ret = pfm_ia64_mark_dbregs_used(ctx, set);
+		if (ret) return ret;
+	}
+
+	/*
+	 * we must clear the (data) debug registers if pmc11.pt bit is cleared
+	 * before they are written (fl_using_dbreg==0) to avoid picking up
+	 * stale information.
+	 */
+	if (cnum == 11 && ((tmpval >> 28)& 0x1) == 0
+		&& ctx_arch->flags.use_dbr == 0) {
+		PFM_DBG("pmc11 has pmc11.pt cleared, clearing dbr");
+		ret = pfm_ia64_mark_dbregs_used(ctx, set);
+		if (ret) return ret;
+	}
+
+	req->reg_value = tmpval;
+
+	return 0;
+}
+
+static int pfm_ita_probe_pmu(void)
+{
+	return local_cpu_data->family == 0x7 && !ia64_platform_is("hpsim")
+		? 0 : -1;
+}
+
+/*
+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
+ */
+static struct pfm_pmu_config pfm_ita_pmu_conf={
+	.pmu_name = "Itanium",
+	.counter_width = 32,
+	.pmd_desc = pfm_ita_pmd_desc,
+	.pmc_desc = pfm_ita_pmc_desc,
+	.pmc_write_check = pfm_ita_pmc_check,
+	.num_pmc_entries = PFM_ITA_NUM_PMCS,
+	.num_pmd_entries = PFM_ITA_NUM_PMDS,
+	.probe_pmu = pfm_ita_probe_pmu,
+	.version = "1.0",
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE,
+	.arch_info = &pfm_ita_pmu_info
+};
+	
+static int __init pfm_ita_pmu_init_module(void)
+{
+	return pfm_pmu_register(&pfm_ita_pmu_conf);
+}
+
+static void __exit pfm_ita_pmu_cleanup_module(void)
+{
+	pfm_pmu_unregister(&pfm_ita_pmu_conf);
+}
+
+module_init(pfm_ita_pmu_init_module);
+module_exit(pfm_ita_pmu_cleanup_module);
+
Index: linux-2.6/arch/ia64/perfmon/perfmon_mckinley.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/ia64/perfmon/perfmon_mckinley.c
@@ -0,0 +1,285 @@
+/*
+ * This file contains the McKinley PMU register description tables
+ * and pmc checker used by perfmon.c.
+ *
+ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("Itanium 2 (McKinley) PMU description tables");
+MODULE_LICENSE("GPL");
+
+#define RDEP(x)	(1UL << (x))
+
+#define PFM_MCK_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|RDEP(10)|RDEP(11)|\
+			   RDEP(12))
+
+#define PFM_MCK_NO64	(1UL<<5)
+
+static struct pfm_arch_pmu_info pfm_mck_pmu_info={
+	.mask_pmcs = {PFM_MCK_MASK_PMCS,},
+};
+
+/* reserved bits are 1 in the mask */
+#define PFM_ITA2_RSVD 0xfffffffffc8000a0UL
+
+/*
+ * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using
+ * PMC256+y means we use PMC256+y+1.  Yet, we do not have dependency information
+ * but this is fine because they are handled separately in the IA-64 specific
+ * code.
+ */
+static struct pfm_regmap_desc pfm_mck_pmc_desc[]={
+/* pmc0  */ PMX_NA,
+/* pmc1  */ PMX_NA,
+/* pmc2  */ PMX_NA,
+/* pmc3  */ PMX_NA,
+/* pmc4  */ PMC_D(PFM_REG_W64, "PMC4" , 0x800020UL, 0xfffffffffc8000a0, PFM_MCK_NO64, 4),
+/* pmc5  */ PMC_D(PFM_REG_W64, "PMC5" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 5),
+/* pmc6  */ PMC_D(PFM_REG_W64, "PMC6" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 6),
+/* pmc7  */ PMC_D(PFM_REG_W64, "PMC7" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 7),
+/* pmc8  */ PMC_D(PFM_REG_W  , "PMC8" , 0xffffffff3fffffffUL, 0xc0000004UL, 0, 8),
+/* pmc9  */ PMC_D(PFM_REG_W  , "PMC9" , 0xffffffff3ffffffcUL, 0xc0000004UL, 0, 9),
+/* pmc10 */ PMC_D(PFM_REG_W  , "PMC10", 0x0, 0xffffffffffff0000UL, 0, 10),
+/* pmc11 */ PMC_D(PFM_REG_W  , "PMC11", 0x0, 0xfffffffffcf0fe30UL, 0, 11),
+/* pmc12 */ PMC_D(PFM_REG_W  , "PMC12", 0x0, 0xffffffffffff0000UL, 0, 12),
+/* pmc13 */ PMC_D(PFM_REG_W  , "PMC13", 0x2078fefefefeUL, 0xfffe1fffe7e7e7e7UL, 0, 13),
+/* pmc14 */ PMC_D(PFM_REG_W  , "PMC14", 0x0db60db60db60db6UL, 0xffffffffffffdb6dUL, 0, 14),
+/* pmc15 */ PMC_D(PFM_REG_W  , "PMC15", 0xfffffff0UL, 0xfffffffffffffff0UL, 0, 15),
+/* pmc16 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc24 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc32 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc40 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc48 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc56 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc64 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc72 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc80 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc88 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc96 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc104 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc112 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc120 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc128 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc136 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc144 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc152 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc160 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc168 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc176 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc184 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc192 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc200 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc208 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc216 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc224 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc232 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc240 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc248 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc256 */ PMC_D(PFM_REG_W  , "IBR0", 0x0, 0, 0, 0),
+/* pmc257 */ PMC_D(PFM_REG_W  , "IBR1", 0x0, 0x8000000000000000UL, 0, 1),
+/* pmc258 */ PMC_D(PFM_REG_W  , "IBR2", 0x0, 0, 0, 2),
+/* pmc259 */ PMC_D(PFM_REG_W  , "IBR3", 0x0, 0x8000000000000000UL, 0, 3),
+/* pmc260 */ PMC_D(PFM_REG_W  , "IBR4", 0x0, 0, 0, 4),
+/* pmc261 */ PMC_D(PFM_REG_W  , "IBR5", 0x0, 0x8000000000000000UL, 0, 5),
+/* pmc262 */ PMC_D(PFM_REG_W  , "IBR6", 0x0, 0, 0, 6),
+/* pmc263 */ PMC_D(PFM_REG_W  , "IBR7", 0x0, 0x8000000000000000UL, 0, 7),
+/* pmc264 */ PMC_D(PFM_REG_W  , "DBR0", 0x0, 0, 0, 0),
+/* pmc265 */ PMC_D(PFM_REG_W  , "DBR1", 0x0, 0xc000000000000000UL, 0, 1),
+/* pmc266 */ PMC_D(PFM_REG_W  , "DBR2", 0x0, 0, 0, 2),
+/* pmc267 */ PMC_D(PFM_REG_W  , "DBR3", 0x0, 0xc000000000000000UL, 0, 3),
+/* pmc268 */ PMC_D(PFM_REG_W  , "DBR4", 0x0, 0, 0, 4),
+/* pmc269 */ PMC_D(PFM_REG_W  , "DBR5", 0x0, 0xc000000000000000UL, 0, 5),
+/* pmc270 */ PMC_D(PFM_REG_W  , "DBR6", 0x0, 0, 0, 6),
+/* pmc271 */ PMC_D(PFM_REG_W  , "DBR7", 0x0, 0xc000000000000000UL, 0, 7)
+};
+#define PFM_MCK_NUM_PMCS ARRAY_SIZE(pfm_mck_pmc_desc)
+
+static struct pfm_regmap_desc pfm_mck_pmd_desc[]={
+/* pmd0  */ PMD_D(PFM_REG_I, "PMD0", 0),
+/* pmd1  */ PMD_D(PFM_REG_I, "PMD1", 1),
+/* pmd2  */ PMD_D(PFM_REG_I, "PMD2", 2),
+/* pmd3  */ PMD_D(PFM_REG_I, "PMD3", 3),
+/* pmd4  */ PMD_D(PFM_REG_C, "PMD4", 4),
+/* pmd5  */ PMD_D(PFM_REG_C, "PMD5", 5),
+/* pmd6  */ PMD_D(PFM_REG_C, "PMD6", 6),
+/* pmd7  */ PMD_D(PFM_REG_C, "PMD7", 7),
+/* pmd8  */ PMD_D(PFM_REG_I, "PMD8", 8),
+/* pmd9  */ PMD_D(PFM_REG_I, "PMD9", 9),
+/* pmd10 */ PMD_D(PFM_REG_I, "PMD10", 10),
+/* pmd11 */ PMD_D(PFM_REG_I, "PMD11", 11),
+/* pmd12 */ PMD_D(PFM_REG_I, "PMD12", 12),
+/* pmd13 */ PMD_D(PFM_REG_I, "PMD13", 13),
+/* pmd14 */ PMD_D(PFM_REG_I, "PMD14", 14),
+/* pmd15 */ PMD_D(PFM_REG_I, "PMD15", 15),
+/* pmd16 */ PMD_D(PFM_REG_I, "PMD16", 16),
+/* pmd17 */ PMD_D(PFM_REG_I, "PMD17", 17)
+};
+#define PFM_MCK_NUM_PMDS ARRAY_SIZE(pfm_mck_pmd_desc)
+
+static int pfm_mck_pmc_check(struct pfm_context *ctx,
+			     struct pfm_event_set *set,
+			     struct pfarg_pmc *req)
+{
+	struct pfm_arch_context *ctx_arch;
+	u64 val8 = 0, val14 = 0, val13 = 0;
+	u64 tmpval;
+	u16 cnum;
+	int ret = 0, check_case1 = 0;
+	int is_system;
+
+	tmpval = req->reg_value;
+	cnum = req->reg_num;
+	ctx_arch = pfm_ctx_arch(ctx);
+	is_system = ctx->flags.system;
+
+#define PFM_MCK_PMC_PM_POS6	(1UL<< 6)
+#define PFM_MCK_PMC_PM_POS4	(1UL<< 4)
+
+	switch(cnum) {
+		case  4:
+		case  5:
+		case  6:
+		case  7:
+		case 11:
+		case 12: if (is_system)
+				 tmpval |= PFM_MCK_PMC_PM_POS6;
+			 else
+				 tmpval &= ~PFM_MCK_PMC_PM_POS6;
+			 break;
+
+		case  8: val8 = tmpval;
+			 val13 = set->pmcs[13];
+			 val14 = set->pmcs[14];
+			 check_case1 = 1;
+			 break;
+
+		case 10: if (is_system)
+				 tmpval |= PFM_MCK_PMC_PM_POS4;
+			 else
+				 tmpval &= ~PFM_MCK_PMC_PM_POS4;
+			 break;
+
+		case 13:
+			 val8 = set->pmcs[8];
+			 val13 = tmpval;
+			 val14 = set->pmcs[14];
+			 check_case1 = 1;
+			 break;
+
+		case 14:
+			 val8 = set->pmcs[8];
+			 val13 = set->pmcs[13];
+			 val14 = tmpval;
+			 check_case1 = 1;
+			 break;
+	}
+
+	/*
+	 * check illegal configuration which can produce inconsistencies
+	 * in tagging i-side events in L1D and L2 caches
+	 */
+	if (check_case1) {
+		ret = (((val13 >> 45) & 0xf) == 0 && ((val8 & 0x1) == 0))
+		    && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0)
+		    ||(((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0));
+
+		if (ret) {
+			PFM_DBG("perfmon: invalid config pmc8=0x%lx "
+				  "pmc13=0x%lx pmc14=0x%lx",
+				  val8, val13, val14);
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * check if configuration implicitely activates the use of
+	 * the debug registers. If true, then we ensure that this is
+	 * possible and that we do not pick up stale value in the HW
+	 * registers.
+	 *
+	 * We postpone the checks of pmc13 and pmc14 to avoid side effects
+	 * in case of errors
+	 */
+
+	/*
+	 * pmc13 is "active" if:
+	 * 	one of the pmc13.cfg_dbrpXX field is different from 0x3
+	 * AND
+	 * 	at the corresponding pmc13.ena_dbrpXX is set.
+	 */
+	if (cnum == 13 && (tmpval & 0x1e00000000000UL)
+	    && (tmpval & 0x18181818UL) != 0x18181818UL
+	    	&& ctx_arch->flags.use_dbr == 0) {
+		PFM_DBG("pmc13=0x%lx active", tmpval);
+		ret = pfm_ia64_mark_dbregs_used(ctx, set);
+		if (ret) return ret;
+	}
+
+	/*
+	 *  if any pmc14.ibrpX bit is enabled we must clear the ibrs
+	 */
+	if (cnum == 14 && ((tmpval & 0x2222UL) != 0x2222UL)
+		&& ctx_arch->flags.use_dbr == 0) {
+		PFM_DBG("pmc14=0x%lx active", tmpval);
+		ret = pfm_ia64_mark_dbregs_used(ctx, set);
+		if (ret) return ret;
+	}
+
+	req->reg_value = tmpval;
+
+	return 0;
+}
+
+static int pfm_mck_probe_pmu(void)
+{
+	return local_cpu_data->family == 0x1f ? 0 : -1;
+}
+
+/*
+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
+ */
+static struct pfm_pmu_config pfm_mck_pmu_conf={
+	.pmu_name = "Itanium 2",
+	.counter_width = 47,
+	.pmd_desc = pfm_mck_pmd_desc,
+	.pmc_desc = pfm_mck_pmc_desc,
+	.pmc_write_check = pfm_mck_pmc_check,
+	.num_pmc_entries = PFM_MCK_NUM_PMCS,
+	.num_pmd_entries = PFM_MCK_NUM_PMDS,
+	.probe_pmu = pfm_mck_probe_pmu,
+	.version = "1.0",
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE,
+	.arch_info = &pfm_mck_pmu_info,
+};
+	
+static int __init pfm_mck_pmu_init_module(void)
+{
+	return pfm_pmu_register(&pfm_mck_pmu_conf);
+}
+
+static void __exit pfm_mck_pmu_cleanup_module(void)
+{
+	pfm_pmu_unregister(&pfm_mck_pmu_conf);
+}
+
+module_init(pfm_mck_pmu_init_module);
+module_exit(pfm_mck_pmu_cleanup_module);
Index: linux-2.6/arch/ia64/perfmon/perfmon_montecito.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/ia64/perfmon/perfmon_montecito.c
@@ -0,0 +1,404 @@
+/*
+ * This file contains the McKinley PMU register description tables
+ * and pmc checker used by perfmon.c.
+ *
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+#include <linux/smp.h>
+
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("Dual-Core Itanium 2 (Montecito) PMU description table");
+MODULE_LICENSE("GPL");
+
+#define RDEP(x)	(1UL << (x))
+
+#define PFM_MONT_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|\
+			    RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|\
+			    RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|\
+			    RDEP(37)|RDEP(39)|RDEP(40)|RDEP(42))
+
+#define PFM_MONT_NO64	(1UL<<5)
+
+static struct pfm_arch_pmu_info pfm_mont_pmu_info={
+	.mask_pmcs = {PFM_MONT_MASK_PMCS,},
+};
+
+#define PFM_MONT_RSVD 0xffffffff838000a0UL
+/*
+ *
+ * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using
+ * PMC256+y means we use PMC256+y+1.  Yet, we do not have dependency information
+ * but this is fine because they are handled separately in the IA-64 specific
+ * code.
+ *
+ * For PMC4-PMC15, PMC40: we force pmc.ism=2 (IA-64 mode only)
+ */
+static struct pfm_regmap_desc pfm_mont_pmc_desc[]={
+/* pmc0  */ PMX_NA,
+/* pmc1  */ PMX_NA,
+/* pmc2  */ PMX_NA,
+/* pmc3  */ PMX_NA,
+/* pmc4  */ PMC_D(PFM_REG_W64, "PMC4" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 4),
+/* pmc5  */ PMC_D(PFM_REG_W64, "PMC5" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 5),
+/* pmc6  */ PMC_D(PFM_REG_W64, "PMC6" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 6),
+/* pmc7  */ PMC_D(PFM_REG_W64, "PMC7" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 7),
+/* pmc8  */ PMC_D(PFM_REG_W64, "PMC8" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 8),
+/* pmc9  */ PMC_D(PFM_REG_W64, "PMC9" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 9),
+/* pmc10 */ PMC_D(PFM_REG_W64, "PMC10", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 10),
+/* pmc11 */ PMC_D(PFM_REG_W64, "PMC11", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 11),
+/* pmc12 */ PMC_D(PFM_REG_W64, "PMC12", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 12),
+/* pmc13 */ PMC_D(PFM_REG_W64, "PMC13", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 13),
+/* pmc14 */ PMC_D(PFM_REG_W64, "PMC14", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 14),
+/* pmc15 */ PMC_D(PFM_REG_W64, "PMC15", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 15),
+/* pmc16 */ PMX_NA,
+/* pmc17 */ PMX_NA,
+/* pmc18 */ PMX_NA,
+/* pmc19 */ PMX_NA,
+/* pmc20 */ PMX_NA,
+/* pmc21 */ PMX_NA,
+/* pmc22 */ PMX_NA,
+/* pmc23 */ PMX_NA,
+/* pmc24 */ PMX_NA,
+/* pmc25 */ PMX_NA,
+/* pmc26 */ PMX_NA,
+/* pmc27 */ PMX_NA,
+/* pmc28 */ PMX_NA,
+/* pmc29 */ PMX_NA,
+/* pmc30 */ PMX_NA,
+/* pmc31 */ PMX_NA,
+/* pmc32 */ PMC_D(PFM_REG_W , "PMC32", 0x30f01ffffffffffUL, 0xfcf0fe0000000000UL, 0, 32),
+/* pmc33 */ PMC_D(PFM_REG_W , "PMC33", 0x0, 0xfffffe0000000000UL, 0, 33),
+/* pmc34 */ PMC_D(PFM_REG_W , "PMC34", 0xf01ffffffffffUL, 0xfff0fe0000000000UL, 0, 34),
+/* pmc35 */ PMC_D(PFM_REG_W , "PMC35", 0x0,  0x1ffffffffffUL, 0, 35),
+/* pmc36 */ PMC_D(PFM_REG_W , "PMC36", 0xfffffff0UL, 0xfffffffffffffff0UL, 0, 36),
+/* pmc37 */ PMC_D(PFM_REG_W , "PMC37", 0x0, 0xffffffffffffc000UL, 0, 37),
+/* pmc38 */ PMC_D(PFM_REG_W , "PMC38", 0xdb6UL, 0xffffffffffffdb6dUL, 0, 38),
+/* pmc39 */ PMC_D(PFM_REG_W , "PMC39", 0x0, 0xffffffffffff0030UL, 0, 39),
+/* pmc40 */ PMC_D(PFM_REG_W , "PMC40", 0x2000000UL, 0xfffffffffff0fe30UL, 0, 40),
+/* pmc41 */ PMC_D(PFM_REG_W , "PMC41", 0x00002078fefefefeUL, 0xfffe1fffe7e7e7e7UL, 0, 41),
+/* pmc42 */ PMC_D(PFM_REG_W , "PMC42", 0x0, 0xfff800b0UL, 0, 42),
+/* pmc43 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc48 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc56 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc64 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc72 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc80 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc88 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc96 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc104 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc112 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc120 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc128 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc136 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc144 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc152 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc160 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc168 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc176 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc184 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc192 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc200 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc208 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc216 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc224 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc232 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc240 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc248 */ PMX_NA, PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,PMX_NA,
+/* pmc256 */ PMC_D(PFM_REG_W, "IBR0", 0x0, 0, 0, 0),
+/* pmc257 */ PMC_D(PFM_REG_W, "IBR1", 0x0, 0x8000000000000000UL, 0, 1),
+/* pmc258 */ PMC_D(PFM_REG_W, "IBR2", 0x0, 0, 0, 2),
+/* pmc259 */ PMC_D(PFM_REG_W, "IBR3", 0x0, 0x8000000000000000UL, 0, 3),
+/* pmc260 */ PMC_D(PFM_REG_W, "IBR4", 0x0, 0, 0, 4),
+/* pmc261 */ PMC_D(PFM_REG_W, "IBR5", 0x0, 0x8000000000000000UL, 0, 5),
+/* pmc262 */ PMC_D(PFM_REG_W, "IBR6", 0x0, 0, 0, 6),
+/* pmc263 */ PMC_D(PFM_REG_W, "IBR7", 0x0, 0x8000000000000000UL, 0, 7),
+/* pmc264 */ PMC_D(PFM_REG_W, "DBR0", 0x0, 0, 0, 0),
+/* pmc265 */ PMC_D(PFM_REG_W, "DBR1", 0x0, 0xc000000000000000UL, 0, 1),
+/* pmc266 */ PMC_D(PFM_REG_W, "DBR2", 0x0, 0, 0, 2),
+/* pmc267 */ PMC_D(PFM_REG_W, "DBR3", 0x0, 0xc000000000000000UL, 0, 3),
+/* pmc268 */ PMC_D(PFM_REG_W, "DBR4", 0x0, 0, 0, 4),
+/* pmc269 */ PMC_D(PFM_REG_W, "DBR5", 0x0, 0xc000000000000000UL, 0, 5),
+/* pmc270 */ PMC_D(PFM_REG_W, "DBR6", 0x0, 0, 0, 6),
+/* pmc271 */ PMC_D(PFM_REG_W, "DBR7", 0x0, 0xc000000000000000UL, 0, 7)
+};
+#define PFM_MONT_NUM_PMCS ARRAY_SIZE(pfm_mont_pmc_desc)
+
+static struct pfm_regmap_desc pfm_mont_pmd_desc[]={
+/* pmd0  */ PMX_NA,
+/* pmd1  */ PMX_NA,
+/* pmd2  */ PMX_NA,
+/* pmd3  */ PMX_NA,
+/* pmd4  */ PMD_D(PFM_REG_C, "PMD4", 4),
+/* pmd5  */ PMD_D(PFM_REG_C, "PMD5", 5),
+/* pmd6  */ PMD_D(PFM_REG_C, "PMD6", 6),
+/* pmd7  */ PMD_D(PFM_REG_C, "PMD7", 7),
+/* pmd8  */ PMD_D(PFM_REG_C, "PMD8", 8),
+/* pmd9  */ PMD_D(PFM_REG_C, "PMD9", 9),
+/* pmd10 */ PMD_D(PFM_REG_C, "PMD10", 10),
+/* pmd11 */ PMD_D(PFM_REG_C, "PMD11", 11),
+/* pmd12 */ PMD_D(PFM_REG_C, "PMD12", 12),
+/* pmd13 */ PMD_D(PFM_REG_C, "PMD13", 13),
+/* pmd14 */ PMD_D(PFM_REG_C, "PMD14", 14),
+/* pmd15 */ PMD_D(PFM_REG_C, "PMD15", 15),
+/* pmd16 */ PMX_NA,
+/* pmd17 */ PMX_NA,
+/* pmd18 */ PMX_NA,
+/* pmd19 */ PMX_NA,
+/* pmd20 */ PMX_NA,
+/* pmd21 */ PMX_NA,
+/* pmd22 */ PMX_NA,
+/* pmd23 */ PMX_NA,
+/* pmd24 */ PMX_NA,
+/* pmd25 */ PMX_NA,
+/* pmd26 */ PMX_NA,
+/* pmd27 */ PMX_NA,
+/* pmd28 */ PMX_NA,
+/* pmd29 */ PMX_NA,
+/* pmd30 */ PMX_NA,
+/* pmd31 */ PMX_NA,
+/* pmd32 */ PMD_D(PFM_REG_I, "PMD32", 32),
+/* pmd33 */ PMD_D(PFM_REG_I, "PMD33", 33),
+/* pmd34 */ PMD_D(PFM_REG_I, "PMD34", 34),
+/* pmd35 */ PMD_D(PFM_REG_I, "PMD35", 35),
+/* pmd36 */ PMD_D(PFM_REG_I, "PMD36", 36),
+/* pmd37 */ PMX_NA,
+/* pmd38 */ PMD_D(PFM_REG_I, "PMD38", 38),
+/* pmd39 */ PMD_D(PFM_REG_I, "PMD39", 39),
+/* pmd40 */ PMX_NA,
+/* pmd41 */ PMX_NA,
+/* pmd42 */ PMX_NA,
+/* pmd43 */ PMX_NA,
+/* pmd44 */ PMX_NA,
+/* pmd45 */ PMX_NA,
+/* pmd46 */ PMX_NA,
+/* pmd47 */ PMX_NA,
+/* pmd48 */ PMD_D(PFM_REG_I, "PMD48", 48),
+/* pmd49 */ PMD_D(PFM_REG_I, "PMD49", 49),
+/* pmd50 */ PMD_D(PFM_REG_I, "PMD50", 50),
+/* pmd51 */ PMD_D(PFM_REG_I, "PMD51", 51),
+/* pmd52 */ PMD_D(PFM_REG_I, "PMD52", 52),
+/* pmd53 */ PMD_D(PFM_REG_I, "PMD53", 53),
+/* pmd54 */ PMD_D(PFM_REG_I, "PMD54", 54),
+/* pmd55 */ PMD_D(PFM_REG_I, "PMD55", 55),
+/* pmd56 */ PMD_D(PFM_REG_I, "PMD56", 56),
+/* pmd57 */ PMD_D(PFM_REG_I, "PMD57", 57),
+/* pmd58 */ PMD_D(PFM_REG_I, "PMD58", 58),
+/* pmd59 */ PMD_D(PFM_REG_I, "PMD59", 59),
+/* pmd60 */ PMD_D(PFM_REG_I, "PMD60", 60),
+/* pmd61 */ PMD_D(PFM_REG_I, "PMD61", 61),
+/* pmd62 */ PMD_D(PFM_REG_I, "PMD62", 62),
+/* pmd63 */ PMD_D(PFM_REG_I, "PMD63", 63)
+};
+#define PFM_MONT_NUM_PMDS ARRAY_SIZE(pfm_mont_pmd_desc)
+
+static int pfm_mont_has_ht;
+
+static int pfm_mont_pmc_check(struct pfm_context *ctx,
+			      struct pfm_event_set *set,
+			      struct pfarg_pmc *req)
+{
+	struct pfm_arch_context *ctx_arch;
+	u64 val32 = 0, val38 = 0, val41 = 0;
+	u64 tmpval;
+	u16 cnum;
+	int ret = 0, check_case1 = 0;
+	int is_system;
+
+	tmpval = req->reg_value;
+	cnum = req->reg_num;
+	ctx_arch = pfm_ctx_arch(ctx);
+	is_system = ctx->flags.system;
+
+#define PFM_MONT_PMC_PM_POS6	(1UL<<6)
+#define PFM_MONT_PMC_PM_POS4	(1UL<<4)
+
+	switch(cnum) {
+		case  4:
+		case  5:
+		case  6:
+		case  7:
+		case  8:
+		case  9: if (is_system)
+				 tmpval |= PFM_MONT_PMC_PM_POS6;
+			 else
+				 tmpval &= ~PFM_MONT_PMC_PM_POS6;
+			 break;
+		case 10:
+		case 11:
+		case 12:
+		case 13:
+		case 14:
+		case 15: if ((req->reg_flags & PFM_REGFL_NO_EMUL64) == 0) {
+				 if (pfm_mont_has_ht) {
+				 	PFM_INFO("perfmon: Errata 121 PMD10/PMD15 cannot be used to overflow"
+						"when threads on on");
+				 	return -EINVAL;
+				 }
+			 }
+			 if (is_system)
+				 tmpval |= PFM_MONT_PMC_PM_POS6;
+			 else
+				 tmpval &= ~PFM_MONT_PMC_PM_POS6;
+			 break;
+		case 39:
+		case 40:
+		case 42: if (pfm_mont_has_ht && ((req->reg_value >> 8) & 0x7) == 4) {
+				 PFM_INFO("perfmon: Errata 120: IP-EAR not available when threads are on");
+				 return -EINVAL;
+			 }
+			 if (is_system)
+				 tmpval |= PFM_MONT_PMC_PM_POS6;
+			 else
+				 tmpval &= ~PFM_MONT_PMC_PM_POS6;
+			 break;
+
+		case  32: val32 = tmpval;
+			  val38 = set->pmcs[38];
+			  val41 = set->pmcs[41];
+			  check_case1 = 1;
+			  break;
+		
+		case  37:
+			  if (is_system)
+				 tmpval |= PFM_MONT_PMC_PM_POS4;
+			 else
+				 tmpval &= ~PFM_MONT_PMC_PM_POS4;
+			 break;
+
+		case  38: val38 = tmpval;
+			  val32 = set->pmcs[32];
+			  val41 = set->pmcs[41];
+			  check_case1 = 1;
+			  break;
+		case  41: val41 = tmpval;
+			  val32 = set->pmcs[32];
+			  val38 = set->pmcs[38];
+			  check_case1 = 1;
+			  break;
+	}
+
+	if (check_case1) {
+		ret = (((val41 >> 45) & 0xf) == 0 && ((val32>>57) & 0x1) == 0)
+		     && ((((val38>>1) & 0x3) == 0x2 || ((val38>>1) & 0x3) == 0)
+		     || (((val38>>4) & 0x3) == 0x2 || ((val38>>4) & 0x3) == 0));
+		if (ret) {
+			PFM_DBG("perfmon: invalid config pmc38=0x%lx "
+				"pmc41=0x%lx pmc32=0x%lx",
+				val38, val41, val32);
+			return -EINVAL;
+		}
+	}
+	
+	/*
+	 * check if configuration implicitely activates the use of the
+	 * debug registers. If true, then we ensure that this is possible
+	 * and that we do not pick up stale value in the HW registers.
+	 */
+
+	/*
+	 *
+	 * pmc41 is "active" if:
+	 * 	one of the pmc41.cfgdtagXX field is different from 0x3
+	 * AND
+	 * 	the corsesponding pmc41.en_dbrpXX is set.
+	 * AND
+	 *	ctx_fl_use_dbr (dbr not yet used)
+	 */
+	if (cnum == 41
+	    && (tmpval & 0x1e00000000000)
+	    	&& (tmpval & 0x18181818) != 0x18181818
+			&& ctx_arch->flags.use_dbr == 0) {
+		PFM_DBG("pmc41=0x%lx active, clearing dbr", tmpval);
+		ret = pfm_ia64_mark_dbregs_used(ctx, set);
+		if (ret) return ret;
+	}
+	/*
+	 * we must clear the (instruction) debug registers if:
+	 * 	pmc38.ig_ibrpX is 0 (enabled)
+	 * and
+	 * 	fl_use_dbr == 0 (dbr not yet used)
+	 */
+	if (cnum == 38 && ((tmpval & 0x492) != 0x492)
+		&& ctx_arch->flags.use_dbr == 0) {
+		PFM_DBG("pmc38=0x%lx active pmc38, clearing ibr", tmpval);
+		ret = pfm_ia64_mark_dbregs_used(ctx, set);
+		if (ret) return ret;
+
+	}
+	req->reg_value = tmpval;
+	return 0;
+}
+
+static void pfm_handle_errata(void)
+{
+	pfm_mont_has_ht = 1;
+
+	PFM_INFO("activating workaround for errata 120 "
+		 "(Disable IP-EAR when threads are on)");
+
+	PFM_INFO("activating workaround for Errata 121 "
+		 "(PMC10-PMC15 cannot be used to overflow"
+		 " when threads are on");
+}
+static int pfm_mont_probe_pmu(void)
+{
+	if (local_cpu_data->family != 0x20)
+		return -1;
+
+	/*
+	 * the 2 errata must be activated when
+	 * threads are/can be enabled
+	 */
+	if (is_multithreading_enabled())
+		pfm_handle_errata();
+
+	return 0;
+}
+
+/*
+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
+ */
+static struct pfm_pmu_config pfm_mont_pmu_conf={
+	.pmu_name = "Montecito",
+	.counter_width = 47,
+	.pmd_desc = pfm_mont_pmd_desc,
+	.pmc_desc = pfm_mont_pmc_desc,
+	.num_pmc_entries = PFM_MONT_NUM_PMCS,
+	.num_pmd_entries = PFM_MONT_NUM_PMDS,
+	.pmc_write_check = pfm_mont_pmc_check,
+	.probe_pmu = pfm_mont_probe_pmu,
+	.version = "1.0",
+	.arch_info = &pfm_mont_pmu_info,
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE
+};
+	
+static int __init pfm_mont_pmu_init_module(void)
+{
+	return pfm_pmu_register(&pfm_mont_pmu_conf);
+}
+
+static void __exit pfm_mont_pmu_cleanup_module(void)
+{
+	pfm_pmu_unregister(&pfm_mont_pmu_conf);
+}
+
+module_init(pfm_mont_pmu_init_module);
+module_exit(pfm_mont_pmu_cleanup_module);
Index: linux-2.6/arch/mips/Kconfig
===================================================================
--- linux-2.6.orig/arch/mips/Kconfig
+++ linux-2.6/arch/mips/Kconfig
@@ -1775,6 +1775,8 @@ config SECCOMP
 
 	  If unsure, say Y. Only embedded should say N here.
 
+source "arch/mips/perfmon/Kconfig"
+
 endmenu
 
 config RWSEM_GENERIC_SPINLOCK
Index: linux-2.6/arch/mips/Makefile
===================================================================
--- linux-2.6.orig/arch/mips/Makefile
+++ linux-2.6/arch/mips/Makefile
@@ -148,6 +148,12 @@ endif
 endif
 
 #
+# Perfmon support
+#
+
+core-$(CONFIG_PERFMON)		+= arch/mips/perfmon/
+
+#
 # Firmware support
 #
 libs-$(CONFIG_ARC)		+= arch/mips/arc/
Index: linux-2.6/arch/mips/kernel/process.c
===================================================================
--- linux-2.6.orig/arch/mips/kernel/process.c
+++ linux-2.6/arch/mips/kernel/process.c
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/completion.h>
 #include <linux/kallsyms.h>
+#include <linux/perfmon.h>
 
 #include <asm/bootinfo.h>
 #include <asm/cpu.h>
@@ -90,6 +91,7 @@ void start_thread(struct pt_regs * regs,
 
 void exit_thread(void)
 {
+  pfm_exit_thread(current);
 }
 
 void flush_thread(void)
@@ -164,6 +166,8 @@ int copy_thread(int nr, unsigned long cl
 	if (clone_flags & CLONE_SETTLS)
 		ti->tp_value = regs->regs[7];
 
+	pfm_copy_thread(p);
+
 	return 0;
 }
 
Index: linux-2.6/arch/mips/kernel/scall32-o32.S
===================================================================
--- linux-2.6.orig/arch/mips/kernel/scall32-o32.S
+++ linux-2.6/arch/mips/kernel/scall32-o32.S
@@ -662,6 +662,18 @@ einval:	li	v0, -EINVAL
 	sys	sys_signalfd		3
 	sys	sys_timerfd		4
 	sys	sys_eventfd		1
+	sys	sys_pfm_create_context	4	/* 4320 */
+	sys	sys_pfm_write_pmcs	3
+	sys	sys_pfm_write_pmds	4
+	sys	sys_pfm_read_pmds	3
+	sys	sys_pfm_load_context	2
+	sys	sys_pfm_start		2	/* 4325 */
+	sys	sys_pfm_stop		1
+	sys	sys_pfm_restart		1
+	sys	sys_pfm_create_evtsets	3
+	sys	sys_pfm_getinfo_evtsets 3
+	sys	sys_pfm_delete_evtsets	3	/* 4326 */
+	sys	sys_pfm_unload_context	1
 	.endm
 
 	/* We pre-compute the number of _instruction_ bytes needed to
Index: linux-2.6/arch/mips/kernel/scall64-64.S
===================================================================
--- linux-2.6.orig/arch/mips/kernel/scall64-64.S
+++ linux-2.6/arch/mips/kernel/scall64-64.S
@@ -477,4 +477,16 @@ sys_call_table:
 	PTR	sys_signalfd
 	PTR	sys_timerfd
 	PTR	sys_eventfd
+	PTR	sys_pfm_create_context
+	PTR	sys_pfm_write_pmcs		/* 5280 */
+	PTR	sys_pfm_write_pmds
+	PTR	sys_pfm_read_pmds
+	PTR	sys_pfm_load_context
+	PTR	sys_pfm_start
+	PTR	sys_pfm_stop			/* 5285 */
+	PTR	sys_pfm_restart
+	PTR	sys_pfm_create_evtsets
+	PTR	sys_pfm_getinfo_evtsets
+	PTR	sys_pfm_delete_evtsets
+	PTR	sys_pfm_unload_context		/* 5290 */
 	.size	sys_call_table,.-sys_call_table
Index: linux-2.6/arch/mips/kernel/scall64-n32.S
===================================================================
--- linux-2.6.orig/arch/mips/kernel/scall64-n32.S
+++ linux-2.6/arch/mips/kernel/scall64-n32.S
@@ -400,7 +400,19 @@ EXPORT(sysn32_call_table)
 	PTR	sys_ioprio_set
 	PTR	sys_ioprio_get
 	PTR	compat_sys_utimensat
-	PTR	compat_sys_signalfd		/* 5280 */
+	PTR	compat_sys_signalfd		/* 6280 */
 	PTR	compat_sys_timerfd
 	PTR	sys_eventfd
+	PTR	sys_pfm_create_context
+	PTR	sys_pfm_write_pmcs
+	PTR	sys_pfm_write_pmds		/* 6285 */
+	PTR	sys_pfm_read_pmds
+	PTR	sys_pfm_load_context
+	PTR	sys_pfm_start
+	PTR	sys_pfm_stop
+	PTR	sys_pfm_restart			/* 6290 */
+	PTR	sys_pfm_create_evtsets
+	PTR	sys_pfm_getinfo_evtsets
+	PTR	sys_pfm_delete_evtsets
+	PTR	sys_pfm_unload_context
 	.size	sysn32_call_table,.-sysn32_call_table
Index: linux-2.6/arch/mips/kernel/scall64-o32.S
===================================================================
--- linux-2.6.orig/arch/mips/kernel/scall64-o32.S
+++ linux-2.6/arch/mips/kernel/scall64-o32.S
@@ -525,4 +525,16 @@ sys_call_table:
 	PTR	compat_sys_signalfd
 	PTR	compat_sys_timerfd
 	PTR	sys_eventfd
+	PTR	sys_pfm_create_context		/* 4320 */
+	PTR	sys_pfm_write_pmcs
+	PTR	sys_pfm_write_pmds
+	PTR	sys_pfm_read_pmds
+	PTR	sys_pfm_load_context
+	PTR	sys_pfm_start			/* 4325 */
+	PTR	sys_pfm_stop
+	PTR	sys_pfm_restart
+	PTR	sys_pfm_create_evtsets
+	PTR	sys_pfm_getinfo_evtsets
+	PTR	sys_pfm_delete_evtsets		/* 4330 */
+	PTR	sys_pfm_unload_context
 	.size	sys_call_table,.-sys_call_table
Index: linux-2.6/arch/mips/kernel/signal.c
===================================================================
--- linux-2.6.orig/arch/mips/kernel/signal.c
+++ linux-2.6/arch/mips/kernel/signal.c
@@ -20,6 +20,7 @@
 #include <linux/unistd.h>
 #include <linux/compiler.h>
 #include <linux/uaccess.h>
+#include <linux/perfmon.h>
 
 #include <asm/abi.h>
 #include <asm/asm.h>
@@ -696,6 +697,9 @@ static void do_signal(struct pt_regs *re
 asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused,
 	__u32 thread_info_flags)
 {
+        if (thread_info_flags & _TIF_PERFMON_WORK)
+		pfm_handle_work(regs);
+
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
 		do_signal(regs);
Index: linux-2.6/arch/mips/kernel/smp.c
===================================================================
--- linux-2.6.orig/arch/mips/kernel/smp.c
+++ linux-2.6/arch/mips/kernel/smp.c
@@ -203,6 +203,52 @@ void smp_call_function_interrupt(void)
 	}
 }
 
+int smp_call_function_single (int cpu, void (*func) (void *info), void *info, int retry,
+			      int wait)
+{
+  struct call_data_struct data;
+  int me = smp_processor_id();
+
+  /*
+     * Can die spectacularly if this CPU isn't yet marked online
+      */
+  BUG_ON(!cpu_online(me));
+  if (cpu == me) {
+    WARN_ON(1);
+    return -EBUSY;
+    }
+
+  /* Can deadlock when called with interrupts disabled */
+  WARN_ON(irqs_disabled());
+
+  data.func = func;
+  data.info = info;
+  atomic_set(&data.started, 0);
+  data.wait = wait;
+  if (wait)
+    atomic_set(&data.finished, 0);
+
+  spin_lock(&smp_call_lock);
+  call_data = &data;
+  mb();
+
+  /* Send a message to the other CPU */
+  core_send_ipi(cpu, SMP_CALL_FUNCTION);
+
+  /* Wait for response */
+  /* FIXME: lock-up detection, backtrace on lock-up */
+  while (atomic_read(&data.started) != 1)
+    barrier();
+
+  if (wait)
+    while (atomic_read(&data.finished) != 1)
+      barrier();
+  call_data = NULL;
+  spin_unlock(&smp_call_lock);
+
+  return 0;
+}
+
 static void stop_this_cpu(void *dummy)
 {
 	/*
Index: linux-2.6/arch/mips/kernel/time.c
===================================================================
--- linux-2.6.orig/arch/mips/kernel/time.c
+++ linux-2.6/arch/mips/kernel/time.c
@@ -23,6 +23,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/perfmon.h>
 
 #include <asm/bootinfo.h>
 #include <asm/cache.h>
@@ -140,6 +141,7 @@ static long last_rtc_update;
 void local_timer_interrupt(int irq, void *dev_id)
 {
 	profile_tick(CPU_PROFILING);
+ 	pfm_handle_switch_timeout();
 	update_process_times(user_mode(get_irq_regs()));
 }
 
Index: linux-2.6/arch/mips/mips-boards/generic/time.c
===================================================================
--- linux-2.6.orig/arch/mips/mips-boards/generic/time.c
+++ linux-2.6/arch/mips/mips-boards/generic/time.c
@@ -27,6 +27,7 @@
 #include <linux/time.h>
 #include <linux/timex.h>
 #include <linux/mc146818rtc.h>
+#include <linux/perfmon.h>
 
 #include <asm/mipsregs.h>
 #include <asm/mipsmtregs.h>
Index: linux-2.6/arch/mips/perfmon/Kconfig
===================================================================
--- /dev/null
+++ linux-2.6/arch/mips/perfmon/Kconfig
@@ -0,0 +1,23 @@
+menu "Hardware Performance Monitoring support"
+config PERFMON
+	bool "Perfmon2 performance monitoring interface"
+	default n
+	help
+  	Enables the perfmon2 interface to access the hardware
+	performance counters. See <http://perfmon2.sf.net/> for
+ 	more details.
+
+config PERFMON_DEBUG
+ 	bool "Perfmon debugging"
+	default n
+	depends on PERFMON
+ 	help
+  	Enables perfmon debugging support
+
+config PERFMON_MIPS64
+	tristate "Support for MIPS64 hardware performance counters"
+	depends on PERFMON
+	default n
+	help
+	Enables support for the MIPS64 hardware performance counters"
+endmenu
Index: linux-2.6/arch/mips/perfmon/Makefile
===================================================================
--- /dev/null
+++ linux-2.6/arch/mips/perfmon/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_PERFMON)		+= perfmon.o
+obj-$(CONFIG_PERFMON_MIPS64)	+= perfmon_mips64.o
Index: linux-2.6/arch/mips/perfmon/perfmon.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/mips/perfmon/perfmon.c
@@ -0,0 +1,299 @@
+/*
+ * This file implements the MIPS64 specific
+ * support for the perfmon2 interface
+ *
+ * Copyright (c) 2005 Philip J. Mucci
+ *
+ * based on versions for other architectures:
+ * Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@htrpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/perfmon.h>
+
+/*
+ * collect pending overflowed PMDs. Called from pfm_ctxsw()
+ * and from PMU interrupt handler. Must fill in set->povfl_pmds[]
+ * and set->npend_ovfls. Interrupts are masked
+ */
+static void __pfm_get_ovfl_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+ 	u64 new_val, wmask;
+	u64 *used_mask, *cnt_pmds;
+	u64 mask[PFM_PMD_BV];
+	unsigned int i, max;
+
+	max = pfm_pmu_conf->regs.max_cnt_pmd;
+	cnt_pmds = pfm_pmu_conf->regs.cnt_pmds;
+	used_mask = set->used_pmds;
+	wmask = 1ULL << pfm_pmu_conf->counter_width;
+	bitmap_and(cast_ulp(mask),
+		   cast_ulp(cnt_pmds),
+		   cast_ulp(used_mask),max);
+	
+	for (i = 0; i < max; i++) {
+		/* assume all PMD are counters */
+		if (test_bit(i, mask)) {
+			new_val = pfm_arch_read_pmd(ctx, i);
+
+ 			PFM_DBG_ovfl("pmd%u new_val=0x%llx bit=%d\n",
+  				 i, (unsigned long long)new_val,
+				 (new_val&wmask) ? 1 : 0);
+
+  			if (new_val & wmask) {
+				__set_bit(i, set->povfl_pmds);
+				set->npend_ovfls++;
+			}
+		}
+	}
+}
+
+static void pfm_stop_active(struct task_struct *task, struct pfm_context *ctx,
+			       struct pfm_event_set *set)
+{
+	unsigned int i, max;
+
+	max = pfm_pmu_conf->regs.max_pmc;
+
+	/*
+	 * clear enable bits
+	 */
+	for (i = 0; i < max; i++) {
+		if (test_bit(i, set->used_pmcs))
+			pfm_arch_write_pmc(ctx, i,0);
+	}
+
+	if (set->npend_ovfls)
+		return;
+
+	__pfm_get_ovfl_pmds(ctx, set);
+}
+
+/*
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * Context is locked. Interrupts are masked. Monitoring is active.
+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
+ *
+ * for per-thread:
+ * 	must stop monitoring for the task
+ *
+ * Return:
+ * 	non-zero : did not save PMDs (as part of stopping the PMU)
+ * 	       0 : saved PMDs (no need to save them in caller)
+ */
+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx,
+			      struct pfm_event_set *set)
+{
+	/*
+	 * disable lazy restore of PMC registers.
+	 */
+	set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
+
+	pfm_stop_active(task, ctx, set);
+
+	return 1;
+}
+
+/*
+ * Called from pfm_stop() and pfm_ctxsw() when idle
+ * task and EXCL_IDLE is on.
+ *
+ * Interrupts are masked. Context is locked. Set is the active set.
+ *
+ * For per-thread:
+ *   task is not necessarily current. If not current task, then
+ *   task is guaranteed stopped and off any cpu. Access to PMU
+ *   is not guaranteed. Interrupts are masked. Context is locked.
+ *   Set is the active set.
+ *
+ * For system-wide:
+ * 	task is current
+ *
+ * must disable active monitoring. ctx cannot be NULL
+ */
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx,
+		   struct pfm_event_set *set)
+{
+	/*
+	 * no need to go through stop_save()
+	 * if we are already stopped
+	 */
+	if (!ctx->flags.started)
+		return;
+
+	/*
+	 * stop live registers and collect pending overflow
+	 */
+	if (task == current)
+		pfm_stop_active(task, ctx, set);
+}
+
+/*
+ * called from pfm_start() or pfm_ctxsw() when idle task and
+ * EXCL_IDLE is on.
+ *
+ * Interrupts are masked. Context is locked. Set is the active set.
+ *
+ * For per-trhead:
+ * 	Task is not necessarily current. If not current task, then task
+ * 	is guaranteed stopped and off any cpu. Access to PMU is not guaranteed.
+ *
+ * For system-wide:
+ * 	task is always current
+ *
+ * must enable active monitoring.
+ */
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx,
+	            struct pfm_event_set *set)
+{
+	unsigned int i, max_pmc;
+
+	if (task != current)
+		return;
+
+	max_pmc = pfm_pmu_conf->regs.max_pmc;
+
+	for (i = 0; i < max_pmc; i++) {
+		if (test_bit(i, set->used_pmcs))
+		    pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
+	}
+}
+
+/*
+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
+ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets()
+ * context is locked. Interrupts are masked. set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ *
+ * function must restore all PMD registers from set.
+ */
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	u64 ovfl_mask, val;
+	u64 *impl_pmds;
+	unsigned int i;
+	unsigned int max_pmd;
+
+	max_pmd = pfm_pmu_conf->regs.max_pmd;
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	impl_pmds = pfm_pmu_conf->regs.pmds;
+
+	/*
+	 * must restore all pmds to avoid leaking
+	 * information to user.
+	 */
+	for (i = 0; i < max_pmd; i++) {
+
+		if (test_bit(i, impl_pmds) == 0)
+			continue;
+
+		val = set->pmds[i].value;
+
+		/*
+		 * set upper bits for counter to ensure
+		 * overflow will trigger
+		 */
+		val &= ovfl_mask;
+
+		pfm_arch_write_pmd(ctx, i, val);
+	}
+}
+
+/*
+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
+ * pfm_context_load_sys(), pfm_ctxsw().
+ * Context is locked. Interrupts are masked. set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ *
+ * function must restore all PMC registers from set, if needed.
+ */
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	u64 *impl_pmcs;
+	unsigned int i, max_pmc;
+
+	max_pmc = pfm_pmu_conf->regs.max_pmc;
+	impl_pmcs = pfm_pmu_conf->regs.pmcs;
+
+	/*
+	 * - by default no PMCS measures anything
+	 * - on ctxswout, all used PMCs are disabled (cccr enable bit cleared)
+	 * hence when masked we do not need to restore anything
+	 */
+	if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0)
+		return;
+
+	/*
+	 * restore all pmcs
+	 */
+	for (i = 0; i < max_pmc; i++)
+		if (test_bit(i, impl_pmcs))
+			pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
+}
+
+char *pfm_arch_get_pmu_module_name(void)
+{
+	switch(cpu_data->cputype) {
+#ifndef CONFIG_SMP
+	case CPU_34K:
+#if defined(CPU_74K)
+	case CPU_74K:
+#endif
+#endif
+	case CPU_SB1:
+	case CPU_SB1A:
+	case CPU_25KF:
+	case CPU_24K:
+	case CPU_20KC:
+	case CPU_5KC:
+	  return "perfmon_mips64";
+	default:
+	  return NULL;
+	}
+	return NULL;
+}
+
+int perfmon_perf_irq(void)
+{
+  /* BLATANTLY STOLEN FROM OPROFILE, then modified */
+  struct pt_regs *regs;
+  unsigned int counters = pfm_pmu_conf->regs.max_pmc;
+  unsigned int control;
+  unsigned int counter;
+
+  regs = get_irq_regs();
+  switch (counters) {
+#define HANDLE_COUNTER(n)						\
+	case n + 1:							\
+		control = read_c0_perfctrl ## n();			\
+		counter = read_c0_perfcntr ## n();			\
+		if ((control & MIPS64_PMC_INT_ENABLE_MASK) &&		\
+		    (counter & MIPS64_PMD_INTERRUPT)) {			\
+			pfm_interrupt_handler(instruction_pointer(regs),\
+					      regs);                    \
+			return(1);					\
+		}
+      HANDLE_COUNTER(3)
+      HANDLE_COUNTER(2)
+      HANDLE_COUNTER(1)
+      HANDLE_COUNTER(0)
+      }
+
+  return 0;
+}
+EXPORT_SYMBOL(perfmon_perf_irq);
Index: linux-2.6/arch/mips/perfmon/perfmon_mips64.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/mips/perfmon/perfmon_mips64.c
@@ -0,0 +1,187 @@
+/*
+ * This file contains the MIPS64 and decendent PMU register description tables
+ * and pmc checker used by perfmon.c.
+ *
+ * Copyright (c) 2005 Philip Mucci
+ *
+ * Based on perfmon_p6.c:
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+
+MODULE_AUTHOR("Philip Mucci <mucci@cs.utk.edu>");
+MODULE_DESCRIPTION("MIPS64 PMU description tables");
+MODULE_LICENSE("GPL");
+
+/*
+ * reserved:
+ * 	- bit 63-9
+ * RSVD: reserved bits must be 1
+ */
+#define PFM_MIPS64_PMC_RSVD 0xfffffffffffffe10ULL
+#define PFM_MIPS64_PMC_VAL  (1ULL<<4)
+
+extern int null_perf_irq(struct pt_regs *regs);
+extern int (*perf_irq)(struct pt_regs *regs);
+extern int perfmon_perf_irq(struct pt_regs *regs);
+
+static struct pfm_arch_pmu_info pfm_mips64_pmu_info;
+
+static struct pfm_regmap_desc pfm_mips64_pmc_desc[]={
+/* pmc0 */  PMC_D(PFM_REG_I64, "CP0_25_0", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 0),
+/* pmc1 */  PMC_D(PFM_REG_I64, "CP0_25_1", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 1),
+/* pmc2 */  PMC_D(PFM_REG_I64, "CP0_25_2", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 2),
+/* pmc3 */  PMC_D(PFM_REG_I64, "CP0_25_3", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 3)
+};
+#define PFM_MIPS64_NUM_PMCS ARRAY_SIZE(pfm_mips64_pmc_desc)
+
+static struct pfm_regmap_desc pfm_mips64_pmd_desc[]={
+/* pmd0 */ PMD_D(PFM_REG_C, "CP0_25_0", 0),
+/* pmd1 */ PMD_D(PFM_REG_C, "CP0_25_1", 1),
+/* pmd2 */ PMD_D(PFM_REG_C, "CP0_25_2", 2),
+/* pmd3 */ PMD_D(PFM_REG_C, "CP0_25_3", 3)
+};
+#define PFM_MIPS64_NUM_PMDS ARRAY_SIZE(pfm_mips64_pmd_desc)
+
+static int pfm_mips64_probe_pmu(void)
+{
+	struct cpuinfo_mips *c = &current_cpu_data;
+
+	switch (c->cputype) {
+#ifndef CONFIG_SMP
+	case CPU_34K:
+#if defined(CPU_74K)
+	case CPU_74K:
+#endif
+#endif
+	case CPU_SB1:
+	case CPU_SB1A:
+	case CPU_25KF:
+	case CPU_24K:
+	case CPU_20KC:
+	case CPU_5KC:
+		return 0;
+		break;
+	default:
+		PFM_INFO("Unknown cputype 0x%x",c->cputype);
+	}
+	return -1;
+}
+
+/*
+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
+ */
+static struct pfm_pmu_config pfm_mips64_pmu_conf = {
+	.pmu_name = "MIPS", /* placeholder */
+	.counter_width = 31,
+	.pmd_desc = pfm_mips64_pmd_desc,
+	.pmc_desc = pfm_mips64_pmc_desc,
+	.num_pmc_entries = PFM_MIPS64_NUM_PMCS,
+	.num_pmd_entries = PFM_MIPS64_NUM_PMDS,
+	.probe_pmu = pfm_mips64_probe_pmu,
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE,
+	.arch_info = &pfm_mips64_pmu_info
+};
+
+static inline int n_counters(void)
+{
+	if (!(read_c0_config1() & MIPS64_CONFIG_PMC_MASK))
+		return 0;
+	if (!(read_c0_perfctrl0() & MIPS64_PMC_CTR_MASK))
+		return 1;
+	if (!(read_c0_perfctrl1() & MIPS64_PMC_CTR_MASK))
+		return 2;
+	if (!(read_c0_perfctrl2() & MIPS64_PMC_CTR_MASK))
+		return 3;
+	return 4;
+}
+
+static int __init pfm_mips64_pmu_init_module(void)
+{
+	struct cpuinfo_mips *c = &current_cpu_data;
+	int i, ret, num;
+
+	switch (c->cputype) {
+	case CPU_5KC:
+		pfm_mips64_pmu_conf.pmu_name = "MIPS5KC";
+		break;
+	case CPU_20KC:
+		pfm_mips64_pmu_conf.pmu_name = "MIPS20KC";
+		break;
+	case CPU_24K:
+		pfm_mips64_pmu_conf.pmu_name = "MIPS24K";
+		break;
+	case CPU_25KF:
+		pfm_mips64_pmu_conf.pmu_name = "MIPS25KF";
+		break;
+	case CPU_SB1:
+		pfm_mips64_pmu_conf.pmu_name = "SB1";
+		break;
+	case CPU_SB1A:
+	pfm_mips64_pmu_conf.pmu_name = "SB1A";
+		break;
+#ifndef CONFIG_SMP
+	case CPU_34K:
+		pfm_mips64_pmu_conf.pmu_name = "MIPS34K";
+		break;
+#if defined(CPU_74K)
+	case CPU_74K:
+		pfm_mips64_pmu_conf.pmu_name = "MIPS74K";
+		break;
+#endif
+#endif
+	default:
+		PFM_INFO("Unknown cputype 0x%x",c->cputype);
+		return -1;
+	}
+
+	num = n_counters();
+	if (num == 0) {
+		PFM_INFO("cputype 0x%x has no counters",c->cputype);
+		return -1;
+	}
+	/* mark remaining counters unavailable */
+	for(i=num; i < PFM_MIPS64_NUM_PMCS; i++) {
+		pfm_mips64_pmc_desc[i].type = PFM_REG_NA;
+	}
+
+	for(i=num; i < PFM_MIPS64_NUM_PMDS; i++) {
+		pfm_mips64_pmd_desc[i].type = PFM_REG_NA;
+	}
+
+	pfm_mips64_pmu_conf.num_pmc_entries = num;
+	pfm_mips64_pmu_conf.num_pmd_entries = num;
+
+	pfm_mips64_pmu_info.pmu_style = c->cputype;
+
+	ret = pfm_pmu_register(&pfm_mips64_pmu_conf);
+	if (ret == 0)
+		perf_irq = perfmon_perf_irq;
+	return ret;
+}
+
+static void __exit pfm_mips64_pmu_cleanup_module(void)
+{
+	pfm_pmu_unregister(&pfm_mips64_pmu_conf);
+	perf_irq = null_perf_irq;
+}
+
+module_init(pfm_mips64_pmu_init_module);
+module_exit(pfm_mips64_pmu_cleanup_module);
Index: linux-2.6/arch/powerpc/Kconfig
===================================================================
--- linux-2.6.orig/arch/powerpc/Kconfig
+++ linux-2.6/arch/powerpc/Kconfig
@@ -140,6 +140,8 @@ config PPC_OF_PLATFORM_PCI
 	depends on PPC64 # not supported on 32 bits yet
 	default n
 
+source "arch/powerpc/perfmon/Kconfig"
+
 source "init/Kconfig"
 
 source "arch/powerpc/platforms/Kconfig"
Index: linux-2.6/arch/powerpc/Makefile
===================================================================
--- linux-2.6.orig/arch/powerpc/Makefile
+++ linux-2.6/arch/powerpc/Makefile
@@ -137,6 +137,7 @@ core-y				+= arch/powerpc/kernel/ \
 				   arch/powerpc/platforms/
 core-$(CONFIG_MATH_EMULATION)	+= arch/powerpc/math-emu/
 core-$(CONFIG_XMON)		+= arch/powerpc/xmon/
+core-$(CONFIG_PERFMON)		+= arch/powerpc/perfmon/
 
 drivers-$(CONFIG_OPROFILE)	+= arch/powerpc/oprofile/
 
Index: linux-2.6/arch/powerpc/kernel/entry_32.S
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/entry_32.S
+++ linux-2.6/arch/powerpc/kernel/entry_32.S
@@ -38,7 +38,7 @@
  * MSR_KERNEL is > 0x10000 on 4xx/Book-E since it include MSR_CE.
  */
 #if MSR_KERNEL >= 0x10000
-#define LOAD_MSR_KERNEL(r, x)	lis r,(x)@h; ori r,r,(x)@l
+#define LOAD_MSR_KERNEL(r, x)	lis r,(x)@ha; ori r,r,(x)@l
 #else
 #define LOAD_MSR_KERNEL(r, x)	li r,(x)
 #endif
Index: linux-2.6/arch/powerpc/kernel/entry_64.S
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/entry_64.S
+++ linux-2.6/arch/powerpc/kernel/entry_64.S
@@ -588,6 +588,10 @@ user_work:
 	b	.ret_from_except_lite
 
 1:	bl	.save_nvgprs
+#ifdef CONFIG_PERFMON
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.pfm_handle_work
+#endif /* CONFIG_PERFMON */
 	li	r3,0
 	addi	r4,r1,STACK_FRAME_OVERHEAD
 	bl	.do_signal
Index: linux-2.6/arch/powerpc/kernel/process.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/process.c
+++ linux-2.6/arch/powerpc/kernel/process.c
@@ -33,6 +33,7 @@
 #include <linux/mqueue.h>
 #include <linux/hardirq.h>
 #include <linux/utsname.h>
+#include <linux/perfmon.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
@@ -324,6 +325,9 @@ struct task_struct *__switch_to(struct t
 		new_thread->start_tb = current_tb;
 	}
 #endif
+	if (test_tsk_thread_flag(new, TIF_PERFMON_CTXSW)
+	    || test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW))
+		pfm_ctxsw(prev, new);
 
 	local_irq_save(flags);
 
@@ -457,6 +461,7 @@ void show_regs(struct pt_regs * regs)
 void exit_thread(void)
 {
 	discard_lazy_cpu_state();
+	pfm_exit_thread(current);
 }
 
 void flush_thread(void)
@@ -570,6 +575,7 @@ int copy_thread(int nr, unsigned long cl
 #else
 	kregs->nip = (unsigned long)ret_from_fork;
 #endif
+	pfm_copy_thread(p);
 
 	return 0;
 }
Index: linux-2.6/arch/powerpc/kernel/time.c
===================================================================
--- linux-2.6.orig/arch/powerpc/kernel/time.c
+++ linux-2.6/arch/powerpc/kernel/time.c
@@ -52,6 +52,7 @@
 #include <linux/jiffies.h>
 #include <linux/posix-timers.h>
 #include <linux/irq.h>
+#include <linux/perfmon.h>
 
 #include <asm/io.h>
 #include <asm/processor.h>
@@ -651,6 +652,8 @@ void timer_interrupt(struct pt_regs * re
 	profile_tick(CPU_PROFILING);
 	calculate_steal_time();
 
+	pfm_handle_switch_timeout();
+
 #ifdef CONFIG_PPC_ISERIES
 	if (firmware_has_feature(FW_FEATURE_ISERIES))
 		get_lppaca()->int_dword.fields.decr_int = 0;
Index: linux-2.6/arch/powerpc/perfmon/Kconfig
===================================================================
--- /dev/null
+++ linux-2.6/arch/powerpc/perfmon/Kconfig
@@ -0,0 +1,50 @@
+menu "Hardware Performance Monitoring support"
+config PERFMON
+	bool "Perfmon2 performance monitoring interface"
+	default n
+	help
+  	Enables the perfmon2 interface to access the hardware
+	performance counters. See <http://perfmon2.sf.net/> for
+ 	more details.
+
+config PERFMON_DEBUG
+ 	bool "Perfmon debugging"
+	default n
+	depends on PERFMON
+ 	help
+  	Enables perfmon debugging support
+
+config PERFMON_POWER5
+	tristate "Support for Power5 hardware performance counters"
+ 	depends on PERFMON && PPC64
+	default n
+	help
+	Enables support for the Power 5 hardware performance counters
+	If unsure, say M.
+
+config PERFMON_PPC32
+	tristate "Support for PPC32 hardware performance counters"
+ 	depends on PERFMON && PPC32
+	default n
+	help
+	Enables support for the PPC32 hardware performance counters
+	If unsure, say M.
+
+config PERFMON_CELL
+	tristate "Support for Cell hardware performance counters"
+ 	depends on PERFMON && PPC_CELL
+	default n
+	help
+	Enables support for the Cell hardware performance counters.
+	If unsure, say M.
+
+config PERFMON_CELL_HW_SMPL
+	tristate "Support for Cell hardware counter sampling"
+ 	depends on PERFMON_CELL
+	default n
+	help
+	Enables support for the Cell hardware counter sampling modes using
+	the PMU trace-buffer.
+	If unsure, say M.
+
+endmenu
Index: linux-2.6/arch/powerpc/perfmon/Makefile
===================================================================
--- /dev/null
+++ linux-2.6/arch/powerpc/perfmon/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_PERFMON)			+= perfmon.o
+obj-$(CONFIG_PERFMON_POWER5)		+= perfmon_power5.o
+obj-$(CONFIG_PERFMON_PPC32)		+= perfmon_ppc32.o
+obj-$(CONFIG_PERFMON_CELL)		+= perfmon_cell.o
+obj-$(CONFIG_PERFMON_CELL_HW_SMPL)	+= perfmon_cell_hw_smpl.o
Index: linux-2.6/arch/powerpc/perfmon/perfmon.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/powerpc/perfmon/perfmon.c
@@ -0,0 +1,282 @@
+/*
+ * This file implements the powerpc specific
+ * support for the perfmon2 interface
+ *
+ * Copyright (c) 2005 David Gibson, IBM Corporation.
+ *
+ * based on versions for other architectures:
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/interrupt.h>
+#include <linux/perfmon.h>
+
+static void pfm_stop_active(struct task_struct *task,
+			    struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+
+	BUG_ON(!arch_info->disable_counters || !arch_info->get_ovfl_pmds);
+
+	arch_info->disable_counters(ctx, set);
+
+	if (set->npend_ovfls)
+		return;
+
+	arch_info->get_ovfl_pmds(ctx, set);
+}
+
+/*
+ * Called from pfm_ctxsw(). Task is guaranteed to be current.
+ * Context is locked. Interrupts are masked. Monitoring is active.
+ * PMU access is guaranteed. PMC and PMD registers are live in PMU.
+ *
+ * for per-thread:
+ * 	must stop monitoring for the task
+ * Return:
+ * 	non-zero : did not save PMDs (as part of stopping the PMU)
+ * 	       0 : saved PMDs (no need to save them in caller)
+ */
+int pfm_arch_ctxswout_thread(struct task_struct *task,
+			     struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+
+	/*
+	 * disable lazy restore of PMC registers.
+	 */
+	set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
+
+	pfm_stop_active(task, ctx, set);
+
+	if (arch_info->ctxswout_thread) {
+		arch_info->ctxswout_thread(task, ctx, set);
+	}
+
+	return 1;
+}
+
+/*
+ * Called from pfm_ctxsw
+ */
+void pfm_arch_ctxswin_thread(struct task_struct *task,
+			     struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+
+	if (ctx->state != PFM_CTX_MASKED && ctx->flags.started == 1) {
+		BUG_ON(!arch_info->enable_counters);
+		arch_info->enable_counters(ctx, set);
+	}
+
+	if (arch_info->ctxswin_thread) {
+		arch_info->ctxswin_thread(task, ctx, set);
+	}
+}
+
+/*
+ * Called from pfm_stop() and idle notifier
+ *
+ * Interrupts are masked. Context is locked. Set is the active set.
+ *
+ * For per-thread:
+ *   task is not necessarily current. If not current task, then
+ *   task is guaranteed stopped and off any cpu. Access to PMU
+ *   is not guaranteed. Interrupts are masked. Context is locked.
+ *   Set is the active set.
+ *
+ * For system-wide:
+ * 	task is current
+ *
+ * must disable active monitoring. ctx cannot be NULL
+ */
+void pfm_arch_stop(struct task_struct *task,
+		   struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	/*
+	 * no need to go through stop_save()
+	 * if we are already stopped
+	 */
+	if (!ctx->flags.started)
+		return;
+
+	/*
+	 * stop live registers and collect pending overflow
+	 */
+	if (task == current)
+		pfm_stop_active(task, ctx, set);
+}
+
+/*
+ * called from pfm_start() and idle notifier
+ *
+ * Interrupts are masked. Context is locked. Set is the active set.
+ *
+ * For per-thread:
+ * 	Task is not necessarily current. If not current task, then task
+ * 	is guaranteed stopped and off any cpu. No access to PMU is task
+ *	is not current.
+ *
+ * For system-wide:
+ * 	task is always current
+ *
+ * must enable active monitoring.
+ */
+void pfm_arch_start(struct task_struct *task,
+		    struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+
+	if (task != current)
+		return;
+
+	BUG_ON(!arch_info->enable_counters);
+
+	arch_info->enable_counters(ctx, set);
+}
+
+/*
+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
+ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets()
+ * context is locked. Interrupts are masked. set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ *
+ * function must restore all PMD registers from set.
+ */
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	u64 *used_pmds;
+	unsigned int i, max_pmd;
+
+	/* The model-specific module can override the default
+	 * restore-PMD method.
+	 */
+	if (arch_info->restore_pmds) {
+		return arch_info->restore_pmds(set);
+	}
+
+	max_pmd = pfm_pmu_conf->regs.max_pmd;
+	used_pmds = set->used_pmds;
+
+	for (i = 0; i < max_pmd; i++)
+		if (test_bit(i, used_pmds) &&
+		    !(pfm_pmu_conf->pmd_desc[i].type & PFM_REG_RO))
+			pfm_arch_write_pmd(ctx, i, set->pmds[i].value);
+}
+
+/*
+ * function called from pfm_switch_sets(), pfm_context_load_thread(),
+ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets()
+ * context is locked. Interrupts are masked. set cannot be NULL.
+ * Access to the PMU is guaranteed.
+ *
+ * function must restore all PMC registers from set, if needed.
+ */
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	struct pfm_arch_pmu_info *arch_info;
+	u64 *impl_pmcs;
+	unsigned int i, max_pmc;
+
+	/* The model-specific module can override the default
+	 * restore-PMC method.
+	 */
+	arch_info = pfm_pmu_conf->arch_info;
+	if (arch_info->restore_pmcs) {
+		return arch_info->restore_pmcs(set);
+	}
+
+	/* The "common" powerpc model's enable the counters simply by writing
+	 * all the control registers. Therefore, if we're masked or stopped we
+	 * don't need to bother restoring the PMCs now.
+	 */
+	if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0)
+		return;
+
+	max_pmc = pfm_pmu_conf->regs.max_pmc;
+	impl_pmcs = pfm_pmu_conf->regs.pmcs;
+
+	/*
+	 * restore all pmcs
+	 */
+	for (i = 0; i < max_pmc; i++)
+		if (test_bit(i, impl_pmcs))
+			pfm_arch_write_pmc(ctx, i, set->pmcs[i]);
+}
+
+char *pfm_arch_get_pmu_module_name(void)
+{
+	unsigned int pvr = mfspr(SPRN_PVR);
+
+	switch (PVR_VER(pvr)) {
+	case 0x0004: /* 604 */
+	case 0x0009: /* 604e;  */
+	case 0x000A: /* 604ev */
+	case 0x0008: /* 750/740 */
+	case 0x7000: /* 750FX */
+	case 0x7001:
+	case 0x7002: /* 750GX */
+	case 0x000C: /* 7400 */
+	case 0x800C: /* 7410 */
+	case 0x8000: /* 7451/7441 */
+	case 0x8001: /* 7455/7445 */
+	case 0x8002: /* 7457/7447 */
+	case 0x8003: /* 7447A */
+	case 0x8004: /* 7448 */
+		return("perfmon_ppc32");
+	case PV_POWER4:
+	case PV_POWER4p:
+		return "perfmon_power4";
+	case PV_POWER5:
+	case PV_POWER5p:
+		return "perfmon_power5";
+	case PV_970:
+	case PV_970FX:
+	case PV_970MP:
+		return "perfmon_ppc970";
+	case PV_BE:
+		return "perfmon_cell";
+	}
+	return NULL;
+}
+
+void pfm_arch_init_percpu(void)
+{
+#ifdef CONFIG_PPC64
+	extern void ppc64_enable_pmcs(void);
+	ppc64_enable_pmcs();
+#endif
+}
+
+/**
+ * powerpc_irq_handler
+ *
+ * Get the perfmon context that belongs to the current CPU, and call the
+ * model-specific interrupt handler.
+ **/
+void powerpc_irq_handler(struct pt_regs *regs)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	struct pfm_context *ctx;
+
+	if (arch_info->irq_handler) {
+		ctx = __get_cpu_var(pmu_ctx);
+		if (likely(ctx))
+			arch_info->irq_handler(regs, ctx);
+	}
+}
Index: linux-2.6/arch/powerpc/perfmon/perfmon_cell.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/powerpc/perfmon/perfmon_cell.c
@@ -0,0 +1,610 @@
+/*
+ * This file contains the Cell PMU register description tables
+ * and pmc checker used by perfmon.c.
+ *
+ * Copyright IBM Corporation 2007
+ *
+ * Based on other Perfmon2 PMU modules.
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/perfmon.h>
+#include <asm/cell-pmu.h>
+#include <asm/io.h>
+#include <asm/rtas.h>
+#include "../platforms/cell/cbe_regs.h"
+#include <asm/perfmon_cell_hw_smpl.h>
+
+MODULE_AUTHOR("Kevin Corry <kevcorry@us.ibm.com>, "
+	      "Carl Love <carll@us.ibm.com>");
+MODULE_DESCRIPTION("Cell PMU description table");
+MODULE_LICENSE("GPL");
+
+/*
+ * Mapping from Perfmon logical control registers to Cell hardware registers.
+ */
+static struct pfm_regmap_desc pfm_cell_pmc_desc[] = {
+	/* Per-counter control registers. */
+	PMC_D(PFM_REG_I, "pm0_control",       0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm1_control",       0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm2_control",       0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm3_control",       0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm4_control",       0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm5_control",       0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm6_control",       0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm7_control",       0, 0, 0, 0),
+
+	/* Per-counter RTAS arguments. Each of these registers has three fields.
+	 *   bits 63-48: debug-bus word
+	 *   bits 47-32: sub-unit
+	 *   bits 31-0 : full signal number
+	 *   (MSB = 63, LSB = 0)
+	 */
+	PMC_D(PFM_REG_I, "pm0_event",         0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm1_event",         0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm2_event",         0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm3_event",         0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm4_event",         0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm5_event",         0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm6_event",         0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm7_event",         0, 0, 0, 0),
+
+	/* Global control registers. Same order as enum pm_reg_name. */
+	PMC_D(PFM_REG_I, "group_control",     0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "debug_bus_control", 0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "trace_address",     0, 0, 0, 0), /* KMC: Not sure if user-space needs access to this one. */
+	PMC_D(PFM_REG_I, "ext_trace_timer",   0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm_status",         0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm_control",        0, 0, 0, 0),
+	PMC_D(PFM_REG_I, "pm_interval",       0, 0, 0, 0), /* KMC: Does user-space also need read access to this one? */
+	PMC_D(PFM_REG_I, "pm_start_stop",     0, 0, 0, 0),
+};
+#define PFM_PM_NUM_PMCS	ARRAY_SIZE(pfm_cell_pmc_desc)
+
+#define CELL_PMC_PM_STATUS 20
+/*
+ * Mapping from Perfmon logical data counters to Cell hardware counters.
+ */
+static struct pfm_regmap_desc pfm_cell_pmd_desc[] = {
+	PMD_D(PFM_REG_C, "pm0", 0),
+	PMD_D(PFM_REG_C, "pm1", 0),
+	PMD_D(PFM_REG_C, "pm2", 0),
+	PMD_D(PFM_REG_C, "pm3", 0),
+	PMD_D(PFM_REG_C, "pm4", 0),
+	PMD_D(PFM_REG_C, "pm5", 0),
+	PMD_D(PFM_REG_C, "pm6", 0),
+	PMD_D(PFM_REG_C, "pm7", 0),
+};
+#define PFM_PM_NUM_PMDS	ARRAY_SIZE(pfm_cell_pmd_desc)
+
+/* The firmware only sees physical CPUs, so divide by 2 if SMT is on. */
+#ifdef CONFIG_SCHED_SMT
+#define RTAS_CPU(cpu) ((cpu) / 2)
+#else
+#define RTAS_CPU(cpu) (cpu)
+#endif
+#define RTAS_BUS_WORD(x)      (u16)(((x) >> 48) & 0x0000ffff)
+#define RTAS_SUB_UNIT(x)      (u16)(((x) >> 32) & 0x0000ffff)
+#define RTAS_SIGNAL_NUMBER(x) (s32)( (x)        & 0xffffffff)
+
+#define subfunc_RESET		1
+#define subfunc_ACTIVATE	2
+
+#define passthru_ENABLE		1
+#define passthru_DISABLE	2
+
+/**
+ * struct cell_rtas_arg
+ *
+ * @cpu: Processor to modify. Linux numbers CPUs based on SMT IDs, but the
+ *       firmware only sees the physical CPUs. So this value should be the
+ *       SMT ID (from smp_processor_id() or get_cpu()) divided by 2.
+ * @sub_unit: Hardware subunit this applies to (if applicable).
+ * @signal_group: Signal group to enable/disable on the trace bus.
+ * @bus_word: For signal groups that propagate via the trace bus, this trace
+ *            bus word will be used. This is a mask of (1 << TraceBusWord).
+ *            For other signal groups, this specifies the trigger or event bus.
+ * @bit: Trigger/Event bit, if applicable for the signal group.
+ *
+ * An array of these structures are passed to rtas_call() to set up the
+ * signals on the debug bus.
+ **/
+struct cell_rtas_arg {
+	u16 cpu;
+	u16 sub_unit;
+	s16 signal_group;
+	u8 bus_word;
+	u8 bit;
+};
+
+/**
+ * rtas_reset_signals
+ *
+ * Set up the RTAS arguments for a RESET command. The buffer will be only
+ * the first entry in the rtas_args[cpu].signal[] array.
+ **/
+static int rtas_reset_signals(u32 cpu)
+{
+	struct cell_rtas_arg signal;
+	u64 real_addr = virt_to_phys(&signal);
+	int rc;
+
+	memset(&signal, 0, sizeof(signal));
+	signal.cpu = RTAS_CPU(cpu);
+	rc = rtas_call(rtas_token("ibm,cbe-perftools"),
+		       5, 1, NULL,
+		       subfunc_RESET,
+		       passthru_DISABLE,
+		       real_addr >> 32,
+		       real_addr & 0xffffffff,
+		       sizeof(signal));
+
+	return rc;
+}
+
+/**
+ * rtas_activate_signals
+ *
+ * Set up the RTAS arguments for an ACTIVATE command. The buffer will be the
+ * number of entries in the rtas_args[cpu].signal[] array that were filled
+ * in by attach_signal_to_counter().
+ **/
+static int rtas_activate_signals(struct cell_rtas_arg *signals,
+				 int num_signals)
+{
+	u64 real_addr = virt_to_phys(signals);
+	int rc;
+
+	rc = rtas_call(rtas_token("ibm,cbe-perftools"),
+		       5, 1, NULL,
+		       subfunc_ACTIVATE,
+		       passthru_ENABLE,
+		       real_addr >> 32,
+		       real_addr & 0xffffffff,
+		       num_signals * sizeof(*signals));
+
+	return rc;
+}
+
+/**
+ * write_pm07_event
+ *
+ * Pull out the RTAS arguments from the 64-bit register value and make the
+ * RTAS activate-signals call.
+ **/
+static void write_pm07_event(int cpu, unsigned int ctr, u64 value)
+{
+	struct cell_rtas_arg signal;
+	int rc;
+
+	signal.cpu = RTAS_CPU(cpu);
+	signal.bus_word = 1 << RTAS_BUS_WORD(value);
+	signal.sub_unit = RTAS_SUB_UNIT(value);
+	signal.signal_group = RTAS_SIGNAL_NUMBER(value) / 100;
+	signal.bit = RTAS_SIGNAL_NUMBER(value) % 100;
+
+	rc = rtas_activate_signals(&signal, 1);
+	if (rc) {
+		PFM_WARN("%s(%d, %u, %lu): Error calling "
+			 "rtas_activate_signal(): %d\n", __FUNCTION__,
+			 cpu, ctr, (unsigned long)value, rc);
+		/* FIX: Could we change this routine to return an error? */
+	}
+}
+
+/**
+ * pfm_cell_probe_pmu
+ *
+ * Simply check the processor version register to see if we're currently
+ * on a Cell system.
+ **/
+static int pfm_cell_probe_pmu(void)
+{
+	unsigned long pvr = mfspr(SPRN_PVR);
+
+	if (PVR_VER(pvr) != PV_BE)
+		return -1;
+
+	return 0;
+}
+
+/**
+ * pfm_cell_write_pmc
+ **/
+static void pfm_cell_write_pmc(unsigned int cnum, u64 value)
+{
+	int cpu = smp_processor_id();
+
+	if (cnum < NR_CTRS) {
+		cbe_write_pm07_control(cpu, cnum, value);
+
+	} else if (cnum < NR_CTRS * 2) {
+		write_pm07_event(cpu, cnum - NR_CTRS, value);
+
+	} else if (cnum == CELL_PMC_PM_STATUS) {
+		/* The pm_status register must be treated separately from
+		 * the other "global" PMCs. This call will ensure that
+		 * the interrupts are routed to the correct CPU, as well
+		 * as writing the desired value to the pm_status register.
+		 */
+		cbe_enable_pm_interrupts(cpu, cbe_get_hw_thread_id(cpu), value);
+
+	} else if (cnum < PFM_PM_NUM_PMCS) {
+		cbe_write_pm(cpu, cnum - (NR_CTRS * 2), value);
+	}
+}
+
+/**
+ * pfm_cell_write_pmd
+ **/
+static void pfm_cell_write_pmd(unsigned int cnum, u64 value)
+{
+	int cpu = smp_processor_id();
+
+	if (cnum < NR_CTRS) {
+		cbe_write_ctr(cpu, cnum, value);
+	}
+}
+
+/**
+ * pfm_cell_read_pmd
+ **/
+static u64 pfm_cell_read_pmd(unsigned int cnum)
+{
+	int cpu = smp_processor_id();
+
+	if (cnum < NR_CTRS) {
+		return cbe_read_ctr(cpu, cnum);
+	}
+
+	return -EINVAL;
+}
+
+/**
+ * pfm_cell_enable_counters
+ *
+ * Just need to turn on the global disable bit in pm_control.
+ **/
+static void pfm_cell_enable_counters(struct pfm_context *ctx,
+				     struct pfm_event_set *set)
+{
+	cbe_enable_pm(smp_processor_id());
+}
+
+/**
+ * pfm_cell_disable_counters
+ *
+ * Just need to turn off the global disable bit in pm_control.
+ *
+ * Also, if we're using the hardware-sampling module, we need to empty the
+ * trace-buffer, since it cannot be restored to its current state when this
+ * event-set is enabled again.
+ **/
+static void pfm_cell_disable_counters(struct pfm_context *ctx,
+				      struct pfm_event_set *set)
+{
+	struct pfm_smpl_fmt *smpl_fmt = ctx->smpl_fmt;
+	struct pt_regs *regs;
+
+	cbe_disable_pm(smp_processor_id());
+
+	if (smpl_fmt && !strcmp(smpl_fmt->fmt_name, PFM_CELL_HW_SMPL_NAME)) {
+		ctx->ovfl_arg.ovfl_pmd = PFM_CELL_HW_SMPL_OVFL_PMD;
+		ctx->ovfl_arg.active_set = ctx->active_set->id;
+		regs = current->thread.regs;
+		smpl_fmt->fmt_handler(ctx->smpl_addr, &ctx->ovfl_arg,
+				      instruction_pointer(regs), 0, regs);
+	}
+}
+
+/**
+ * pfm_cell_restore_pmcs
+ *
+ * Write all control register values that are saved in the specified event
+ * set. We could use the pfm_arch_write_pmc() function to restore each PMC
+ * individually (as is done in other architectures), but that results in
+ * multiple RTAS calls. As an optimization, we will setup the RTAS argument
+ * array so we can do all event-control registers in one RTAS call.
+ **/
+void pfm_cell_restore_pmcs(struct pfm_event_set *set)
+{
+	struct cell_rtas_arg signals[NR_CTRS];
+	u64 value, *used_pmcs = set->used_pmcs;
+	int i, rc, num_used = 0, cpu = smp_processor_id();
+
+	memset(signals, 0, sizeof(signals));
+
+	for (i = 0; i < NR_CTRS; i++) {
+		/* Write the per-counter control register. If the PMC is not
+		 * in use, then it will simply clear the register, which will
+		 * disable the associated counter.
+		 */
+		cbe_write_pm07_control(cpu, i, set->pmcs[i]);
+
+		if (test_bit(i + NR_CTRS, used_pmcs)) {
+			/* Set up the next RTAS array entry for this counter.
+			 * Only include pm07_event registers that are in use
+			 * by this set so the RTAS call doesn't have to
+			 * process blank array entries.
+			 */
+			value = set->pmcs[i + NR_CTRS];
+			signals[num_used].cpu = RTAS_CPU(cpu);
+			signals[num_used].sub_unit = RTAS_SUB_UNIT(value);
+			signals[num_used].bus_word = 1 << RTAS_BUS_WORD(value);
+			signals[num_used].bit = RTAS_SIGNAL_NUMBER(value) % 100;
+			signals[num_used].signal_group =
+						RTAS_SIGNAL_NUMBER(value) / 100;
+			num_used++;
+		}
+	}
+
+	rc = rtas_activate_signals(signals, num_used);
+	if (rc) {
+		PFM_WARN("Error calling rtas_activate_signal(): %d\n", rc);
+		/* FIX: We will also need this routine to be able to return
+		 * an error if Stephane agrees to change pfm_arch_write_pmc
+		 * to return an error.
+		 */
+	}
+
+	/* Write all the global PMCs. Need to call pfm_cell_write_pmc()
+	 * instead of cbe_write_pm() due to special handling for the
+	 * pm_status register.
+	 */
+	for (i *= 2; i < PFM_PM_NUM_PMCS; i++)
+		pfm_cell_write_pmc(i, set->pmcs[i]);
+}
+
+/**
+ * pfm_cell_unload_context
+ *
+ * For system-wide contexts and self-monitored contexts, make the RTAS call
+ * to reset the debug-bus signals.
+ *
+ * For non-self-monitored contexts, the monitored thread will already have
+ * been taken off the CPU and we don't need to do anything additional.
+ **/
+static int pfm_cell_unload_context(struct pfm_context *ctx,
+				   struct task_struct *task)
+{
+	if (task == current || ctx->flags.system) {
+		rtas_reset_signals(smp_processor_id());
+	}
+	return 0;
+}
+
+/**
+ * pfm_cell_ctxswout_thread
+ *
+ * When a monitored thread is switched out (self-monitored or externally
+ * monitored) we need to reset the debug-bus signals so the next context that
+ * gets switched in can start from a clean set of signals.
+ **/
+int pfm_cell_ctxswout_thread(struct task_struct *task,
+			     struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	rtas_reset_signals(smp_processor_id());
+	return 0;
+}
+
+/**
+ * pfm_cell_get_ovfl_pmds
+ *
+ * Determine which counters in this set have overflowed and fill in the
+ * set->povfl_pmds mask and set->npend_ovfls count. On Cell, the pm_status
+ * register contains a bit for each counter to indicate overflow. However,
+ * those 8 bits are in the reverse order than what Perfmon2 is expecting,
+ * so we need to reverse the order of the overflow bits.
+ **/
+static void pfm_cell_get_ovfl_pmds(struct pfm_context *ctx,
+				   struct pfm_event_set *set)
+{
+	struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx);
+	u32 pm_status, ovfl_ctrs;
+	u64 povfl_pmds = 0;
+	int i;
+
+	if (!ctx_arch->last_read_updated)
+		/* This routine was not called via the interrupt handler.
+		 * Need to start by getting interrupts and updating
+		 * last_read_pm_status.
+		 */
+		ctx_arch->last_read_pm_status =
+			cbe_get_and_clear_pm_interrupts(smp_processor_id());
+
+	/* Reset the flag that the interrupt handler last read pm_status. */
+	ctx_arch->last_read_updated = 0;
+
+	pm_status = ctx_arch->last_read_pm_status &
+		    set->pmcs[CELL_PMC_PM_STATUS];
+	ovfl_ctrs = CBE_PM_OVERFLOW_CTRS(pm_status);
+
+	/* Reverse the order of the bits in ovfl_ctrs
+	 * and store the result in povfl_pmds.
+	 */
+	for (i = 0; i < PFM_PM_NUM_PMDS; i++) {
+		povfl_pmds = (povfl_pmds << 1) | (ovfl_ctrs & 1);
+		ovfl_ctrs >>= 1;
+	}
+
+	/* Mask povfl_pmds with set->used_pmds to get set->povfl_pmds.
+	 * Count the bits set in set->povfl_pmds to get set->npend_ovfls.
+	 */
+	bitmap_and(set->povfl_pmds, &povfl_pmds,
+		   set->used_pmds, PFM_PM_NUM_PMDS);
+	set->npend_ovfls = bitmap_weight(set->povfl_pmds, PFM_PM_NUM_PMDS);
+}
+
+/**
+ * handle_trace_buffer_interrupts
+ *
+ * This routine is for processing just the interval timer and trace buffer
+ * overflow interrupts. Performance counter interrupts are handled by the
+ * perf_irq_handler() routine, which reads and saves the pm_status register.
+ * This routine should not read the actual pm_status register, but rather
+ * the value passed in.
+ **/
+static void handle_trace_buffer_interrupts(unsigned long iip,
+					   struct pt_regs *regs,
+					   struct pfm_context *ctx,
+					   u32 pm_status)
+{
+	struct pfm_smpl_fmt *smpl_fmt;
+
+	if (pm_status & CBE_PM_TRACE_BUFFER_FULL_INTR) {
+		/* The trace-buffer is full. Get the
+		 * sampling-buffer address and call the handler.
+		 */
+		smpl_fmt = ctx->smpl_fmt;
+
+		if (smpl_fmt &&
+		    !strcmp(smpl_fmt->fmt_name, PFM_CELL_HW_SMPL_NAME)) {
+			ctx->ovfl_arg.ovfl_pmd = PFM_CELL_HW_SMPL_OVFL_PMD;
+			ctx->ovfl_arg.active_set = ctx->active_set->id;
+			smpl_fmt->fmt_handler(ctx->smpl_addr, &ctx->ovfl_arg,
+					      iip, 0, regs);
+		}
+	}
+
+	/* Currently the trace buffer underflow and interval timer
+	 * interrupts are ignored.
+	 */
+
+	return;
+}
+
+/**
+ * pfm_cell_irq_handler
+ *
+ * Handler for all Cell performance-monitor interrupts.
+ **/
+static void pfm_cell_irq_handler(struct pt_regs *regs, struct pfm_context *ctx)
+{
+	struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx);
+	u32 last_read_pm_status;
+	int cpu = smp_processor_id();
+
+	/* Need to disable and reenable the performance counters to get the
+	 * desired behavior from the hardware. This is specific to the Cell
+	 * PMU hardware.
+	 */
+	cbe_disable_pm(cpu);
+
+	/* Read the pm_status register to get the interrupt bits. If a
+	 * perfmormance counter overflow interrupt occurred, call the core
+	 * perfmon interrupt handler to service the counter overflow. If the
+	 * interrupt was for the interval timer or the trace_buffer,
+	 * call the interval timer and trace buffer interrupt handler.
+	 *
+	 * The value read from the pm_status register is stored in the
+	 * pmf_arch_context structure for use by other routines. Note that
+	 * reading the pm_status register resets the interrupt flags to zero.
+	 * Hence, it is important that the register is only read in one place.
+	 *
+	 * The pm_status reg interrupt reg format is:
+	 * [pmd0:pmd1:pmd2:pmd3:pmd4:pmd5:pmd6:pmd7:intt:tbf:tbu:]
+	 * - pmd0 to pm7 are the perf counter overflow interrupts.
+	 * - intt is the interval timer overflowed interrupt.
+	 * - tbf is the trace buffer full interrupt.
+	 * - tbu is the trace buffer underflow interrupt.
+	 * - The pmd0 bit is the MSB of the 32 bit register.
+	 */
+	ctx_arch->last_read_pm_status = last_read_pm_status =
+					cbe_get_and_clear_pm_interrupts(cpu);
+
+	/* Set flag for pfm_cell_get_ovfl_pmds() routine so it knows
+	 * last_read_pm_status was updated by the interrupt handler.
+	 */
+	ctx_arch->last_read_updated = 1;
+
+	if (last_read_pm_status & CBE_PM_ALL_OVERFLOW_INTR)
+		/* At least one counter overflowed. */
+		pfm_interrupt_handler(instruction_pointer(regs), regs);
+
+	if (last_read_pm_status & (CBE_PM_INTERVAL_INTR |
+				   CBE_PM_TRACE_BUFFER_FULL_INTR |
+				   CBE_PM_TRACE_BUFFER_UNDERFLOW_INTR))
+		/* Trace buffer or interval timer overflow. */
+		handle_trace_buffer_interrupts(instruction_pointer(regs),
+					       regs, ctx, last_read_pm_status);
+
+	/* If the hardware-sampling module masked monitoring for this context,
+	 * don't re-enable the PMU.
+	 */
+	if (ctx->state & PFM_CTX_MASKED) {
+		return;
+	}
+
+	/* The interrupt settings is the value written to the pm_status
+	 * register. It is saved in the context when the register is
+	 * written.
+	 */
+	cbe_enable_pm_interrupts(cpu, cbe_get_hw_thread_id(cpu),
+				 ctx->active_set->pmcs[CELL_PMC_PM_STATUS]);
+
+	/* The writes to the various performance counters only writes to a
+	 * latch. The new values (interrupt setting bits, reset counter value
+	 * etc.) are not copied to the actual registers until the performance
+	 * monitor is enabled. In order to get this to work as desired, the
+	 * permormance monitor needs to be disabled while writting to the
+	 * latches. This is a HW design issue.
+	 */
+	cbe_enable_pm(cpu);
+}
+
+static struct pfm_arch_pmu_info pfm_cell_pmu_info = {
+	.pmu_style        = PFM_POWERPC_PMU_CELL,
+	.write_pmc        = pfm_cell_write_pmc,
+	.write_pmd        = pfm_cell_write_pmd,
+	.read_pmd         = pfm_cell_read_pmd,
+	.enable_counters  = pfm_cell_enable_counters,
+	.disable_counters = pfm_cell_disable_counters,
+	.irq_handler      = pfm_cell_irq_handler,
+	.get_ovfl_pmds    = pfm_cell_get_ovfl_pmds,
+	.restore_pmcs     = pfm_cell_restore_pmcs,
+	.ctxswout_thread  = pfm_cell_ctxswout_thread,
+	.unload_context   = pfm_cell_unload_context,
+};
+
+static struct pfm_pmu_config pfm_cell_pmu_conf = {
+	.pmu_name = "Cell",
+	.version = "0.1",
+	.counter_width = 32,
+	.pmd_desc = pfm_cell_pmd_desc,
+	.pmc_desc = pfm_cell_pmc_desc,
+	.num_pmc_entries = PFM_PM_NUM_PMCS,
+	.num_pmd_entries = PFM_PM_NUM_PMDS,
+	.probe_pmu  = pfm_cell_probe_pmu,
+	.arch_info = &pfm_cell_pmu_info,
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE,
+};
+
+static int __init pfm_cell_pmu_init_module(void)
+{
+	return pfm_pmu_register(&pfm_cell_pmu_conf);
+}
+
+static void __exit pfm_cell_pmu_cleanup_module(void)
+{
+	pfm_pmu_unregister(&pfm_cell_pmu_conf);
+}
+
+module_init(pfm_cell_pmu_init_module);
+module_exit(pfm_cell_pmu_cleanup_module);
Index: linux-2.6/arch/powerpc/perfmon/perfmon_cell_hw_smpl.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/powerpc/perfmon/perfmon_cell_hw_smpl.c
@@ -0,0 +1,298 @@
+/*
+ * Copyright IBM Corp 2007
+ *
+ * Contributed by Carl Love <carll@us.ibm.com>
+ * and Kevin Corry <kevcorry@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ *
+ *
+ * This file implements the IBM Cell PMU hardware-sampling module.
+ */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+#include <asm/cell-pmu.h>
+#include <asm/perfmon_cell_hw_smpl.h>
+
+MODULE_AUTHOR("Carl Love <carll@us.ibm.com>, "
+	      "Kevin Corry <kevcorry@us.ibm.com>");
+MODULE_DESCRIPTION("Perfmon2 CELL hardware sampling format");
+MODULE_LICENSE("GPL");
+
+/**
+ * pfm_cell_hw_smpl_validate
+ *
+ * Validate the arguments passed from user-space for creating the
+ * sampling-buffer. The buffer must be large enough to hold the
+ * sampling-buffer header and at least one copy of the trace-buffer.
+ **/
+static int pfm_cell_hw_smpl_validate(u32 flags, u16 npmds, void *data)
+{
+	struct pfm_cell_hw_smpl_arg *arg = data;
+
+	if (!arg) {
+		PFM_ERR("No argument passed.");
+		return -EINVAL;
+	}
+
+	if (arg->buf_size < PFM_CELL_HW_SMPL_MIN_BUF_SIZE) {
+		PFM_ERR("Specified buffer size (%lu) too small. "
+			"Min size is %lu bytes.",
+			arg->buf_size, PFM_CELL_HW_SMPL_MIN_BUF_SIZE);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * pfm_cell_hw_smpl_get_size
+ *
+ * Tell the Perfmon2 core how large a buffer we need to have allocated, and
+ * it will do the allocation for us. The size of the buffer has already been
+ * validated.
+ **/
+static int pfm_cell_hw_smpl_get_size(unsigned int flags,
+				     void *data, size_t *size)
+{
+	struct pfm_cell_hw_smpl_arg *arg = data;
+	*size = arg->buf_size;
+	return 0;
+}
+
+/**
+ * pfm_cell_hw_smpl_init
+ *
+ * Initialize the start of the sampling-buffer with a header structure.
+ * The buffer has already been allocated by the Perfmon2 core.
+ **/
+static int pfm_cell_hw_smpl_init(struct pfm_context *ctx, void *buf,
+				 u32 flags, u16 npmds, void *data)
+{
+	struct pfm_cell_hw_smpl_hdr *hdr = buf;
+	struct pfm_cell_hw_smpl_arg *arg = data;
+
+	hdr->count = 0;
+	hdr->cur_offset = sizeof(*hdr);
+	hdr->overflows = 0;
+	hdr->buf_size = arg->buf_size;
+	hdr->version = PFM_CELL_HW_SMPL_VERSION;
+	hdr->buf_flags = arg->buf_flags;
+
+	return 0;
+}
+
+/**
+ * pfm_cell_hw_smpl_notify_user
+ *
+ * Add a "buffer full" message to the context and wake up any user-space
+ * process that is polling on the context's file descriptor. That process
+ * can then read() from the file-descriptor to get a copy of the message.
+ **/
+static int pfm_cell_hw_smpl_notify_user(struct pfm_context *ctx)
+{
+	union pfarg_msg *msg;
+
+	if (ctx->flags.no_msg) {
+		return 0;
+	}
+
+	msg = pfm_get_new_msg(ctx);
+	if (msg == NULL) {
+		/* The message queue is full. The user must have called
+		 * pfm_restart(), but didn't extract any messages.
+		 */
+		PFM_ERR("No notification messages available.");
+		return -EBUSY;
+	}
+
+	msg->type = PFM_MSG_CELL_HW_SMPL_BUF_FULL;
+
+	return pfm_notify_user(ctx);
+}
+
+/**
+ * pfm_cell_hw_smpl_handler
+ *
+ * Create a new entry-header in the sampling-buffer and copy the current
+ * contents of the trace-buffer into the sampling-buffer.
+ **/
+static int pfm_cell_hw_smpl_handler(void *buf,
+				    struct pfm_ovfl_arg *arg,
+				    unsigned long ip,
+				    u64 tstamp,
+				    void *data)
+{
+	struct pfm_cell_hw_smpl_hdr *hdr = buf;
+	struct pfm_cell_hw_smpl_entry_hdr *ent;
+	struct pfm_context *ctx;
+	void *cur, *end;
+        u64 *trace_buffer_lines;
+        u32 trace_addr;
+
+	/* If this handler was called due to an actual PMD overflowing, do
+	 * nothing. Only store the contents of the trace-buffer if the trace-
+	 * buffer overflowed.
+	 */
+	if (arg->ovfl_pmd != PFM_CELL_HW_SMPL_OVFL_PMD)
+		return 0;
+
+	cur = buf + hdr->cur_offset;
+	end = buf + hdr->buf_size;
+	ctx = __get_cpu_var(pmu_ctx);
+
+	/* Check if the sampling-buffer is full. This should never happen,
+	 * since we check if the buffer is full after adding the new entry.
+	 */
+	if ((end - cur) < PFM_CELL_HW_SMPL_MAX_ENTRY_SIZE) {
+		PFM_ERR("Cell HW Sampling: Buffer is full "
+			"before adding new entry.");
+		goto full;
+	}
+
+	ent = cur;
+
+	/* current = task running at the time of the overflow.
+	 *
+	 * per-task mode:
+	 * 	- This is ususally the task being monitored.
+	 * 	  Under certain conditions, it might be a different task
+	 *
+	 * system-wide:
+	 * 	- This is not necessarily the task controlling the session
+	 */
+	ent->pid = current->pid;
+	ent->tgid = current->tgid;
+	ent->cpu = smp_processor_id();
+	ent->set = arg->active_set;
+	ent->num_samples = 0;
+	ent->entry_num = hdr->count;
+
+	/* Read at most 1024 lines from the trace-buffer. Note, lines could be
+	 * added to the trace-buffer while it is being read.  However, we only
+	 * made sure we had space for up to 1024 lines.
+	 */
+
+	trace_buffer_lines = (u64 *)(ent + 1);
+	trace_addr = cbe_read_pm(ent->cpu, trace_address);
+        while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY) &&
+	       ent->num_samples < CBE_PM_TRACE_BUF_MAX_COUNT) {
+		cbe_read_trace_buffer(ent->cpu, trace_buffer_lines);
+		trace_buffer_lines += 2;
+		ent->num_samples++;
+		trace_addr = cbe_read_pm(ent->cpu, trace_address);
+	}
+
+	/* Update the sampling-buffer header for the next entry. Since the
+	 * hw_smpl_hdr and hw_smpl_entry_hdr structures are both padded to
+	 * 128-bits, and each trace-buffer line is 128-bits, we know that
+	 * every buffer entry will start on a 128-bit boundary.
+	 */
+	if (ent->num_samples) {
+		cur = trace_buffer_lines;
+		hdr->cur_offset = cur - (void *)hdr;
+		hdr->count++;
+	}
+
+	/* Check the available size in the buffer again so we won't lose the
+	 * next sample entry.
+	 */
+	if ((end - cur) < PFM_CELL_HW_SMPL_MAX_ENTRY_SIZE)
+		goto full;
+
+	return 0;
+
+full:
+	PFM_DBG_ovfl("Sampling-buffer full. free bytes=%lu, count=%lu",
+		     end-cur, hdr->count);
+
+	/* Increment the number of sampling-buffer overflows. This
+	 * is important for detecting duplicate sets of samples.
+	 */
+	hdr->overflows++;
+
+	/* Add a message to the context's message queue and wake up any
+	 * user-space program's that are polling on the context's file
+	 * descriptor.
+	 */
+	pfm_cell_hw_smpl_notify_user(ctx);
+
+	/* Mask monitoring until a pfm_restart() occurs. */
+	pfm_mask_monitoring(ctx, ctx->active_set);
+	ctx->state = PFM_CTX_MASKED;
+	ctx->flags.can_restart = 1;
+
+	return -ENOBUFS;
+}
+
+/**
+ * pfm_cell_hw_smpl_restart
+ *
+ * Reinitialize the sampling-buffer header, effectively deleting all entries
+ * previously stored in the sampling-buffer.
+ *
+ * FIX: What is the "is_active" argument for? It's not used by any of the
+ *      other sampling modules.
+ **/
+static int pfm_cell_hw_smpl_restart(int is_active, u32 *ovfl_ctrl, void *buf)
+{
+	struct pfm_cell_hw_smpl_hdr *hdr = buf;
+
+	hdr->count = 0;
+	hdr->cur_offset = sizeof(*hdr);
+	hdr->overflows = 0;
+
+	return 0;
+}
+
+/**
+ * pfm_cell_hw_smpl_exit
+ **/
+static int pfm_cell_hw_smpl_exit(void *buf)
+{
+	return 0;
+}
+
+/**
+ * cell_hw_smpl_fmt
+ *
+ * Structure to describe the Cell hardware-sampling module to the Perfmon2 core.
+ **/
+static struct pfm_smpl_fmt cell_hw_smpl_fmt = {
+	.fmt_name = PFM_CELL_HW_SMPL_NAME,
+	.fmt_arg_size = sizeof(struct pfm_cell_hw_smpl_arg),
+	.fmt_flags = PFM_FMT_BUILTIN_FLAG,
+	.fmt_version = PFM_CELL_HW_SMPL_VERSION,
+	.fmt_validate = pfm_cell_hw_smpl_validate,
+	.fmt_getsize = pfm_cell_hw_smpl_get_size,
+	.fmt_init = pfm_cell_hw_smpl_init,
+	.fmt_handler = pfm_cell_hw_smpl_handler,
+	.fmt_restart = pfm_cell_hw_smpl_restart,
+	.fmt_exit = pfm_cell_hw_smpl_exit,
+	.owner = THIS_MODULE,
+};
+
+static int __init pfm_cell_hw_smpl_init_module(void)
+{
+	return pfm_fmt_register(&cell_hw_smpl_fmt);
+}
+
+static void __exit pfm_cell_hw_smpl_exit_module(void)
+{
+	pfm_fmt_unregister(&cell_hw_smpl_fmt);
+}
+
+module_init(pfm_cell_hw_smpl_init_module);
+module_exit(pfm_cell_hw_smpl_exit_module);
Index: linux-2.6/arch/powerpc/perfmon/perfmon_power5.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/powerpc/perfmon/perfmon_power5.c
@@ -0,0 +1,292 @@
+/*
+ * This file contains the POWER5 PMU register description tables
+ * and pmc checker used by perfmon.c.
+ *
+ * Copyright (c) 2005 David Gibson, IBM Corporation.
+ *
+ * Based on perfmon_p6.c:
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+
+MODULE_AUTHOR("David Gibson <dwg@au1.ibm.com>");
+MODULE_DESCRIPTION("POWER5 PMU description table");
+MODULE_LICENSE("GPL");
+
+static struct pfm_regmap_desc pfm_power5_pmc_desc[]={
+/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", MMCR0_FC, 0, 0, SPRN_MMCR0),
+/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0, 0, 0, SPRN_MMCR1),
+/* mmcra */ PMC_D(PFM_REG_I, "MMCRA", 0, 0, 0, SPRN_MMCRA)
+};
+#define PFM_PM_NUM_PMCS	ARRAY_SIZE(pfm_power5_pmc_desc)
+
+/* The TB and PURR registers are read-only. Also, note that the TB register
+ * actually consists of both the 32-bit SPRN_TBRU and SPRN_TBRL registers.
+ * For Perfmon2's purposes, we'll treat it as a single 64-bit register.
+ */
+static struct pfm_regmap_desc pfm_power5_pmd_desc[]={
+/* tb    */ PMD_D((PFM_REG_I|PFM_REG_RO), "TB", SPRN_TBRL),
+/* pmd1  */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1),
+/* pmd2  */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2),
+/* pmd3  */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3),
+/* pmd4  */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4),
+/* pmd5  */ PMD_D(PFM_REG_C, "PMC5", SPRN_PMC5),
+/* pmd6  */ PMD_D(PFM_REG_C, "PMC6", SPRN_PMC6),
+/* purr  */ PMD_D((PFM_REG_I|PFM_REG_RO), "PURR", SPRN_PURR),
+};
+#define PFM_PM_NUM_PMDS	ARRAY_SIZE(pfm_power5_pmd_desc)
+
+static int pfm_power5_probe_pmu(void)
+{
+	unsigned long pvr = mfspr(SPRN_PVR);
+
+	if (PVR_VER(pvr) != PV_POWER5)
+		return -1;
+
+	return 0;
+}
+
+static void pfm_power5_write_pmc(unsigned int cnum, u64 value)
+{
+	switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) {
+	case SPRN_MMCR0:
+		mtspr(SPRN_MMCR0, value);
+		break;
+	case SPRN_MMCR1:
+		mtspr(SPRN_MMCR1, value);
+		break;
+	case SPRN_MMCRA:
+		mtspr(SPRN_MMCRA, value);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void pfm_power5_write_pmd(unsigned int cnum, u64 value)
+{
+	switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
+	case SPRN_PMC1:
+		mtspr(SPRN_PMC1, value);
+		break;
+	case SPRN_PMC2:
+		mtspr(SPRN_PMC2, value);
+		break;
+	case SPRN_PMC3:
+		mtspr(SPRN_PMC3, value);
+		break;
+	case SPRN_PMC4:
+		mtspr(SPRN_PMC4, value);
+		break;
+	case SPRN_PMC5:
+		mtspr(SPRN_PMC5, value);
+		break;
+	case SPRN_PMC6:
+		mtspr(SPRN_PMC6, value);
+		break;
+	case SPRN_PMC7:
+		mtspr(SPRN_PMC7, value);
+		break;
+	case SPRN_PMC8:
+		mtspr(SPRN_PMC8, value);
+		break;
+	case SPRN_TBRL:
+	case SPRN_PURR:
+		/* Ignore writes to read-only registers. */
+		break;
+	default:
+		BUG();
+	}
+}
+
+static u64 pfm_power5_read_pmd(unsigned int cnum)
+{
+	switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
+	case SPRN_PMC1:
+		return mfspr(SPRN_PMC1);
+	case SPRN_PMC2:
+		return mfspr(SPRN_PMC2);
+	case SPRN_PMC3:
+		return mfspr(SPRN_PMC3);
+	case SPRN_PMC4:
+		return mfspr(SPRN_PMC4);
+	case SPRN_PMC5:
+		return mfspr(SPRN_PMC5);
+	case SPRN_PMC6:
+		return mfspr(SPRN_PMC6);
+	case SPRN_PMC7:
+		return mfspr(SPRN_PMC7);
+	case SPRN_PMC8:
+		return mfspr(SPRN_PMC8);
+	case SPRN_TBRL:
+		return ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL);
+	case SPRN_PURR:
+		if (cpu_has_feature(CPU_FTR_PURR))
+			return mfspr(SPRN_PURR);
+		else
+			return 0;
+	default:
+		BUG();
+	}
+}
+
+/**
+ * pfm_power5_enable_counters
+ *
+ * Just need to load the current values into the control registers.
+ **/
+static void pfm_power5_enable_counters(struct pfm_context *ctx,
+				       struct pfm_event_set *set)
+{
+	unsigned int i, max_pmc;
+
+	max_pmc = pfm_pmu_conf->regs.max_pmc;
+
+	for (i = 0; i < max_pmc; i++)
+		if (test_bit(i, set->used_pmcs))
+			pfm_power5_write_pmc(i, set->pmcs[i]);
+}
+
+/**
+ * pfm_power5_disable_counters
+ *
+ * Just need to zero all the control registers.
+ **/
+static void pfm_power5_disable_counters(struct pfm_context *ctx,
+					struct pfm_event_set *set)
+{
+	unsigned int i, max;
+
+	max = pfm_pmu_conf->regs.max_pmc;
+
+	for (i = 0; i < max; i++)
+		if (test_bit(i, set->used_pmcs))
+			pfm_power5_write_pmc(i, 0);
+}
+
+/**
+ * pfm_power5_get_ovfl_pmds
+ *
+ * Determine which counters in this set have overflowed and fill in the
+ * set->povfl_pmds mask and set->npend_ovfls count.
+ **/
+static void pfm_power5_get_ovfl_pmds(struct pfm_context *ctx,
+				     struct pfm_event_set *set)
+{
+	unsigned int i;
+	unsigned int max_pmd = pfm_pmu_conf->regs.max_cnt_pmd;
+	u64 *used_pmds = set->used_pmds;
+	u64 *cntr_pmds = pfm_pmu_conf->regs.cnt_pmds;
+	u64 width_mask = 1 << pfm_pmu_conf->counter_width;
+	u64 new_val, mask[PFM_PMD_BV];
+
+	bitmap_and(cast_ulp(mask), cast_ulp(cntr_pmds),
+		   cast_ulp(used_pmds), max_pmd);
+
+	for (i = 0; i < max_pmd; i++) {
+		if (test_bit(i, mask)) {
+			new_val = pfm_power5_read_pmd(i);
+			if (new_val & width_mask) {
+				set_bit(i, set->povfl_pmds);
+				set->npend_ovfls++;
+			}
+		}
+	}
+}
+
+static void pfm_power5_irq_handler(struct pt_regs *regs,
+				   struct pfm_context *ctx)
+{
+	u32 mmcr0;
+	u64 mmcra;
+
+	/* Disable the counters (set the freeze bit) to not polute
+	 * the counts.
+	 */
+	mmcr0 = mfspr(SPRN_MMCR0);
+	mtspr(SPRN_MMCR0, (mmcr0 | MMCR0_FC));
+	mmcra = mfspr(SPRN_MMCRA);
+
+	/* Set the PMM bit (see comment below). */
+	mtmsrd(mfmsr() | MSR_PMM);
+
+	pfm_interrupt_handler(instruction_pointer(regs), regs);
+
+	mmcr0 = mfspr(SPRN_MMCR0);
+	/* Reset the perfmon trigger. */
+	mmcr0 |= MMCR0_PMXE;
+
+	/*
+	 * We must clear the PMAO bit on some (GQ) chips. Just do it
+	 * all the time.
+	 */
+	mmcr0 &= ~MMCR0_PMAO;
+
+	/* Clear the appropriate bits in the MMCRA. */
+	mmcra &= POWER6_MMCRA_THRM | POWER6_MMCRA_OTHER;
+	mtspr(SPRN_MMCRA, mmcra);
+
+	/*
+	 * Now clear the freeze bit, counting will not start until we
+	 * rfid from this exception, because only at that point will
+	 * the PMM bit be cleared.
+	 */
+	mmcr0 &= ~MMCR0_FC;
+	mtspr(SPRN_MMCR0, mmcr0);
+}
+
+struct pfm_arch_pmu_info pfm_power5_pmu_info = {
+	.pmu_style        = PFM_POWERPC_PMU_POWER5,
+	.write_pmc        = pfm_power5_write_pmc,
+	.write_pmd        = pfm_power5_write_pmd,
+	.read_pmd         = pfm_power5_read_pmd,
+	.irq_handler      = pfm_power5_irq_handler,
+	.get_ovfl_pmds    = pfm_power5_get_ovfl_pmds,
+	.enable_counters  = pfm_power5_enable_counters,
+	.disable_counters = pfm_power5_disable_counters,
+};
+
+/*
+ * impl_pmcs, impl_pmds are computed at runtime to minimize errors!
+ */
+static struct pfm_pmu_config pfm_power5_pmu_conf = {
+	.pmu_name = "POWER5",
+	.counter_width = 31,
+	.pmd_desc = pfm_power5_pmd_desc,
+	.pmc_desc = pfm_power5_pmc_desc,
+	.num_pmc_entries = PFM_PM_NUM_PMCS,
+	.num_pmd_entries = PFM_PM_NUM_PMDS,
+	.probe_pmu  = pfm_power5_probe_pmu,
+	.arch_info = &pfm_power5_pmu_info,
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE
+};
+	
+static int __init pfm_power5_pmu_init_module(void)
+{
+	return pfm_pmu_register(&pfm_power5_pmu_conf);
+}
+
+static void __exit pfm_power5_pmu_cleanup_module(void)
+{
+	pfm_pmu_unregister(&pfm_power5_pmu_conf);
+}
+
+module_init(pfm_power5_pmu_init_module);
+module_exit(pfm_power5_pmu_cleanup_module);
Index: linux-2.6/arch/powerpc/perfmon/perfmon_ppc32.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/powerpc/perfmon/perfmon_ppc32.c
@@ -0,0 +1,340 @@
+/*
+ * This file contains the PPC32 PMU register description tables
+ * and pmc checker used by perfmon.c.
+ *
+ * Philip Mucci, mucci@cs.utk.edu
+ *
+ * Based on code from:
+ * Copyright (c) 2005 David Gibson, IBM Corporation.
+ *
+ * Based on perfmon_p6.c:
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+#include <asm/reg.h>
+
+MODULE_AUTHOR("Philip Mucci <mucci@cs.utk.edu>");
+MODULE_DESCRIPTION("PPC32 PMU description table");
+MODULE_LICENSE("GPL");
+
+static struct pfm_pmu_config pfm_ppc32_pmu_conf;
+
+static struct pfm_regmap_desc pfm_ppc32_pmc_desc[] = {
+/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", 0x0, 0, 0, SPRN_MMCR0),
+/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0x0, 0, 0, SPRN_MMCR1),
+/* mmcr2 */ PMC_D(PFM_REG_I, "MMCR2", 0x0, 0, 0, SPRN_MMCR2),
+};
+#define PFM_PM_NUM_PMCS	ARRAY_SIZE(pfm_ppc32_pmc_desc)
+
+static struct pfm_regmap_desc pfm_ppc32_pmd_desc[] = {
+/* pmd0  */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1),
+/* pmd1  */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2),
+/* pmd2  */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3),
+/* pmd3  */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4),
+/* pmd4  */ PMD_D(PFM_REG_C, "PMC5", SPRN_PMC5),
+/* pmd5  */ PMD_D(PFM_REG_C, "PMC6", SPRN_PMC6),
+};
+#define PFM_PM_NUM_PMDS	ARRAY_SIZE(pfm_ppc32_pmd_desc)
+
+static void perfmon_perf_irq(struct pt_regs *regs)
+{
+	u32 mmcr0;
+
+	/* BLATANTLY STOLEN FROM OPROFILE, then modified */
+
+	/* set the PMM bit (see comment below) */
+	mtmsr(mfmsr() | MSR_PMM);
+
+	pfm_interrupt_handler(instruction_pointer(regs), regs);
+
+	/* The freeze bit was set by the interrupt.
+	 * Clear the freeze bit, and reenable the interrupt.
+	 * The counters won't actually start until the rfi clears
+	 * the PMM bit.
+	 */
+
+	/* Unfreezes the counters on this CPU, enables the interrupt,
+	 * enables the counters to trigger the interrupt, and sets the
+	 * counters to only count when the mark bit is not set.
+	 */
+	mmcr0 = mfspr(SPRN_MMCR0);
+
+	mmcr0 &= ~(MMCR0_FC | MMCR0_FCM0);
+	mmcr0 |= (MMCR0_FCECE | MMCR0_PMC1CE | MMCR0_PMCnCE | MMCR0_PMXE);
+
+	mtspr(SPRN_MMCR0, mmcr0);
+}
+
+static int pfm_ppc32_probe_pmu(void)
+{
+	enum ppc32_pmu_type pm_type;
+	int nmmcr = 0, npmds = 0, intsok = 0, i;
+	unsigned int pvr;
+	char *str;
+
+	pvr = mfspr(SPRN_PVR);
+
+	switch (PVR_VER(pvr)) {
+	case 0x0004: /* 604 */
+		str = "PPC604";
+		pm_type = PFM_POWERPC_PMU_604;
+		nmmcr = 1;
+		npmds = 2;
+		break;
+	case 0x0009: /* 604e;  */
+	case 0x000A: /* 604ev */
+		str = "PPC604e";
+		pm_type = PFM_POWERPC_PMU_604e;
+		nmmcr = 2;
+		npmds = 4;
+		break;
+	case 0x0008: /* 750/740 */
+		str = "PPC750";
+		pm_type = PFM_POWERPC_PMU_750;
+		nmmcr = 2;
+		npmds = 4;
+		break;
+	case 0x7000: /* 750FX */
+	case 0x7001:
+		str = "PPC750";
+		pm_type = PFM_POWERPC_PMU_750;
+		nmmcr = 2;
+		npmds = 4;
+		if ((pvr & 0xFF0F) >= 0x0203)
+			intsok = 1;
+		break;
+	case 0x7002: /* 750GX */
+		str = "PPC750";
+		pm_type = PFM_POWERPC_PMU_750;
+		nmmcr = 2;
+		npmds = 4;
+		intsok = 1;
+	case 0x000C: /* 7400 */
+		str = "PPC7400";
+		pm_type = PFM_POWERPC_PMU_7400;
+		nmmcr = 3;
+		npmds = 4;
+		break;
+	case 0x800C: /* 7410 */
+		str = "PPC7410";
+		pm_type = PFM_POWERPC_PMU_7400;
+		nmmcr = 3;
+		npmds = 4;
+		if ((pvr & 0xFFFF) >= 0x01103)
+			intsok = 1;
+		break;
+	case 0x8000: /* 7451/7441 */
+	case 0x8001: /* 7455/7445 */
+	case 0x8002: /* 7457/7447 */
+	case 0x8003: /* 7447A */
+	case 0x8004: /* 7448 */
+		str = "PPC7450";
+		pm_type = PFM_POWERPC_PMU_7450;
+		nmmcr = 3; npmds = 6;
+		intsok = 1;
+		break;
+	default:
+		PFM_INFO("Unknown PVR_VER(0x%x)\n", PVR_VER(pvr));
+		return -1;
+	}
+
+	/*
+	 * deconfigure unimplemented registers
+	 */
+	for (i = npmds; i < PFM_PM_NUM_PMDS; i++)
+		pfm_ppc32_pmd_desc[i].type = PFM_REG_NA;
+
+	for (i = nmmcr; i < PFM_PM_NUM_PMCS; i++)
+		pfm_ppc32_pmc_desc[i].type = PFM_REG_NA;
+
+	/*
+	 * update PMU description structure
+	 */
+	pfm_ppc32_pmu_conf.pmu_name = str;
+	pfm_ppc32_pmu_info.pmu_style = pm_type;
+	pfm_ppc32_pmu_conf.num_pmc_entries = nmmcr;
+	pfm_ppc32_pmu_conf.num_pmd_entries = npmds;
+
+	if (intsok == 0)
+		PFM_INFO("Interrupts unlikely to work\n");
+
+	return reserve_pmc_hardware(perfmon_perf_irq);
+}
+
+static void pfm_ppc32_write_pmc(unsigned int cnum, u64 value)
+{
+	switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) {
+	case SPRN_MMCR0:
+		mtspr(SPRN_MMCR0, value);
+		break;
+	case SPRN_MMCR1:
+		mtspr(SPRN_MMCR1, value);
+		break;
+	case SPRN_MMCR2:
+		mtspr(SPRN_MMCR2, value);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void pfm_ppc32_write_pmd(unsigned int cnum, u64 value)
+{
+	switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
+	case SPRN_PMC1:
+		mtspr(SPRN_PMC1, value);
+		break;
+	case SPRN_PMC2:
+		mtspr(SPRN_PMC2, value);
+		break;
+	case SPRN_PMC3:
+		mtspr(SPRN_PMC3, value);
+		break;
+	case SPRN_PMC4:
+		mtspr(SPRN_PMC4, value);
+		break;
+	case SPRN_PMC5:
+		mtspr(SPRN_PMC5, value);
+		break;
+	case SPRN_PMC6:
+		mtspr(SPRN_PMC6, value);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static u64 pfm_ppc32_read_pmd(unsigned int cnum)
+{
+	switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
+	case SPRN_PMC1:
+		return mfspr(SPRN_PMC1);
+	case SPRN_PMC2:
+		return mfspr(SPRN_PMC2);
+	case SPRN_PMC3:
+		return mfspr(SPRN_PMC3);
+	case SPRN_PMC4:
+		return mfspr(SPRN_PMC4);
+	case SPRN_PMC5:
+		return mfspr(SPRN_PMC5);
+	case SPRN_PMC6:
+		return mfspr(SPRN_PMC6);
+	default:
+		BUG();
+	}
+}
+
+/**
+ * pfm_ppc32_enable_counters
+ *
+ * Just need to load the current values into the control registers.
+ **/
+static void pfm_ppc32_enable_counters(struct pfm_context *ctx,
+				      struct pfm_event_set *set)
+{
+	unsigned int i, max_pmc;
+
+	max_pmc = pfm_pmu_conf->regs.max_pmc;
+
+	for (i = 0; i < max_pmc; i++)
+		if (test_bit(i, set->used_pmcs))
+			pfm_ppc32_write_pmc(i, set->pmcs[i]);
+}
+
+/**
+ * pfm_ppc32_disable_counters
+ *
+ * Just need to zero all the control registers.
+ **/
+static void pfm_ppc32_disable_counters(struct pfm_context *ctx,
+				       struct pfm_event_set *set)
+{
+	unsigned int i, max;
+
+	max = pfm_pmu_conf->regs.max_pmc;
+
+	for (i = 0; i < max; i++)
+		if (test_bit(i, set->used_pmcs))
+			pfm_ppc32_write_pmc(ctx, 0);
+}
+
+/**
+ * pfm_ppc32_get_ovfl_pmds
+ *
+ * Determine which counters in this set have overflowed and fill in the
+ * set->povfl_pmds mask and set->npend_ovfls count.
+ **/
+static void pfm_ppc32_get_ovfl_pmds(struct pfm_context *ctx,
+				    struct pfm_event_set *set)
+{
+	unsigned int i;
+	unsigned int max_pmd = pfm_pmu_conf->regs.max_cnt_pmd;
+	u64 *used_pmds = set->used_pmds;
+	u64 *cntr_pmds = pfm_pmu_conf->regs.cnt_pmds;
+	u64 width_mask = 1 << pfm_pmu_conf->counter_width;
+	u64 new_val, mask[PFM_PMD_BV];
+
+	bitmap_and(cast_ulp(mask), cast_ulp(cntr_pmds),
+		   cast_ulp(used_pmds), max_pmd);
+
+	for (i = 0; i < max_pmd; i++) {
+		if (test_bit(i, mask)) {
+			new_val = pfm_ppc32_read_pmd(i);
+			if (new_val & width_mask) {
+				set_bit(i, set->povfl_pmds);
+				set->npend_ovfls++;
+			}
+		}
+	}
+}
+
+struct pfm_arch_pmu_info pfm_ppc32_pmu_info = {
+	.pmu_style        = PFM_POWERPC_PMU_NONE,
+	.write_pmc        = pfm_ppc32_write_pmc,
+	.write_pmd        = pfm_ppc32_write_pmd,
+	.read_pmd         = pfm_ppc32_read_pmd,
+	.get_ovfl_pmds    = pfm_ppc32_get_ovfl_pmds,
+	.enable_counters  = pfm_ppc32_enable_counters,
+	.disable_counters = pfm_ppc32_disable_counters,
+};
+
+static struct pfm_pmu_config pfm_ppc32_pmu_conf = {
+	.counter_width = 31,
+	.pmd_desc = pfm_ppc32_pmd_desc,
+	.pmc_desc = pfm_ppc32_pmc_desc,
+	.probe_pmu  = pfm_ppc32_probe_pmu,
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE,
+	.version = "0.1",
+	.arch_info = &pfm_ppc32_pmu_info,
+};
+	
+static int __init pfm_ppc32_pmu_init_module(void)
+{
+	return pfm_pmu_register(&pfm_ppc32_pmu_conf);
+}
+
+static void __exit pfm_ppc32_pmu_cleanup_module(void)
+{
+	release_pmc_hardware();
+	pfm_pmu_unregister(&pfm_ppc32_pmu_conf);
+}
+
+module_init(pfm_ppc32_pmu_init_module);
+module_exit(pfm_ppc32_pmu_cleanup_module);
Index: linux-2.6/arch/x86_64/Kconfig
===================================================================
--- linux-2.6.orig/arch/x86_64/Kconfig
+++ linux-2.6/arch/x86_64/Kconfig
@@ -667,6 +667,8 @@ config K8_NB
 	def_bool y
 	depends on AGP_AMD64 || IOMMU || (PCI && NUMA)
 
+source "arch/x86_64/perfmon/Kconfig"
+
 endmenu
 
 #
Index: linux-2.6/arch/x86_64/Makefile
===================================================================
--- linux-2.6.orig/arch/x86_64/Makefile
+++ linux-2.6/arch/x86_64/Makefile
@@ -79,6 +79,7 @@ core-y					+= arch/x86_64/kernel/ \
 					   arch/x86_64/crypto/
 core-$(CONFIG_IA32_EMULATION)		+= arch/x86_64/ia32/
 drivers-$(CONFIG_PCI)			+= arch/x86_64/pci/
+drivers-$(CONFIG_PERFMON)		+= arch/x86_64/perfmon/
 drivers-$(CONFIG_OPROFILE)		+= arch/x86_64/oprofile/
 
 boot := arch/x86_64/boot
Index: linux-2.6/arch/x86_64/ia32/ia32entry.S
===================================================================
--- linux-2.6.orig/arch/x86_64/ia32/ia32entry.S
+++ linux-2.6/arch/x86_64/ia32/ia32entry.S
@@ -719,4 +719,16 @@ ia32_sys_call_table:
 	.quad compat_sys_signalfd
 	.quad compat_sys_timerfd
 	.quad sys_eventfd
+	.quad sys_pfm_create_context
+	.quad sys_pfm_write_pmcs	/* 325 */
+	.quad sys_pfm_write_pmds
+	.quad sys_pfm_read_pmds
+	.quad sys_pfm_load_context
+	.quad sys_pfm_start
+	.quad sys_pfm_stop		/* 330 */
+	.quad sys_pfm_restart
+	.quad sys_pfm_create_evtsets
+	.quad sys_pfm_getinfo_evtsets
+	.quad sys_pfm_delete_evtsets
+	.quad sys_pfm_unload_context	/* 335 */
 ia32_syscall_end:
Index: linux-2.6/arch/x86_64/kernel/entry.S
===================================================================
--- linux-2.6.orig/arch/x86_64/kernel/entry.S
+++ linux-2.6/arch/x86_64/kernel/entry.S
@@ -282,7 +282,7 @@ sysret_careful:
 sysret_signal:
 	TRACE_IRQS_ON
 	sti
-	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_PERFMON_WORK),%edx
 	jz    1f
 
 	/* Really a signal */
@@ -375,7 +375,7 @@ int_very_careful:
 	jmp int_restore_rest
 	
 int_signal:
-	testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx
+	testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_PERFMON_WORK),%edx
 	jz 1f
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
@@ -599,7 +599,7 @@ retint_careful:
 	jmp retint_check
 	
 retint_signal:
-	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx
+	testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP|_TIF_PERFMON_WORK),%edx
 	jz    retint_swapgs
 	TRACE_IRQS_ON
 	sti
@@ -691,7 +691,12 @@ END(error_interrupt)
 ENTRY(spurious_interrupt)
 	apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt
 END(spurious_interrupt)
-				
+
+#ifdef CONFIG_PERFMON
+ENTRY(pmu_interrupt)
+	apicinterrupt LOCAL_PERFMON_VECTOR,smp_pmu_interrupt
+#endif
+
 /*
  * Exception entry points.
  */ 		
Index: linux-2.6/arch/x86_64/kernel/i8259.c
===================================================================
--- linux-2.6.orig/arch/x86_64/kernel/i8259.c
+++ linux-2.6/arch/x86_64/kernel/i8259.c
@@ -11,6 +11,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
 #include <linux/bitops.h>
+#include <linux/perfmon.h>
 
 #include <asm/acpi.h>
 #include <asm/atomic.h>
Index: linux-2.6/arch/x86_64/kernel/process.c
===================================================================
--- linux-2.6.orig/arch/x86_64/kernel/process.c
+++ linux-2.6/arch/x86_64/kernel/process.c
@@ -379,6 +379,7 @@ void exit_thread(void)
 		t->io_bitmap_max = 0;
 		put_cpu();
 	}
+	pfm_exit_thread(me);
 }
 
 void flush_thread(void)
@@ -487,6 +488,8 @@ int copy_thread(int nr, unsigned long cl
 	asm("mov %%es,%0" : "=m" (p->thread.es));
 	asm("mov %%ds,%0" : "=m" (p->thread.ds));
 
+	pfm_copy_thread(p);
+
 	if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
 		p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
 		if (!p->thread.io_bitmap_ptr) {
@@ -557,6 +560,10 @@ static inline void __switch_to_xtra(stru
 		 */
 		memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
 	}
+
+	if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW)
+	    || test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW))
+		pfm_ctxsw(prev_p, next_p);
 }
 
 /*
@@ -661,7 +668,7 @@ __switch_to(struct task_struct *prev_p, 
 	 * Now maybe reload the debug registers and handle I/O bitmaps
 	 */
 	if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
-	    || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
+  	    || (task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW))
 		__switch_to_xtra(prev_p, next_p, tss);
 
 	/* If the task has used fpu the last 5 timeslices, just do a full
Index: linux-2.6/arch/x86_64/kernel/setup64.c
===================================================================
--- linux-2.6.orig/arch/x86_64/kernel/setup64.c
+++ linux-2.6/arch/x86_64/kernel/setup64.c
@@ -11,6 +11,7 @@
 #include <linux/bootmem.h>
 #include <linux/bitops.h>
 #include <linux/module.h>
+#include <linux/perfmon.h>
 #include <asm/bootsetup.h>
 #include <asm/pda.h>
 #include <asm/pgtable.h>
@@ -286,4 +287,6 @@ void __cpuinit cpu_init (void)
 	fpu_init(); 
 
 	raw_local_save_flags(kernel_eflags);
+
+ 	pfm_init_percpu();
 }
Index: linux-2.6/arch/x86_64/kernel/signal.c
===================================================================
--- linux-2.6.orig/arch/x86_64/kernel/signal.c
+++ linux-2.6/arch/x86_64/kernel/signal.c
@@ -21,6 +21,7 @@
 #include <linux/stddef.h>
 #include <linux/personality.h>
 #include <linux/compiler.h>
+#include <linux/perfmon.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
@@ -472,6 +473,9 @@ do_notify_resume(struct pt_regs *regs, v
 		clear_thread_flag(TIF_SINGLESTEP);
 	}
 
+	if (thread_info_flags & _TIF_PERFMON_WORK)
+		pfm_handle_work(regs);
+
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
 		do_signal(regs);
Index: linux-2.6/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux-2.6.orig/arch/x86_64/kernel/smpboot.c
+++ linux-2.6/arch/x86_64/kernel/smpboot.c
@@ -49,6 +49,7 @@
 #include <linux/mc146818rtc.h>
 #include <linux/smp.h>
 #include <linux/kdebug.h>
+#include <linux/perfmon.h>
 
 #include <asm/mtrr.h>
 #include <asm/pgalloc.h>
@@ -1043,6 +1044,7 @@ int __cpu_disable(void)
 	spin_unlock(&vector_lock);
 	remove_cpu_from_maps();
 	fixup_irqs(cpu_online_map);
+	pfm_cpu_disable();
 	return 0;
 }
 
Index: linux-2.6/arch/x86_64/oprofile/Makefile
===================================================================
--- linux-2.6.orig/arch/x86_64/oprofile/Makefile
+++ linux-2.6/arch/x86_64/oprofile/Makefile
@@ -15,5 +15,6 @@ OPROFILE-y := init.o backtrace.o
 OPROFILE-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o op_model_p4.o \
 				     op_model_ppro.o
 OPROFILE-$(CONFIG_X86_IO_APIC)    += nmi_timer_int.o 
+OPROFILE-$(CONFIG_PERFMON)	  += perfmon.o
 
 oprofile-y = $(DRIVER_OBJS) $(addprefix ../../i386/oprofile/, $(OPROFILE-y))
Index: linux-2.6/arch/x86_64/perfmon/Kconfig
===================================================================
--- /dev/null
+++ linux-2.6/arch/x86_64/perfmon/Kconfig
@@ -0,0 +1,55 @@
+menu "Hardware Performance Monitoring support"
+config PERFMON
+ 	bool "Perfmon2 performance monitoring interface"
+	default n
+ 	help
+  	Enables the perfmon2 interface to access the hardware
+	performance counters. See <http://perfmon2.sf.net/> for
+ 	more details.
+
+config PERFMON_DEBUG
+ 	bool "Perfmon debugging"
+	default n
+	depends on PERFMON
+ 	help
+  	Enables perfmon debugging support
+
+config X86_64_PERFMON_K8
+	tristate "Support 64-bit mode AMD Athlon64 and Opteron64 hardware performance counters"
+	depends on PERFMON
+	default n
+	help
+	Enables support for 64-bit mode AMD Athlon64 and Opteron64 processors
+	hardware performance counters.
+
+config X86_64_PERFMON_P4
+	tristate "Support for Intel 64-bit Pentium 4/Xeon hardware performance counters"
+	depends on PERFMON
+	default n
+	help
+	Enables support for Intel 64-bit mode Pentium 4/Xeon hardware performance
+	counters.
+
+config  X86_64_PERFMON_CORE
+	tristate "Support for Intel Core-based performance counters"
+	depends on PERFMON
+	default n
+	help
+	Enables 64-bit support for Intel Core-based performance counters. Enable
+	this option to support Intel Core 2 Duo processors.
+
+config X86_64_PERFMON_INTEL_ARCH
+	tristate "Support for Intel architectural performance counters"
+	depends on PERFMON
+	default n
+	help
+	Enables 64-bit support for Intel architectural performance counters.
+
+config X86_64_PERFMON_PEBS
+	tristate "Support for Intel Precise Event-Based Sampling (PEBS)"
+	depends on PERFMON
+	default n
+	help
+	Enables support for 64-bit Precise Event-Based Sampling (PEBS) on the Intel
+	Pentium 4, Xeon, and Core-based processors which support it.
+endmenu
Index: linux-2.6/arch/x86_64/perfmon/Makefile
===================================================================
--- /dev/null
+++ linux-2.6/arch/x86_64/perfmon/Makefile
@@ -0,0 +1,16 @@
+#
+# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+# Contributed by Stephane Eranian <eranian@hpl.hp.com>
+#
+
+obj-$(CONFIG_PERFMON)			+= perfmon.o
+obj-$(CONFIG_X86_64_PERFMON_K8)		+= perfmon_k8.o
+obj-$(CONFIG_X86_64_PERFMON_P4)		+= perfmon_p4.o
+obj-$(CONFIG_X86_64_PERFMON_CORE)	+= perfmon_core.o
+obj-$(CONFIG_X86_64_PERFMON_INTEL_ARCH)	+= perfmon_intel_arch.o
+obj-$(CONFIG_X86_64_PERFMON_PEBS)	+= perfmon_pebs_smpl.o
+
+perfmon-$(subst m,y,$(CONFIG_PERFMON)) += ../../i386/perfmon/perfmon.o
+perfmon_p4-$(subst m,y,$(CONFIG_X86_64_PERFMON_P4)) += ../../i386/perfmon/perfmon_p4.o
+perfmon_intel_arch-$(subst m,y,$(CONFIG_X86_64_PERFMON_INTEL_ARCH)) += ../../i386/perfmon/perfmon_intel_arch.o
+perfmon_pebs_smpl-$(subst m,y,$(CONFIG_X86_64_PERFMON_PEBS)) += ../../i386/perfmon/perfmon_pebs_smpl.o
Index: linux-2.6/arch/x86_64/perfmon/perfmon_core.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/x86_64/perfmon/perfmon_core.c
@@ -0,0 +1,215 @@
+/*
+ * This file contains the Intel Core PMU registers description tables.
+ * Intel Core-based processors support architectural perfmon v1
+ *
+ * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+#include <asm/nmi.h>
+
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("Intel Core");
+MODULE_LICENSE("GPL");
+
+static int force_nmi;
+MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt");
+module_param(force_nmi, bool, 0600);
+
+/*
+ * - upper 32 bits are reserved
+ * - INT: APIC enable bit is reserved (forced to 1)
+ * - bit 21 is reserved
+ *
+ *   RSVD: reserved bits must be 1
+ */
+#define PFM_CORE_PMC_RSVD ((~((1ULL<<32)-1)) \
+		  	| (1ULL<<20)   \
+			| (1ULL<<21))
+
+/*
+ * force Local APIC interrupt on overflow
+ * disable with NO_EMUL64
+ */
+#define PFM_CORE_PMC_VAL	(1ULL<<20)
+#define PFM_CORE_NO64		(1ULL<<20)
+
+#define PFM_CORE_NA { .reg_type = PFM_REGT_NA}
+
+#define PFM_CORE_CA(m, c, t) \
+	{ \
+	  .addrs[0] = m, \
+	  .ctr = c, \
+	  .reg_type = t \
+	}
+/*
+ * physical addresses of MSR for evntsel and perfctr registers
+ *
+ * IMPORTANT:
+ * 	The mapping  was chosen to be compatible with the Intel
+ * 	architectural perfmon, so that applications which only
+ * 	know about the architectural perfmon can work on Core
+ * 	without any changes. After all, this is the goal of
+ * 	having an architected PMU.
+ *
+ * 	Because of this compatibility constraint, it is not
+ * 	possibleto use PERF_GLOBAL_CTRL, unless we force it to
+ * 	'enable all' which is the default value. Applications
+ * 	written for architectural perfmon do not know about
+ * 	PERF_GLOBAL_CTRL which is an Intel Core specific
+ * 	extension.
+ *
+ * 	Fixed counters are placed after the generic counters
+ * 	which are compatible with architectural perfmon.
+ */
+struct pfm_arch_pmu_info pfm_core_pmu_info={
+	.pmc_addrs = {
+		PFM_CORE_CA(MSR_P6_EVNTSEL0, 0, PFM_REGT_EN),
+		PFM_CORE_CA(MSR_P6_EVNTSEL1, 1, PFM_REGT_EN),
+		PFM_CORE_CA(MSR_CORE_PERF_FIXED_CTR_CTRL, 0, PFM_REGT_EN),
+		PFM_CORE_CA(MSR_IA32_PEBS_ENABLE, 0, PFM_REGT_EN)
+	},
+	.pmd_addrs = {
+		PFM_CORE_CA(MSR_P6_PERFCTR0, 0, PFM_REGT_CTR),
+		PFM_CORE_CA(MSR_P6_PERFCTR1, 0, PFM_REGT_CTR),
+		PFM_CORE_CA(MSR_CORE_PERF_FIXED_CTR0, 0, PFM_REGT_CTR),
+		PFM_CORE_CA(MSR_CORE_PERF_FIXED_CTR1, 0, PFM_REGT_CTR),
+		PFM_CORE_CA(MSR_CORE_PERF_FIXED_CTR2, 0, PFM_REGT_CTR)
+	},
+	.pebs_ctr_idx = 0, /* IA32_PMC0 */
+	.pmu_style = PFM_X86_PMU_CORE
+};
+
+static struct pfm_regmap_desc pfm_core_pmc_desc[]={
+/* pmc0  */ {
+	      .type = PFM_REG_I64,
+	      .desc = "PERFEVTSEL0",
+	      .dfl_val = PFM_CORE_PMC_VAL,
+	      .rsvd_msk = PFM_CORE_PMC_RSVD,
+	      .no_emul64_msk = PFM_CORE_NO64,
+	      .hw_addr = MSR_P6_EVNTSEL0
+	    },
+/* pmc1  */ {
+	      .type = PFM_REG_W64,
+	      .desc = "PERFEVTSEL1",
+	      .dfl_val = PFM_CORE_PMC_VAL,
+	      .rsvd_msk = PFM_CORE_PMC_RSVD,
+	      .no_emul64_msk = PFM_CORE_NO64,
+	      .hw_addr = MSR_P6_EVNTSEL1
+	    },
+/* pmc2  */ { .type = PFM_REG_W,
+	      .desc = "FIXED_CTRL",
+	      .dfl_val = 0x888ULL,
+	      .rsvd_msk = 0xfffffffffffff444ULL,
+	      .no_emul64_msk = 0,
+	      .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL
+	    },
+/* pmc3  */ { .type = PFM_REG_I,
+	      .desc = "PEBS_ENABLE",
+	      .dfl_val = 0,
+	      .rsvd_msk = 0xfffffffffffffffeULL,
+	      .no_emul64_msk = 0,
+	      .hw_addr = MSR_IA32_PEBS_ENABLE
+	    }
+};
+
+#define PFM_CORE_D(n) PMD_D(PFM_REG_C, "PMC"#n, MSR_P6_PERFCTR0+n)
+#define PFM_CORE_FD(n) PMD_D(PFM_REG_C, "FIXED_CTR"#n, MSR_CORE_PERF_FIXED_CTR0+n)
+
+static struct pfm_regmap_desc pfm_core_pmd_desc[]={
+/* pmd0  */ PFM_CORE_D(0),
+/* pmd1  */ PFM_CORE_D(1),
+/* pmd2  */ PFM_CORE_FD(0),
+/* pmd3  */ PFM_CORE_FD(1),
+/* pmd4  */ PFM_CORE_FD(2)
+};
+#define PFM_CORE_NUM_PMCS	ARRAY_SIZE(pfm_core_pmc_desc)
+#define PFM_CORE_NUM_PMDS	ARRAY_SIZE(pfm_core_pmd_desc)
+
+static struct pfm_pmu_config pfm_core_pmu_conf;
+
+static int pfm_core_probe_pmu(void)
+{
+	/*
+	 * Check for Intel Core processors.
+	 * Checking for cpu_has_perfmon is not enough as this
+	 * matches intel Core Duo/Core Solo which do not have
+	 * PEBS nor fixed counters.
+	 */
+	if (cpu_data->x86 != 6 || cpu_data->x86_model != 15)
+		return -1;
+
+	if (!cpu_has_apic) {
+		PFM_INFO("no Local APIC, unsupported");
+		return -1;
+	}
+
+	PFM_INFO("nmi_watchdog=%d nmi_active=%d force_nmi=%d",
+		nmi_watchdog, atomic_read(&nmi_active), force_nmi);
+
+	/*
+	 * NMI using PMU?
+	 * Actual removal of NMI counter is done by pfm_pmu_acquire()
+	 */
+	if (nmi_watchdog == NMI_LOCAL_APIC || force_nmi)
+		pfm_core_pmu_info.flags |= PFM_X86_FL_USE_NMI;
+
+	/*
+	 * Intel Core processors implement DS and PEBS, no need to check
+	 */
+	pfm_core_pmu_info.flags |= PFM_X86_FL_PMU_DS|PFM_X86_FL_PMU_PEBS;
+	PFM_INFO("PEBS supported, enabled");
+
+	return 0;
+}
+
+/*
+ * called on PMC1, FIXED_CTR. Reject access when PEBS is used
+ */
+static int pfm_core_pmc_check(struct pfm_context *ctx,
+			     struct pfm_event_set *set,
+			     struct pfarg_pmc *req)
+{
+	struct pfm_arch_context *ctx_arch;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	return ctx_arch->flags & PFM_X86_USE_PEBS ?  -EINVAL : 0;
+}
+
+/*
+ * Counters may have model-specific width which can be probed using
+ * the CPUID.0xa leaf. Yet, the documentation says: "
+ * In the initial implementation, only the read bit width is reported
+ * by CPUID, write operations are limited to the low 32 bits.
+ * Bits [w-32] are sign extensions of bit 31. As such the effective width
+ * of a counter is 31 bits only.
+ */
+static struct pfm_pmu_config pfm_core_pmu_conf={
+	.pmu_name = "Intel Core",
+	.pmd_desc = pfm_core_pmd_desc,
+	.counter_width = 31,
+	.num_pmc_entries = PFM_CORE_NUM_PMCS,
+	.num_pmd_entries = PFM_CORE_NUM_PMDS,
+	.pmc_write_check = pfm_core_pmc_check,
+	.pmc_desc = pfm_core_pmc_desc,
+	.probe_pmu = pfm_core_probe_pmu,
+	.version = "1.0",
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE,
+	.arch_info = &pfm_core_pmu_info
+};
+	
+static int __init pfm_core_pmu_init_module(void)
+{
+	return pfm_pmu_register(&pfm_core_pmu_conf);
+}
+
+static void __exit pfm_core_pmu_cleanup_module(void)
+{
+	pfm_pmu_unregister(&pfm_core_pmu_conf);
+}
+
+module_init(pfm_core_pmu_init_module);
+module_exit(pfm_core_pmu_cleanup_module);
Index: linux-2.6/arch/x86_64/perfmon/perfmon_k8.c
===================================================================
--- /dev/null
+++ linux-2.6/arch/x86_64/perfmon/perfmon_k8.c
@@ -0,0 +1,347 @@
+/*
+ * This file contains the PMU description for the Athlon64 and Opteron64
+ * processors. It supports 32 and 64-bit modes.
+ *
+ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+#include <linux/vmalloc.h>
+#include <asm/nmi.h>
+
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("AMD64 PMU description table");
+MODULE_LICENSE("GPL");
+
+static int force_nmi;
+MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt");
+module_param(force_nmi, bool, 0600);
+
+static struct pfm_arch_pmu_info pfm_k8_pmu_info = {
+	.pmc_addrs = {
+/* pmc0  */	{{MSR_K7_EVNTSEL0, 0}, 0, PFM_REGT_EN},
+/* pmc1  */	{{MSR_K7_EVNTSEL1, 0}, 1, PFM_REGT_EN},
+/* pmc2  */	{{MSR_K7_EVNTSEL2, 0}, 2, PFM_REGT_EN},
+/* pmc3  */	{{MSR_K7_EVNTSEL3, 0}, 3, PFM_REGT_EN},
+	},
+	.pmd_addrs = {
+/* pmd0  */	{{MSR_K7_PERFCTR0, 0}, 0, PFM_REGT_CTR},
+/* pmd1  */	{{MSR_K7_PERFCTR1, 0}, 0, PFM_REGT_CTR},
+/* pmd2  */	{{MSR_K7_PERFCTR2, 0}, 0, PFM_REGT_CTR},
+/* pmd3  */	{{MSR_K7_PERFCTR3, 0}, 0, PFM_REGT_CTR},
+	},
+	.pmu_style = PFM_X86_PMU_AMD64
+};
+
+/*
+ * force Local APIC interrupt on overflow
+ */
+#define PFM_K8_VAL	(1ULL<<20)
+#define PFM_K8_NO64	(1ULL<<20)
+
+/*
+ * reserved bits must be zero
+ *
+ * - upper 32 bits are reserved
+ * - APIC enable bit is reserved (forced to 1)
+ * - bit 21 is reserved
+ */
+#define PFM_K8_RSVD	((~((1ULL<<32)-1)) \
+			| (1ULL<<20)	   \
+			| (1ULL<<21))
+
+/*
+ * force Local APIC interrupt on overflow
+ */
+#define PFM_K8_VAL	(1ULL<<20)
+#define PFM_K8_NO64	(1ULL<<20)
+
+static struct pfm_regmap_desc pfm_k8_pmc_desc[]={
+/* pmc0  */ PMC_D(PFM_REG_I64, "PERFSEL0", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL0),
+/* pmc1  */ PMC_D(PFM_REG_I64, "PERFSEL1", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL1),
+/* pmc2  */ PMC_D(PFM_REG_I64, "PERFSEL2", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL2),
+/* pmc3  */ PMC_D(PFM_REG_I64, "PERFSEL3", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL3),
+};
+#define PFM_AMD_NUM_PMCS ARRAY_SIZE(pfm_k8_pmc_desc)
+
+static struct pfm_regmap_desc pfm_k8_pmd_desc[] = {
+/* pmd0  */ PMD_D(PFM_REG_C,   "PERFCTR0",	MSR_K7_PERFCTR0),
+/* pmd1  */ PMD_D(PFM_REG_C,   "PERFCTR1",	MSR_K7_PERFCTR1),
+/* pmd2  */ PMD_D(PFM_REG_C,   "PERFCTR2",	MSR_K7_PERFCTR2),
+/* pmd3  */ PMD_D(PFM_REG_C,   "PERFCTR3",	MSR_K7_PERFCTR3)
+};
+#define PFM_AMD_NUM_PMDS ARRAY_SIZE(pfm_k8_pmd_desc)
+
+static struct pfm_context **pfm_nb_sys_owners;
+static struct pfm_context *pfm_nb_task_owner;
+
+static struct pfm_pmu_config pfm_k8_pmu_conf;
+
+/*
+ * There can only be one user per socket for the Northbridge (NB) events,
+ * so we enforce mutual exclusion as follows:
+ * 	- per-thread : only one context machine-wide can use NB events
+ * 	- system-wide: only one context per processor socket
+ *
+ * Exclusion is enforced at:
+ * 	- pfm_load_context()
+ * 	- pfm_write_pmcs() for attached contexts
+ *
+ * Exclusion is released at:
+ * 	- pfm_unload_context() or any calls that implicitely uses it
+ *
+ * return:
+ * 	0  : successfully acquire NB access
+ * 	< 0:  errno, failed to acquire NB access
+ */
+static int pfm_k8_acquire_nb(struct pfm_context *ctx)
+{
+	struct pfm_context **entry, *old;
+	int proc_id;
+
+#ifdef CONFIG_SMP
+	proc_id = topology_physical_package_id(smp_processor_id());
+#else
+	proc_id = 0;
+#endif
+
+	if (ctx->flags.system)
+		entry = &pfm_nb_sys_owners[proc_id];
+	else
+		entry = &pfm_nb_task_owner;
+
+	old = cmpxchg(entry, NULL, ctx);	
+ 	if (!old) {
+		if (ctx->flags.system)
+			PFM_DBG("acquired Northbridge event access on socket %u", proc_id);
+		else
+			PFM_DBG("acquired Northbridge event access globally");
+	} else if (old != ctx) {
+		if (ctx->flags.system)
+			PFM_DBG("NorthBridge event conflict on socket %u", proc_id);
+		else
+			PFM_DBG("global NorthBridge event conflict");
+		return -EBUSY;
+	}
+	return 0;
+}
+
+/*
+ * invoked from pfm_write_pmcs() when pfm_nb_sys_owners is not NULL,i.e.,
+ * when we have detected a multi-core processor.
+ *
+ * context is locked, interrupts are masked
+ */
+static int pfm_k8_pmc_write_check(struct pfm_context *ctx,
+			     struct pfm_event_set *set,
+			     struct pfarg_pmc *req)
+{
+	unsigned int event;
+	/*
+	 * delay checking NB event until we load the context
+	 */
+	if (ctx->state == PFM_CTX_UNLOADED)
+		return 0;
+
+	/*
+	 * check event is NB event
+	 */
+	event = (unsigned int)(req->reg_value & 0xff);
+	if (event < 0xee)
+		return 0;
+
+	return pfm_k8_acquire_nb(ctx);
+}
+
+/*
+ * invoked on pfm_load_context().
+ * context is locked, interrupts are masked
+ */
+static int pfm_k8_load_context(struct pfm_context *ctx)
+{
+	struct pfm_event_set *set;
+	unsigned int i, n;
+
+	/*
+	 * scan all sets for NB events
+	 */
+	list_for_each_entry(set, &ctx->list, list) {
+		n = set->nused_pmcs;
+		for(i=0; n; i++) {
+			if (!test_bit(i, cast_ulp(set->used_pmcs)))
+				continue;
+			if ((set->pmcs[i] & 0xff) >= 0xee)
+				goto found;
+			n--;
+		}
+	}
+	return 0;
+found:
+	return pfm_k8_acquire_nb(ctx);
+}
+
+/*
+ * invoked on pfm_unload_context()
+ */
+static int pfm_k8_unload_context(struct pfm_context *ctx)
+{
+	struct pfm_context **entry, *old;
+	int proc_id;
+
+#ifdef CONFIG_SMP
+	proc_id = topology_physical_package_id(smp_processor_id());
+#else
+	proc_id = 0;
+#endif
+
+	/*
+	 * unload always happens on the monitored CPU in system-wide
+	 */
+	if (ctx->flags.system)
+		entry = &pfm_nb_sys_owners[proc_id];
+	else
+		entry = &pfm_nb_task_owner;
+
+	old = cmpxchg(entry, ctx, NULL);	
+	if (old == ctx) {
+		if (ctx->flags.system)
+			PFM_DBG("released NorthBridge on socket %u", proc_id);
+		else
+			PFM_DBG("released NorthBridge events globally");
+	}
+	return 0;
+}
+
+/*
+ * detect if we need to active NorthBridge event access control
+ */
+static int pfm_k8_setup_nb_event_control(void)
+{
+	unsigned int c, n = 0;
+	unsigned int max_phys = 0;
+
+#ifdef CONFIG_SMP
+	for_each_possible_cpu(c) {
+		if (cpu_data[c].phys_proc_id > max_phys)
+			max_phys = cpu_data[c].phys_proc_id;
+	}
+#else
+	max_phys = 0;
+#endif
+	if (max_phys > 255) {
+		PFM_INFO("socket id %d is too big to handle", max_phys);
+		return -ENOMEM;
+	}
+
+	n = max_phys + 1;
+	if (n < 2)
+		return 0;
+
+	pfm_nb_sys_owners = vmalloc(n * sizeof(*pfm_nb_sys_owners));
+	if (!pfm_nb_sys_owners)
+		return -ENOMEM;
+
+	memset(pfm_nb_sys_owners, 0, n * sizeof(*pfm_nb_sys_owners));
+	pfm_nb_task_owner = NULL;
+
+	/*
+	 * activate write-checker for PMC registers
+	 */
+	for(c=0; c < PFM_AMD_NUM_PMCS; c++) {
+		pfm_k8_pmc_desc[c].type |= PFM_REG_WC;
+	}
+
+	pfm_k8_pmu_info.load_context = pfm_k8_load_context;
+	pfm_k8_pmu_info.unload_context = pfm_k8_unload_context;
+
+	pfm_k8_pmu_conf.pmc_write_check = pfm_k8_pmc_write_check;
+
+	PFM_INFO("NorthBridge event access control enabled");
+
+	return 0;
+}
+
+static int pfm_k8_probe_pmu(void)
+{
+	if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+		PFM_INFO("not an AMD processor");
+		return -1;
+	}
+
+	switch (current_cpu_data.x86) {
+	case 15:
+	case 16:
+		PFM_INFO("found family=%d", current_cpu_data.x86);
+		break;
+	default:
+		PFM_INFO("unsupported family=%d", current_cpu_data.x86);
+		return -1;
+	}
+
+	/*
+	 * check for local APIC (required)
+	 */
+	if (!cpu_has_apic) {
+		PFM_INFO("no local APIC, unsupported");
+		return -1;
+	}
+
+	if (current_cpu_data.x86_max_cores > 1)
+		return pfm_k8_setup_nb_event_control();
+
+	PFM_INFO("nmi_watchdog=%d nmi_active=%d force_nmi=%d",
+		nmi_watchdog, atomic_read(&nmi_active), force_nmi);
+	/*
+	 * NMI using PMU?
+	 * Actual removal of NMI counter is done by pfm_pmu_acquire()
+	 */
+	if (nmi_watchdog == NMI_LOCAL_APIC || force_nmi)
+		pfm_k8_pmu_info.flags |= PFM_X86_FL_USE_NMI;
+
+	return 0;
+}
+
+static struct pfm_pmu_config pfm_k8_pmu_conf = {
+	.pmu_name = "AMD64",
+	.counter_width = 47,
+	.pmd_desc = pfm_k8_pmd_desc,
+	.pmc_desc = pfm_k8_pmc_desc,
+	.num_pmc_entries = PFM_AMD_NUM_PMCS,
+	.num_pmd_entries = PFM_AMD_NUM_PMDS,
+	.probe_pmu = pfm_k8_probe_pmu,
+	.version = "1.1",
+	.arch_info = &pfm_k8_pmu_info,
+	.flags = PFM_PMU_BUILTIN_FLAG,
+	.owner = THIS_MODULE
+};
+	
+static int __init pfm_k8_pmu_init_module(void)
+{
+	return pfm_pmu_register(&pfm_k8_pmu_conf);
+}
+
+static void __exit pfm_k8_pmu_cleanup_module(void)
+{
+	if (pfm_nb_sys_owners)
+		vfree(pfm_nb_sys_owners);
+
+	pfm_pmu_unregister(&pfm_k8_pmu_conf);
+}
+
+module_init(pfm_k8_pmu_init_module);
+module_exit(pfm_k8_pmu_cleanup_module);
Index: linux-2.6/drivers/oprofile/oprofile_files.c
===================================================================
--- linux-2.6.orig/drivers/oprofile/oprofile_files.c
+++ linux-2.6/drivers/oprofile/oprofile_files.c
@@ -117,7 +117,17 @@ static ssize_t dump_write(struct file * 
 static const struct file_operations dump_fops = {
 	.write		= dump_write,
 };
- 
+
+static ssize_t implementation(struct file * file, char __user * buf, size_t count, loff_t * offset)
+{
+	return oprofilefs_str_to_user(oprofile_ops.implementation, buf, count, offset);
+}
+
+
+static struct file_operations implementation_fops = {
+	.read		= implementation,
+};
+
 void oprofile_create_files(struct super_block * sb, struct dentry * root)
 {
 	oprofilefs_create_file(sb, root, "enable", &enable_fops);
@@ -127,6 +137,7 @@ void oprofile_create_files(struct super_
 	oprofilefs_create_ulong(sb, root, "buffer_watershed", &fs_buffer_watershed);
 	oprofilefs_create_ulong(sb, root, "cpu_buffer_size", &fs_cpu_buffer_size);
 	oprofilefs_create_file(sb, root, "cpu_type", &cpu_type_fops); 
+	oprofilefs_create_file(sb, root, "implementation", &implementation_fops);
 	oprofilefs_create_file(sb, root, "backtrace_depth", &depth_fops);
 	oprofilefs_create_file(sb, root, "pointer_size", &pointer_size_fops);
 	oprofile_create_stats_files(sb, root);
Index: linux-2.6/drivers/oprofile/timer_int.c
===================================================================
--- linux-2.6.orig/drivers/oprofile/timer_int.c
+++ linux-2.6/drivers/oprofile/timer_int.c
@@ -43,4 +43,5 @@ void __init oprofile_timer_init(struct o
 	ops->start = timer_start;
 	ops->stop = timer_stop;
 	ops->cpu_type = "timer";
+	ops->implementation = "timer";
 }
Index: linux-2.6/include/asm-i386/mach-default/entry_arch.h
===================================================================
--- linux-2.6.orig/include/asm-i386/mach-default/entry_arch.h
+++ linux-2.6/include/asm-i386/mach-default/entry_arch.h
@@ -31,4 +31,8 @@ BUILD_INTERRUPT(spurious_interrupt,SPURI
 BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 #endif
 
+#ifdef CONFIG_PERFMON
+BUILD_INTERRUPT(pmu_interrupt,LOCAL_PERFMON_VECTOR)
+#endif
+
 #endif
Index: linux-2.6/include/asm-i386/mach-default/irq_vectors.h
===================================================================
--- linux-2.6.orig/include/asm-i386/mach-default/irq_vectors.h
+++ linux-2.6/include/asm-i386/mach-default/irq_vectors.h
@@ -56,6 +56,7 @@
  * sources per level' errata.
  */
 #define LOCAL_TIMER_VECTOR	0xef
+#define LOCAL_PERFMON_VECTOR	0xee
 
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee)
@@ -63,7 +64,7 @@
  * levels. (0x80 is the syscall vector)
  */
 #define FIRST_DEVICE_VECTOR	0x31
-#define FIRST_SYSTEM_VECTOR	0xef
+#define FIRST_SYSTEM_VECTOR	0xee
 
 #define TIMER_IRQ 0
 
Index: linux-2.6/include/asm-i386/perfmon.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-i386/perfmon.h
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file contains X86 Processor Family specific definitions
+ * for the perfmon interface. This covers P6, Pentium M, P4/Xeon
+ * (32-bit and 64-bit, i.e., EM64T) and AMD X86-64.
+ *
+ * This file MUST never be included directly. Use linux/perfmon.h.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef _ASM_I386_PERFMON_H_
+#define _ASM_I386_PERFMON_H_
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_4KSTACKS
+#define PFM_ARCH_PMD_STK_ARG	2
+#define PFM_ARCH_PMC_STK_ARG	2
+#else
+#define PFM_ARCH_PMD_STK_ARG	4 /* about 700 bytes of stack space */
+#define PFM_ARCH_PMC_STK_ARG	4 /* about 200 bytes of stack space */
+#endif
+
+/*
+ * For P4:
+ * - bits 31 - 63 reserved
+ * - T1_OS and T1_USR bits are reserved - set depending on logical proc
+ *      user mode application should use T0_OS and T0_USR to indicate
+ * RSVD: reserved bits must be 1
+ */
+#define PFM_ESCR_RSVD  ~0x000000007ffffffcULL
+
+/*
+ * bitmask for reg_type
+ */
+#define PFM_REGT_NA		0x0000	/* not available */
+#define PFM_REGT_EN		0x0001	/* has enable bit (cleared on ctxsw) */
+#define PFM_REGT_ESCR		0x0002	/* P4: ESCR */
+#define PFM_REGT_CCCR		0x0004	/* P4: CCCR */
+#define PFM_REGT_PEBS		0x0010	/* PEBS related */
+#define PFM_REGT_NOHT		0x0020	/* unavailable with HT */
+#define PFM_REGT_CTR		0x0040	/* counter */
+#define PFM_REGT_OTH		0x0080	/* other type of register */
+
+/*
+ * This design and the partitioning of resources for SMT (hyper threads)
+ * is very static and limited due to limitations in the number of ESCRs
+ * and CCCRs per group.
+ */
+#define MAX_SMT_ID 1
+
+/*
+ * For extended register information in addition to address that is used
+ * at runtime to figure out the mapping of reg addresses to logical procs
+ * and association of registers to hardware specific features
+ */
+struct pfm_arch_ext_reg {
+	/*
+	 * one each for the logical CPUs.  Index 0 corresponds to T0 and
+	 * index 1 corresponds to T1.  Index 1 can be zero if no T1
+	 * complement reg exists.
+	 */
+	unsigned long addrs[MAX_SMT_ID+1];
+	unsigned int ctr;	/* for CCCR/PERFEVTSEL, associated counter */
+	unsigned int reg_type;
+};
+
+typedef int (*pfm_check_session_t)(struct pfm_context *ctx);
+
+struct pfm_arch_pmu_info {
+	struct pfm_arch_ext_reg pmc_addrs[PFM_MAX_PMCS];
+	struct pfm_arch_ext_reg pmd_addrs[PFM_MAX_PMDS];
+	u64 enable_mask[PFM_PMC_BV]; /* PMC registers with enable bit */
+	u64 ovfl_reg_mask; /* relevant bits of PERF_OVFL_STATS (Core) */
+
+	u16 max_ena;	   /* highest enable bit + 1 */
+	u16 flags;	   /* PMU feature flags */
+	u16 pebs_ctr_idx;  /* index of PEBS counter for overflow */
+	u16 reserved;	   /* for future use */
+
+	/*
+	 * optional callbacks invoked by pfm_arch_*load_context()
+	 */
+	int (*load_context)(struct pfm_context *ctx);
+	int (*unload_context)(struct pfm_context *ctx);
+
+	u8  pmu_style;	   /* type of PMU interface (P4, P6, CORE) */
+};
+
+/*
+ * X86 PMU style
+ */
+#define PFM_X86_PMU_P4		1 /* Intel P4/Xeon/EM64T processor PMU */
+#define PFM_X86_PMU_P6		2 /* Intel P6/Pentium M */
+#define PFM_X86_PMU_CORE	3 /* Intel Core PMU */
+#define PFM_X86_PMU_AMD64	4 /* AMD64 PMU (K8, family 10h) */
+
+/*
+ * PMU feature flags
+ */
+#define PFM_X86_FL_PMU_DS	0x01	/* Intel: support for Data Save Area (DS) */
+#define PFM_X86_FL_PMU_PEBS	0x02	/* Intel: support PEBS (implies DS) */
+#define PFM_X86_FL_USE_NMI	0x04	/* must use NMI interrupt */
+
+void __pfm_read_reg_p4(const struct pfm_arch_ext_reg *xreg, u64 *val);
+void __pfm_write_reg_p4(const struct pfm_arch_ext_reg *xreg, u64 val);
+
+
+extern void pfm_arch_resend_irq(void);
+
+static inline void pfm_arch_serialize(void)
+{}
+
+/*
+ * on x86, the PMDs are already saved by pfm_arch_freeze_pmu()
+ * when entering the PMU interrupt handler, thus, we do not need
+ * to save them again in pfm_switch_sets_from_intr()
+ */
+static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx,
+						struct pfm_event_set *set)
+{}
+
+/*
+ * in certain situations, ctx may be NULL
+ */
+static inline void pfm_arch_write_pmc(struct pfm_context *ctx, unsigned int cnum, u64 value)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	/*
+	 * we only write to the actual register when monitoring is
+	 * active (pfm_start was issued)
+	 */
+	if (ctx && ctx->flags.started == 0)
+		return;
+
+	PFM_DBG_ovfl("pfm_arch_write_pmc(0x%016Lx, 0x%016Lx)",
+		     (unsigned long long) pfm_pmu_conf->pmc_desc[cnum].hw_addr,
+		     (unsigned long long) value);
+	if (arch_info->pmu_style == PFM_X86_PMU_P4)
+		__pfm_write_reg_p4(&arch_info->pmc_addrs[cnum], value);
+	else
+		wrmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value);
+}
+
+static inline void pfm_arch_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	/*
+	 * force upper bit set for counter to ensure overflow
+	 */
+	if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64)
+		value |= ~pfm_pmu_conf->ovfl_mask;
+
+	PFM_DBG_ovfl("pfm_arch_write_pmd(0x%016Lx, 0x%016Lx)",
+		     (unsigned long long) pfm_pmu_conf->pmd_desc[cnum].hw_addr,
+		     (unsigned long long) value);
+	if (arch_info->pmu_style == PFM_X86_PMU_P4)
+		__pfm_write_reg_p4(&arch_info->pmd_addrs[cnum], value);
+	else
+		wrmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value);
+}
+
+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	u64 tmp;
+
+	if (arch_info->pmu_style == PFM_X86_PMU_P4)
+		__pfm_read_reg_p4(&arch_info->pmd_addrs[cnum], &tmp);
+	else
+		rdmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, tmp);
+	PFM_DBG_ovfl("pfm_arch_read_pmd(0x%016Lx) = 0x%016Lx",
+		     (unsigned long long) pfm_pmu_conf->pmd_desc[cnum].hw_addr,
+		     (unsigned long long) tmp);
+	return tmp;
+}
+
+static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	u64 tmp;
+	if (arch_info->pmu_style == PFM_X86_PMU_P4)
+		__pfm_read_reg_p4(&arch_info->pmc_addrs[cnum], &tmp);
+	else
+		rdmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, tmp);
+        PFM_DBG_ovfl("pfm_arch_read_pmc(0x%016Lx) = 0x%016Lx",
+		     (unsigned long long) pfm_pmu_conf->pmc_desc[cnum].hw_addr,
+		     (unsigned long long) tmp);
+	return tmp;
+}
+
+/*
+ * At certain points, perfmon needs to know if monitoring has been
+ * explicitely started/stopped by user via pfm_start/pfm_stop. The
+ * information is tracked in flags.started. However on certain
+ * architectures, it may be possible to start/stop directly from
+ * user level with a single assembly instruction bypassing
+ * the kernel. This function is used to determine by
+ * an arch-specific mean if monitoring is actually started/stopped.
+ */
+static inline int pfm_arch_is_active(struct pfm_context *ctx)
+{
+	return ctx->flags.started;
+}
+
+static inline void pfm_arch_ctxswout_sys(struct task_struct *task,
+		       			 struct pfm_context *ctx,
+					 struct pfm_event_set *set)
+{}
+
+static inline void pfm_arch_ctxswin_sys(struct task_struct *task,
+                       			struct pfm_context *ctx,
+					struct pfm_event_set *set)
+{}
+
+static inline int  pfm_arch_init(void)
+{
+	return 0;
+}
+
+static inline void pfm_arch_init_percpu(void)
+{}
+
+int  pfm_arch_ctxswout_thread(struct task_struct *task,
+		      	      struct pfm_context *ctx,
+			      struct pfm_event_set *set);
+
+void pfm_arch_ctxswin_thread(struct task_struct *task,
+		      	     struct pfm_context *ctx,
+			     struct pfm_event_set *set);
+
+void pfm_arch_stop(struct task_struct *task,
+		   struct pfm_context *ctx, struct pfm_event_set *set);
+void pfm_arch_start(struct task_struct *task,
+		    struct pfm_context *ctx, struct pfm_event_set *set);
+
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
+void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx);
+int  pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg);
+void pfm_arch_pmu_config_remove(void);
+char *pfm_arch_get_pmu_module_name(void);
+
+static inline int pfm_arch_unload_context(struct pfm_context *ctx,
+			    		  struct task_struct *task)
+{
+	struct pfm_arch_pmu_info *arch_info;
+	int ret = 0;
+
+	arch_info = pfm_pmu_conf->arch_info;
+	if (arch_info->unload_context) {
+		ret = arch_info->unload_context(ctx);
+	}
+	return ret;
+}
+
+static inline int pfm_arch_load_context(struct pfm_context *ctx,
+			  		struct pfm_event_set *set,
+			  		struct task_struct *task)
+{
+	struct pfm_arch_pmu_info *arch_info;
+	int ret = 0;
+
+	arch_info = pfm_pmu_conf->arch_info;
+	if (arch_info->load_context) {
+		ret = arch_info->load_context(ctx);
+	}
+	return ret;
+}
+
+/*
+ * this function is called from the PMU interrupt handler ONLY.
+ * On x86, the PMU is frozen via arch_stop, masking would be implemented
+ * via arch-stop as well. Given that the PMU is already stopped when
+ * entering the interrupt handler, we do not need to stop it again, so
+ * this function is a nop.
+ */
+static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx,
+					    struct pfm_event_set *set)
+{}
+
+/*
+ * on x86 masking/unmasking uses the start/stop mechanism, so we simply
+ * need to start here.
+ */
+static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx,
+					      struct pfm_event_set *set)
+{
+	pfm_arch_start(current, ctx, set);
+}
+
+/*
+ * called from __pfm_interrupt_handler(). ctx is not NULL.
+ * ctx is locked. interrupts are masked
+ *
+ * The following actions must take place:
+ *  - stop all monitoring to ensure handler has consistent view.
+ *  - collect overflowed PMDs bitmask into povfls_pmds and
+ *    npend_ovfls. If no interrupt detected then npend_ovfls
+ *    must be set to zero.
+ */
+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx,
+					    struct pfm_event_set *set)
+{
+	/*
+	 * on X86, freezing is equivalent to stopping
+	 */
+	pfm_arch_stop(current, ctx, set);
+
+	/*
+	 * we mark monitoring as stopped to avoid
+	 * certain side effects especially in
+	 * pfm_switch_sets_from_intr() and
+	 * pfm_arch_restore_pmcs()
+	 */
+	ctx->flags.started = 0;
+}
+
+/*
+ * function called from pfm_setfl_sane(). Context is locked
+ * and interrupts are masked.
+ * The value of flags is the value of ctx_flags as passed by
+ * user.
+ *
+ * function must check arch-specific set flags.
+ * Return:
+ * 	1 when flags are valid
+ *      0 on error
+ */
+static inline int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags)
+{
+	return 0;
+}
+
+int pfm_arch_pmu_acquire(void);
+void pfm_arch_pmu_release(void);
+
+/*
+ * For some CPUs, the upper bits of a counter must be set in order for the
+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
+ * and the upper bits are cleared. This function may be used to set them back.
+ *
+ * x86: The current version loses whatever is remaining in the counter,
+ * which is usually has a small count. In order not to loose this count,
+ * we do a read-modify-write to set the upper bits while preserving the
+ * low-order bits. This is slow but works.
+ */
+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+	u64 val;
+	val = pfm_arch_read_pmd(ctx, cnum);
+	pfm_arch_write_pmd(ctx, cnum, val);
+}
+
+/*
+ * not used for i386/x86_64
+ */
+static inline int pfm_smpl_buffer_alloc_compat(struct pfm_context *ctx,
+					       size_t rsize, struct file *filp)
+{
+	return -EINVAL;
+}
+
+/*
+ * architecture specific context extension.
+ * located at: (struct pfm_arch_context *)(ctx+1)
+ */
+
+struct pfm_arch_p4_context {
+	u32	npend_ovfls;	/* P4 NMI #pending ovfls */
+	u32	reserved;
+	u64	povfl_pmds[PFM_HW_PMD_BV]; /* P4 NMI overflowed counters */
+	u64	saved_cccrs[PFM_MAX_PMCS];
+};
+
+struct pfm_arch_context {
+	u64		saved_real_iip;	/* instr pointer of last NMI intr (ctxsw) */
+	u32		flags;		/* arch-specific flags */
+	unsigned long	ds_area;	/* address of DS management area */
+	struct pfm_arch_p4_context *p4; /* P4 specific state */
+};
+
+static inline int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	struct pfm_arch_context *ctx_arch;
+
+	if (arch_info->pmu_style != PFM_X86_PMU_P4)
+		return 0;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	ctx_arch->p4 = kzalloc(sizeof(*(ctx_arch->p4)), GFP_KERNEL);
+	if (!ctx_arch->p4)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline void pfm_arch_context_free(struct pfm_context *ctx)
+{
+	struct pfm_arch_context *ctx_arch;
+
+	ctx_arch = pfm_ctx_arch(ctx);
+
+	/* 
+	 * we do not check if P4, because it would be NULL and
+	 * kfree can deal with NULL
+	 */
+	kfree(ctx_arch->p4);
+}
+
+
+/*
+ * pfm_arch_context flags
+ */
+#define PFM_X86_USE_PEBS	0x1	/* context is using PEBS */
+#define PFM_X86_USE_BTS		0x2	/* context is using BTS  */
+#define PFM_X86_USE_DS		(PFM_X86_USE_PEBS|PFM_X86_USE_BTS)
+
+#define PFM_ARCH_CTX_SIZE	(sizeof(struct pfm_arch_context))
+
+asmlinkage void  pmu_interrupt(void);
+
+#endif /* __KERNEL__ */
+
+#endif /* _ASM_I386_PERFMON_H_ */
Index: linux-2.6/include/asm-i386/perfmon_api.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-i386/perfmon_api.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file contains x86-64 specific definitions for the perfmon
+ * interface.
+ *
+ * This file MUST never be included directly. Use linux/perfmon.h.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef _ASM_I386_PERFMON_API_H_
+#define _ASM_I386_PERFMON_API_H_
+/*
+ * both i386 and x86-64 maximums MUST be identical to ensure ABI
+ * compatibility
+ */
+#define PFM_ARCH_MAX_HW_PMCS	256 /* maximum number of PMC registers */
+#define PFM_ARCH_MAX_HW_PMDS	256 /* maximum number of PMD registers */
+
+/*
+ * Virtual PMU registers: registers mapped to non-PMU resources
+ * IMPORTANT:
+ * 	- must appear in PMC/PMD namespace *AFTER* PMU registers
+ * 	- SW PMD can be specified as smpl_pmds, reset_pmds
+ * 	- SW PMD cannot overflow
+ * 	- SW PMD do not show up in pfarg_msg.ovfl_pmds/pfarg_setinfo_t.ovfl_pmds
+ */
+#define PFM_ARCH_MAX_SW_PMCS	64 /* max virtual PMCS */
+#define PFM_ARCH_MAX_SW_PMDS	64 /* max virtual PMDS */
+
+#endif /* _ASM_I386_PERFMON_API_H_ */
Index: linux-2.6/include/asm-i386/perfmon_pebs_smpl.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-i386/perfmon_pebs_smpl.h
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ *
+ * This file implements the sampling format to support Intel
+ * Precise Event Based Sampling (PEBS) feature of Pentium 4 and
+ * Intel Core-based processors.
+ *
+ * What is PEBS?
+ * ------------
+ *  This is a hardware feature to enhance sampling by providing
+ *  better precision as to where a sample is taken. This avoids the
+ *  typical skew in the instruction one can observe with any
+ *  interrupt-based sampling technique.
+ *
+ *  PEBS also lowers sampling overhead significantly by having the
+ *  processor store samples instead of the OS. PMU interrupt are only
+ *  generated after multiple samples are written.
+ *
+ *  Another benefit of PEBS is that samples can be captured inside
+ *  critical sections where interrupts are masked.
+ *
+ * How does it work?
+ *  PEBS effectively implements a Hw buffer. The Os must pass a region
+ *  of memory where samples are to be stored. The region can have any
+ *  size. The OS must also specify the sampling period to reload. The PMU
+ *  will interrupt when it reaches the end of the buffer or a specified
+ *  threshold location inside the memory region.
+ *
+ *  The description of the buffer is stored in the Data Save Area (DS).
+ *  The samples are stored sequentially in the buffer. The format of the
+ *  buffer is fixed and specified in the PEBS documentation.  The sample
+ *  format changes between 32-bit and 64-bit modes due to extended register
+ *  file.
+ *
+ *  PEBS does not work when HyperThreading is enabled due to certain MSR
+ *  being shared being to two threads.
+ *
+ *  What does the format do?
+ *   It provides access to the PEBS feature for both 32-bit and 64-bit
+ *   processors that support it.
+ *
+ *   The same code is used for both 32-bit and 64-bit mode, but different
+ *   format names are used because the two modes are not compatible due to
+ *   data model and register file differences. Similarly the public data
+ *   structures describing the samples are different.
+ *
+ *   It is important to realize that the format provide a zero-copy environment
+ *   for the samples, i.e,, the OS never touches the samples. Whatever the
+ *   processor write is directly accessible to the user.
+ *
+ *   Parameters to the buffer can be passed via pfm_create_context() in
+ *   the pfm_pebs_smpl_arg structure.
+ *
+ *   It is not possible to mix a 32-bit PEBS application on top of a 64-bit
+ *   host kernel.
+ */
+#ifndef __PERFMON_PEBS_SMPL_H__
+#define __PERFMON_PEBS_SMPL_H__ 1
+
+#ifdef __i386__
+/*
+ * The 32-bit and 64-bit formats are not compatible, thus we have
+ * two different identifications so that 32-bit programs running on
+ * 64-bit OS will fail to use the 64-bit PEBS support.
+ */
+#define PFM_PEBS_SMPL_NAME	"pebs32"
+#else
+#define PFM_PEBS_SMPL_NAME	"pebs64"
+#endif
+
+/*
+ * format specific parameters (passed at context creation)
+ *
+ * intr_thres: index from start of buffer of entry where the
+ * PMU interrupt must be triggered. It must be several samples
+ * short of the end of the buffer.
+ */
+struct pfm_pebs_smpl_arg {
+	u64 cnt_reset;	  /* counter reset value */
+	size_t buf_size;  /* size of the PEBS buffer in bytes */
+	size_t intr_thres;/* index of PEBS interrupt threshold entry */
+	u64 reserved[6];  /* for future use */
+};
+
+/*
+ * Data Save Area (32 and 64-bit mode)
+ *
+ * The DS area must be exposed to the user because this is the only
+ * way to report on the number of valid entries recorded by the CPU.
+ * This is required when the buffer is not full, i..e, there was not
+ * PMU interrupt.
+ *
+ * Layout of the structure is mandated by hardware and specified in
+ * the Intel documentation.
+ */
+struct pfm_ds_area {
+	unsigned long	bts_buf_base;
+	unsigned long	bts_index;
+	unsigned long	bts_abs_max;
+	unsigned long	bts_intr_thres;
+	unsigned long	pebs_buf_base;
+	unsigned long	pebs_index;
+	unsigned long	pebs_abs_max;
+	unsigned long	pebs_intr_thres;
+	u64     	pebs_cnt_reset;
+};
+
+/*
+ * This header is at the beginning of the sampling buffer returned to the user.
+ *
+ * Because of PEBS alignement constraints, the actual PEBS buffer area does
+ * not necessarily begin right after the header. The hdr_start_offs must be
+ * used to compute the first byte of the buffer. The offset is defined as
+ * the number of bytes between the end of the header and the beginning of
+ * the buffer. As such the formula is:
+ * 	actual_buffer = (unsigned long)(hdr+1)+hdr->hdr_start_offs
+ */
+struct pfm_pebs_smpl_hdr {
+	u64 overflows;			/* #overflows for buffer */
+	size_t buf_size;		/* bytes in the buffer */
+	size_t start_offs; 		/* actual buffer start offset */
+	u32 version;			/* smpl format version */
+	u32 reserved1;			/* for future use */
+	u64 reserved2[5];		/* for future use */
+	struct pfm_ds_area ds;	/* data save area */
+};
+
+/*
+ * 64-bit PEBS record format is described in
+ * http://www.intel.com/technology/64bitextensions/30083502.pdf
+ *
+ * The format does not peek at samples. The sample structure is only
+ * used to ensure that the buffer is large enough to accomodate one
+ * sample.
+ */
+#ifdef __i386__
+struct pfm_pebs_smpl_entry {
+	u32	eflags;
+	u32	ip;
+	u32	eax;
+	u32	ebx;
+	u32	ecx;
+	u32	edx;
+	u32	esi;
+	u32	edi;
+	u32	ebp;
+	u32	esp;
+};
+#else
+struct pfm_pebs_smpl_entry {
+	u64	eflags;
+	u64	ip;
+	u64	eax;
+	u64	ebx;
+	u64	ecx;
+	u64	edx;
+	u64	esi;
+	u64	edi;
+	u64	ebp;
+	u64	esp;
+	u64	r8;
+	u64	r9;
+	u64	r10;
+	u64	r11;
+	u64	r12;
+	u64	r13;
+	u64	r14;
+	u64	r15;
+};
+#endif
+
+#define PFM_PEBS_SMPL_VERSION_MAJ 1U
+#define PFM_PEBS_SMPL_VERSION_MIN 0U
+#define PFM_PEBS_SMPL_VERSION (((PFM_PEBS_SMPL_VERSION_MAJ&0xffff)<<16)|\
+				   (PFM_PEBS_SMPL_VERSION_MIN & 0xffff))
+
+#endif /* __PERFMON_PEBS_SMPL_H__ */
Index: linux-2.6/include/asm-i386/thread_info.h
===================================================================
--- linux-2.6.orig/include/asm-i386/thread_info.h
+++ linux-2.6/include/asm-i386/thread_info.h
@@ -160,7 +160,7 @@ static inline struct thread_info *curren
 #define _TIF_ALLWORK_MASK	(0x0000FFFF & ~_TIF_SECCOMP)
 
 /* flags to check in __switch_to() */
-#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)
+#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP|_TIF_PERFMON_CTXSW)
 
 /*
  * Thread-synchronous status.
Index: linux-2.6/include/asm-i386/unistd.h
===================================================================
--- linux-2.6.orig/include/asm-i386/unistd.h
+++ linux-2.6/include/asm-i386/unistd.h
@@ -329,10 +329,22 @@
 #define __NR_signalfd		321
 #define __NR_timerfd		322
 #define __NR_eventfd		323
+#define __NR_pfm_create_context	324
+#define __NR_pfm_write_pmcs	(__NR_pfm_create_context+1)
+#define __NR_pfm_write_pmds	(__NR_pfm_create_context+2)
+#define __NR_pfm_read_pmds	(__NR_pfm_create_context+3)
+#define __NR_pfm_load_context	(__NR_pfm_create_context+4)
+#define __NR_pfm_start		(__NR_pfm_create_context+5)
+#define __NR_pfm_stop		(__NR_pfm_create_context+6)
+#define __NR_pfm_restart	(__NR_pfm_create_context+7)
+#define __NR_pfm_create_evtsets	(__NR_pfm_create_context+8)
+#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9)
+#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10)
+#define __NR_pfm_unload_context	(__NR_pfm_create_context+11)
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 324
+#define NR_syscalls 335
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
Index: linux-2.6/include/asm-ia64/hw_irq.h
===================================================================
--- linux-2.6.orig/include/asm-ia64/hw_irq.h
+++ linux-2.6/include/asm-ia64/hw_irq.h
@@ -63,9 +63,9 @@ extern int ia64_last_device_vector;
 #define IA64_NUM_DEVICE_VECTORS		(IA64_LAST_DEVICE_VECTOR - IA64_FIRST_DEVICE_VECTOR + 1)
 
 #define IA64_MCA_RENDEZ_VECTOR		0xe8	/* MCA rendez interrupt */
-#define IA64_PERFMON_VECTOR		0xee	/* performanc monitor interrupt vector */
 #define IA64_TIMER_VECTOR		0xef	/* use highest-prio group 15 interrupt for timer */
 #define	IA64_MCA_WAKEUP_VECTOR		0xf0	/* MCA wakeup (must be >MCA_RENDEZ_VECTOR) */
+#define IA64_PERFMON_VECTOR		0xf1	/* performance monitor interrupt vector */
 #define IA64_IPI_LOCAL_TLB_FLUSH	0xfc	/* SMP flush local TLB */
 #define IA64_IPI_RESCHEDULE		0xfd	/* SMP reschedule */
 #define IA64_IPI_VECTOR			0xfe	/* inter-processor interrupt vector */
Index: linux-2.6/include/asm-ia64/perfmon.h
===================================================================
--- linux-2.6.orig/include/asm-ia64/perfmon.h
+++ linux-2.6/include/asm-ia64/perfmon.h
@@ -1,279 +1,302 @@
 /*
- * Copyright (C) 2001-2003 Hewlett-Packard Co
- *               Stephane Eranian <eranian@hpl.hp.com>
- */
-
-#ifndef _ASM_IA64_PERFMON_H
-#define _ASM_IA64_PERFMON_H
-
-/*
- * perfmon comamnds supported on all CPU models
- */
-#define PFM_WRITE_PMCS		0x01
-#define PFM_WRITE_PMDS		0x02
-#define PFM_READ_PMDS		0x03
-#define PFM_STOP		0x04
-#define PFM_START		0x05
-#define PFM_ENABLE		0x06 /* obsolete */
-#define PFM_DISABLE		0x07 /* obsolete */
-#define PFM_CREATE_CONTEXT	0x08
-#define PFM_DESTROY_CONTEXT	0x09 /* obsolete use close() */
-#define PFM_RESTART		0x0a
-#define PFM_PROTECT_CONTEXT	0x0b /* obsolete */
-#define PFM_GET_FEATURES	0x0c
-#define PFM_DEBUG		0x0d
-#define PFM_UNPROTECT_CONTEXT	0x0e /* obsolete */
-#define PFM_GET_PMC_RESET_VAL	0x0f
-#define PFM_LOAD_CONTEXT	0x10
-#define PFM_UNLOAD_CONTEXT	0x11
+ * Copyright (c) 2001-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file contains Itanium Processor Family specific definitions
+ * for the perfmon interface.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef _ASM_IA64_PERFMON_H_
+#define _ASM_IA64_PERFMON_H_
 
+#ifdef __KERNEL__
 /*
- * PMU model specific commands (may not be supported on all PMU models)
+ * compatibility for previous versions of the interface
  */
-#define PFM_WRITE_IBRS		0x20
-#define PFM_WRITE_DBRS		0x21
+#include <asm/perfmon_compat.h>
 
-/*
- * context flags
- */
-#define PFM_FL_NOTIFY_BLOCK    	 0x01	/* block task on user level notifications */
-#define PFM_FL_SYSTEM_WIDE	 0x02	/* create a system wide context */
-#define PFM_FL_OVFL_NO_MSG	 0x80   /* do not post overflow/end messages for notification */
+#define PFM_ARCH_PMD_STK_ARG	8
+#define PFM_ARCH_PMC_STK_ARG	8
 
-/*
- * event set flags
- */
-#define PFM_SETFL_EXCL_IDLE      0x01   /* exclude idle task (syswide only) XXX: DO NOT USE YET */
+#include <asm/delay.h>
 
 /*
- * PMC flags
+ * describe the content of the pfm_syst_info field
+ * layout:
+ * bits[00-15] : generic flags
+ * bits[16-31] : arch-specific flags
  */
-#define PFM_REGFL_OVFL_NOTIFY	0x1	/* send notification on overflow */
-#define PFM_REGFL_RANDOM	0x2	/* randomize sampling interval   */
+#define PFM_ITA_CPUINFO_IDLE_EXCL 0x10000 /* stop monitoring in idle loop */
 
 /*
- * PMD/PMC/IBR/DBR return flags (ignored on input)
- *
- * Those flags are used on output and must be checked in case EAGAIN is returned
- * by any of the calls using a pfarg_reg_t or pfarg_dbreg_t structure.
+ * For some CPUs, the upper bits of a counter must be set in order for the
+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
+ * and the upper bits are cleared. This function may be used to set them back.
  */
-#define PFM_REG_RETFL_NOTAVAIL	(1UL<<31) /* set if register is implemented but not available */
-#define PFM_REG_RETFL_EINVAL	(1UL<<30) /* set if register entry is invalid */
-#define PFM_REG_RETFL_MASK	(PFM_REG_RETFL_NOTAVAIL|PFM_REG_RETFL_EINVAL)
+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum)
+{}
 
-#define PFM_REG_HAS_ERROR(flag)	(((flag) & PFM_REG_RETFL_MASK) != 0)
+static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg)
+{
+	return 0;
+}
 
-typedef unsigned char pfm_uuid_t[16];	/* custom sampling buffer identifier type */
+static inline void pfm_arch_pmu_config_remove(void)
+{}
 
 /*
- * Request structure used to define a context
- */
-typedef struct {
-	pfm_uuid_t     ctx_smpl_buf_id;	 /* which buffer format to use (if needed) */
-	unsigned long  ctx_flags;	 /* noblock/block */
-	unsigned short ctx_nextra_sets;	 /* number of extra event sets (you always get 1) */
-	unsigned short ctx_reserved1;	 /* for future use */
-	int	       ctx_fd;		 /* return arg: unique identification for context */
-	void	       *ctx_smpl_vaddr;	 /* return arg: virtual address of sampling buffer, is used */
-	unsigned long  ctx_reserved2[11];/* for future use */
-} pfarg_context_t;
+ * called from __pfm_interrupt_handler(). ctx is not NULL.
+ * ctx is locked. PMU interrupt is masked.
+ *
+ * must stop all monitoring to ensure handler has consistent view.
+ * must collect overflowed PMDs bitmask  into povfls_pmds and
+ * npend_ovfls. If no interrupt detected then npend_ovfls
+ * must be set to zero.
+ */
+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx,
+					    struct pfm_event_set *set)
+{
+	u64 tmp;
+
+	/*
+	 * do not overwrite existing value, must
+	 * process those first (coming from context switch replay)
+	 */
+	if (set->npend_ovfls)
+		return;
+
+	ia64_srlz_d();
+
+	tmp =  ia64_get_pmc(0) & ~0xf;
+
+	set->povfl_pmds[0] = tmp;
+
+	set->npend_ovfls = ia64_popcnt(tmp);
+}
+
+static inline int pfm_arch_init_pmu_config(void)
+{
+	return 0;
+}
+
+static inline void pfm_arch_resend_irq(void)
+{
+	ia64_resend_irq(IA64_PERFMON_VECTOR);
+}
+
+static inline void pfm_arch_serialize(void)
+{
+	ia64_srlz_d();
+}
+
+static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
+{
+	PFM_DBG_ovfl("state=%d", ctx->state);
+	ia64_set_pmc(0, 0);
+	/* no serialization */
+}
+
+static inline void pfm_arch_write_pmc(struct pfm_context *ctx, unsigned int cnum, u64 value)
+{
+	if (cnum < 256) {
+		ia64_set_pmc(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value);
+	} else if (cnum < 264) {
+		ia64_set_ibr(cnum-256, value);
+		ia64_dv_serialize_instruction();
+	} else {
+		ia64_set_dbr(cnum-264, value);
+		ia64_dv_serialize_instruction();
+	}
+}
+
+/*
+ * On IA-64, for per-thread context which have the ITA_FL_INSECURE
+ * flag, it is possible to start/stop monitoring directly from user evel
+ * without calling pfm_start()/pfm_stop. This allows very lightweight
+ * control yet the kernel sometimes needs to know if monitoring is actually
+ * on or off.
+ *
+ * Tracking of this information is normally done by pfm_start/pfm_stop
+ * in flags.started. Here we need to compensate by checking actual
+ * psr bit.
+ */
+static inline int pfm_arch_is_active(struct pfm_context *ctx)
+{
+	return ctx->flags.started || ia64_getreg(_IA64_REG_PSR) & (IA64_PSR_UP|IA64_PSR_PP);
+}
+	
+static inline void pfm_arch_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value)
+{
+	/*
+	 * for a counting PMD, overflow bit must be cleared
+	 */
+	if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64)
+		value &= pfm_pmu_conf->ovfl_mask;
+
+	/*
+	 * for counters, write to upper bits are ignored, no need to mask
+	 */
+	ia64_set_pmd(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value);
+}
+
+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+	return ia64_get_pmd(pfm_pmu_conf->pmd_desc[cnum].hw_addr);
+}
+
+static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum)
+{
+	return ia64_get_pmc(pfm_pmu_conf->pmc_desc[cnum].hw_addr);
+}
+
+static inline void pfm_arch_ctxswout_sys(struct task_struct *task,
+		           		 struct pfm_context *ctx,
+					 struct pfm_event_set *set)
+{
+	struct pt_regs *regs;
+
+	regs = task_pt_regs(task);
+	ia64_psr(regs)->pp = 0;
+}
+
+static inline void pfm_arch_ctxswin_sys(struct task_struct *task,
+					struct pfm_context *ctx,
+					struct pfm_event_set *set)
+{
+	struct pt_regs *regs;
+
+	if (!(set->flags & PFM_ITA_SETFL_INTR_ONLY)) {
+		regs = task_pt_regs(task);
+		ia64_psr(regs)->pp = 1;
+	}
+}
+
+/*
+ * On IA-64, the PMDs are NOT saved by pfm_arch_freeze_pmu()
+ * when entering the PMU interrupt handler, thus, we need
+ * to save them in pfm_switch_sets_from_intr()
+ */ 
+static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx,
+					   struct pfm_event_set *set)
+{
+	pfm_save_pmds(ctx, set);
+}
+
+int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags);
+
+static inline void pfm_arch_context_free(struct pfm_context *ctx)
+{}
+
+int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx,
+		       	     struct pfm_event_set *set);
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx,
+		      	     struct pfm_event_set *set);
+
+int pfm_arch_unload_context(struct pfm_context *ctx, struct task_struct *task);
+int pfm_arch_load_context(struct pfm_context *ctx, struct pfm_event_set *set,
+			  struct task_struct *task);
+int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags);
+
+void pfm_arch_mask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set);
+void pfm_arch_unmask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set);
+
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
+
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx,
+		   struct pfm_event_set *set);
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx,
+		    struct pfm_event_set *set);
+
+int  pfm_arch_init(void);
+void pfm_arch_init_percpu(void);
+char *pfm_arch_get_pmu_module_name(void);
+
+int __pfm_use_dbregs(struct task_struct *task);
+int  __pfm_release_dbregs(struct task_struct *task);
+int pfm_ia64_mark_dbregs_used(struct pfm_context *ctx,
+			      struct pfm_event_set *set);
+
+void pfm_arch_show_session(struct seq_file *m);
+
+static inline int pfm_arch_pmu_acquire(void)
+{
+	return 0;
+}
+
+static inline void pfm_arch_pmu_release(void)
+{}
 
 /*
- * Request structure used to write/read a PMC or PMD
+ * miscellaneous architected definitions
  */
-typedef struct {
-	unsigned int	reg_num;	   /* which register */
-	unsigned short	reg_set;	   /* event set for this register */
-	unsigned short	reg_reserved1;	   /* for future use */
-
-	unsigned long	reg_value;	   /* initial pmc/pmd value */
-	unsigned long	reg_flags;	   /* input: pmc/pmd flags, return: reg error */
-
-	unsigned long	reg_long_reset;	   /* reset after buffer overflow notification */
-	unsigned long	reg_short_reset;   /* reset after counter overflow */
-
-	unsigned long	reg_reset_pmds[4]; /* which other counters to reset on overflow */
-	unsigned long	reg_random_seed;   /* seed value when randomization is used */
-	unsigned long	reg_random_mask;   /* bitmask used to limit random value */
-	unsigned long   reg_last_reset_val;/* return: PMD last reset value */
-
-	unsigned long	reg_smpl_pmds[4];  /* which pmds are accessed when PMC overflows */
-	unsigned long	reg_smpl_eventid;  /* opaque sampling event identifier */
-
-	unsigned long   reg_reserved2[3];   /* for future use */
-} pfarg_reg_t;
-
-typedef struct {
-	unsigned int	dbreg_num;		/* which debug register */
-	unsigned short	dbreg_set;		/* event set for this register */
-	unsigned short	dbreg_reserved1;	/* for future use */
-	unsigned long	dbreg_value;		/* value for debug register */
-	unsigned long	dbreg_flags;		/* return: dbreg error */
-	unsigned long	dbreg_reserved2[1];	/* for future use */
-} pfarg_dbreg_t;
-
-typedef struct {
-	unsigned int	ft_version;	/* perfmon: major [16-31], minor [0-15] */
-	unsigned int	ft_reserved;	/* reserved for future use */
-	unsigned long	reserved[4];	/* for future use */
-} pfarg_features_t;
-
-typedef struct {
-	pid_t		load_pid;	   /* process to load the context into */
-	unsigned short	load_set;	   /* first event set to load */
-	unsigned short	load_reserved1;	   /* for future use */
-	unsigned long	load_reserved2[3]; /* for future use */
-} pfarg_load_t;
-
-typedef struct {
-	int		msg_type;		/* generic message header */
-	int		msg_ctx_fd;		/* generic message header */
-	unsigned long	msg_ovfl_pmds[4];	/* which PMDs overflowed */
-	unsigned short  msg_active_set;		/* active set at the time of overflow */
-	unsigned short  msg_reserved1;		/* for future use */
-	unsigned int    msg_reserved2;		/* for future use */
-	unsigned long	msg_tstamp;		/* for perf tuning/debug */
-} pfm_ovfl_msg_t;
-
-typedef struct {
-	int		msg_type;		/* generic message header */
-	int		msg_ctx_fd;		/* generic message header */
-	unsigned long	msg_tstamp;		/* for perf tuning */
-} pfm_end_msg_t;
-
-typedef struct {
-	int		msg_type;		/* type of the message */
-	int		msg_ctx_fd;		/* unique identifier for the context */
-	unsigned long	msg_tstamp;		/* for perf tuning */
-} pfm_gen_msg_t;
-
-#define PFM_MSG_OVFL	1	/* an overflow happened */
-#define PFM_MSG_END	2	/* task to which context was attached ended */
-
-typedef union {
-	pfm_ovfl_msg_t	pfm_ovfl_msg;
-	pfm_end_msg_t	pfm_end_msg;
-	pfm_gen_msg_t	pfm_gen_msg;
-} pfm_msg_t;
+#define PFM_ITA_FCNTR	4 /* first counting monitor (PMC/PMD) */
 
 /*
- * Define the version numbers for both perfmon as a whole and the sampling buffer format.
+ * private event set flags  (set_priv_flags)
  */
-#define PFM_VERSION_MAJ		 2U
-#define PFM_VERSION_MIN		 0U
-#define PFM_VERSION		 (((PFM_VERSION_MAJ&0xffff)<<16)|(PFM_VERSION_MIN & 0xffff))
-#define PFM_VERSION_MAJOR(x)	 (((x)>>16) & 0xffff)
-#define PFM_VERSION_MINOR(x)	 ((x) & 0xffff)
+#define PFM_ITA_SETFL_USE_DBR	0x1000000 /* set uses debug registers */
 
 
 /*
- * miscellaneous architected definitions
+ * Itanium-specific data structures
  */
-#define PMU_FIRST_COUNTER	4	/* first counting monitor (PMC/PMD) */
-#define PMU_MAX_PMCS		256	/* maximum architected number of PMC registers */
-#define PMU_MAX_PMDS		256	/* maximum architected number of PMD registers */
+struct pfm_ia64_context_flags {
+	unsigned int use_dbr:1;	 /* use range restrictions (debug registers) */
+	unsigned int insecure:1; /* insecure monitoring for non-self session */
+	unsigned int reserved:30;/* for future use */
+};
 
-#ifdef __KERNEL__
+struct pfm_arch_context {
+	struct pfm_ia64_context_flags flags;	/* arch specific ctx flags */
+	u64			 ctx_saved_psr_up;/* storage for ctxsw psr_up */
+#ifdef CONFIG_IA64_PERFMON_COMPAT
+	void			*ctx_smpl_vaddr; /* vaddr of user mapping */
+#endif
+};
 
-extern long perfmonctl(int fd, int cmd, void *arg, int narg);
+#ifdef CONFIG_IA64_PERFMON_COMPAT
+int pfm_ia64_compat_init(void);
+int pfm_smpl_buffer_alloc_compat(struct pfm_context *ctx,
+			         size_t rsize, struct file *filp);
+#else
+static inline int pfm_smpl_buffer_alloc_compat(struct pfm_context *ctx,
+					       size_t rsize, struct file *filp)
+{
+	return -EINVAL;
+}
+#endif
 
-typedef struct {
-	void (*handler)(int irq, void *arg, struct pt_regs *regs);
-} pfm_intr_handler_desc_t;
-
-extern void pfm_save_regs (struct task_struct *);
-extern void pfm_load_regs (struct task_struct *);
-
-extern void pfm_exit_thread(struct task_struct *);
-extern int  pfm_use_debug_registers(struct task_struct *);
-extern int  pfm_release_debug_registers(struct task_struct *);
-extern void pfm_syst_wide_update_task(struct task_struct *, unsigned long info, int is_ctxswin);
-extern void pfm_inherit(struct task_struct *task, struct pt_regs *regs);
-extern void pfm_init_percpu(void);
-extern void pfm_handle_work(void);
-extern int  pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *h);
-extern int  pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *h);
+extern struct pfm_ia64_pmu_info *pfm_ia64_pmu_info;
 
+#define PFM_ARCH_CTX_SIZE	(sizeof(struct pfm_arch_context))
 
 
-/*
- * Reset PMD register flags
- */
-#define PFM_PMD_SHORT_RESET	0
-#define PFM_PMD_LONG_RESET	1
-
-typedef union {
-	unsigned int val;
-	struct {
-		unsigned int notify_user:1;	/* notify user program of overflow */
-		unsigned int reset_ovfl_pmds:1;	/* reset overflowed PMDs */
-		unsigned int block_task:1;	/* block monitored task on kernel exit */
-		unsigned int mask_monitoring:1; /* mask monitors via PMCx.plm */
-		unsigned int reserved:28;	/* for future use */
-	} bits;
-} pfm_ovfl_ctrl_t;
-
-typedef struct {
-	unsigned char	ovfl_pmd;			/* index of overflowed PMD  */
-	unsigned char   ovfl_notify;			/* =1 if monitor requested overflow notification */
-	unsigned short  active_set;			/* event set active at the time of the overflow */
-	pfm_ovfl_ctrl_t ovfl_ctrl;			/* return: perfmon controls to set by handler */
-
-	unsigned long   pmd_last_reset;			/* last reset value of of the PMD */
-	unsigned long	smpl_pmds[4];			/* bitmask of other PMD of interest on overflow */
-	unsigned long   smpl_pmds_values[PMU_MAX_PMDS]; /* values for the other PMDs of interest */
-	unsigned long   pmd_value;			/* current 64-bit value of the PMD */
-	unsigned long	pmd_eventid;			/* eventid associated with PMD */
-} pfm_ovfl_arg_t;
-
-
-typedef struct {
-	char		*fmt_name;
-	pfm_uuid_t	fmt_uuid;
-	size_t		fmt_arg_size;
-	unsigned long	fmt_flags;
-
-	int		(*fmt_validate)(struct task_struct *task, unsigned int flags, int cpu, void *arg);
-	int		(*fmt_getsize)(struct task_struct *task, unsigned int flags, int cpu, void *arg, unsigned long *size);
-	int 		(*fmt_init)(struct task_struct *task, void *buf, unsigned int flags, int cpu, void *arg);
-	int		(*fmt_handler)(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp);
-	int		(*fmt_restart)(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs);
-	int		(*fmt_restart_active)(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs);
-	int		(*fmt_exit)(struct task_struct *task, void *buf, struct pt_regs *regs);
+static inline void pfm_release_dbregs(struct task_struct *task)
+{
+	if (task->thread.flags & IA64_THREAD_DBG_VALID)
+		__pfm_release_dbregs(task);
+}
 
-	struct list_head fmt_list;
-} pfm_buffer_fmt_t;
+#define pfm_use_dbregs(_t)     __pfm_use_dbregs(_t)
 
-extern int pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt);
-extern int pfm_unregister_buffer_fmt(pfm_uuid_t uuid);
-
-/*
- * perfmon interface exported to modules
- */
-extern int pfm_mod_read_pmds(struct task_struct *, void *req, unsigned int nreq, struct pt_regs *regs);
-extern int pfm_mod_write_pmcs(struct task_struct *, void *req, unsigned int nreq, struct pt_regs *regs);
-extern int pfm_mod_write_ibrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs);
-extern int pfm_mod_write_dbrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs);
-
-/*
- * describe the content of the local_cpu_date->pfm_syst_info field
- */
-#define PFM_CPUINFO_SYST_WIDE	0x1	/* if set a system wide session exists */
-#define PFM_CPUINFO_DCR_PP	0x2	/* if set the system wide session has started */
-#define PFM_CPUINFO_EXCL_IDLE	0x4	/* the system wide session excludes the idle task */
-
-/*
- * sysctl control structure. visible to sampling formats
- */
-typedef struct {
-	int	debug;		/* turn on/off debugging via syslog */
-	int	debug_ovfl;	/* turn on/off debug printk in overflow handler */
-	int	fastctxsw;	/* turn on/off fast (unsecure) ctxsw */
-	int	expert_mode;	/* turn on/off value checking */
-} pfm_sysctl_t;
-extern pfm_sysctl_t pfm_sysctl;
 
+struct pfm_arch_pmu_info {
+	unsigned long mask_pmcs[PFM_PMC_BV]; /* PMC to modify when masking monitoring */
+};
 
 #endif /* __KERNEL__ */
-
-#endif /* _ASM_IA64_PERFMON_H */
+#endif /* _ASM_IA64_PERFMON_H_ */
Index: linux-2.6/include/asm-ia64/perfmon_api.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-ia64/perfmon_api.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2001-2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file contains Itanium Processor Family specific definitions
+ * for the perfmon interface.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef _ASM_IA64_PERFMON_API_H_
+#define _ASM_IA64_PERFMON_API_H_
+
+#define PFM_ARCH_MAX_HW_PMCS	256 /* architecture-defined max PMCS */
+#define PFM_ARCH_MAX_HW_PMDS	256 /* architecture-defined max PMDS */
+/*
+ * Virtual PMU registers: registers mapped to non-PMU resources
+ * IMPORTANT:
+ * 	- must appear in PMC/PMD namespace *AFTER* PMU registers
+ * 	- SW PMD can be specified as smpl_pmds, reset_pmds
+ * 	- SW PMD cannot overflow
+ * 	- SW PMD do not show up in pfarg_msg.ovfl_pmds/pfarg_setinfo_t.ovfl_pmds
+ */
+#define PFM_ARCH_MAX_SW_PMCS	64 /* max virtual PMCS */
+#define PFM_ARCH_MAX_SW_PMDS	64 /* max virtual PMDS */
+
+/*
+ * Itanium specific context flags
+ */
+#define PFM_ITA_FL_INSECURE 0x10000 /* force psr.sp=0 for non self-monitoring */
+
+/*
+ * Itanium specific public event set flags (set_flags)
+ *
+ * event set flags layout:
+ * bits[00-15] : generic flags
+ * bits[16-31] : arch-specific flags
+ */
+#define PFM_ITA_SETFL_EXCL_INTR	0x10000	 /* exclude interrupt execution */
+#define PFM_ITA_SETFL_INTR_ONLY	0x20000	 /* include only interrupt execution */
+#define PFM_ITA_SETFL_IDLE_EXCL 0x40000  /* stop monitoring in idle loop */
+
+#endif /* _ASM_IA64_PERFMON_API_H_ */
Index: linux-2.6/include/asm-ia64/perfmon_compat.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-ia64/perfmon_compat.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This header file contains perfmon interface definition
+ * that are now obsolete and should be dropped in favor
+ * of their equivalent functions as explained below.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+
+#ifndef _ASM_IA64_PERFMON_COMPAT_H_
+#define _ASM_IA64_PERFMON_COMPAT_H_
+
+/*
+ * custom sampling buffer identifier type
+ */
+typedef __u8 pfm_uuid_t[16];
+
+/*
+ * obsolete perfmon commands. Supported only on IA-64 for
+ * backward compatiblity reasons with perfmon v2.0.
+ */
+#define PFM_WRITE_PMCS		0x01 /* use pfm_write_pmcs */
+#define PFM_WRITE_PMDS		0x02 /* use pfm_write_pmds */
+#define PFM_READ_PMDS		0x03 /* use pfm_read_pmds */
+#define PFM_STOP		0x04 /* use pfm_stop */
+#define PFM_START		0x05 /* use pfm_start */
+#define PFM_ENABLE		0x06 /* obsolete */
+#define PFM_DISABLE		0x07 /* obsolete */
+#define PFM_CREATE_CONTEXT	0x08 /* use pfm_create_context */
+#define PFM_DESTROY_CONTEXT	0x09 /* use close() */
+#define PFM_RESTART		0x0a /* use pfm_restart */
+#define PFM_PROTECT_CONTEXT	0x0b /* obsolete */
+#define PFM_GET_FEATURES	0x0c /* use /proc/sys/perfmon */
+#define PFM_DEBUG		0x0d /* /proc/sys/kernel/perfmon/debug */
+#define PFM_UNPROTECT_CONTEXT	0x0e /* obsolete */
+#define PFM_GET_PMC_RESET_VAL	0x0f /* use /proc/perfmon_map */
+#define PFM_LOAD_CONTEXT	0x10 /* use pfm_load_context */
+#define PFM_UNLOAD_CONTEXT	0x11 /* use pfm_unload_context */
+
+/*
+ * PMU model specific commands (may not be supported on all PMU models)
+ */
+#define PFM_WRITE_IBRS		0x20 /* obsolete: use PFM_WRITE_PMCS[256-263] */
+#define PFM_WRITE_DBRS		0x21 /* obsolete: use PFM_WRITE_PMCS[264-271] */
+
+/*
+ * argument to PFM_CREATE_CONTEXT
+ */
+struct pfarg_context {
+	pfm_uuid_t     ctx_smpl_buf_id;	 /* buffer format to use */
+	unsigned long  ctx_flags;	 /* noblock/block */
+	unsigned int   ctx_reserved1;	 /* for future use */
+	int	       ctx_fd;		 /* return: fildesc */
+	void	       *ctx_smpl_vaddr;	 /* return: vaddr of buffer */
+	unsigned long  ctx_reserved3[11];/* for future use */
+};
+
+/*
+ * argument structure for PFM_WRITE_PMCS/PFM_WRITE_PMDS/PFM_WRITE_PMDS
+ */
+struct pfarg_reg {
+	unsigned int	reg_num;	   /* which register */
+	unsigned short	reg_set;	   /* event set for this register */
+	unsigned short	reg_reserved1;	   /* for future use */
+
+	unsigned long	reg_value;	   /* initial pmc/pmd value */
+	unsigned long	reg_flags;	   /* input: pmc/pmd flags, return: reg error */
+
+	unsigned long	reg_long_reset;	   /* reset after buffer overflow notification */
+	unsigned long	reg_short_reset;   /* reset after counter overflow */
+
+	unsigned long	reg_reset_pmds[4]; /* which other counters to reset on overflow */
+	unsigned long	reg_random_seed;   /* seed for randomization */
+	unsigned long	reg_random_mask;   /* random range limit */
+	unsigned long   reg_last_reset_val;/* return: PMD last reset value */
+
+	unsigned long	reg_smpl_pmds[4];  /* pmds to be saved on overflow */
+	unsigned long	reg_smpl_eventid;  /* opaque sampling event id */
+	unsigned long   reg_ovfl_switch_cnt;/* #overflows to switch */
+
+	unsigned long   reg_reserved2[2];   /* for future use */
+};
+
+/*
+ * argument to PFM_WRITE_IBRS/PFM_WRITE_DBRS
+ */
+struct pfarg_dbreg {
+	unsigned int	dbreg_num;		/* which debug register */
+	unsigned short	dbreg_set;		/* event set */
+	unsigned short	dbreg_reserved1;	/* for future use */
+	unsigned long	dbreg_value;		/* value for debug register */
+	unsigned long	dbreg_flags;		/* return: dbreg error */
+	unsigned long	dbreg_reserved2[1];	/* for future use */
+};
+
+/*
+ * argument to PFM_GET_FEATURES
+ */
+struct pfarg_features {
+	unsigned int	ft_version;	/* major [16-31], minor [0-15] */
+	unsigned int	ft_reserved;	/* reserved for future use */
+	unsigned long	reserved[4];	/* for future use */
+};
+
+#endif /* _ASM_IA64_PERFMON_COMPAT_H_ */
Index: linux-2.6/include/asm-ia64/perfmon_default_smpl.h
===================================================================
--- linux-2.6.orig/include/asm-ia64/perfmon_default_smpl.h
+++ linux-2.6/include/asm-ia64/perfmon_default_smpl.h
@@ -1,83 +1,106 @@
 /*
- * Copyright (C) 2002-2003 Hewlett-Packard Co
- *               Stephane Eranian <eranian@hpl.hp.com>
+ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
  *
- * This file implements the default sampling buffer format
- * for Linux/ia64 perfmon subsystem.
- */
-#ifndef __PERFMON_DEFAULT_SMPL_H__
-#define __PERFMON_DEFAULT_SMPL_H__ 1
+ * This file implements the old default sampling buffer format
+ * for the perfmon2 subsystem. For IA-64 only.
+ *
+ * It requires the use of the perfmon_compat.h header. It is recommended
+ * that applications be ported to the new format instead.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef __ASM_IA64_PERFMON_DEFAULT_SMPL_H__
+#define __ASM_IA64_PERFMON_DEFAULT_SMPL_H__ 1
+
+#ifndef __ia64__
+#error "this file must be used for compatibility reasons only on IA-64"
+#endif
 
 #define PFM_DEFAULT_SMPL_UUID { \
-		0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82, 0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97}
+		0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82,\
+		0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97}
 
 /*
  * format specific parameters (passed at context creation)
  */
-typedef struct {
+struct pfm_default_smpl_arg {
 	unsigned long buf_size;		/* size of the buffer in bytes */
 	unsigned int  flags;		/* buffer specific flags */
 	unsigned int  res1;		/* for future use */
 	unsigned long reserved[2];	/* for future use */
-} pfm_default_smpl_arg_t;
+};
 
 /*
  * combined context+format specific structure. Can be passed
- * to PFM_CONTEXT_CREATE
+ * to PFM_CONTEXT_CREATE (not PFM_CONTEXT_CREATE2)
  */
-typedef struct {
-	pfarg_context_t		ctx_arg;
-	pfm_default_smpl_arg_t	buf_arg;
-} pfm_default_smpl_ctx_arg_t;
+struct pfm_default_smpl_ctx_arg {
+	struct pfarg_context		ctx_arg;
+	struct pfm_default_smpl_arg	buf_arg;
+};
 
 /*
  * This header is at the beginning of the sampling buffer returned to the user.
  * It is directly followed by the first record.
  */
-typedef struct {
-	unsigned long	hdr_count;		/* how many valid entries */
-	unsigned long	hdr_cur_offs;		/* current offset from top of buffer */
-	unsigned long	hdr_reserved2;		/* reserved for future use */
-
-	unsigned long	hdr_overflows;		/* how many times the buffer overflowed */
-	unsigned long   hdr_buf_size;		/* how many bytes in the buffer */
-
-	unsigned int	hdr_version;		/* contains perfmon version (smpl format diffs) */
-	unsigned int	hdr_reserved1;		/* for future use */
-	unsigned long	hdr_reserved[10];	/* for future use */
-} pfm_default_smpl_hdr_t;
+struct pfm_default_smpl_hdr {
+	u64	hdr_count;	/* how many valid entries */
+	u64	hdr_cur_offs;	/* current offset from top of buffer */
+	u64	dr_reserved2;	/* reserved for future use */
+
+	u64	hdr_overflows;	/* how many times the buffer overflowed */
+	u64	hdr_buf_size;	/* how many bytes in the buffer */
+
+	u32	hdr_version;	/* smpl format version*/
+	u32	hdr_reserved1;		/* for future use */
+	u64	hdr_reserved[10];	/* for future use */
+};
 
 /*
  * Entry header in the sampling buffer.  The header is directly followed
- * with the values of the PMD registers of interest saved in increasing 
- * index order: PMD4, PMD5, and so on. How many PMDs are present depends 
+ * with the values of the PMD registers of interest saved in increasing
+ * index order: PMD4, PMD5, and so on. How many PMDs are present depends
  * on how the session was programmed.
  *
  * In the case where multiple counters overflow at the same time, multiple
  * entries are written consecutively.
  *
- * last_reset_value member indicates the initial value of the overflowed PMD. 
+ * last_reset_value member indicates the initial value of the overflowed PMD.
  */
-typedef struct {
-        int             pid;                    /* thread id (for NPTL, this is gettid()) */
-        unsigned char   reserved1[3];           /* reserved for future use */
-        unsigned char   ovfl_pmd;               /* index of overflowed PMD */
-
-        unsigned long   last_reset_val;         /* initial value of overflowed PMD */
-        unsigned long   ip;                     /* where did the overflow interrupt happened  */
-        unsigned long   tstamp;                 /* ar.itc when entering perfmon intr. handler */
-
-        unsigned short  cpu;                    /* cpu on which the overfow occured */
-        unsigned short  set;                    /* event set active when overflow ocurred   */
-        int    		tgid;              	/* thread group id (for NPTL, this is getpid()) */
-} pfm_default_smpl_entry_t;
-
-#define PFM_DEFAULT_MAX_PMDS		64 /* how many pmds supported by data structures (sizeof(unsigned long) */
-#define PFM_DEFAULT_MAX_ENTRY_SIZE	(sizeof(pfm_default_smpl_entry_t)+(sizeof(unsigned long)*PFM_DEFAULT_MAX_PMDS))
-#define PFM_DEFAULT_SMPL_MIN_BUF_SIZE	(sizeof(pfm_default_smpl_hdr_t)+PFM_DEFAULT_MAX_ENTRY_SIZE)
+struct pfm_default_smpl_entry {
+	pid_t	pid;		/* thread id (for NPTL, this is gettid()) */
+	uint8_t	reserved1[3];	/* for future use */
+	uint8_t	ovfl_pmd;	/* overflow pmd for this sample */
+	u64	last_reset_val;	/* initial value of overflowed PMD */
+	unsigned long ip;	/* where did the overflow interrupt happened */
+	u64	tstamp; 	/* overflow timetamp */
+	u16	cpu;  		/* cpu on which the overfow occured */
+	u16	set;  		/* event set active when overflow ocurred   */
+	pid_t	tgid;		/* thread group id (for NPTL, this is getpid()) */
+};
+
+#define PFM_DEFAULT_MAX_PMDS		64 /* #pmds supported  */
+#define PFM_DEFAULT_MAX_ENTRY_SIZE	(sizeof(struct pfm_default_smpl_entry)+\
+					 (sizeof(u64)*PFM_DEFAULT_MAX_PMDS))
+#define PFM_DEFAULT_SMPL_MIN_BUF_SIZE	(sizeof(struct pfm_default_smpl_hdr)+\
+					 PFM_DEFAULT_MAX_ENTRY_SIZE)
 
 #define PFM_DEFAULT_SMPL_VERSION_MAJ	2U
-#define PFM_DEFAULT_SMPL_VERSION_MIN	0U
-#define PFM_DEFAULT_SMPL_VERSION	(((PFM_DEFAULT_SMPL_VERSION_MAJ&0xffff)<<16)|(PFM_DEFAULT_SMPL_VERSION_MIN & 0xffff))
+#define PFM_DEFAULT_SMPL_VERSION_MIN 1U
+#define PFM_DEFAULT_SMPL_VERSION (((PFM_DEFAULT_SMPL_VERSION_MAJ&0xffff)<<16)|\
+				    (PFM_DEFAULT_SMPL_VERSION_MIN & 0xffff))
 
-#endif /* __PERFMON_DEFAULT_SMPL_H__ */
+#endif /* __ASM_IA64_PERFMON_DEFAULT_SMPL_H__ */
Index: linux-2.6/include/asm-ia64/processor.h
===================================================================
--- linux-2.6.orig/include/asm-ia64/processor.h
+++ linux-2.6/include/asm-ia64/processor.h
@@ -41,7 +41,6 @@
 
 #define IA64_THREAD_FPH_VALID	(__IA64_UL(1) << 0)	/* floating-point high state valid? */
 #define IA64_THREAD_DBG_VALID	(__IA64_UL(1) << 1)	/* debug registers valid? */
-#define IA64_THREAD_PM_VALID	(__IA64_UL(1) << 2)	/* performance registers valid? */
 #define IA64_THREAD_UAC_NOPRINT	(__IA64_UL(1) << 3)	/* don't log unaligned accesses */
 #define IA64_THREAD_UAC_SIGBUS	(__IA64_UL(1) << 4)	/* generate SIGBUS on unaligned acc. */
 #define IA64_THREAD_MIGRATION	(__IA64_UL(1) << 5)	/* require migration
@@ -257,14 +256,6 @@ struct thread_struct {
 #else
 # define INIT_THREAD_IA32
 #endif /* CONFIG_IA32_SUPPORT */
-#ifdef CONFIG_PERFMON
-	void *pfm_context;		     /* pointer to detailed PMU context */
-	unsigned long pfm_needs_checking;    /* when >0, pending perfmon work on kernel exit */
-# define INIT_THREAD_PM		.pfm_context =		NULL,     \
-				.pfm_needs_checking =	0UL,
-#else
-# define INIT_THREAD_PM
-#endif
 	__u64 dbr[IA64_NUM_DBG_REGS];
 	__u64 ibr[IA64_NUM_DBG_REGS];
 	struct ia64_fpreg fph[96];	/* saved/loaded on demand */
@@ -279,7 +270,6 @@ struct thread_struct {
 	.task_size =	DEFAULT_TASK_SIZE,			\
 	.last_fph_cpu =  -1,					\
 	INIT_THREAD_IA32					\
-	INIT_THREAD_PM						\
 	.dbr =		{0, },					\
 	.ibr =		{0, },					\
 	.fph =		{{{{0}}}, }				\
Index: linux-2.6/include/asm-ia64/system.h
===================================================================
--- linux-2.6.orig/include/asm-ia64/system.h
+++ linux-2.6/include/asm-ia64/system.h
@@ -208,22 +208,18 @@ struct task_struct;
 extern void ia64_save_extra (struct task_struct *task);
 extern void ia64_load_extra (struct task_struct *task);
 
-#ifdef CONFIG_PERFMON
-  DECLARE_PER_CPU(unsigned long, pfm_syst_info);
-# define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1)
-#else
-# define PERFMON_IS_SYSWIDE() (0)
-#endif
-
-#define IA64_HAS_EXTRA_STATE(t)							\
-	((t)->thread.flags & (IA64_THREAD_DBG_VALID|IA64_THREAD_PM_VALID)	\
-	 || IS_IA32_PROCESS(task_pt_regs(t)) || PERFMON_IS_SYSWIDE())
+#define IA64_HAS_EXTRA_STATE(t)						\
+	(((t)->thread.flags & IA64_THREAD_DBG_VALID)			\
+	 || IS_IA32_PROCESS(task_pt_regs(t)))
 
 #define __switch_to(prev,next,last) do {							 \
 	if (IA64_HAS_EXTRA_STATE(prev))								 \
 		ia64_save_extra(prev);								 \
 	if (IA64_HAS_EXTRA_STATE(next))								 \
 		ia64_load_extra(next);								 \
+	if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)					 \
+	    || test_tsk_thread_flag(next, TIF_PERFMON_CTXSW))					 \
+		pfm_ctxsw(prev, next);								 \
 	ia64_psr(task_pt_regs(next))->dfh = !ia64_is_local_fpu_owner(next);			 \
 	(last) = ia64_switch_to((next));							 \
 } while (0)
Index: linux-2.6/include/asm-ia64/thread_info.h
===================================================================
--- linux-2.6.orig/include/asm-ia64/thread_info.h
+++ linux-2.6/include/asm-ia64/thread_info.h
@@ -86,11 +86,13 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
 #define TIF_SINGLESTEP		5	/* restore singlestep on return to user mode */
 #define TIF_RESTORE_SIGMASK	6	/* restore signal mask in do_signal() */
+#define TIF_PERFMON_WORK	7	/* work for pfm_handle_work() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17
 #define TIF_MCA_INIT		18	/* this task is processing MCA or INIT */
 #define TIF_DB_DISABLED		19	/* debug trap disabled for fsyscall */
 #define TIF_FREEZE		20	/* is freezing for suspend */
+#define TIF_PERFMON_CTXSW	21	/* perfmon needs ctxsw calls */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
@@ -104,9 +106,11 @@ struct thread_info {
 #define _TIF_MCA_INIT		(1 << TIF_MCA_INIT)
 #define _TIF_DB_DISABLED	(1 << TIF_DB_DISABLED)
 #define _TIF_FREEZE		(1 << TIF_FREEZE)
+#define _TIF_PERFMON_CTXSW	(1 << TIF_PERFMON_CTXSW)
+#define _TIF_PERFMON_WORK	(1 << TIF_PERFMON_WORK)
 
 /* "work to do on user-return" bits */
-#define TIF_ALLWORK_MASK	(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_RESTORE_SIGMASK)
+#define TIF_ALLWORK_MASK	(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_RESTORE_SIGMASK|_TIF_PERFMON_WORK)
 /* like TIF_ALLWORK_BITS but sans TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT */
 #define TIF_WORK_MASK		(TIF_ALLWORK_MASK&~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
 
Index: linux-2.6/include/asm-ia64/unistd.h
===================================================================
--- linux-2.6.orig/include/asm-ia64/unistd.h
+++ linux-2.6/include/asm-ia64/unistd.h
@@ -299,11 +299,23 @@
 #define __NR_signalfd			1307
 #define __NR_timerfd			1308
 #define __NR_eventfd			1309
+#define __NR_pfm_create_context		1310
+#define __NR_pfm_write_pmcs		(__NR_pfm_create_context+1)
+#define __NR_pfm_write_pmds		(__NR_pfm_create_context+2)
+#define __NR_pfm_read_pmds		(__NR_pfm_create_context+3)
+#define __NR_pfm_load_context		(__NR_pfm_create_context+4)
+#define __NR_pfm_start			(__NR_pfm_create_context+5)
+#define __NR_pfm_stop			(__NR_pfm_create_context+6)
+#define __NR_pfm_restart		(__NR_pfm_create_context+7)
+#define __NR_pfm_create_evtsets		(__NR_pfm_create_context+8)
+#define __NR_pfm_getinfo_evtsets 	(__NR_pfm_create_context+9)
+#define __NR_pfm_delete_evtsets 	(__NR_pfm_create_context+10)
+#define __NR_pfm_unload_context		(__NR_pfm_create_context+11)
 
 #ifdef __KERNEL__
 
 
-#define NR_syscalls			286 /* length of syscall table */
+#define NR_syscalls			298 /* length of syscall table */
 
 /*
  * The following defines stop scripts/checksyscalls.sh from complaining about
Index: linux-2.6/include/asm-mips/perfmon.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-mips/perfmon.h
@@ -0,0 +1,383 @@
+/*
+ * Copyright (c) 2005 Philip Mucci.
+ *
+ * Based on other versions:
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file contains mips64 specific definitions for the perfmon
+ * interface.
+ *
+ * This file MUST never be included directly. Use linux/perfmon.h.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef _ASM_MIPS64_PERFMON_H_
+#define _ASM_MIPS64_PERFMON_H_
+
+#ifdef __KERNEL__
+
+#define PFM_ARCH_PMD_STK_ARG	2
+#define PFM_ARCH_PMC_STK_ARG	2
+
+struct pfm_arch_pmu_info {
+	u32 pmu_style;
+};
+
+#define MIPS64_CONFIG_PMC_MASK (1 << 4)
+#define MIPS64_PMC_INT_ENABLE_MASK (1 << 4)
+#define MIPS64_PMC_CNT_ENABLE_MASK (0xf)
+#define MIPS64_PMC_EVT_MASK (0x7 << 6)
+#define MIPS64_PMC_CTR_MASK (1 << 31)
+#define MIPS64_PMD_INTERRUPT (1 << 31)
+
+/* Coprocessor register 25 contains the PMU interface. */
+/* Sel 0 is control for counter 0 */
+/* Sel 1 is count for counter 0. */
+/* Sel 2 is control for counter 1. */
+/* Sel 3 is count for counter 1. */
+
+/*
+
+31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4  3 2 1 0
+M  0--------------------------------------------------------------0 Event-- IE U S K EXL
+
+M 31 If this bit is one, another pair of Performance Control
+and Counter registers is implemented at a MTC0
+
+Event 8:5 Counter event enabled for this counter. Possible events
+are listed in Table 6-30. R/W Undefined
+
+IE 4 Counter Interrupt Enable. This bit masks bit 31 of the
+associated count register from the interrupt exception
+request output. R/W 0
+
+U 3 Count in User Mode. When this bit is set, the specified
+event is counted in User Mode. R/W Undefined
+
+S 2 Count in Supervisor Mode. When this bit is set, the
+specified event is counted in Supervisor Mode. R/W Undefined
+
+K 1 Count in Kernel Mode. When this bit is set, count the
+event in Kernel Mode when EXL and ERL both are 0. R/W Undefined
+
+EXL 0 Count when EXL. When this bit is set, count the event
+when EXL = 1 and ERL = 0. R/W Undefined
+*/
+
+static inline void pfm_arch_resend_irq(void)
+{}
+
+static inline void pfm_arch_serialize(void)
+{}
+
+
+static inline void pfm_arch_unfreeze_pmu(void)
+{}
+
+/*
+ * MIPS does not save the PMDs during pfm_arch_intr_freeze_pmu(), thus
+ * this routine needs to do it when switching sets on overflow
+ */
+static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx,
+					   struct pfm_event_set *set)
+{
+	pfm_save_pmds(ctx, set);
+}
+
+static inline void pfm_arch_write_pmc(struct pfm_context *ctx,
+				      unsigned int cnum, u64 value)
+{
+  /*
+   * we only write to the actual register when monitoring is
+   * active (pfm_start was issued)
+   */
+  if (ctx && (ctx->flags.started == 0))
+	  return;
+
+  switch(pfm_pmu_conf->pmc_desc[cnum].hw_addr) {
+  case 0:
+    write_c0_perfctrl0(value);
+    break;
+  case 1:
+    write_c0_perfctrl1(value);
+    break;
+  case 2:
+    write_c0_perfctrl2(value);
+    break;
+  case 3:
+    write_c0_perfctrl3(value);
+    break;
+  default:
+    BUG();
+  }
+}
+
+static inline void pfm_arch_write_pmd(struct pfm_context *ctx,
+				      unsigned int cnum, u64 value)
+{
+  value &= pfm_pmu_conf->ovfl_mask;
+
+  switch(pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
+  case 0:
+    write_c0_perfcntr0(value);
+    break;
+  case 1:
+    write_c0_perfcntr1(value);
+    break;
+  case 2:
+    write_c0_perfcntr2(value);
+    break;
+  case 3:
+    write_c0_perfcntr3(value);
+    break;
+  default:
+    BUG();
+  }
+}
+
+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+	switch(pfm_pmu_conf->pmd_desc[cnum].hw_addr) {
+	case 0:
+		return read_c0_perfcntr0();
+		break;
+	case 1:
+		return read_c0_perfcntr1();
+		break;
+	case 2:
+		return read_c0_perfcntr2();
+		break;
+	case 3:
+		return read_c0_perfcntr3();
+		break;
+	default:
+		BUG();
+		return 0;
+	}
+}
+
+static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum)
+{
+	switch(pfm_pmu_conf->pmc_desc[cnum].hw_addr) {
+	case 0:
+		return read_c0_perfctrl0();
+		break;
+	case 1:
+		return read_c0_perfctrl1();
+		break;
+	case 2:
+		return read_c0_perfctrl2();
+		break;
+	case 3:
+		return read_c0_perfctrl3();
+		break;
+	default:
+		BUG();
+		return 0;
+	}
+}
+
+/*
+ * For some CPUs, the upper bits of a counter must be set in order for the
+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
+ * and the upper bits are cleared. This function may be used to set them back.
+ */
+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx,
+					   unsigned int cnum)
+{
+  u64 val;
+  val = pfm_arch_read_pmd(ctx, cnum);
+  /* This masks out overflow bit 31 */
+  pfm_arch_write_pmd(ctx, cnum, val);
+}
+
+/*
+ * At certain points, perfmon needs to know if monitoring has been
+ * explicitely started/stopped by user via pfm_start/pfm_stop. The
+ * information is tracked in ctx.flags.started. However on certain
+ * architectures, it may be possible to start/stop directly from
+ * user level with a single assembly instruction bypassing
+ * the kernel. This function must be used to determine by
+ * an arch-specific mean if monitoring is actually started/stopped.
+ */
+static inline int pfm_arch_is_active(struct pfm_context *ctx)
+{
+	return ctx->flags.started;
+}
+
+static inline void pfm_arch_ctxswout_sys(struct task_struct *task,
+					 struct pfm_context *ctx,
+					 struct pfm_event_set *set)
+{}
+
+static inline void pfm_arch_ctxswin_sys(struct task_struct *task,
+                                         struct pfm_context *ctx,
+                                         struct pfm_event_set *set)
+{}
+
+static inline void pfm_arch_ctxswin_thread(struct task_struct *task,
+                                         struct pfm_context *ctx,
+                                         struct pfm_event_set *set)
+{}
+
+int  pfm_arch_is_monitoring_active(struct pfm_context *ctx);
+int  pfm_arch_ctxswout_thread(struct task_struct *task,
+			      struct pfm_context *ctx, struct pfm_event_set *set);
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx,
+		   struct pfm_event_set *set);
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx,
+		    struct pfm_event_set *set);
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
+char *pfm_arch_get_pmu_module_name(void);
+
+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx,
+					    struct pfm_event_set *set)
+{
+	pfm_arch_stop(current, ctx, set);
+	/*
+	 * we mark monitoring as stopped to avoid
+	 * certain side effects especially in
+	 * pfm_switch_sets_from_intr() on
+	 * pfm_arch_restore_pmcs()
+	 */
+	ctx->flags.started = 0;
+}
+
+/*
+ * unfreeze PMU from pfm_do_interrupt_handler()
+ * ctx may be NULL for spurious
+ */
+static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
+{
+	if (!ctx)
+		return;
+
+	PFM_DBG_ovfl("state=%d", ctx->state);
+
+	ctx->flags.started = 1;
+
+	if (ctx->state == PFM_CTX_MASKED)
+		return;
+
+	pfm_arch_restore_pmcs(ctx, ctx->active_set);
+}
+
+static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg)
+{
+	return 0;
+}
+
+/*
+ * this function is called from the PMU interrupt handler ONLY.
+ * On MIPS, the PMU is frozen via arch_stop, masking would be implemented
+ * via arch-stop as well. Given that the PMU is already stopped when
+ * entering the interrupt handler, we do not need to stop it again, so
+ * this function is a nop.
+ */
+static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx,
+					    struct pfm_event_set *set)
+{}
+
+/*
+ * on MIPS masking/unmasking uses the start/stop mechanism, so we simply
+ * need to start here.
+ */
+static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx,
+					      struct pfm_event_set *set)
+{
+	pfm_arch_start(current, ctx, set);
+}
+
+static inline void pfm_arch_pmu_config_remove(void)
+{}
+
+static inline int pfm_arch_context_create(struct pfm_context *ctx,
+					  u32 ctx_flags)
+{
+	return 0;
+}
+
+static inline void pfm_arch_context_free(struct pfm_context *ctx)
+{}
+
+
+
+
+
+/*
+ * function called from pfm_setfl_sane(). Context is locked
+ * and interrupts are masked.
+ * The value of flags is the value of ctx_flags as passed by
+ * user.
+ *
+ * function must check arch-specific set flags.
+ * Return:
+ * 	1 when flags are valid
+ *      0 on error
+ */
+static inline int
+pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags)
+{
+	return 0;
+}
+
+static inline int pfm_arch_init(void)
+{
+	return 0;
+}
+
+static inline void pfm_arch_init_percpu(void)
+{}
+
+static inline int pfm_arch_load_context(struct pfm_context *ctx,
+					struct pfm_event_set *set,
+					struct task_struct *task)
+{
+	return 0;
+}
+
+static inline int pfm_arch_unload_context(struct pfm_context *ctx,
+					  struct task_struct *task)
+{
+	return 0;
+}
+
+static inline int pfm_arch_pmu_acquire(void)
+{
+	return 0;
+}
+
+static inline void pfm_arch_pmu_release(void)
+{}
+
+/*
+ * not used for mips
+ */
+static inline int pfm_smpl_buffer_alloc_compat(struct pfm_context *ctx,
+					       size_t rsize, struct file *filp)
+{
+	return -EINVAL;
+}
+
+struct pfm_arch_context {
+	/* empty */
+};
+
+#define PFM_ARCH_CTX_SIZE	sizeof(struct pfm_arch_context)
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_MIPS64_PERFMON_H_ */
Index: linux-2.6/include/asm-mips/perfmon_api.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-mips/perfmon_api.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file contains mips64 specific definitions for the perfmon
+ * interface.
+ *
+ * This file MUST never be included directly. Use linux/perfmon.h.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef _ASM_MIPS64_PERFMON_API_H_
+#define _ASM_MIPS64_PERFMON_API_H_
+
+#define PFM_ARCH_MAX_HW_PMCS	256 /* maximum number of PMC registers */
+#define PFM_ARCH_MAX_HW_PMDS	256 /* maximum number of PMD registers */
+/*
+ * Virtual PMU registers: registers mapped to non-PMU resources
+ * IMPORTANT:
+ * 	- must appear in PMC/PMD namespace *AFTER* PMU registers
+ * 	- SW PMD can be specified as smpl_pmds, reset_pmds
+ * 	- SW PMD cannot overflow
+ * 	- SW PMD do not show up in pfarg_msg.ovfl_pmds/pfarg_setinfo_t.ovfl_pmds
+ */
+#define PFM_ARCH_MAX_SW_PMCS	64 /* max virtual PMCS */
+#define PFM_ARCH_MAX_SW_PMDS	64 /* max virtual PMDS */
+
+#endif /* _ASM_MIPS64_PERFMON_API_H_ */
Index: linux-2.6/include/asm-mips/smp.h
===================================================================
--- linux-2.6.orig/include/asm-mips/smp.h
+++ linux-2.6/include/asm-mips/smp.h
@@ -109,6 +109,8 @@ static inline void smp_send_reschedule(i
 	core_send_ipi(cpu, SMP_RESCHEDULE_YOURSELF);
 }
 
+extern int smp_call_function_single(int cpuid, void (*func) (void *info),
+				void *info, int retry, int wait);
 extern asmlinkage void smp_call_function_interrupt(void);
 
 #endif /* CONFIG_SMP */
Index: linux-2.6/include/asm-mips/system.h
===================================================================
--- linux-2.6.orig/include/asm-mips/system.h
+++ linux-2.6/include/asm-mips/system.h
@@ -65,6 +65,9 @@ do {									\
 do {									\
 	if (cpu_has_dsp)						\
 		__save_dsp(prev);					\
+	if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)		\
+	    || test_tsk_thread_flag(next, TIF_PERFMON_CTXSW))		\
+		pfm_ctxsw(prev, next);					\
 	(last) = resume(prev, next, task_thread_info(next));		\
 	if (cpu_has_dsp)						\
 		__restore_dsp(current);					\
Index: linux-2.6/include/asm-mips/thread_info.h
===================================================================
--- linux-2.6.orig/include/asm-mips/thread_info.h
+++ linux-2.6/include/asm-mips/thread_info.h
@@ -115,10 +115,12 @@ register struct thread_info *__current_t
 #define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
 #define TIF_SECCOMP		5	/* secure computing */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal() */
+#define TIF_PERFMON_WORK	10	/* work for pfm_handle_work() */
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		18
 #define TIF_FREEZE		19
+#define TIF_PERFMON_CTXSW	21	/* perfmon needs ctxsw calls */
 #define TIF_SYSCALL_TRACE	31	/* syscall trace active */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -131,6 +133,8 @@ register struct thread_info *__current_t
 #define _TIF_USEDFPU		(1<<TIF_USEDFPU)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
+#define _TIF_PERFMON_WORK	(1<<TIF_PERFMON_WORK)
+#define _TIF_PERFMON_CTXSW	(1<<TIF_PERFMON_CTXSW)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK		(0x0000ffef & ~_TIF_SECCOMP)
Index: linux-2.6/include/asm-mips/unistd.h
===================================================================
--- linux-2.6.orig/include/asm-mips/unistd.h
+++ linux-2.6/include/asm-mips/unistd.h
@@ -340,16 +340,28 @@
 #define __NR_signalfd			(__NR_Linux + 317)
 #define __NR_timerfd			(__NR_Linux + 318)
 #define __NR_eventfd			(__NR_Linux + 319)
+#define __NR_pfm_create_context         (__NR_Linux + 320)
+#define __NR_pfm_write_pmcs		(__NR_pfm_create_context+1)
+#define __NR_pfm_write_pmds		(__NR_pfm_create_context+2)
+#define __NR_pfm_read_pmds		(__NR_pfm_create_context+3)
+#define __NR_pfm_load_context		(__NR_pfm_create_context+4)
+#define __NR_pfm_start			(__NR_pfm_create_context+5)
+#define __NR_pfm_stop			(__NR_pfm_create_context+6)
+#define __NR_pfm_restart		(__NR_pfm_create_context+7)
+#define __NR_pfm_create_evtsets		(__NR_pfm_create_context+8)
+#define __NR_pfm_getinfo_evtsets 	(__NR_pfm_create_context+9)
+#define __NR_pfm_delete_evtsets 	(__NR_pfm_create_context+10)
+#define __NR_pfm_unload_context		(__NR_pfm_create_context+11)
 
 /*
  * Offset of the last Linux o32 flavoured syscall
  */
-#define __NR_Linux_syscalls		319
+#define __NR_Linux_syscalls		331
 
 #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
 
 #define __NR_O32_Linux			4000
-#define __NR_O32_Linux_syscalls		319
+#define __NR_O32_Linux_syscalls		331
 
 #if _MIPS_SIM == _MIPS_SIM_ABI64
 
Index: linux-2.6/include/asm-powerpc/cell-pmu.h
===================================================================
--- linux-2.6.orig/include/asm-powerpc/cell-pmu.h
+++ linux-2.6/include/asm-powerpc/cell-pmu.h
@@ -61,6 +61,11 @@
 
 /* Macros for the pm_status register. */
 #define CBE_PM_CTR_OVERFLOW_INTR(ctr)      (1 << (31 - ((ctr) & 7)))
+#define CBE_PM_OVERFLOW_CTRS(pm_status)    (((pm_status) >> 24) & 0xff)
+#define CBE_PM_ALL_OVERFLOW_INTR           0xff000000
+#define CBE_PM_INTERVAL_INTR               0x00800000
+#define CBE_PM_TRACE_BUFFER_FULL_INTR      0x00400000
+#define CBE_PM_TRACE_BUFFER_UNDERFLOW_INTR 0x00200000
 
 enum pm_reg_name {
 	group_control,
Index: linux-2.6/include/asm-powerpc/perfmon.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-powerpc/perfmon.h
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2005 David Gibson, IBM Corporation.
+ *
+ * Based on other versions:
+ * Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file contains powerpc specific definitions for the perfmon
+ * interface.
+ *
+ * This file MUST never be included directly. Use linux/perfmon.h.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#ifndef _ASM_POWERPC_PERFMON_H_
+#define _ASM_POWERPC_PERFMON_H_
+
+#ifdef __KERNEL__
+
+#include <asm/pmc.h>
+
+enum powerpc_pmu_type {
+	PFM_POWERPC_PMU_NONE,
+	PFM_POWERPC_PMU_604,
+	PFM_POWERPC_PMU_604e,
+	PFM_POWERPC_PMU_750,	/* XXX: Minor event set diffs between IBM and Moto. */
+	PFM_POWERPC_PMU_7400,
+	PFM_POWERPC_PMU_7450,
+	PFM_POWERPC_PMU_POWER5,
+	PFM_POWERPC_PMU_CELL,
+};
+
+struct pfm_arch_pmu_info {
+	enum powerpc_pmu_type pmu_style;
+
+	void (*write_pmc)(unsigned int cnum, u64 value);
+	void (*write_pmd)(unsigned int cnum, u64 value);
+
+	u64 (*read_pmd)(unsigned int cnum);
+
+	void (*enable_counters)(struct pfm_context *ctx,
+				struct pfm_event_set *set);
+	void (*disable_counters)(struct pfm_context *ctx,
+				 struct pfm_event_set *set);
+
+	void (*irq_handler)(struct pt_regs *regs, struct pfm_context *ctx);
+	void (*get_ovfl_pmds)(struct pfm_context *ctx,
+			      struct pfm_event_set *set);
+
+	/* The following routines are optional. */
+	void (*restore_pmcs)(struct pfm_event_set *set);
+	void (*restore_pmds)(struct pfm_event_set *set);
+
+	int  (*ctxswout_thread)(struct task_struct *task,
+				struct pfm_context *ctx,
+				struct pfm_event_set *set);
+	void (*ctxswin_thread)(struct task_struct *task,
+			       struct pfm_context *ctx,
+			       struct pfm_event_set *set);
+	int  (*load_context)(struct pfm_context *ctx,
+			     struct pfm_event_set *set,
+			     struct task_struct *task);
+	int  (*unload_context)(struct pfm_context *ctx,
+			       struct task_struct *task);
+};
+
+#ifdef CONFIG_PPC32
+#define PFM_ARCH_PMD_STK_ARG	6 /* conservative value */
+#define PFM_ARCH_PMC_STK_ARG	6 /* conservative value */
+#else
+#define PFM_ARCH_PMD_STK_ARG	8 /* conservative value */
+#define PFM_ARCH_PMC_STK_ARG	8 /* conservative value */
+#endif
+
+static inline void pfm_arch_resend_irq(void)
+{}
+
+static inline void pfm_arch_serialize(void)
+{}
+
+static inline void pfm_arch_write_pmc(struct pfm_context *ctx,
+				      unsigned int cnum,
+				      u64 value)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+
+	/*
+	 * we only write to the actual register when monitoring is
+	 * active (pfm_start was issued)
+	 */
+	if (ctx && ctx->flags.started == 0)
+		return;
+
+	BUG_ON(!arch_info->write_pmc);
+
+	arch_info->write_pmc(cnum, value);
+}
+
+static inline void pfm_arch_write_pmd(struct pfm_context *ctx,
+				      unsigned int cnum, u64 value)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+
+	value &= pfm_pmu_conf->ovfl_mask;
+
+	BUG_ON(!arch_info->write_pmd);
+
+	arch_info->write_pmd(cnum, value);
+}
+
+static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+
+	BUG_ON(!arch_info->read_pmd);
+
+	return arch_info->read_pmd(cnum);
+}
+
+/*
+ * For some CPUs, the upper bits of a counter must be set in order for the
+ * overflow interrupt to happen. On overflow, the counter has wrapped around,
+ * and the upper bits are cleared. This function may be used to set them back.
+ */
+static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx,
+					   unsigned int cnum)
+{
+	u64 val = pfm_arch_read_pmd(ctx, cnum);
+
+	/* This masks out overflow bit 31 */
+	pfm_arch_write_pmd(ctx, cnum, val);
+}
+
+/*
+ * At certain points, perfmon needs to know if monitoring has been
+ * explicitely started/stopped by user via pfm_start/pfm_stop. The
+ * information is tracked in flags.started. However on certain
+ * architectures, it may be possible to start/stop directly from
+ * user level with a single assembly instruction bypassing
+ * the kernel. This function must be used to determine by
+ * an arch-specific mean if monitoring is actually started/stopped.
+ */
+static inline int pfm_arch_is_active(struct pfm_context *ctx)
+{
+	return ctx->flags.started;
+}
+
+static inline void pfm_arch_ctxswout_sys(struct task_struct *task,
+		           		 struct pfm_context *ctx,
+					 struct pfm_event_set *set)
+{}
+
+static inline void pfm_arch_ctxswin_sys(struct task_struct *task,
+					struct pfm_context *ctx,
+					struct pfm_event_set *set)
+{}
+
+void pfm_arch_init_percpu(void);
+int  pfm_arch_is_monitoring_active(struct pfm_context *ctx);
+int  pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx,
+			      struct pfm_event_set *set);
+void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx,
+			     struct pfm_event_set *set);
+void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx,
+			  struct pfm_event_set *set);
+void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx,
+			   struct pfm_event_set *set);
+void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
+void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set);
+int  pfm_arch_get_ovfl_pmds(struct pfm_context *ctx,
+			    struct pfm_event_set *set);
+char *pfm_arch_get_pmu_module_name(void);
+/*
+ * called from __pfm_interrupt_handler(). ctx is not NULL.
+ * ctx is locked. PMU interrupt is masked.
+ *
+ * must stop all monitoring to ensure handler has consistent view.
+ * must collect overflowed PMDs bitmask  into povfls_pmds and
+ * npend_ovfls. If no interrupt detected then npend_ovfls
+ * must be set to zero.
+ */
+static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	pfm_arch_stop(current, ctx, set);
+}
+
+void powerpc_irq_handler(struct pt_regs *regs);
+
+/*
+ * unfreeze PMU from pfm_do_interrupt_handler()
+ * ctx may be NULL for spurious
+ */
+static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx)
+{
+	struct pfm_arch_pmu_info *arch_info;
+
+	if (!ctx)
+		return;
+
+	PFM_DBG_ovfl("state=%d", ctx->state);
+
+	ctx->flags.started = 1;
+
+	if (ctx->state == PFM_CTX_MASKED)
+		return;
+
+	arch_info = pfm_pmu_conf->arch_info;
+	BUG_ON(!arch_info->enable_counters);
+	arch_info->enable_counters(ctx, ctx->active_set);
+}
+
+/*
+ * PowerPC does not save the PMDs during pfm_arch_intr_freeze_pmu(), thus
+ * this routine needs to do it when switching sets on overflow
+ */
+static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx,
+					   struct pfm_event_set *set)
+{
+	pfm_save_pmds(ctx, set);
+}
+
+/*
+ * this function is called from the PMU interrupt handler ONLY.
+ * On PPC, the PMU is frozen via arch_stop, masking would be implemented
+ * via arch-stop as well. Given that the PMU is already stopped when
+ * entering the interrupt handler, we do not need to stop it again, so
+ * this function is a nop.
+ */
+static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx,
+					    struct pfm_event_set *set)
+{}
+
+/*
+ * on x86 masking/unmasking uses the start/stop mechanism, so we simply
+ * need to start here.
+ */
+static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx,
+					      struct pfm_event_set *set)
+{
+	pfm_arch_start(current, ctx, set);
+}
+
+
+static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg)
+{
+	return 0;
+}
+
+static inline void pfm_arch_pmu_config_remove(void)
+{}
+
+static inline int pfm_arch_context_create(struct pfm_context *ctx,
+					  u32 ctx_flags)
+{
+	return 0;
+}
+
+static inline void pfm_arch_context_free(struct pfm_context *ctx)
+{}
+
+/*
+ * function called from pfm_setfl_sane(). Context is locked
+ * and interrupts are masked.
+ * The value of flags is the value of ctx_flags as passed by
+ * user.
+ *
+ * function must check arch-specific set flags.
+ * Return:
+ * 	1 when flags are valid
+ *      0 on error
+ */
+static inline int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags)
+{
+	return 0;
+}
+
+static inline int pfm_arch_init(void)
+{
+	return 0;
+}
+
+static inline int pfm_arch_load_context(struct pfm_context *ctx,
+					struct pfm_event_set *set,
+					struct task_struct *task)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	int rc = 0;
+
+	if (arch_info->load_context) {
+		rc = arch_info->load_context(ctx, set, task);
+	}
+
+	return rc;
+}
+
+static inline int pfm_arch_unload_context(struct pfm_context *ctx,
+					  struct task_struct *task)
+{
+	struct pfm_arch_pmu_info *arch_info = pfm_pmu_conf->arch_info;
+	int rc = 0;
+
+	if (arch_info->unload_context) {
+		rc = arch_info->unload_context(ctx, task);
+	}
+
+	return rc;
+}
+
+/*
+ * not applicable to powerpc
+ */
+static inline int pfm_smpl_buffer_alloc_compat(struct pfm_context *ctx,
+					       size_t rsize, struct file *filp)
+{
+	return -EINVAL;
+}
+
+static inline int pfm_arch_pmu_acquire(void)
+{
+	return reserve_pmc_hardware(powerpc_irq_handler);
+}
+
+static inline void pfm_arch_pmu_release(void)
+{
+	release_pmc_hardware();
+}
+
+struct pfm_arch_context {
+	/* Cell: Most recent value of the pm_status
+	 * register read by the interrupt handler.
+	 *
+	 * Interrupt handler sets last_read_updated if it
+	 * just read and updated last_read_pm_status
+	 */
+	u32 last_read_pm_status;
+	u32 last_read_updated;
+};
+
+#define PFM_ARCH_CTX_SIZE sizeof(struct pfm_arch_context)
+
+#endif /* __KERNEL__ */
+#endif /* _ASM_POWERPC_PERFMON_H_ */
Index: linux-2.6/include/asm-powerpc/perfmon_api.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-powerpc/perfmon_api.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file contains powerpc specific definitions for the perfmon
+ * interface.
+ *
+ * This file MUST never be included directly. Use linux/perfmon.h.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef _ASM_POWERPC_PERFMON_API_H_
+#define _ASM_POWERPC_PERFMON_API_H_
+
+#define PFM_ARCH_MAX_HW_PMCS	256 /* maximum number of PMC registers */
+#define PFM_ARCH_MAX_HW_PMDS	256 /* maximum number of PMD registers */
+/*
+ * Virtual PMU registers: registers mapped to non-PMU resources
+ * IMPORTANT:
+ * 	- must appear in PMC/PMD namespace *AFTER* PMU registers
+ * 	- SW PMD can be specified as smpl_pmds, reset_pmds
+ * 	- SW PMD cannot overflow
+ * 	- SW PMD do not show up in pfarg_msg.ovfl_pmds/pfarg_setinfo_t.ovfl_pmds
+ */
+#define PFM_ARCH_MAX_SW_PMCS	64 /* max virtual PMCS */
+#define PFM_ARCH_MAX_SW_PMDS	64 /* max virtual PMDS */
+
+
+#endif /* _ASM_POWERPC_PERFMON_API_H_ */
Index: linux-2.6/include/asm-powerpc/perfmon_cell_hw_smpl.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-powerpc/perfmon_cell_hw_smpl.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright IBM Corp 2007
+ *
+ * Contributed by Carl Love <carll@us.ibm.com>
+ * and Kevin Corry <kevcorry@us.ibm.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#ifndef __ASM_POWERPC_PERFMON_CELL_HW_SMPL_H__
+#define __ASM_POWERPC_PERFMON_CELL_HW_SMPL_H__
+
+/**
+ * struct pfm_cell_hw_smpl_arg
+ *
+ * @buf_size: Size of full sampling-buffer (in bytes).
+ * @buf_flags: Sampling-buffer-specific flags.
+ * @reserved1: Pad to 128-bit boundary.
+ *
+ * Format specific parameters (passed at context creation).
+ **/
+struct pfm_cell_hw_smpl_arg {
+	__u64 buf_size;
+	__u32 buf_flags;
+	__u32 reserved1;
+};
+
+/**
+ * struct pfm_cell_hw_smpl_hdr
+ *
+ * @count: Number of valid sampling-buffer entries.
+ * @cur_offset: Offset (in bytes) from the top of the sampling-buffer to the
+ *              next available space for a sampling-buffer entry.
+ * @overflows: Number of times the sampling-buffer has filled up.
+ * @buf_size: Total bytes in the sampling-buffer.
+ * @version: Sampling module version.
+ * @buf_flags: Copy of buf_flags from pfm_cell_hw_smpl_arg.
+ * @reserved1: Pad to 128-bit boundary.
+ *
+ * This header is at the beginning of the sampling-buffer returned to the user.
+ * It is directly followed by the first record.
+ **/
+struct pfm_cell_hw_smpl_hdr {
+	__u64 count;
+	__u64 cur_offset;
+	__u64 overflows;
+	__u64 buf_size;
+	__u32 version;
+	__u32 buf_flags;
+	__u64 reserved1;
+};
+
+/**
+ * struct pfm_cell_hw_smpl_entry_hdr
+ *
+ * @pid: Thread ID.
+ * @tgid: Thread group ID. For NPTL, this is getpid().
+ * @cpu: CPU on which the overflow occurred.
+ * @set: Event-set that was active when the overflow occurred.
+ * @num_samples: Number of 128-bit trace-buffer samples in this entry.
+ * @entry_num: Sequence number of sampling-buffer entries.
+ *
+ * The header for each data entry in the sampling-buffer. The entry header
+ * is immediately followed by the contents of the trace-buffer. Each line in
+ * the trace-buffer is 128 bits wide. The entry header specifies the number
+ * of 128-bit trace-buffer lines that follow the header.
+ **/
+struct pfm_cell_hw_smpl_entry_hdr {
+	__u32 pid;
+	__u32 tgid;
+	__u16 cpu;
+	__u16 set;
+	__u16 num_samples;
+	__u16 entry_num;
+};
+
+/* The max size of each sampling-buffer entry is the size of the entry header
+ * plus the full size of the trace-buffer.
+ */
+#define PFM_CELL_HW_SMPL_MAX_ENTRY_SIZE	\
+			(sizeof(struct pfm_cell_hw_smpl_entry_hdr) + \
+			 2 * sizeof(u64) * CBE_PM_TRACE_BUF_MAX_COUNT)
+
+/* The sampling-buffer must be at least as large as the sampling-buffer header
+ * and the max size of one sampling-buffer entry.
+ */
+#define PFM_CELL_HW_SMPL_MIN_BUF_SIZE (sizeof(struct pfm_cell_hw_smpl_hdr) + \
+				       PFM_CELL_HW_SMPL_MAX_ENTRY_SIZE)
+
+#define PFM_CELL_HW_SMPL_VERSION  1
+#define PFM_CELL_HW_SMPL_NAME     "perfmon_cell_hw_smpl"
+#define PFM_CELL_HW_SMPL_OVFL_PMD (PFM_MAX_PMDS + 1)
+#define PFM_MSG_CELL_HW_SMPL_BUF_FULL 99
+
+#endif /* __ASM_POWERPC_PERFMON_CELL_HW_SMPL_H__ */
Index: linux-2.6/include/asm-powerpc/systbl.h
===================================================================
--- linux-2.6.orig/include/asm-powerpc/systbl.h
+++ linux-2.6/include/asm-powerpc/systbl.h
@@ -312,3 +312,15 @@ COMPAT_SYS_SPU(signalfd)
 COMPAT_SYS_SPU(timerfd)
 SYSCALL_SPU(eventfd)
 COMPAT_SYS_SPU(sync_file_range2)
+SYSCALL(pfm_create_context)
+SYSCALL(pfm_write_pmcs)
+SYSCALL(pfm_write_pmds)
+SYSCALL(pfm_read_pmds)
+SYSCALL(pfm_load_context)
+SYSCALL(pfm_start)
+SYSCALL(pfm_stop)
+SYSCALL(pfm_restart)
+SYSCALL(pfm_create_evtsets)
+SYSCALL(pfm_getinfo_evtsets)
+SYSCALL(pfm_delete_evtsets)
+SYSCALL(pfm_unload_context)
Index: linux-2.6/include/asm-powerpc/thread_info.h
===================================================================
--- linux-2.6.orig/include/asm-powerpc/thread_info.h
+++ linux-2.6/include/asm-powerpc/thread_info.h
@@ -147,7 +147,7 @@ static inline struct thread_info *curren
 #define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP)
 
 #define _TIF_USER_WORK_MASK	(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \
-				 _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK)
+				 _TIF_NEED_RESCHED | _TIF_RESTORE_SIGMASK| _TIF_PERFMON_WORK)
 #define _TIF_PERSYSCALL_MASK	(_TIF_RESTOREALL|_TIF_NOERROR)
 
 /* Bits in local_flags */
Index: linux-2.6/include/asm-powerpc/unistd.h
===================================================================
--- linux-2.6.orig/include/asm-powerpc/unistd.h
+++ linux-2.6/include/asm-powerpc/unistd.h
@@ -331,10 +331,22 @@
 #define __NR_timerfd		306
 #define __NR_eventfd		307
 #define __NR_sync_file_range2	308
+#define __NR_pfm_create_context	309
+#define __NR_pfm_write_pmcs	(__NR_pfm_create_context+1)
+#define __NR_pfm_write_pmds	(__NR_pfm_create_context+2)
+#define __NR_pfm_read_pmds	(__NR_pfm_create_context+3)
+#define __NR_pfm_load_context	(__NR_pfm_create_context+4)
+#define __NR_pfm_start		(__NR_pfm_create_context+5)
+#define __NR_pfm_stop		(__NR_pfm_create_context+6)
+#define __NR_pfm_restart	(__NR_pfm_create_context+7)
+#define __NR_pfm_create_evtsets	(__NR_pfm_create_context+8)
+#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9)
+#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10)
+#define __NR_pfm_unload_context	(__NR_pfm_create_context+11)
 
 #ifdef __KERNEL__
 
-#define __NR_syscalls		309
+#define __NR_syscalls		321
 
 #define __NR__exit __NR_exit
 #define NR_syscalls	__NR_syscalls
Index: linux-2.6/include/asm-x86_64/hw_irq.h
===================================================================
--- linux-2.6.orig/include/asm-x86_64/hw_irq.h
+++ linux-2.6/include/asm-x86_64/hw_irq.h
@@ -84,6 +84,7 @@
  * sources per level' errata.
  */
 #define LOCAL_TIMER_VECTOR	0xef
+#define LOCAL_PERFMON_VECTOR	0xee
 
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee)
Index: linux-2.6/include/asm-x86_64/perfmon.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-x86_64/perfmon.h
@@ -0,0 +1 @@
+#include <asm-i386/perfmon.h>
Index: linux-2.6/include/asm-x86_64/perfmon_api.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-x86_64/perfmon_api.h
@@ -0,0 +1 @@
+#include <asm-i386/perfmon_api.h>
Index: linux-2.6/include/asm-x86_64/perfmon_pebs_smpl.h
===================================================================
--- /dev/null
+++ linux-2.6/include/asm-x86_64/perfmon_pebs_smpl.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <asm-i386/perfmon_pebs_smpl.h>
Index: linux-2.6/include/asm-x86_64/thread_info.h
===================================================================
--- linux-2.6.orig/include/asm-x86_64/thread_info.h
+++ linux-2.6/include/asm-x86_64/thread_info.h
@@ -123,6 +123,7 @@ static inline struct thread_info *stack_
 #define TIF_DEBUG		21	/* uses debug registers */
 #define TIF_IO_BITMAP		22	/* uses I/O bitmap */
 #define TIF_FREEZE		23	/* is freezing for suspend */
+#define TIF_PERFMON_CTXSW	24	/* perfmon needs ctxsw calls */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
@@ -139,6 +140,8 @@ static inline struct thread_info *stack_
 #define _TIF_DEBUG		(1<<TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
+#define _TIF_PERFMON_WORK	(1<<TIF_PERFMON_WORK)
+#define _TIF_PERFMON_CTXSW	(1<<TIF_PERFMON_CTXSW)
 
 /* work to do on interrupt/exception return */
 #define _TIF_WORK_MASK \
@@ -147,7 +150,7 @@ static inline struct thread_info *stack_
 #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
 
 /* flags to check in __switch_to() */
-#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)
+#define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP|_TIF_PERFMON_CTXSW)
 
 #define PREEMPT_ACTIVE     0x10000000
 
Index: linux-2.6/include/asm-x86_64/unistd.h
===================================================================
--- linux-2.6.orig/include/asm-x86_64/unistd.h
+++ linux-2.6/include/asm-x86_64/unistd.h
@@ -630,6 +630,32 @@ __SYSCALL(__NR_signalfd, sys_signalfd)
 __SYSCALL(__NR_timerfd, sys_timerfd)
 #define __NR_eventfd		284
 __SYSCALL(__NR_eventfd, sys_eventfd)
+#define __NR_pfm_create_context	284
+__SYSCALL(__NR_pfm_create_context, sys_pfm_create_context)
+#define __NR_pfm_write_pmcs	(__NR_pfm_create_context+1)
+__SYSCALL(__NR_pfm_write_pmcs, sys_pfm_write_pmcs)
+#define __NR_pfm_write_pmds	(__NR_pfm_create_context+2)
+__SYSCALL(__NR_pfm_write_pmds, sys_pfm_write_pmds)
+#define __NR_pfm_read_pmds	(__NR_pfm_create_context+3)
+__SYSCALL(__NR_pfm_read_pmds, sys_pfm_read_pmds)
+#define __NR_pfm_load_context	(__NR_pfm_create_context+4)
+__SYSCALL(__NR_pfm_load_context, sys_pfm_load_context)
+#define __NR_pfm_start		(__NR_pfm_create_context+5)
+__SYSCALL(__NR_pfm_start, sys_pfm_start)
+#define __NR_pfm_stop		(__NR_pfm_create_context+6)
+__SYSCALL(__NR_pfm_stop, sys_pfm_stop)
+#define __NR_pfm_restart	(__NR_pfm_create_context+7)
+__SYSCALL(__NR_pfm_restart, sys_pfm_restart)
+#define __NR_pfm_create_evtsets	(__NR_pfm_create_context+8)
+__SYSCALL(__NR_pfm_create_evtsets, sys_pfm_create_evtsets)
+#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9)
+__SYSCALL(__NR_pfm_getinfo_evtsets, sys_pfm_getinfo_evtsets)
+#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10)
+__SYSCALL(__NR_pfm_delete_evtsets, sys_pfm_delete_evtsets)
+#define __NR_pfm_unload_context	(__NR_pfm_create_context+11)
+__SYSCALL(__NR_pfm_unload_context, sys_pfm_unload_context)
+
+#define __NR_syscall_max __NR_pfm_unload_context
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
Index: linux-2.6/include/linux/oprofile.h
===================================================================
--- linux-2.6.orig/include/linux/oprofile.h
+++ linux-2.6/include/linux/oprofile.h
@@ -67,6 +67,8 @@ struct oprofile_operations {
 	void (*backtrace)(struct pt_regs * const regs, unsigned int depth);
 	/* CPU identification string. */
 	char * cpu_type;
+	/* Identify method of  string. */
+	char * implementation;
 };
 
 /**
Index: linux-2.6/include/linux/perfmon.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/perfmon.h
@@ -0,0 +1,755 @@
+/*
+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+
+#ifndef __LINUX_PERFMON_H__
+#define __LINUX_PERFMON_H__
+
+#ifdef CONFIG_PERFMON
+
+/*
+ * include arch-specific constants and user visible definitions
+ */
+#include <asm/perfmon_api.h>
+
+#define PFM_MAX_PMCS	(PFM_ARCH_MAX_HW_PMCS+PFM_ARCH_MAX_SW_PMCS)
+#define PFM_MAX_PMDS	(PFM_ARCH_MAX_HW_PMDS+PFM_ARCH_MAX_SW_PMDS)
+
+/*
+ * number of elements for each type of bitvector
+ * all bitvectors use u64 fixed size type on all architectures.
+ */
+#define PFM_BVSIZE(x)	(((x)+(sizeof(u64)<<3)-1) / (sizeof(u64)<<3))
+#define PFM_HW_PMD_BV	PFM_BVSIZE(PFM_ARCH_MAX_HW_PMDS)
+#define PFM_PMD_BV	PFM_BVSIZE(PFM_MAX_PMDS)
+#define PFM_PMC_BV	PFM_BVSIZE(PFM_MAX_PMCS)
+/*
+ * PMC/PMD flags to use with pfm_write_pmds() or pfm_write_pmcs()
+ *
+ * reg_flags layout:
+ * bit 00-15 : generic flags
+ * bit 16-23 : arch-specific flags
+ * bit 24-31 : error codes
+ */
+#define PFM_REGFL_OVFL_NOTIFY	0x1	/* PMD: send notification on overflow */
+#define PFM_REGFL_RANDOM	0x2	/* PMD: randomize sampling interval   */
+#define PFM_REGFL_NO_EMUL64	0x4	/* PMC: no 64-bit emulation for counter */
+
+/*
+ * event set flags layout:
+ * bits[00-15] : generic flags
+ * bits[16-31] : arch-specific flags (see asm/perfmon.h)
+ */
+#define PFM_SETFL_OVFL_SWITCH	0x01 /* enable switch on overflow */
+#define PFM_SETFL_TIME_SWITCH	0x02 /* enable switch on timeout */
+
+/*
+ * PMD/PMC return flags in case of error (ignored on input)
+ *
+ * reg_flags layout:
+ * bit 00-15 : generic flags
+ * bits[16-23] : arch-specific flags (see asm/perfmon.h)
+ * bit 24-31 : error codes
+ *
+ * Those flags are used on output and must be checked in case EINVAL is
+ * returned by a command accepting a vector of values and each has a flag
+ * field, such as pfarg_pmc or pfarg_pmd.
+ */
+#define PFM_REG_RETFL_NOTAVAIL	(1<<31) /* not implemented or unaccessible */
+#define PFM_REG_RETFL_EINVAL	(1<<30) /* entry is invalid */
+#define PFM_REG_RETFL_NOSET	(1<<29) /* event set does not exist */
+#define PFM_REG_RETFL_MASK	(PFM_REG_RETFL_NOTAVAIL|\
+				 PFM_REG_RETFL_EINVAL|\
+				 PFM_REG_RETFL_NOSET)
+
+#define PFM_REG_HAS_ERROR(flag)	(((flag) & PFM_REG_RETFL_MASK) != 0)
+
+/*
+ * argument to pfm_create_context() system call
+ * structure shared with user level
+ */
+struct pfarg_ctx {
+	__u32		ctx_flags;	  /* noblock/block/syswide */
+	__u32		ctx_reserved1;	  /* ret arg: fd for context */
+	__u64		ctx_reserved2[7]; /* for future use */
+};
+
+/*
+ * context flags (ctx_flags)
+ *
+ */
+#define PFM_FL_NOTIFY_BLOCK    	 0x01	/* block task on user notifications */
+#define PFM_FL_SYSTEM_WIDE	 0x02	/* create a system wide context */
+#define PFM_FL_OVFL_NO_MSG	 0x80   /* no overflow msgs */
+#define PFM_FL_MAP_SETS		 0x10	/* event sets are remapped */
+
+/*
+ * argument to pfm_write_pmcs() system call.
+ * structure shared with user level
+ */
+struct pfarg_pmc {
+	__u16 reg_num;		/* which register */
+	__u16 reg_set;		/* event set for this register */
+	__u32 reg_flags;	/* input: flags, return: reg error */
+	__u64 reg_value;	/* pmc value */
+	__u64 reg_reserved2[4];	/* for future use */
+};
+
+/*
+ * argument to pfm_write_pmds() and pfm_read_pmds() system calls.
+ * structure shared with user level
+ */
+struct pfarg_pmd {
+	__u16 reg_num;	   	/* which register */
+	__u16 reg_set;	   	/* event set for this register */
+	__u32 reg_flags; 	/* input: flags, return: reg error */
+	__u64 reg_value;	/* initial pmc/pmd value */
+	__u64 reg_long_reset;	/* value to reload after notification */
+	__u64 reg_short_reset;  /* reset after counter overflow */
+	__u64 reg_last_reset_val;	/* return: PMD last reset value */
+	__u64 reg_ovfl_switch_cnt;	/* #overflows before switch */
+	__u64 reg_reset_pmds[PFM_PMD_BV]; /* reset on overflow */
+	__u64 reg_smpl_pmds[PFM_PMD_BV];  /* record in sample */
+	__u64 reg_smpl_eventid; /* opaque event identifier */
+	__u64 reg_random_mask; 	/* bitmask used to limit random value */
+	__u32 reg_random_seed;  /* seed for randomization (OBSOLETE) */
+	__u32 reg_reserved2[7];	/* for future use */
+};
+
+/*
+ * optional argument to pfm_start() system call. Pass NULL if not needed.
+ * structure shared with user level
+ */
+struct pfarg_start {
+	__u16 start_set;	/* event set to start with */
+	__u16 start_reserved1;	/* for future use */
+	__u32 start_reserved2;	/* for future use */
+	__u64 reserved3[3];	/* for future use */
+};
+
+/*
+ * argument to pfm_load_context() system call.
+ * structure shared with user level
+ */
+struct pfarg_load {
+	__u32	load_pid;	   /* thread or CPU to attach to */
+	__u16	load_set;	   /* set to load first */
+	__u16	load_reserved1;	   /* for future use */
+	__u64	load_reserved2[3]; /* for future use */
+};
+
+/*
+ * argument to pfm_create_evtsets() and pfm_delete_evtsets() system calls.
+ * structure shared with user level.
+ */
+struct pfarg_setdesc {
+	__u16	set_id;		  /* which set */
+	__u16	set_reserved1;	  /* for future use */
+	__u32	set_flags; 	  /* input: flags, return: err flag  */
+	__u64	set_timeout;	  /* req/eff switch timeout in nsecs */
+	__u64	reserved[6];	  /* for future use */
+};
+
+/*
+ * argument to pfm_getinfo_evtsets() system call.
+ * structure shared with user level
+ */
+struct pfarg_setinfo {
+	__u16	set_id;             /* which set */
+	__u16	set_reserved1;      /* for future use */
+	__u32	set_flags;	    /* out:flags or error */
+	__u64 	set_ovfl_pmds[PFM_HW_PMD_BV]; /* out: last ovfl PMDs */
+	__u64	set_runs;            /* out: #times the set was active */
+	__u64	set_timeout;         /* out: effective/leftover switch timeout in nsecs */
+	__u64	set_act_duration;    /* out: time set was active in nsecs */
+	__u64	set_avail_pmcs[PFM_PMC_BV];/* unavailable PMCs */
+	__u64	set_avail_pmds[PFM_PMD_BV];/* unavailable PMDs */
+	__u64	reserved[6];         /* for future use */
+};
+
+/*
+ * default value for the user and group security parameters in
+ * /proc/sys/kernel/perfmon/sys_group
+ * /proc/sys/kernel/perfmon/task_group
+ */
+#define PFM_GROUP_PERM_ANY	-1	/* any user/group */
+
+/*
+ * overflow notification message.
+ * structure shared with user level
+ */
+struct pfarg_ovfl_msg {
+	__u32 		msg_type;	/* message type: PFM_MSG_OVFL */
+	__u32		msg_ovfl_pid;	/* process id */
+	__u64		msg_ovfl_pmds[PFM_HW_PMD_BV];/* overflowed PMDs */
+	__u16 		msg_active_set;	/* active set at overflow */
+	__u16 		msg_ovfl_cpu;	/* cpu of PMU interrupt */
+	__u32		msg_ovfl_tid;	/* thread id */
+	__u64		msg_ovfl_ip;    /* IP on PMU intr */
+};
+
+#define PFM_MSG_OVFL	1	/* an overflow happened */
+#define PFM_MSG_END	2	/* task to which context was attached ended */
+
+/*
+ * generic notification message (union).
+ * union shared with user level
+ */
+union pfarg_msg {
+	__u32	type;
+	struct pfarg_ovfl_msg	pfm_ovfl_msg;
+};
+
+/*
+ * perfmon version number
+ */
+#define PFM_VERSION_MAJ		 2U
+#define PFM_VERSION_MIN		 5U
+#define PFM_VERSION		 (((PFM_VERSION_MAJ&0xffff)<<16)|\
+				  (PFM_VERSION_MIN & 0xffff))
+#define PFM_VERSION_MAJOR(x)	 (((x)>>16) & 0xffff)
+#define PFM_VERSION_MINOR(x)	 ((x) & 0xffff)
+
+/*
+ * This part of the header file is meant for kernel level code only including
+ * kernel modules
+ */
+#ifdef __KERNEL__
+
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/kobject.h>
+
+/*
+ * perfmon context state
+ */
+#define PFM_CTX_UNLOADED	1 /* context is not loaded onto any task */
+#define PFM_CTX_LOADED		2 /* context is loaded onto a task */
+#define PFM_CTX_MASKED		3 /* context is loaded, monitoring is masked */
+#define PFM_CTX_ZOMBIE		4 /* context lost owner but is still attached */
+
+/*
+ * depth of message queue
+ */
+#define PFM_MSGS_ORDER		3 /* log2(number of messages) */
+#define PFM_MSGS_COUNT		(1<<PFM_MSGS_ORDER) /* number of messages */
+#define PFM_MSGQ_MASK		(PFM_MSGS_COUNT-1)
+
+/*
+ * type of PMD reset for pfm_reset_pmds() or pfm_switch_sets*()
+ */
+#define PFM_PMD_RESET_SHORT	1	/* use short reset value */
+#define PFM_PMD_RESET_LONG	2	/* use long reset value  */
+
+/*
+ * describe the content of the pfm_syst_info field
+ * layout:
+ * 	bits[00-15] : generic
+ *	bits[16-31] : arch-specific flags (see asm/perfmon.h)
+ */
+#define PFM_CPUINFO_TIME_SWITCH	0x1 /* current set is time-switched */
+
+struct pfm_controls {
+	int	debug;		/* debugging via syslog */
+	int	debug_ovfl;	/* overflow handling debugging */
+	gid_t	sys_group;	/* gid to create a syswide context */
+	gid_t	task_group;	/* gid to create a per-task context */
+	size_t	arg_mem_max;	/* maximum vector argument size */
+	size_t	smpl_buffer_mem_max; /* max buf mem, -1 for infinity */
+	int pmd_read;
+};
+DECLARE_PER_CPU(u32, pfm_syst_info);
+DECLARE_PER_CPU(struct task_struct *, pmu_owner);
+DECLARE_PER_CPU(struct pfm_context *, pmu_ctx);
+DECLARE_PER_CPU(u64, pmu_activation_number);
+DECLARE_PER_CPU(struct pfm_stats, pfm_stats);
+
+/*
+ * logging
+ */
+#define PFM_ERR(f, x...)  printk(KERN_ERR     "perfmon: " f "\n", ## x)
+#define PFM_WARN(f, x...) printk(KERN_WARNING "perfmon: " f "\n", ## x)
+#define PFM_LOG(f, x...)  printk(KERN_NOTICE  "perfmon: " f "\n", ## x)
+#define PFM_INFO(f, x...) printk(KERN_INFO    "perfmon: " f "\n", ## x)
+
+/*
+ * debugging
+ *
+ * Printk rate limiting is enforced to avoid getting flooded with too many
+ * error messages on the console (which could render the machine unresponsive).
+ * To get full debug output (turn off ratelimit):
+ * 	$ echo 0 >/proc/sys/kernel/printk_ratelimit
+ */
+#ifdef CONFIG_PERFMON_DEBUG
+#define PFM_DBG(f, x...) \
+	do { \
+		if (unlikely(pfm_controls.debug >0 && printk_ratelimit())) { \
+			printk("perfmon: %s.%d: CPU%d [%d]: " f "\n", \
+			       __FUNCTION__, __LINE__, \
+			       smp_processor_id(), current->pid , ## x); \
+		} \
+	} while (0)
+
+#define PFM_DBG_ovfl(f, x...) \
+	do { \
+		if (unlikely(pfm_controls.debug_ovfl >0 && printk_ratelimit())) { \
+			printk("perfmon: %s.%d: CPU%d [%d]: " f "\n", \
+			       __FUNCTION__, __LINE__, \
+			       smp_processor_id(), current->pid , ## x); \
+		} \
+	} while (0)
+#else
+#define PFM_DBG(f, x...)	do {} while(0)
+#define PFM_DBG_ovfl(f, x...)	do {} while(0)
+#endif
+
+/*
+ * PMD information
+ */
+struct pfm_pmd {
+	u64 value;		/* currnet 64-bit value */
+	u64 lval;		/* last reset value */
+	u64 ovflsw_thres;	 /* #overflows left before switching */
+	u64 long_reset;		/* reset value on sampling overflow */
+	u64 short_reset;    	/* reset value on overflow */
+	u64 reset_pmds[PFM_PMD_BV];  /* pmds to reset on overflow */
+	u64 smpl_pmds[PFM_PMD_BV];   /* pmds to record on overflow */
+	u64 mask;		 /* mask for generator */
+	u32 flags;		 /* notify/do not notify */
+	u64 ovflsw_ref_thres;	 /* #overflows before switching to next set */
+	u64 eventid;	 	 /* overflow event identifier */
+};
+
+/*
+ * perfmon context: encapsulates all the state of a monitoring session
+ */
+struct pfm_event_set {
+	u16 id;
+	u16 id_next;			/* which set to go to from this one */
+	u32 flags;			/* public set flags */
+	u64 runs;			/* number of activations */
+	struct list_head list;		/* next in the ordered list */
+	struct pfm_event_set *sw_next;	/* address of set to go to */
+	u32 priv_flags;			/* private flags */
+	u32 npend_ovfls;		/* number of pending PMD overflow */
+
+	u64 used_pmds[PFM_PMD_BV];    /* used PMDs */
+	u64 povfl_pmds[PFM_HW_PMD_BV];   /* pending overflowed PMDs */
+	u64 ovfl_pmds[PFM_HW_PMD_BV];    /* last overflowed PMDs */
+	u64 reset_pmds[PFM_PMD_BV];   /* union of PMDs to reset */
+	u64 ovfl_notify[PFM_PMD_BV];  /* notify on overflow */
+	u64 pmcs[PFM_MAX_PMCS];	      /* PMC values */
+
+	u16 nused_pmds;			    /* max number of used PMDs */
+	u16 nused_pmcs;			    /* max number of used PMCs */
+
+	struct pfm_pmd pmds[PFM_MAX_PMDS];  /* 64-bit SW PMDs */
+	u64 timeout_sw_ref;		    /* switch timeout reference */
+	u64 timeout_sw_left;		    /* timeout remaining */
+	u64 timeout_sw_exp;		    /* timeout expiration jiffies */
+	u64 duration_start;		    /* start ns */
+	u64 duration;			    /* total active ns */
+	u64 used_pmcs[PFM_PMC_BV];    /* used PMCs (keep for arbitration) */
+};
+
+/*
+ * common private event set flags (priv_flags)
+ *
+ * upper 16 bits: for arch-specific use
+ * lower 16 bits: for common use
+ */
+#define PFM_SETFL_PRIV_MOD_PMDS 0x1 /* PMD register(s) modified */
+#define PFM_SETFL_PRIV_MOD_PMCS 0x2 /* PMC register(s) modified */
+#define PFM_SETFL_PRIV_SWITCH	0x4 /* must switch set on restart */
+#define PFM_SETFL_PRIV_MOD_BOTH	(PFM_SETFL_PRIV_MOD_PMDS | PFM_SETFL_PRIV_MOD_PMCS)
+
+/*
+ * context flags
+ */
+struct pfm_context_flags {
+	unsigned int block:1;		/* task blocks on user notifications */
+	unsigned int system:1;		/* do system wide monitoring */
+	unsigned int no_msg:1;		/* no message sent on overflow */
+	unsigned int can_restart:1;	/* allowed to issue a PFM_RESTART */
+	unsigned int switch_ovfl:1;	/* switch set on counter ovfl */
+	unsigned int switch_time:1;	/* switch set on timeout */
+	unsigned int started:1;		/* pfm_start() issued */
+	unsigned int work_type:2;	/* type of work for pfm_handle_work */
+	unsigned int mmap_nlock:1;	/* no lock in pfm_release_buf_space */
+	unsigned int reserved:20;	/* for future use */
+};
+
+/*
+ * values for work_type (TIF_PERFMON_WORK must be set)
+ */
+#define PFM_WORK_NONE	0	/* nothing to do */
+#define PFM_WORK_RESET	1	/* reset overflowed counters */
+#define PFM_WORK_BLOCK	2	/* block current thread */
+#define PFM_WORK_ZOMBIE	3	/* cleanup zombie context */
+
+/*
+ * check_mask bitmask values for pfm_check_task_state()
+ */
+#define PFM_CMD_STOPPED		0x01	/* command needs thread stopped */
+#define PFM_CMD_UNLOADED	0x02	/* command needs ctx unloaded */
+#define PFM_CMD_UNLOAD		0x04	/* command is unload */
+
+#include <linux/perfmon_pmu.h>
+#include <linux/perfmon_fmt.h>
+
+/*
+ * context: encapsulates all the state of a monitoring session
+ */
+struct pfm_context {
+	spinlock_t		lock;	/* context protection */
+
+	struct pfm_context_flags flags;	/*  flags */
+	u32			state;	/* state */
+	struct task_struct 	*task;	/* attached task */
+
+	struct completion       restart_complete;/* block on notification */
+	u64 			last_act;	/* last activation */
+	u32			last_cpu;   	/* last CPU used (SMP only) */
+	u32			cpu;		/* cpu bound to context */
+
+	struct pfm_smpl_fmt	*smpl_fmt;	/* buffer format callbacks */
+	void			*smpl_addr;	/* smpl buffer base */
+	size_t			smpl_size;
+
+	wait_queue_head_t 	msgq_wait;	/* used when flags.kapi=0 */
+	union pfarg_msg		msgq[PFM_MSGS_COUNT];
+	int			msgq_head;
+	int			msgq_tail;
+
+	struct fasync_struct	*async_queue;
+
+	u64			set_all_runs; /* total number of set activations */
+	struct pfm_event_set	*active_set;  /* active set */
+	struct list_head	list;	 /* ordered list of sets */
+
+	/*
+	 * save stack space by allocating temporary variables for
+	 * pfm_overflow_handler() in pfm_context
+	 */
+	struct pfm_ovfl_arg 	ovfl_arg;
+	u64			ovfl_ovfl_notify[PFM_PMD_BV];
+};
+
+static inline struct pfm_arch_context *pfm_ctx_arch(struct pfm_context *c)
+{
+	return (struct pfm_arch_context *)(c+1);
+}
+
+static inline void pfm_set_pmu_owner(struct task_struct *task,
+				     struct pfm_context *ctx)
+{
+	BUG_ON(task && task->pid == 0);
+	__get_cpu_var(pmu_owner) = task;
+	__get_cpu_var(pmu_ctx) = ctx;
+}
+
+static inline void pfm_inc_activation(void)
+{
+	__get_cpu_var(pmu_activation_number)++;
+}
+
+static inline void pfm_set_activation(struct pfm_context *ctx)
+{
+	ctx->last_act = __get_cpu_var(pmu_activation_number);
+}
+
+static inline void pfm_set_last_cpu(struct pfm_context *ctx, int cpu)
+{
+	ctx->last_cpu = cpu;
+}
+
+static inline void pfm_retflag_set(u32 flags, u32 val)
+{
+	flags &= ~PFM_REG_RETFL_MASK;
+	flags |= (val);
+}
+
+extern struct pfm_pmu_config  *pfm_pmu_conf;
+extern struct pfm_controls pfm_controls;
+extern int perfmon_disabled;
+
+int  pfm_get_args(void __user *ureq, size_t sz, size_t lsz, void *laddr,
+		  void **req, void **to_free);
+
+int pfm_get_task(struct pfm_context *ctx, pid_t pid, struct task_struct **task);
+int pfm_get_smpl_arg(char __user *fmt_uname, void __user *uaddr, size_t usize, void **arg,
+		     struct pfm_smpl_fmt **fmt);
+
+int pfm_alloc_fd(struct file **cfile);
+
+int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmc *req, int count);
+int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count,
+		     int compat);
+int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count);
+int __pfm_load_context(struct pfm_context *ctx, struct pfarg_load *req,
+		       struct task_struct *task);
+int __pfm_unload_context(struct pfm_context *ctx, int *can_release);
+int __pfm_stop(struct pfm_context *ctx);
+int  __pfm_restart(struct pfm_context *ctx, int *complete_needed);
+int __pfm_start(struct pfm_context *ctx, struct pfarg_start *start);
+int __pfm_delete_evtsets(struct pfm_context *ctx, void *arg, int count);
+int __pfm_getinfo_evtsets(struct pfm_context *ctx, struct pfarg_setinfo *req,
+			  int count);
+int __pfm_create_evtsets(struct pfm_context *ctx, struct pfarg_setdesc *req,
+			int count);
+
+int __pfm_create_context(struct pfarg_ctx *req,
+			 struct pfm_smpl_fmt *fmt,
+			 void *fmt_arg,
+			 int mode,
+			 struct pfm_context **new_ctx);
+
+int pfm_check_task_state(struct pfm_context *ctx, int check_mask,
+			 unsigned long *flags);
+
+struct pfm_event_set *pfm_find_set(struct pfm_context *ctx, u16 set_id,
+				   int alloc);
+
+struct pfm_context *pfm_get_ctx(int fd);
+
+void pfm_context_free(struct pfm_context *ctx);
+struct pfm_context *pfm_context_alloc(void);
+int pfm_pmu_conf_get(int autoload);
+void pfm_pmu_conf_put(void);
+
+int pfm_pmu_acquire(void);
+void pfm_pmu_release(void);
+
+int pfm_reserve_session(int is_system, u32 cpu);
+int pfm_release_session(int is_system, u32 cpu);
+
+int pfm_reserve_allcpus(void);
+int pfm_release_allcpus(void);
+
+int pfm_smpl_buffer_alloc(struct pfm_context *ctx, size_t rsize);
+int pfm_reserve_buf_space(size_t size);
+void pfm_release_buf_space(struct pfm_context *ctx, size_t size);
+
+struct pfm_smpl_fmt *pfm_smpl_fmt_get(char *name);
+void pfm_smpl_fmt_put(struct pfm_smpl_fmt *fmt);
+
+int  pfm_init_sysfs(void);
+ssize_t pfm_sysfs_session_show(char *buf, size_t sz, int what);
+int pfm_sysfs_remove_pmu(struct pfm_pmu_config *pmu);
+int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu);
+
+int pfm_sysfs_add_fmt(struct pfm_smpl_fmt *fmt);
+int pfm_sysfs_remove_fmt(struct pfm_smpl_fmt *fmt);
+
+int pfm_sysfs_add_cpu(int mycpu);
+void pfm_sysfs_del_cpu(int mycpu);
+
+void pfm_interrupt_handler(unsigned long iip, struct pt_regs *regs);
+void pfm_save_prev_context(struct pfm_context *ctxp);
+
+void pfm_reset_pmds(struct pfm_context *ctx, struct pfm_event_set *set,
+		    int num_pmds,
+		    int reset_mode);
+
+void __pfm_handle_switch_timeout(void);
+int pfm_prepare_sets(struct pfm_context *ctx, struct pfm_event_set *act_set);
+int pfm_sets_init(void);
+
+int pfm_mmap_set(struct pfm_context *ctx, struct vm_area_struct *vma,
+		 size_t size);
+
+void pfm_free_sets(struct pfm_context *ctx);
+void pfm_init_evtset(struct pfm_event_set *set);
+void pfm_switch_sets_from_intr(struct pfm_context *ctx);
+void pfm_switch_sets(struct pfm_context *ctx,
+		    struct pfm_event_set *new_set,
+		    int reset_mode,
+		    int no_restart);
+
+union pfarg_msg *pfm_get_new_msg(struct pfm_context *ctx);
+void pfm_save_pmds(struct pfm_context *ctx, struct pfm_event_set *set);
+int pfm_notify_user(struct pfm_context *ctx);
+int pfm_ovfl_notify_user(struct pfm_context *ctx,
+			struct pfm_event_set *set,
+	     		unsigned long ip);
+void pfm_mask_monitoring(struct pfm_context *ctx,
+			 struct pfm_event_set *set);
+
+int pfm_init_fs(void);
+
+struct pfm_stats {
+	u64 ovfl_intr_replay_count;	/* replayed ovfl interrupts */
+	u64 ovfl_intr_regular_count;	/* processed ovfl interrupts */
+	u64 ovfl_intr_all_count; 	/* total ovfl interrupts */
+	u64 ovfl_intr_ns;		/* cycles in ovfl interrupts */
+	u64 ovfl_intr_phase1;		/* cycles in ovfl interrupts */
+	u64 ovfl_intr_phase2;		/* cycles in ovfl interrupts */
+	u64 ovfl_intr_phase3;		/* cycles in ovfl interrupts */
+	u64 fmt_handler_calls;		/* # calls smpl buffer handler */
+	u64 fmt_handler_ns;		/* cycle in smpl format handler */
+	u64 set_switch_count;		/* #set_switches on this CPU */
+	u64 set_switch_ns;		/* cycles for switching sets */
+	u64 ctxsw_count;		/* #context switches on this CPU */
+	u64 ctxsw_ns;			/* cycles for context switches */
+	u64 handle_timeout_count;	/* #of set timeouts handled */
+	u64 ovfl_intr_nmi_count;	/* number of NMI-base ovfl */
+	u64 handle_work_count;		/* calls to pfm_handle_work */
+	u64 ovfl_notify_count;		/* notification messages */
+	u64 reset_pmds_count;		/* calls to pfm_reset_pmds */
+	u64 pfm_restart_count;		/* calls to pfm_restart_count */
+	u64 ccnt0;
+	u64 ccnt1;
+	u64 ccnt2;
+	u64 ccnt3;
+	u64 ccnt4;
+	u64 ccnt5;
+	u64 ccnt6;
+	struct kobject kobj;		/* for sysfs internal use only */
+};
+#define to_stats(n) container_of(n, struct pfm_stats, kobj)
+
+/*
+ * include arch-specific kernel level only definitions
+ * (split with perfmon_api.h is necessary to avoid circular
+ *  dependencies on certain data structures definitions)
+ */
+#include <asm/perfmon.h>
+
+extern const struct file_operations pfm_file_ops;
+/*
+ * max vector argument elements for local storage (no kmalloc/kfree)
+ * The PFM_ARCH_PM*_ARG should be defined in the arch specific perfmon.h
+ * file. If not, default (conservative) values are used
+ */
+
+#ifndef PFM_ARCH_PMC_STK_ARG
+#define PFM_ARCH_PMC_STK_ARG	1
+#endif
+
+#ifndef PFM_ARCH_PMD_STK_ARG
+#define PFM_ARCH_PMD_STK_ARG	1
+#endif
+
+#define PFM_PMC_STK_ARG	PFM_ARCH_PMC_STK_ARG
+#define PFM_PMD_STK_ARG	PFM_ARCH_PMD_STK_ARG
+
+#define PFM_BPL		64
+#define PFM_LBPL	6	/* log2(BPL) */
+
+/*
+ * upper limit for count in calls that take vector arguments. This is used
+ * to prevent for multiplication overflow when we compute actual storage size
+ */
+#define PFM_MAX_ARG_COUNT(m) (INT_MAX/sizeof(*(m)))
+
+/*
+ * read a single PMD register. PMD register mapping is provided by PMU
+ * description module. Virtual PMD registers have special handler.
+ */
+static inline u64 pfm_read_pmd(struct pfm_context *ctx, unsigned int cnum)
+{
+	if (unlikely(pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V))
+		return pfm_pmu_conf->pmd_sread(ctx, cnum);
+
+	return pfm_arch_read_pmd(ctx, cnum);
+}
+
+static inline void pfm_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value)
+{
+	/*
+	 * PMD writes are ignored for read-only registers
+	 */
+	if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_RO)
+		return;
+
+	if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V) {
+		pfm_pmu_conf->pmd_swrite(ctx, cnum, value);
+		return;
+	}
+	pfm_arch_write_pmd(ctx, cnum, value);
+}
+
+#define cast_ulp(_x) ((unsigned long *)_x)
+
+#define PFM_NORMAL      0
+#define PFM_COMPAT      1
+
+void __pfm_exit_thread(struct task_struct *task);
+void __pfm_copy_thread(struct task_struct *task);
+void pfm_ctxsw(struct task_struct *prev, struct task_struct *next);
+void pfm_handle_work(struct pt_regs *regs);
+void __pfm_handle_switch_timeout(void);
+void __pfm_init_percpu (void *dummy);
+void pfm_cpu_disable(void);
+
+static inline void pfm_exit_thread(struct task_struct *task)
+{
+	if (task->pfm_context)
+		__pfm_exit_thread(task);
+}
+
+static inline void pfm_copy_thread(struct task_struct *task)
+{
+	/*
+	 * context or perfmon TIF state  is NEVER inherited
+	 * in child task. Holds for per-thread and system-wide
+	 */
+	task->pfm_context = NULL;
+	clear_tsk_thread_flag(task, TIF_PERFMON_CTXSW);
+	clear_tsk_thread_flag(task, TIF_PERFMON_WORK);
+}
+
+static inline void pfm_handle_switch_timeout(void)
+{
+	unsigned long info;
+	info = __get_cpu_var(pfm_syst_info);
+	if (info & PFM_CPUINFO_TIME_SWITCH)
+		__pfm_handle_switch_timeout();
+}
+
+static inline void pfm_init_percpu(void)
+{
+	__pfm_init_percpu(NULL);
+}
+
+#endif /* __KERNEL__ */
+
+#else /* !CONFIG_PERFMON */
+#ifdef __KERNEL__
+
+#define tsks_have_perfmon(p, n)	(0)
+#define pfm_cpu_disable()		do { } while (0)
+#define pfm_init_percpu()		do { } while (0)
+#define pfm_exit_thread(_t)  		do { } while (0)
+#define pfm_handle_work(_t)    		do { } while (0)
+#define pfm_copy_thread(_t)		do { } while (0)
+#define pfm_ctxsw(_p, _t)     		do { } while (0)
+#define pfm_handle_switch_timeout()  	do { } while (0)
+#define	pfm_release_allcpus()		do { } while (0)
+#define	pfm_reserve_allcpus()		(0)
+#ifdef __ia64__
+#define pfm_release_dbregs(_t) 		do { } while (0)
+#define pfm_use_dbregs(_t)     		(0)
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* CONFIG_PERFMON */
+
+#endif /* __LINUX_PERFMON_H__ */
Index: linux-2.6/include/linux/perfmon_dfl_smpl.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/perfmon_dfl_smpl.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+ *               Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file implements the new dfl sampling buffer format
+ * for perfmon2 subsystem.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef __PERFMON_DFL_SMPL_H__
+#define __PERFMON_DFL_SMPL_H__ 1
+
+/*
+ * format specific parameters (passed at context creation)
+ */
+struct pfm_dfl_smpl_arg {
+	__u64 buf_size;		/* size of the buffer in bytes */
+	__u32 buf_flags;	/* buffer specific flags */
+	__u32 reserved1;	/* for future use */
+	__u64 reserved[6];	/* for future use */
+};
+
+/*
+ * This header is at the beginning of the sampling buffer returned to the user.
+ * It is directly followed by the first record.
+ */
+struct pfm_dfl_smpl_hdr {
+	__u64 hdr_count;	/* how many valid entries */
+	__u64 hdr_cur_offs;	/* current offset from top of buffer */
+	__u64 hdr_overflows;	/* #overflows for buffer */
+	__u64 hdr_buf_size;	/* bytes in the buffer */
+	__u64 hdr_min_buf_space;/* minimal buffer size (internal use) */
+	__u32 hdr_version;	/* smpl format version */
+	__u32 hdr_buf_flags;	/* copy of buf_flags */
+	__u64 hdr_reserved[10];	/* for future use */
+};
+
+/*
+ * Entry header in the sampling buffer.  The header is directly followed
+ * with the values of the PMD registers of interest saved in increasing
+ * index order: PMD4, PMD5, and so on. How many PMDs are present depends
+ * on how the session was programmed.
+ *
+ * In the case where multiple counters overflow at the same time, multiple
+ * entries are written consecutively.
+ *
+ * last_reset_value member indicates the initial value of the overflowed PMD.
+ */
+struct pfm_dfl_smpl_entry {
+	__u32	pid;		/* thread id (for NPTL, this is gettid()) */
+	__u16	ovfl_pmd;	/* index of overflowed PMD for this sample */
+	__u16	reserved;	/* for future use */
+	__u64	last_reset_val;	/* initial value of overflowed PMD */
+	__u64	ip;		/* where did the overflow interrupt happened  */
+	__u64	tstamp;		/* overflow timetamp */
+	__u16	cpu;		/* cpu on which the overfow occurred */
+	__u16	set;		/* event set active when overflow ocurred   */
+	__u32	tgid;		/* thread group id (for NPTL, this is getpid())*/
+};
+
+#define PFM_DFL_SMPL_VERSION_MAJ 1U
+#define PFM_DFL_SMPL_VERSION_MIN 0U
+#define PFM_DFL_SMPL_VERSION (((PFM_DFL_SMPL_VERSION_MAJ&0xffff)<<16)|\
+				(PFM_DFL_SMPL_VERSION_MIN & 0xffff))
+
+#endif /* __PERFMON_DFL_SMPL_H__ */
Index: linux-2.6/include/linux/perfmon_fmt.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/perfmon_fmt.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Interface for custom sampling buffer format modules
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef __PERFMON_FMT_H__
+#define __PERFMON_FMT_H__ 1
+
+#include <linux/kobject.h>
+
+struct pfm_ovfl_arg {
+	u16 ovfl_pmd;	/* index of overflowed PMD  */
+	u16 active_set;	/* set active at the time of the overflow */
+	u32 ovfl_ctrl;	/* control flags */
+	u64 pmd_last_reset;	/* last reset value of overflowed PMD */
+	u64 smpl_pmds_values[PFM_MAX_PMDS]; 	/* values of other PMDs */
+	u64 pmd_eventid;	/* eventid associated with PMD */
+	u16 num_smpl_pmds;	/* number of PMDS in smpl_pmd_values */
+};
+
+/*
+ * ovfl_ctrl bitmask of flags
+ */
+#define PFM_OVFL_CTRL_NOTIFY	0x1	/* notify user */
+#define PFM_OVFL_CTRL_RESET	0x2	/* reset overflowed pmds */
+#define PFM_OVFL_CTRL_MASK	0x4	/* mask monitoring */
+
+
+typedef int (*fmt_validate_t )(u32 flags, u16 npmds, void *arg);
+typedef	int (*fmt_getsize_t)(u32 flags, void *arg, size_t *size);
+typedef int (*fmt_init_t)(struct pfm_context *ctx, void *buf, u32 flags, u16 nmpds, void *arg);
+typedef int (*fmt_restart_t)(int is_active, u32 *ovfl_ctrl, void *buf);
+typedef int (*fmt_exit_t)(void *buf);
+typedef int (*fmt_handler_t)(void *buf, struct pfm_ovfl_arg *arg,
+			     unsigned long ip, u64 stamp, void *data);
+
+struct pfm_smpl_fmt {
+	char		*fmt_name;	/* name of the format (required) */
+	size_t		fmt_arg_size;	/* size of fmt args for ctx create */
+	u32		fmt_flags;	/* format specific flags */
+	u32		fmt_version;	/* format version number */
+
+	fmt_validate_t	fmt_validate;	/* validate context flags */
+	fmt_getsize_t	fmt_getsize;	/* get size for sampling buffer */
+	fmt_init_t	fmt_init;	/* initialize buffer area */
+	fmt_handler_t	fmt_handler;	/* overflow handler (required) */
+	fmt_restart_t	fmt_restart;	/* restart after notification  */
+	fmt_exit_t	fmt_exit;	/* context termination */
+
+	struct list_head fmt_list;	/* internal use only */
+
+	struct kobject	kobj;		/* sysfs internal use only */
+	struct module	*owner;		/* pointer to module owner */
+	u32		fmt_qdepth;	/* Max notify queue depth (required) */
+};
+#define to_smpl_fmt(n) container_of(n, struct pfm_smpl_fmt, kobj)
+
+#define PFM_FMTFL_IS_BUILTIN	0x1	/* fmt is compiled in */
+/*
+ * we need to know whether the format is builtin or compiled
+ * as a module
+ */
+#ifdef MODULE
+#define PFM_FMT_BUILTIN_FLAG	0	/* not built as a module */
+#else
+#define PFM_FMT_BUILTIN_FLAG	PFM_PMUFL_IS_BUILTIN /* built as a module */
+#endif
+
+int pfm_fmt_register(struct pfm_smpl_fmt *fmt);
+int pfm_fmt_unregister(struct pfm_smpl_fmt *fmt);
+void pfm_sysfs_builtin_fmt_add(void);
+
+#endif /* __PERFMON_FMT_H__ */
Index: linux-2.6/include/linux/perfmon_pmu.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/perfmon_pmu.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * Interface for PMU description modules
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#ifndef __PERFMON_PMU_H__
+#define __PERFMON_PMU_H__ 1
+
+/*
+ * generic information about a PMC or PMD register
+ */
+struct pfm_regmap_desc {
+	u16  type;		/* role of the register */
+	u16  reserved1;		/* for future use */
+	u32  reserved2;		/* for future use */
+	u64  dfl_val;		/* power-on default value (quiescent) */
+	u64  rsvd_msk;		/* reserved bits: 1 means reserved */
+	u64  no_emul64_msk;	/* bits to clear for PFM_REGFL_NO_EMUL64 */
+	unsigned long hw_addr;	/* HW register address or index */
+	struct kobject	kobj;	/* for internal use only */
+	char *desc;		/* HW register description string */
+};
+#define to_reg(n) container_of(n, struct pfm_regmap_desc, kobj)
+
+/*
+ * pfm_reg_desc declaration help macros
+ */
+#define PMC_D(t,d,v,r,n, h)   \
+	{ .type = t,          \
+	  .desc = d,          \
+	  .dfl_val = v,       \
+	  .rsvd_msk = r,      \
+	  .no_emul64_msk = n, \
+	  .hw_addr = h	      \
+        }
+
+#define PMD_D(t,d, h)         \
+	{ .type = t,          \
+	  .desc = d,          \
+	  .rsvd_msk = 0,      \
+	  .no_emul64_msk = 0, \
+	  .hw_addr = h	      \
+        }
+
+#define PMX_NA \
+	{ .type = PFM_REG_NA }
+
+/*
+ * type of a PMU register (16-bit bitmask) for use with pfm_reg_desc.type
+ */
+#define PFM_REG_NA	0x00  /* not avail. (not impl.,no access) must be 0 */
+#define PFM_REG_I	0x01  /* implemented */
+#define PFM_REG_WC	0x02  /* has write_checker */
+#define PFM_REG_C64	0x04  /* PMD: 64-bit virtualization */
+#define PFM_REG_RO	0x08  /* PMD: read-only (writes ignored) */
+#define PFM_REG_V	0x10  /* PMD: virtual reg (provided by PMU description) */
+#define PFM_REG_NO64	0x100 /* PMC: supports PFM_REGFL_NO_EMUL64 */
+
+/*
+ * define some shortcuts for common types
+ */
+#define PFM_REG_W	(PFM_REG_WC|PFM_REG_I)
+#define PFM_REG_W64	(PFM_REG_WC|PFM_REG_NO64|PFM_REG_I)
+#define PFM_REG_C	(PFM_REG_C64|PFM_REG_I)
+#define PFM_REG_I64	(PFM_REG_NO64|PFM_REG_I)
+
+typedef int (*pfm_pmc_check_t)(struct pfm_context *ctx,
+			       struct pfm_event_set *set,
+			       struct pfarg_pmc *req);
+
+typedef int (*pfm_pmd_check_t)(struct pfm_context *ctx,
+			       struct pfm_event_set *set,
+			       struct pfarg_pmd *req);
+
+
+typedef u64 (*pfm_pmd_sread_t)(struct pfm_context *ctx, unsigned int cnum);
+typedef void (*pfm_pmd_swrite_t)(struct pfm_context *ctx, unsigned int cnum, u64 val);
+
+/*
+ * registers description
+ */
+struct pfm_regdesc {
+	u64 pmcs[PFM_PMC_BV];		/* available PMC */
+	u64 pmds[PFM_PMD_BV];		/* available PMD */
+	u64 rw_pmds[PFM_PMD_BV];	/* available RW PMD */
+	u64 cnt_pmds[PFM_PMD_BV];	/* available counting PMD (counters) */
+	u16 max_pmc;			/* highest+1 avail PMC */
+	u16 max_pmd;			/* highest+1 avail PMD */
+	u16 max_rw_pmd;			/* highest+1 avail RW PMD */
+	u16 first_cnt_pmd;		/* first counting PMD */
+	u16 max_cnt_pmd;		/* highest+1 avail counter */
+	u16 num_rw_pmd;			/* number of avail RW PMD */
+	u16 num_pmcs;			/* logical PMCS  */
+	u16 num_pmds;			/* logical PMDS  */
+	u16 num_counters;		/* PMC/PMD counter pairs */
+};
+
+/*
+ * structure used by pmu description modules
+ *
+ * probe_pmu() routine return value:
+ * 	- 1 means recognized PMU
+ * 	- 0 means not recognized PMU
+ */
+struct pfm_pmu_config {
+	char *pmu_name;				/* PMU family name */
+	char *version;				/* config module version number */
+
+	int counter_width;			/* width of hardware counter */
+
+	struct pfm_regmap_desc	*pmc_desc;	/* PMC register descriptions */
+	struct pfm_regmap_desc	*pmd_desc;	/* PMD register descriptions */
+
+	pfm_pmc_check_t		pmc_write_check;/* PMC write checker callback (optional) */
+	pfm_pmd_check_t		pmd_write_check;/* PMD write checker callback (optional) */
+	pfm_pmd_check_t		pmd_read_check;	/* PMD read checker callback  (optional) */
+
+	pfm_pmd_sread_t		pmd_sread;	/* PMD model specific read (optional) */
+	pfm_pmd_swrite_t	pmd_swrite;	/* PMD model specific write (optional) */
+
+	int             	(*probe_pmu)(void);/* probe PMU routine */
+
+	u16			num_pmc_entries;/* number of entries in pmc_desc */
+	u16			num_pmd_entries;/* number of entries in pmd_desc */
+
+	void			*arch_info;	/* arch-specific information */
+	u32			flags;		/* set of flags */
+
+	struct module		*owner;		/* pointer to module struct */
+
+	/*
+	 * fields computed internally, do not set in module
+	 */
+	struct pfm_regdesc	regs;		/* registers currently available */
+	struct pfm_regdesc	full_regs;	/* registers presented by module */
+
+	u64			ovfl_mask;	/* overflow mask */
+	struct kobject		kobj;		/* for internal use only */
+};
+#define to_pmu(n) container_of(n, struct pfm_pmu_config, kobj)
+
+/*
+ * pfm_pmu_config flags
+ */
+#define PFM_PMUFL_IS_BUILTIN	0x1	/* pmu config is compiled in */
+
+/*
+ * we need to know whether the PMU description is builtin or compiled
+ * as a module
+ */
+#ifdef MODULE
+#define PFM_PMU_BUILTIN_FLAG	0	/* not built as a module */
+#else
+#define PFM_PMU_BUILTIN_FLAG	PFM_PMUFL_IS_BUILTIN /* built as a module */
+#endif
+
+int pfm_pmu_register(struct pfm_pmu_config *cfg);
+void pfm_pmu_unregister(struct pfm_pmu_config *cfg);
+
+#endif /* __PERFMON_PMU_H__ */
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -89,6 +89,7 @@ struct sched_param {
 struct exec_domain;
 struct futex_pi_state;
 struct bio;
+struct pfm_context;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -1076,6 +1077,9 @@ struct task_struct {
 #ifdef CONFIG_FAULT_INJECTION
 	int make_it_fail;
 #endif
+#ifdef CONFIG_PERFMON
+	struct pfm_context *pfm_context;
+#endif
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
Index: linux-2.6/include/linux/syscalls.h
===================================================================
--- linux-2.6.orig/include/linux/syscalls.h
+++ linux-2.6/include/linux/syscalls.h
@@ -29,6 +29,13 @@ struct msqid_ds;
 struct new_utsname;
 struct nfsctl_arg;
 struct __old_kernel_stat;
+struct pfarg_ctx;
+struct pfarg_pmc;
+struct pfarg_pmd;
+struct pfarg_start;
+struct pfarg_load;
+struct pfarg_setinfo;
+struct pfarg_setdesc;
 struct pollfd;
 struct rlimit;
 struct rusage;
@@ -613,4 +620,27 @@ asmlinkage long sys_eventfd(unsigned int
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
+asmlinkage long sys_pfm_create_context(struct pfarg_ctx __user *ureq,
+				       void __user *uarg, size_t smpl_size);
+asmlinkage long sys_pfm_write_pmcs(int fd, struct pfarg_pmc __user *ureq,
+				   int count);
+asmlinkage long sys_pfm_write_pmds(int fd, struct pfarg_pmd __user *ureq,
+				   int count);
+asmlinkage long sys_pfm_read_pmds(int fd, struct pfarg_pmd __user *ureq,
+				  int count);
+asmlinkage long sys_pfm_restart(int fd);
+asmlinkage long sys_pfm_stop(int fd);
+asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *ureq);
+asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ureq);
+asmlinkage long sys_pfm_unload_context(int fd);
+asmlinkage long sys_pfm_delete_evtsets(int fd,
+				       struct pfarg_setinfo __user *ureq,
+				       int count);
+asmlinkage long sys_pfm_create_evtsets(int fd,
+				       struct pfarg_setdesc __user *ureq,
+				       int count);
+asmlinkage long sys_pfm_getinfo_evtsets(int fd,
+					struct pfarg_setinfo __user *ureq,
+					int count);
+
 #endif
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -53,6 +53,7 @@
 #include <linux/kprobes.h>
 #include <linux/delayacct.h>
 #include <linux/reciprocal_div.h>
+#include <linux/perfmon.h>
 
 #include <asm/tlb.h>
 #include <asm/unistd.h>
Index: linux-2.6/kernel/sys_ni.c
===================================================================
--- linux-2.6.orig/kernel/sys_ni.c
+++ linux-2.6/kernel/sys_ni.c
@@ -113,6 +113,19 @@ cond_syscall(sys_vm86);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
 
+cond_syscall(sys_pfm_create_context);
+cond_syscall(sys_pfm_write_pmcs);
+cond_syscall(sys_pfm_write_pmds);
+cond_syscall(sys_pfm_read_pmds);
+cond_syscall(sys_pfm_restart);
+cond_syscall(sys_pfm_start);
+cond_syscall(sys_pfm_stop);
+cond_syscall(sys_pfm_load_context);
+cond_syscall(sys_pfm_unload_context);
+cond_syscall(sys_pfm_create_evtsets);
+cond_syscall(sys_pfm_delete_evtsets);
+cond_syscall(sys_pfm_getinfo_evtsets);
+ 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
 cond_syscall(sys_pciconfig_write);
Index: linux-2.6/perfmon/Makefile
===================================================================
--- /dev/null
+++ linux-2.6/perfmon/Makefile
@@ -0,0 +1,8 @@
+#
+# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P.
+# Contributed by Stephane Eranian <eranian@hpl.hp.com>
+#
+obj-$(CONFIG_PERFMON) = perfmon.o perfmon_rw.o perfmon_res.o perfmon_fmt.o \
+			perfmon_pmu.o perfmon_sysfs.o perfmon_syscalls.o   \
+			perfmon_file.o perfmon_ctxsw.o perfmon_intr.o	   \
+			perfmon_dfl_smpl.o perfmon_sets.o
Index: linux-2.6/perfmon/perfmon.c
===================================================================
--- /dev/null
+++ linux-2.6/perfmon/perfmon.c
@@ -0,0 +1,1692 @@
+/*
+ * perfmon.c: perfmon2 core functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://www.hpl.hp.com/research/linux/perfmon
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include <linux/poll.h>
+#include <linux/ptrace.h>
+#include <linux/perfmon.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+
+/*
+ * internal variables
+ */
+static struct kmem_cache *pfm_ctx_cachep;
+
+/*
+ * external variables
+ */
+DEFINE_PER_CPU(u32, pfm_syst_info);
+DEFINE_PER_CPU(struct task_struct *, pmu_owner);
+DEFINE_PER_CPU(struct pfm_context  *, pmu_ctx);
+DEFINE_PER_CPU(u64, pmu_activation_number);
+DEFINE_PER_CPU(struct pfm_stats, pfm_stats);
+EXPORT_PER_CPU_SYMBOL(pmu_ctx);
+
+#define PFM_INVALID_ACTIVATION	((u64)~0)
+
+int perfmon_disabled;	/* >0 if perfmon is disabled */
+
+/*
+ * Reset PMD register flags
+ */
+#define PFM_PMD_RESET_NONE	0	/* do not reset (pfm_switch_set) */
+#define PFM_PMD_RESET_SHORT	1	/* use short reset value */
+#define PFM_PMD_RESET_LONG	2	/* use long reset value  */
+
+/*
+ * get a new message slot from the queue. If the queue is full NULL
+ * is returned and monitoring stops.
+ */
+union pfarg_msg *pfm_get_new_msg(struct pfm_context *ctx)
+{
+	int next;
+
+	next = ctx->msgq_head & PFM_MSGQ_MASK;
+
+	if ((ctx->msgq_head - ctx->msgq_tail) == PFM_MSGS_COUNT)
+		return NULL;
+
+	/*
+	 * move to next possible slot
+	 */
+	ctx->msgq_head++;
+
+	PFM_DBG_ovfl("head=%d tail=%d msg=%d",
+		ctx->msgq_head & PFM_MSGQ_MASK,
+		ctx->msgq_tail & PFM_MSGQ_MASK,
+		next);
+
+	return ctx->msgq+next;
+}
+EXPORT_SYMBOL(pfm_get_new_msg);
+
+void pfm_context_free(struct pfm_context *ctx)
+{
+	struct pfm_smpl_fmt *fmt;
+
+	pfm_arch_context_free(ctx);
+
+	fmt = ctx->smpl_fmt;
+
+	pfm_free_sets(ctx);
+
+	if (ctx->smpl_addr) {
+		PFM_DBG("freeing sampling buffer @%p size=%zu",
+			ctx->smpl_addr,
+			ctx->smpl_size);
+
+		pfm_release_buf_space(ctx, ctx->smpl_size);
+
+		if (fmt->fmt_exit)
+			(*fmt->fmt_exit)(ctx->smpl_addr);
+
+		vfree(ctx->smpl_addr);
+	}
+
+	PFM_DBG("free ctx @%p", ctx);
+	kmem_cache_free(pfm_ctx_cachep, ctx);
+	/*
+	 * decrease refcount on:
+	 * 	- PMU description table
+	 * 	- sampling format
+	 */
+	pfm_pmu_conf_put();
+	pfm_smpl_fmt_put(fmt);
+	pfm_pmu_release();
+}
+
+/*
+ * only called in for the current task
+ */
+static int pfm_setup_smpl_fmt(struct pfm_smpl_fmt *fmt, void *fmt_arg,
+				struct pfm_context *ctx, u32 ctx_flags,
+				int mode, struct file *filp)
+{
+	size_t size = 0;
+	int ret = 0;
+
+	/*
+	 * validate parameters
+	 */
+	if (fmt->fmt_validate) {
+		ret = (*fmt->fmt_validate)(ctx_flags, pfm_pmu_conf->regs.num_pmds,
+					   fmt_arg);
+		PFM_DBG("validate(0x%x,%p)=%d", ctx_flags, fmt_arg, ret);
+		if (ret)
+			goto error;
+	}
+
+	/*
+	 * check if buffer format wants to use perfmon
+	 * buffer allocation/mapping service
+	 */
+	size = 0;
+	if (fmt->fmt_getsize) {
+		ret = (*fmt->fmt_getsize)(ctx_flags, fmt_arg, &size);
+		if (ret) {
+			PFM_DBG("cannot get size ret=%d", ret);
+			goto error;
+		}
+	}
+
+	if (size) {
+		if (mode == PFM_COMPAT)
+			ret = pfm_smpl_buffer_alloc_compat(ctx, size, filp);
+		else
+			ret = pfm_smpl_buffer_alloc(ctx, size);
+		if (ret)
+			goto error;
+
+	}
+
+	if (fmt->fmt_init) {
+		ret = (*fmt->fmt_init)(ctx, ctx->smpl_addr, ctx_flags,
+				       pfm_pmu_conf->regs.num_pmds,
+				       fmt_arg);
+		if (ret)
+			goto error_buffer;
+	}
+	return 0;
+
+error_buffer:
+	pfm_release_buf_space(ctx, ctx->smpl_size);
+	/*
+	 * we do not call fmt_exit, if init has failed
+	 */
+	vfree(ctx->smpl_addr);
+error:
+	return ret;
+}
+
+/*
+ * interrupts are masked when entering this function.
+ * context must be in MASKED state when calling.
+ */
+static void pfm_unmask_monitoring(struct pfm_context *ctx,
+				  struct pfm_event_set *set)
+{
+	if (ctx->state != PFM_CTX_MASKED)
+		return;
+
+	PFM_DBG_ovfl("unmasking monitoring");
+
+	/*
+	 * must be done before calling
+	 * pfm_arch_unmask_monitoring()
+	 */
+	ctx->state = PFM_CTX_LOADED;
+
+	/*
+	 * we need to restore the PMDs because they
+	 * may have been modified by user while MASKED in which
+	 * case the actual registers were not updated
+	 *
+	 * XXX: could be avoided in system-wide mode
+	 */
+	pfm_arch_restore_pmds(ctx, set);
+
+	pfm_arch_unmask_monitoring(ctx, set);
+
+	set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+	/*
+	 * reset set duration timer
+	 */
+	set->duration_start = sched_clock();
+}
+
+/*
+ * called from pfm_smpl_buffer_alloc_old() (IA64-COMPAT)
+ * and pfm_setup_smpl_fmt()
+ *
+ * interrupts are enabled, context is not locked.
+ */
+int pfm_smpl_buffer_alloc(struct pfm_context *ctx, size_t rsize)
+{
+	void *addr;
+	size_t size;
+	int ret;
+
+	might_sleep();
+
+	/*
+	 * align page boundary
+	 */
+	size = PAGE_ALIGN(rsize);
+
+	PFM_DBG("buffer req_size=%zu actual_size=%zu before", rsize, size);
+
+	ret = pfm_reserve_buf_space(size);
+	if (ret)
+		return ret;
+
+	PFM_DBG("buffer req_size=%zu actual_size=%zu after", rsize, size);
+	/*
+	 * vmalloc can sleep. we do not hold
+	 * any spinlock and interrupts are enabled
+	 */
+	addr = vmalloc(size);
+	if (!addr) {
+		PFM_DBG("cannot allocate sampling buffer");
+		goto unres;
+	}
+
+	memset(addr, 0, size);
+
+	ctx->smpl_addr = addr;
+	ctx->smpl_size = size;
+
+	PFM_DBG("kernel smpl buffer @%p", addr);
+
+	return 0;
+unres:
+	PFM_DBG("buffer req_size=%zu actual_size=%zu error", rsize, size);
+	pfm_release_buf_space(ctx, size);
+	return -ENOMEM;
+}
+
+void pfm_reset_pmds(struct pfm_context *ctx,
+		    struct pfm_event_set *set,
+		    int num_pmds,
+		    int reset_mode)
+{
+	u64 val, mask, new_seed;
+	struct pfm_pmd *reg;
+	unsigned int i, not_masked;
+
+	not_masked = ctx->state != PFM_CTX_MASKED;
+
+	PFM_DBG_ovfl("%s r_pmds=0x%llx not_masked=%d",
+		reset_mode == PFM_PMD_RESET_LONG ? "long" : "short",
+		(unsigned long long)set->reset_pmds[0],
+		not_masked);
+
+	__get_cpu_var(pfm_stats).reset_pmds_count++;
+
+	for (i = 0; num_pmds; i++) {
+		if (test_bit(i, cast_ulp(set->reset_pmds))) {
+			num_pmds--;
+
+			reg = set->pmds + i;
+
+			val = reset_mode == PFM_PMD_RESET_LONG ? reg->long_reset : reg->short_reset;
+
+			if (reg->flags & PFM_REGFL_RANDOM) {
+				mask = reg->mask;
+				new_seed = random32();
+
+				/* construct a full 64-bit random value: */
+				if ((unlikely(mask >> 32) != 0))
+					new_seed |= (u64)random32() << 32;
+
+				/* counter values are negative numbers! */
+				val -= (new_seed & mask);
+			}
+
+			set->pmds[i].value = val;
+			reg->lval = val;
+
+			/*
+			 * not all PMD to reset are necessarily
+			 * counters
+			 */
+			if (not_masked)
+				pfm_write_pmd(ctx, i, val);
+
+			PFM_DBG_ovfl("set%u pmd%u sval=0x%llx",
+					set->id,
+					i,
+					(unsigned long long)val);
+		}
+	}
+
+	/*
+	 * done with reset
+	 */
+	bitmap_zero(cast_ulp(set->reset_pmds), i);
+
+	/*
+	 * make changes visible
+	 */
+	if (not_masked)
+		pfm_arch_serialize();
+}
+
+/*
+ * called from pfm_handle_work() and __pfm_restart()
+ * for system-wide and per-thread context to resume
+ * monitoring after a user level notification.
+ *
+ * In both cases, the context is locked and interrupts
+ * are disabled.
+ */
+static void pfm_resume_after_ovfl(struct pfm_context *ctx)
+{
+	struct pfm_smpl_fmt *fmt;
+	u32 rst_ctrl;
+	struct pfm_event_set *set;
+	u64 *reset_pmds;
+	void *hdr;
+	int state, ret;
+
+	hdr = ctx->smpl_addr;
+	fmt = ctx->smpl_fmt;
+	state = ctx->state;
+	set = ctx->active_set;
+	ret = 0;
+
+	if (hdr) {
+		rst_ctrl = 0;
+		prefetch(hdr);
+	} else
+		rst_ctrl= PFM_OVFL_CTRL_RESET;
+
+	/*
+	 * if using a sampling buffer format and it has a restart callback,
+	 * then invoke it. hdr may be NULL, it the format does not use a
+	 * perfmon buffer
+	 */
+	if (fmt && fmt->fmt_restart)
+		ret = (*fmt->fmt_restart)(state == PFM_CTX_LOADED, &rst_ctrl, hdr);
+
+	reset_pmds = set->reset_pmds;
+
+	PFM_DBG("restart=%d set=%u r_pmds=0x%llx switch=%d ctx_state=%d",
+		ret,
+		set->id,
+		(unsigned long long)reset_pmds[0],
+		(set->priv_flags & PFM_SETFL_PRIV_SWITCH),
+		state);
+
+	if (!ret) {
+		/*
+		 * switch set if needed
+		 */
+		if (set->priv_flags & PFM_SETFL_PRIV_SWITCH) {
+			set->priv_flags &= ~PFM_SETFL_PRIV_SWITCH;
+			pfm_switch_sets(ctx, NULL, PFM_PMD_RESET_LONG, 0);
+			set = ctx->active_set;
+		} else if (rst_ctrl & PFM_OVFL_CTRL_RESET) {
+			int nn;
+			nn = bitmap_weight(cast_ulp(set->reset_pmds),
+					   pfm_pmu_conf->regs.max_pmd);
+			if (nn)
+				pfm_reset_pmds(ctx, set, nn, PFM_PMD_RESET_LONG);
+		}
+
+		if (!(rst_ctrl & PFM_OVFL_CTRL_MASK))
+			pfm_unmask_monitoring(ctx, set);
+		else
+			PFM_DBG("stopping monitoring?");
+		ctx->state = PFM_CTX_LOADED;
+	}
+	ctx->flags.can_restart = 0;
+}
+
+/*
+ * This function is always called after pfm_stop has been issued
+ */
+static void pfm_flush_pmds(struct task_struct *task, struct pfm_context *ctx)
+{
+	struct pfm_event_set *set;
+	u64 ovfl_mask;
+	u64 *ovfl_pmds;
+	u32 num_ovfls;
+	u16 i, first_cnt_pmd;
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	first_cnt_pmd = pfm_pmu_conf->regs.first_cnt_pmd;
+
+	set = ctx->active_set;
+
+	/*
+	 * save active set
+	 * UP:
+	 * 	if not current task and due to lazy, state may
+	 * 	still be live
+	 * for system-wide, guaranteed to run on correct CPU
+	 */
+	if (__get_cpu_var(pmu_ctx) == ctx) {
+		/*
+		 * pending overflows have been saved by pfm_stop()
+		 */
+		pfm_save_pmds(ctx, set);
+		pfm_set_pmu_owner(NULL, NULL);
+		PFM_DBG("released ownership");
+	}
+
+	/*
+	 * cleanup each set
+	 */
+	list_for_each_entry(set, &ctx->list, list) {
+		if (!set->npend_ovfls)
+			continue;
+
+		/*
+		 * take care of overflow
+		 * no format handler is called here
+		 */
+		ovfl_pmds = set->povfl_pmds;
+		num_ovfls = set->npend_ovfls;
+
+		PFM_DBG("set%u first=%u novfls=%u",
+			set->id, first_cnt_pmd, num_ovfls);
+		/*
+		 * only look up to the last counting PMD register
+		 */
+		for (i = first_cnt_pmd; num_ovfls; i++) {
+			if (test_bit(i, cast_ulp(ovfl_pmds))) {
+				set->pmds[i].value += 1 + ovfl_mask;
+				num_ovfls--;
+				PFM_DBG("pmd%u overflowed", i);
+ 			}
+			PFM_DBG("pmd%u set=%u val=0x%llx",
+				i,
+				set->id,
+				(unsigned long long)set->pmds[i].value);
+		}
+	}
+}
+
+/*
+ * This function is called when we need to perform asynchronous
+ * work on a context. This function is called ONLY when about to
+ * return to user mode (very much like with signals handling).
+ *
+ * There are several reasons why we come here:
+ *
+ *  - per-thread mode, not self-monitoring, to reset the counters
+ *    after a pfm_restart() by the thread controlling the context
+ *
+ *  - because we are zombie and we need to cleanup our state
+ *
+ *  - because we need to block after an overflow notification
+ *    on a context with the PFM_OVFL_NOTIFY_BLOCK flag
+ *
+ * This function is never called for a system-wide context.
+ *
+ * pfm_handle_work() can be called with interrupts enabled
+ * (TIF_NEED_RESCHED) or disabled. The down_interruptible
+ * call may sleep, therefore we must re-enable interrupts
+ * to avoid deadlocks. It is safe to do so because this function
+ * is called ONLY when returning to user level, in which case
+ * there is no risk of kernel stack overflow due to deep
+ * interrupt nesting.
+ */
+void pfm_handle_work(struct pt_regs *regs)
+{
+	struct pfm_context *ctx;
+	unsigned long flags, dummy_flags;
+	int type, ret, can_release;
+
+#ifdef CONFIG_PPC
+	/*
+	 * This is just a temporary fix. Obviously we'd like to fix the powerpc
+	 * code to make that check before calling __pfm_handle_work() to
+	 * prevent the function call overhead, but the call is made from assembly
+	 * code, so it will take a little while to figure out how to perform the
+	 * check correctly.
+	 */
+	if (!test_thread_flag(TIF_PERFMON_WORK))
+		return;
+#endif
+
+	if (!user_mode(regs))
+		return;
+
+	//BUG_ON(!irqs_disabled());
+
+	clear_thread_flag(TIF_PERFMON_WORK);
+
+	__get_cpu_var(pfm_stats).handle_work_count++;
+
+	ctx = current->pfm_context;
+	if (ctx == NULL) {
+		PFM_ERR("handle_work [%d] has no ctx", current->pid);
+		return;
+	}
+
+	BUG_ON(ctx->flags.system);
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	type = ctx->flags.work_type;
+	ctx->flags.work_type = PFM_WORK_NONE;
+
+	/*
+	 * must be done before we check for simple reset mode
+	 */
+	if (type == PFM_WORK_ZOMBIE)
+		goto do_zombie;
+
+	if (type == PFM_WORK_RESET) {
+		PFM_DBG("counter reset");
+		goto skip_blocking;
+	}
+
+	/*
+	 * restore interrupt mask to what it was on entry.
+	 * Could be enabled/disabled.
+	 */
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	/*
+	 * force interrupt enable because of down_interruptible()
+	 */
+	local_irq_enable();
+
+	PFM_DBG("before block sleeping");
+
+	/*
+	 * may go through without blocking on SMP systems
+	 * if restart has been received already by the time we call down()
+	 */
+	ret = wait_for_completion_interruptible(&ctx->restart_complete);
+
+	PFM_DBG("after block sleeping ret=%d", ret);
+
+	/*
+	 * lock context and mask interrupts again
+	 * We save flags into a dummy because we may have
+	 * altered interrupts mask compared to entry in this
+	 * function.
+	 */
+	spin_lock_irqsave(&ctx->lock, dummy_flags);
+
+	if (ctx->state == PFM_CTX_ZOMBIE)
+		goto do_zombie;
+
+	/*
+	 * in case of interruption of down() we don't restart anything
+	 */
+	if (ret < 0)
+		goto nothing_to_do;
+
+skip_blocking:
+	pfm_resume_after_ovfl(ctx);
+
+nothing_to_do:
+	/*
+	 * restore flags as they were upon entry
+	 */
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	return;
+
+do_zombie:
+	PFM_DBG("context is zombie, bailing out");
+
+	__pfm_unload_context(ctx, &can_release);
+
+	/*
+	 * keep the spinlock check happy
+	 */
+	spin_unlock(&ctx->lock);
+
+	/*
+	 * enable interrupt for vfree()
+	 */
+	local_irq_enable();
+
+	/*
+	 * actual context free
+	 */
+	pfm_context_free(ctx);
+
+	/*
+	 * restore interrupts as they were upon entry
+	 */
+	local_irq_restore(flags);
+
+	/* always true */
+	if (can_release)
+		pfm_release_session(0, 0);
+}
+
+int pfm_notify_user(struct pfm_context *ctx)
+{
+	if (ctx->state == PFM_CTX_ZOMBIE) {
+		PFM_DBG("ignoring overflow notification, owner is zombie");
+		return 0;
+	}
+	PFM_DBG("waking up somebody");
+
+	wake_up_interruptible(&ctx->msgq_wait);
+
+	/*
+	 * it is safe to call kill_fasync() from an interrupt
+	 * handler. kill_fasync()  grabs two RW locks (fasync_lock,
+	 * tasklist_lock) in read mode. There is conflict only in
+	 * case the PMU interrupt occurs during a write mode critical
+	 * section. This cannot happen because for both locks, the
+	 * write mode is always using interrupt masking (write_lock_irq).
+	 */
+	kill_fasync (&ctx->async_queue, SIGIO, POLL_IN);
+
+	return 0;
+}
+EXPORT_SYMBOL(pfm_notify_user);
+
+/*
+ * send a counter overflow notification message to
+ * user. First appends the message to the queue, then
+ * wake up ay waiter on the file descriptor
+ *
+ * context is locked and interrupts are disabled (no preemption).
+ */
+int pfm_ovfl_notify_user(struct pfm_context *ctx,
+			struct pfm_event_set *set,
+	     		unsigned long ip)
+{
+	union pfarg_msg *msg = NULL;
+	int max_cnt_pmd;
+	u64 *ovfl_pmds;
+
+	max_cnt_pmd = pfm_pmu_conf->regs.max_cnt_pmd;
+
+	if (!ctx->flags.no_msg) {
+		msg = pfm_get_new_msg(ctx);
+		if (msg == NULL) {
+			/*
+			 * when message queue fills up it is because the user
+			 * did not extract the message, yet issued
+			 * pfm_restart(). At this point, we stop sending
+			 * notification, thus the user will not be able to get
+			 * new samples when using the default format.
+			 */
+			PFM_DBG_ovfl("no more notification msgs");
+			return -1;
+		}
+
+		msg->pfm_ovfl_msg.msg_type = PFM_MSG_OVFL;
+		msg->pfm_ovfl_msg.msg_ovfl_pid = current->pid;
+		msg->pfm_ovfl_msg.msg_active_set = set->id;
+
+		ovfl_pmds = msg->pfm_ovfl_msg.msg_ovfl_pmds;
+
+		bitmap_copy(cast_ulp(ovfl_pmds), cast_ulp(set->ovfl_pmds),
+			    max_cnt_pmd);
+
+		msg->pfm_ovfl_msg.msg_ovfl_cpu = smp_processor_id();
+		msg->pfm_ovfl_msg.msg_ovfl_tid = current->tgid;
+		msg->pfm_ovfl_msg.msg_ovfl_ip = ip;
+
+		__get_cpu_var(pfm_stats).ovfl_notify_count++;
+	}
+
+	PFM_DBG("ovfl msg: ip=0x%lx o_pmds=0x%llx",
+		ip,
+		(unsigned long long)set->ovfl_pmds[0]);
+
+	return pfm_notify_user(ctx);
+}
+
+/*
+ * In per-thread mode, when not self-monitoring, perfmon
+ * send a 'end' notification message when the monitored
+ * thread where the context is attached is exiting.
+ *
+ * This helper message alleviate the need to track the activity
+ * of the thread/process when it is not directly related, i.e.,
+ * was attached vs was forked/execd.
+ *
+ * This function appends the 'end' notification message to the
+ * queue.
+ *
+ * the context must be locked and interrupts disabled.
+ */
+static int pfm_end_notify_user(struct pfm_context *ctx)
+{
+	union pfarg_msg *msg;
+
+	msg = pfm_get_new_msg(ctx);
+	if (msg == NULL) {
+		PFM_ERR("%s no more msgs", __FUNCTION__);
+		return -1;
+	}
+	/* no leak */
+	memset(msg, 0, sizeof(*msg));
+
+	msg->type = PFM_MSG_END;
+
+	PFM_DBG("end msg: msg=%p no_msg=%d",
+		msg,
+		ctx->flags.no_msg);
+
+	return pfm_notify_user(ctx);
+}
+
+/*
+ * called only from exit_thread(): task == current
+ * we come here only if current has a context
+ * attached (loaded or masked or zombie)
+ */
+void __pfm_exit_thread(struct task_struct *task)
+{
+	struct pfm_context *ctx;
+	unsigned long flags;
+	int free_ok = 0, can_release = 0;
+
+	ctx  = task->pfm_context;
+
+	BUG_ON(ctx->flags.system);
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	PFM_DBG("state=%d", ctx->state);
+
+	/*
+	 * __pfm_unload_context() cannot fail
+	 * in the context states we are interested in
+	 */
+	switch (ctx->state) {
+	case PFM_CTX_LOADED:
+	case PFM_CTX_MASKED:
+		__pfm_unload_context(ctx, &can_release);
+		pfm_end_notify_user(ctx);
+		break;
+	case PFM_CTX_ZOMBIE:
+		__pfm_unload_context(ctx, &can_release);
+		free_ok = 1;
+		break;
+	default:
+		BUG_ON(ctx->state != PFM_CTX_LOADED);
+		break;
+	}
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (can_release)
+		pfm_release_session(0, 0);
+
+	/*
+	 * All memory free operations (especially for vmalloc'ed memory)
+	 * MUST be done with interrupts ENABLED.
+	 */
+	if (free_ok)
+		pfm_context_free(ctx);
+}
+
+/*
+ * CPU hotplug event nofication callback
+ *
+ * We use the callback to do manage the sysfs interface.
+ * Note that the actual shutdown of monitoring on the CPU
+ * is done in pfm_cpu_disable(), see comments there for more
+ * information.
+ */
+static int pfm_cpu_notify(struct notifier_block *nfb,
+			  unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	int ret = NOTIFY_OK;
+
+	pfm_pmu_conf_get(0);
+
+	switch (action) {
+	case CPU_ONLINE:
+		pfm_sysfs_add_cpu(cpu);
+		PFM_INFO("CPU%d is online", cpu);
+		break;
+	case CPU_UP_PREPARE:
+		PFM_INFO("CPU%d prepare online", cpu);
+		break;
+	case CPU_UP_CANCELED:
+		pfm_sysfs_del_cpu(cpu);
+		PFM_INFO("CPU%d is up canceled", cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		PFM_INFO("CPU%d prepare offline", cpu);
+		break;
+	case CPU_DOWN_FAILED:
+		PFM_INFO("CPU%d is down failed", cpu);
+		break;
+	case CPU_DEAD:
+		pfm_sysfs_del_cpu(cpu);
+		PFM_INFO("CPU%d is offline", cpu);
+		break;
+	}
+	pfm_pmu_conf_put();
+	return ret;
+}
+
+static struct notifier_block pfm_cpu_notifier ={
+	.notifier_call = pfm_cpu_notify
+};
+
+/*
+ * called from cpu_init() and pfm_pmu_register()
+ */
+void __pfm_init_percpu(void *dummy)
+{
+	pfm_arch_init_percpu();
+}
+
+/*
+ * global initialization routine, executed only once
+ */
+int __init pfm_init(void)
+{
+	int ret;
+
+	PFM_LOG("version %u.%u, compiled: " __DATE__ ", " __TIME__,
+		PFM_VERSION_MAJ, PFM_VERSION_MIN);
+
+	pfm_ctx_cachep = kmem_cache_create("pfm_context",
+				   sizeof(struct pfm_context)+PFM_ARCH_CTX_SIZE,
+				   SLAB_HWCACHE_ALIGN, 0, NULL, NULL);
+	if (pfm_ctx_cachep == NULL) {
+		PFM_ERR("cannot initialize context slab");
+		goto error_disable;
+	}
+	ret = pfm_sets_init();
+	if (ret)
+		goto error_disable;
+
+	if (pfm_init_fs())
+		goto error_disable;
+
+	if (pfm_init_sysfs())
+		goto error_disable;
+
+	/*
+	 * one time, arch-specific global initialization
+	 */
+	if (pfm_arch_init())
+		goto error_disable;
+
+	/*
+	 * register CPU hotplug event notifier
+	 */
+	ret = register_cpu_notifier(&pfm_cpu_notifier);
+	if (!ret)
+		return 0;
+
+error_disable:
+	PFM_INFO("perfmon is disabled due to initialization error");
+	perfmon_disabled = 1;
+	return -1;
+}
+
+/*
+ * must use subsys_initcall() to ensure that the perfmon2 core
+ * is initialized before any PMU description module when they are
+ * compiled in.
+ */
+subsys_initcall(pfm_init);
+
+/*
+ * function used to start monitoring. When operating in per-thread
+ * mode and when not self-monitoring, the monitored thread must be
+ * stopped.
+ *
+ * The pfarg_start argument is optional and may be used to designate
+ * the initial event set to activate. Wehn missing, the last active
+ * set is used. For the first activation, set0 is used.
+ *
+ * On some architectures, e.g., IA-64, it may be possible to start monitoring
+ * without calling this function under certain conditions (per-thread and self
+ * monitoring).
+ *
+ * the context is locked and interrupts are disabled.
+ */
+int __pfm_start(struct pfm_context *ctx, struct pfarg_start *start)
+{
+	struct task_struct *task, *owner_task;
+	struct pfm_event_set *new_set, *old_set;
+	u64 now;
+	int is_self;
+
+	task = ctx->task;
+
+	/*
+	 * context must be loaded.
+	 * we do not support starting while in MASKED state
+	 * (mostly because of set switching issues)
+	 */
+	if (ctx->state != PFM_CTX_LOADED)
+		return -EINVAL;
+
+	old_set = new_set = ctx->active_set;
+
+	/*
+	 * always the case for system-wide
+	 */
+	if (task == NULL)
+		task = current;
+
+	is_self = task == current;
+
+	/*
+	 * argument is provided?
+	 */
+	if (start) {
+		/*
+		 * find the set to load first
+		 */
+		new_set = pfm_find_set(ctx, start->start_set, 0);
+		if (new_set == NULL) {
+			PFM_DBG("event set%u does not exist",
+				start->start_set);
+			return -EINVAL;
+		}
+	}
+
+	PFM_DBG("cur_set=%u req_set=%u",
+		old_set->id,
+		new_set->id);
+
+	/*
+	 * if we need to change the active set we need
+	 * to check if we can access the PMU
+	 */
+	if (new_set != old_set) {
+		owner_task = __get_cpu_var(pmu_owner);
+		/*
+		 * system-wide: must run on the right CPU
+		 * per-thread : must be the owner of the PMU context
+		 *
+		 * pfm_switch_sets() returns with monitoring stopped
+		 */
+		if (is_self) {
+			pfm_switch_sets(ctx, new_set, PFM_PMD_RESET_LONG, 1);
+		} else {
+			/*
+			 * In a UP kernel, the PMU may contain the state
+			 * of the task we want to operate on, yet the task
+			 * may be switched out (lazy save). We need to save
+			 * current state (old_set), switch active_set and
+			 * mark it for reload.
+			 */
+			if (owner_task == task)
+				pfm_save_pmds(ctx, old_set);
+			ctx->active_set = new_set;
+			new_set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH;
+		}
+	}
+	/*
+	 * mark as started, must be done before calling
+	 * pfm_arch_start()
+	 */
+	ctx->flags.started = 1;
+
+	now = sched_clock();
+
+	pfm_arch_start(task, ctx, new_set);
+
+	/*
+	 * we check whether we had a pending ovfl before restarting.
+	 * If so we need to regenerate the interrupt to make sure we
+	 * keep recorded samples. For non-self monitoring this check
+	 * is done in the pfm_ctxswin_thread() routine.
+	 */
+	if (is_self && new_set->npend_ovfls) {
+		pfm_arch_resend_irq();
+		__get_cpu_var(pfm_stats).ovfl_intr_replay_count++;
+	}
+
+	/*
+	 * we restart total duration even if context was
+	 * already started. In that case, counts are simply
+	 * reset.
+	 *
+	 * For per-thread, if not self-monitoring, the statement
+	 * below will have no effect because thread is stopped.
+	 * The field is reset of ctxsw in.
+	 */
+	new_set->duration_start = now;
+
+	return 0;
+}
+
+/*
+ * function used to stop monitoring. When operating in per-thread
+ * mode and when not self-monitoring, the monitored thread must be
+ * stopped.
+ *
+ * the context is locked and interrupts are disabled.
+ */
+int __pfm_stop(struct pfm_context *ctx)
+{
+	struct pfm_event_set *set;
+	struct task_struct *task;
+	u64 now;
+	int state;
+
+	now = sched_clock();
+	state = ctx->state;
+	set = ctx->active_set;
+
+	/*
+	 * context must be attached (zombie cannot happen)
+	 */
+	if (state == PFM_CTX_UNLOADED)
+		return -EINVAL;
+
+	task = ctx->task;
+
+	PFM_DBG("ctx_task=[%d] ctx_state=%d is_system=%d",
+		task ? task->pid : -1,
+		state,
+		ctx->flags.system);
+
+	/*
+	 * this happens for system-wide context
+	 */
+	if (task == NULL)
+		task = current;
+
+	/*
+	 * compute elapsed time
+	 * don't update set duration if masked
+	 */
+	if (task == current && state == PFM_CTX_LOADED)
+		set->duration += now - set->duration_start;
+
+	pfm_arch_stop(task, ctx, set);
+
+	ctx->flags.started = 0;
+	/*
+	 * starting now, in-flight PMU interrupt for this context
+	 * are treated as spurious
+	 */
+	return 0;
+}
+
+/*
+ * function called from sys_pfm_restart(). It is used when overflow
+ * notification is requested. For each notification received, the user
+ * must call pfm_restart() to indicate to the kernel that it is done
+ * processing the notification.
+ *
+ * When the caller is doing user level sampling, this function resets
+ * the overflowed counters and resumes monitoring which is normally stopped
+ * during notification (always the consequence of a counter overflow).
+ *
+ * When using a sampling format, the format restart() callback is invoked,
+ * overflowed PMDS may be reset based upon decision from sampling format.
+ *
+ * When operating in per-thread mode, and when not self-monitoring, the
+ * monitored thread DOES NOT need to be stopped, unlike for many other calls.
+ *
+ * This means that the effect of the restart may not necessarily be observed
+ * right when returning from the call. For instance, counters may not already
+ * be reset in the other thread.
+ *
+ * When operating in system-wide, the caller must be running on the monitored
+ * CPU.
+ *
+ * The context is locked and interrupts are disabled.
+ *
+ */
+int __pfm_restart(struct pfm_context *ctx, int *complete_needed)
+{
+	int state;
+
+
+	state = ctx->state;
+
+	PFM_DBG("state=%d", state);
+
+	*complete_needed = 0;
+
+	if (state != PFM_CTX_MASKED && state != PFM_CTX_LOADED) {
+		PFM_DBG("invalid state=%d", state);
+		return -EBUSY;
+	}
+
+	__get_cpu_var(pfm_stats).pfm_restart_count++;
+	/*
+	 * at this point, the context is either LOADED or MASKED
+	 */
+
+	if (ctx->task == current || ctx->flags.system) {
+		pfm_resume_after_ovfl(ctx);
+		return 0;
+	}
+
+	/*
+	 * restart another task
+	 */
+
+	/*
+	 * When PFM_CTX_MASKED, we cannot issue a restart before the previous
+	 * one is seen by the task.
+	 */
+	if (state == PFM_CTX_MASKED) {
+		if (!ctx->flags.can_restart) {
+			PFM_DBG("cannot restart can_restart=%d",
+				ctx->flags.can_restart);
+			return -EBUSY;
+		}
+		/*
+		 * prevent subsequent restart before this one is
+		 * seen by the task
+		 */
+		ctx->flags.can_restart = 0;
+	}
+
+	/*
+	 * if blocking, then post the semaphore is PFM_CTX_MASKED, i.e.
+	 * the task is blocked or on its way to block. That's the normal
+	 * restart path. If the monitoring is not masked, then the task
+	 * can be actively monitoring and we cannot directly intervene.
+	 * Therefore we use the trap mechanism to catch the task and
+	 * force it to reset the buffer/reset PMDs.
+	 *
+	 * if non-blocking, then we ensure that the task will go into
+	 * pfm_handle_work() before returning to user mode.
+	 *
+	 * We cannot explicitly reset another task, it MUST always
+	 * be done by the task itself. This works for system wide because
+	 * the tool that is controlling the session is logically doing
+	 * "self-monitoring".
+	 */
+	if (ctx->flags.block && state == PFM_CTX_MASKED) {
+		PFM_DBG("unblocking [%d]", ctx->task->pid);
+		/*
+		 * It is not possible to call complete() with the context locked
+		 * otherwise we have a potential deadlock with the PMU context
+		 * switch code due to a lock inversion between task_rq_lock()
+		 * and the context lock.
+		 * Instead we mark whether or not we need to issue the complete
+		 * and we invoke the function once the context lock is released
+		 * in sys_pfm_restart()
+		 */
+		*complete_needed = 1;
+	} else {
+		PFM_DBG("[%d] armed exit trap", ctx->task->pid);
+		ctx->flags.work_type = PFM_WORK_RESET;
+		set_tsk_thread_flag(ctx->task, TIF_PERFMON_WORK);
+	}
+	return 0;
+}
+
+/*
+ * function used to attach a context to either a CPU or a thread.
+ * In per-thread mode, and when not self-monitoring, the thread must be
+ * stopped. In system-wide mode, the cpu specified in the pfarg_load.load_tgt
+ * argument must be the current CPU.
+ *
+ * The function must be called with the context locked and interrupts disabled.
+ */
+int __pfm_load_context(struct pfm_context *ctx, struct pfarg_load *req,
+		       struct task_struct *task)
+{
+	struct pfm_event_set *set;
+	struct pfm_context *old;
+	int mycpu;
+	int ret;
+
+	mycpu = smp_processor_id();
+
+	/*
+	 * system-wide: check we are running on the desired CPU
+	 */
+	if (ctx->flags.system && req->load_pid != mycpu) {
+		PFM_DBG("running on wrong CPU: %u vs. %u",
+			mycpu, req->load_pid);
+		return -EINVAL;
+	}
+
+	/*
+	 * locate first set to activate
+	 */
+	set = pfm_find_set(ctx, req->load_set, 0);
+	if (set == NULL) {
+		PFM_DBG("event set%u does not exist", req->load_set);
+		return -EINVAL;
+	}
+
+	/*
+	 * assess sanity of event sets, initialize set state
+	 */
+	ret = pfm_prepare_sets(ctx, set);
+	if (ret) {
+		PFM_DBG("invalid next field pointers in the sets");
+		return -EINVAL;
+	}
+
+	PFM_DBG("load_pid=%d set=%u set_flags=0x%x",
+		req->load_pid,
+		set->id,
+		set->flags);
+
+	/*
+	 * per-thread:
+	 *   - task to attach to is checked in sys_pfm_load_context() to avoid
+	 *     locking issues. if found, and not self,  task refcount was incremented.
+	 */
+	if (ctx->flags.system) {
+		ctx->cpu = mycpu;
+		ctx->task = NULL;
+		task = current;
+	} else {
+		old = cmpxchg(&task->pfm_context, NULL, ctx);
+		if (old != NULL) {
+			PFM_DBG("load_pid=%d has a context "
+				"old=%p new=%p cur=%p",
+				req->load_pid,
+				old,
+				ctx,
+				task->pfm_context);
+			return -EEXIST;
+		}
+		ctx->task = task;
+		ctx->cpu = -1;
+	}
+
+	/*
+	 * perform any architecture specific actions
+	 */
+	ret = pfm_arch_load_context(ctx, set, ctx->task);
+	if (ret)
+		goto error_noload;
+
+	/*
+	 * now reserve the session, before we can proceed with
+	 * actually accessing the PMU hardware
+	 */
+	ret = pfm_reserve_session(ctx->flags.system, ctx->cpu);
+	if (ret)
+		goto error;
+
+	/*
+	 * commit active set
+	 */
+	ctx->set_all_runs = 1;
+	ctx->active_set = set;
+
+	set->runs++;
+
+	/*
+	 * self-monitoring (incl. system-wide)
+	 */
+	if (task == current) {
+#ifndef CONFIG_SMP
+		/*
+		 * in UP mode, because of lazy save/restore
+		 * there may already be valid state on the PMU.
+		 * We need to push it out before we can load the
+		 * next state
+		 */
+		struct pfm_context *ctxp;
+		ctxp = __get_cpu_var(pmu_ctx);
+		if (ctxp)
+			pfm_save_prev_context(ctxp);
+#endif
+		pfm_set_last_cpu(ctx, mycpu);
+		pfm_inc_activation();
+		pfm_set_activation(ctx);
+
+		/*
+	 	 * we activate switch timeout callbacks to pfm_handle_switch_timeout()
+	 	 * even though the interface guarantees monitoring is inactive at
+	 	 * this point. The reason is that on some architectures (e.g., IA-64)
+	 	 * it is possible to start monitoring directly from user level without
+	 	 * the kernel knowing. In that case, the kernel would not be able to
+	 	 * active switch timeout when monitoring starts
+	 	 */
+		if (set->flags & PFM_SETFL_TIME_SWITCH)
+			__get_cpu_var(pfm_syst_info) = PFM_CPUINFO_TIME_SWITCH;
+
+		/*
+		 * load PMD from set
+		 * load PMC from set
+		 */
+		pfm_arch_restore_pmds(ctx, set);
+		pfm_arch_restore_pmcs(ctx, set);
+
+		/*
+		 * set new ownership
+		 */
+		pfm_set_pmu_owner(ctx->task, ctx);
+	} else {
+		/* force a full reload */
+		ctx->last_act = PFM_INVALID_ACTIVATION;
+		pfm_set_last_cpu(ctx, -1);
+		set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH;
+		PFM_DBG("context loaded next ctxswin for [%d]", task->pid);
+	}
+
+	if (!ctx->flags.system) {
+		set_tsk_thread_flag(task, TIF_PERFMON_CTXSW);
+		PFM_DBG("[%d] set TIF", task->pid);
+	}
+
+	ctx->flags.work_type = PFM_WORK_NONE;
+
+	/*
+	 * reset message queue
+	 */
+	ctx->msgq_head = ctx->msgq_tail = 0;
+
+	ctx->state = PFM_CTX_LOADED;
+
+	return 0;
+
+error:
+	pfm_arch_unload_context(ctx, task);
+error_noload:
+	/*
+	 * detach context
+	 */
+	if (!ctx->flags.system)
+		task->pfm_context = NULL;
+
+	return ret;
+}
+
+/*
+ * Function used to detach a context from either a CPU or a thread.
+ * In the per-thread case and when not self-monitoring, the thread must be
+ * stopped. After the call, the context is detached and monitoring is stopped.
+ *
+ * The function must be called with the context locked and interrupts disabled.
+ */
+int __pfm_unload_context(struct pfm_context *ctx, int *can_release)
+{
+	struct task_struct *task;
+	struct pfm_event_set *set;
+	int ret, is_self;
+
+	PFM_DBG("ctx_state=%d task [%d]", ctx->state, ctx->task ? ctx->task->pid : -1);
+
+	*can_release = 0;
+
+	/*
+	 * unload only when necessary
+	 */
+	if (ctx->state == PFM_CTX_UNLOADED)
+		return 0;
+
+	task = ctx->task;
+	set = ctx->active_set;
+	is_self = ctx->flags.system || task == current;
+
+	/*
+	 * stop monitoring
+	 */
+	ret = __pfm_stop(ctx);
+	if (ret)
+		return ret;
+
+	ctx->state = PFM_CTX_UNLOADED;
+	ctx->flags.can_restart = 0;
+
+	/*
+	 * clear any leftover in pfm_syst_info.
+	 *
+	 * for non-self monitoring,
+	 * this is done in pfm_ctxswout_thread.
+	 */
+	if (is_self)
+		__get_cpu_var(pfm_syst_info) = 0;
+
+	/*
+	 * save PMD registers
+	 * release ownership
+	 */
+	pfm_flush_pmds(task, ctx);
+
+	/*
+	 * arch-specific unload operations
+	 */
+	pfm_arch_unload_context(ctx, task);
+
+	/*
+	 * per-thread: disconnect from monitored task
+	 * syswide   : keep ctx->cpu has it may be used after unload
+	 *             to release the session
+	 */
+	if (task) {
+		task->pfm_context = NULL;
+		ctx->task = NULL;
+		clear_tsk_thread_flag(task, TIF_PERFMON_CTXSW);
+	}
+
+	*can_release = 1;
+
+	return 0;
+}
+
+static inline int pfm_ctx_flags_sane(u32 ctx_flags)
+{
+	if (ctx_flags & PFM_FL_SYSTEM_WIDE) {
+		if (ctx_flags & PFM_FL_NOTIFY_BLOCK) {
+			PFM_DBG("cannot use blocking mode in syswide mode");
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+/*
+ * check for permissions to create a context.
+ *
+ * A sysadmin may decide to restrict creation of per-thread
+ * and/or system-wide context to a group of users using the
+ * group id via /sys/kernel/perfmon/task_group  and
+ * /sys/kernel/perfmon/sys_group.
+ *
+ * Once we identify a user level package which can be used
+ * to grant/revoke Linux capabilites at login via PAM, we will
+ * be able to use capabilities. We would also need to increase
+ * the size of cap_t to support more than 32 capabilities (it
+ * is currently defined as u32 and 32 capabilities are alrady
+ * defined).
+ */
+static inline int pfm_ctx_permissions(u32 ctx_flags)
+{
+	if (  (ctx_flags & PFM_FL_SYSTEM_WIDE)
+	   && pfm_controls.sys_group != PFM_GROUP_PERM_ANY
+	   && !in_group_p(pfm_controls.sys_group)) {
+		PFM_DBG("user group not allowed to create a syswide ctx");
+		return -EPERM;
+	} else if (pfm_controls.task_group != PFM_GROUP_PERM_ANY
+		   && !in_group_p(pfm_controls.task_group)) {
+		PFM_DBG("user group not allowed to create a task context");
+		return -EPERM;
+	}
+	return 0;
+}
+
+/*
+ * function used to allocate a new context. A context is allocated along
+ * with the default event set. If a sampling format is used, the buffer
+ * may be allocated and initialized.
+ *
+ * The file descriptor identifying the context is allocated and returned
+ * to caller.
+ *
+ * This function operates with no locks and interrupts are enabled.
+ * return:
+ * 	>=0: the file descriptor to identify the context
+ * 	<0 : the error code
+ */
+int __pfm_create_context(struct pfarg_ctx *req,
+			 struct pfm_smpl_fmt *fmt,
+			 void *fmt_arg,
+			 int mode,
+			 struct pfm_context **new_ctx)
+{
+	struct pfm_context *ctx;
+	struct pfm_event_set *set;
+	struct file *filp = NULL;
+	u32 ctx_flags;
+	int fd = 0, ret;
+
+	ctx_flags = req->ctx_flags;
+
+	/* Increase refcount on PMU description */
+	ret = pfm_pmu_conf_get(1);
+	if (ret < 0)
+		goto error_conf;
+
+	ret = pfm_ctx_flags_sane(ctx_flags);
+	if (ret < 0)
+		goto error_alloc;
+
+	ret = pfm_ctx_permissions(ctx_flags);
+	if (ret < 0)
+		goto error_alloc;
+
+	/*
+	 * we can use GFP_KERNEL and potentially sleep because we do
+	 * not hold any lock at this point.
+	 */
+	might_sleep();
+	ret = -ENOMEM;
+	ctx = kmem_cache_zalloc(pfm_ctx_cachep, GFP_KERNEL);
+	if (!ctx)
+		goto error_alloc;
+
+	/*
+	 * needs to init event set list otherwise
+	 * we could fail in pfm_free_sets
+	 */
+	INIT_LIST_HEAD(&ctx->list);
+
+	ret = pfm_pmu_acquire();
+	if (ret)
+		goto error_file;
+	/*
+	 * link to format, must be done first for correct
+	 * error handling in pfm_context_free()
+	 */
+	ctx->smpl_fmt = fmt;
+
+	ret = -ENFILE;
+	fd = pfm_alloc_fd(&filp);
+	if (fd < 0)
+		goto error_file;
+
+	/*
+	 * context is unloaded
+	 */
+	ctx->state = PFM_CTX_UNLOADED;
+
+	/*
+	 * initialization of context's flags
+	 * must be done before pfm_find_set()
+	 */
+	ctx->flags.block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0;
+	ctx->flags.system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0;
+	ctx->flags.no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0;
+
+	INIT_LIST_HEAD(&ctx->list);
+
+	/*
+	 * initialize arch-specific section
+	 * must be done before fmt_init()
+	 *
+	 * XXX: fix dependency with fmt_init()
+	 */
+	ret = pfm_arch_context_create(ctx, ctx_flags);
+	if (ret)
+		goto error_set;
+
+	ret = -ENOMEM;
+	/*
+	 * create initial set
+	 */
+	if (pfm_find_set(ctx, 0, 1) == NULL)
+		goto error_set;
+
+	set = list_entry(ctx->list.next, struct pfm_event_set, list);
+
+	pfm_init_evtset(set);
+
+	/*
+	 * does the user want to sample?
+	 */
+	if (fmt) {
+		ret = pfm_setup_smpl_fmt(fmt, fmt_arg, ctx, ctx_flags,
+					 mode, filp);
+		if (ret)
+			goto error_set;
+	}
+
+	filp->private_data = ctx;
+
+	spin_lock_init(&ctx->lock);
+	init_completion(&ctx->restart_complete);
+
+	ctx->last_act = PFM_INVALID_ACTIVATION;
+	pfm_set_last_cpu(ctx, -1);
+
+	/*
+	 * initialize notification message queue
+	 */
+	ctx->msgq_head = ctx->msgq_tail = 0;
+	init_waitqueue_head(&ctx->msgq_wait);
+
+	PFM_DBG("ctx=%p flags=0x%x system=%d notify_block=%d no_msg=%d"
+		" use_fmt=%d ctx_fd=%d mode=%d",
+		ctx,
+		ctx_flags,
+		ctx->flags.system,
+		ctx->flags.block,
+		ctx->flags.no_msg,
+		fmt != NULL,
+		fd, mode);
+
+	*new_ctx = ctx;
+
+	/*
+	 * we defer the fd_install until we are certain the call succeeded
+	 * to ensure we do not have to undo its effect. Neither put_filp()
+	 * nor put_unused_fd() undoes the effect of fd_install().
+	 */
+	fd_install(fd, filp);
+
+	return fd;
+
+error_set:
+	put_filp(filp);
+	put_unused_fd(fd);
+error_file:
+	/*
+	 * calls the right *_put() functions
+	 * calls pfm_release_pmu()
+	 */
+	pfm_context_free(ctx);
+	return ret;
+error_alloc:
+	pfm_pmu_conf_put();
+error_conf:
+	pfm_smpl_fmt_put(fmt);
+	return ret;
+}
+
+/*
+ * called from cpu_disable() to detach the perfmon context
+ * from the CPU going down.
+ *
+ * We cannot use the cpu hotplug notifier because we MUST run
+ * on the CPU that is going down to save the PMU state
+ */
+void pfm_cpu_disable(void)
+{
+	struct pfm_context *ctx;
+	unsigned long flags;
+	int is_system, can_release = 0;
+	u32 cpu;
+
+	ctx = __get_cpu_var(pmu_ctx);
+	if (ctx == NULL)
+		return;
+
+	is_system = ctx->flags.system;
+	cpu = ctx->cpu;
+
+	/*
+	 * context is LOADED or MASKED
+	 *
+	 * we unload from CPU. That stops monitoring and does
+	 * all the bookeeping of saving values and updating duration
+	 */
+	spin_lock_irqsave(&ctx->lock, flags);
+	if (is_system)
+		__pfm_unload_context(ctx, &can_release);
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (can_release)
+		pfm_release_session(is_system, cpu);
+}
Index: linux-2.6/perfmon/perfmon_ctxsw.c
===================================================================
--- /dev/null
+++ linux-2.6/perfmon/perfmon_ctxsw.c
@@ -0,0 +1,333 @@
+/*
+ * perfmon_cxtsw.c: perfmon2 context switch code
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/perfmon.h>
+
+/*
+ * used only in UP mode
+ */
+void pfm_save_prev_context(struct pfm_context *ctxp)
+{
+	struct pfm_event_set *set;
+
+	/*
+	 * in UP per-thread, due to lazy save
+	 * there could be a context from another
+	 * task. We need to push it first before
+	 * installing our new state
+	 */
+	set = ctxp->active_set;
+	pfm_save_pmds(ctxp, set);
+	/*
+	 * do not clear ownership because we rewrite
+	 * right away
+	 */
+}
+
+void pfm_save_pmds(struct pfm_context *ctx, struct pfm_event_set *set)
+{
+	u64 val, ovfl_mask;
+	u64 *used_mask, *cnt_mask;
+	u16 i, num;
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	num = set->nused_pmds;
+	cnt_mask = pfm_pmu_conf->regs.cnt_pmds;
+	used_mask = set->used_pmds;
+
+	for (i = 0; num; i++) {
+		if (test_bit(i, cast_ulp(used_mask))) {
+			val = pfm_read_pmd(ctx, i);
+			if (likely(test_bit(i, cast_ulp(cnt_mask))))
+				val = (set->pmds[i].value & ~ovfl_mask) |
+					(val & ovfl_mask);
+			set->pmds[i].value = val;
+			num--;
+		}
+	}
+}
+
+/*
+ * interrupts are  disabled (no preemption)
+ */
+static void __pfm_ctxswin_thread(struct task_struct *task,
+				 struct pfm_context *ctx, u64 now)
+{
+	u64 cur_act;
+	struct pfm_event_set *set;
+	int reload_pmcs, reload_pmds;
+	int mycpu;
+
+	mycpu = smp_processor_id();
+
+	/*
+	 * we need to lock context because it could be accessed
+	 * from another CPU
+	 */
+	spin_lock(&ctx->lock);
+
+	cur_act = __get_cpu_var(pmu_activation_number);
+
+	set = ctx->active_set;
+
+	/*
+	 * in case fo zombie, we do not complete ctswin of the
+	 * PMU, and we force a call to pfm_handle_work() to finish
+	 * cleanup, i.e., free context + smpl_buff. The reason for
+	 * deferring to pfm_handle_work() is that it is not possible
+	 * to vfree() with interrupts disabled.
+	 */
+	if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) {
+		ctx->flags.work_type = PFM_WORK_ZOMBIE;
+		set_tsk_thread_flag(task, TIF_PERFMON_WORK);
+		spin_unlock(&ctx->lock);
+		return;
+	}
+
+	if (set->flags & PFM_SETFL_TIME_SWITCH)
+		__get_cpu_var(pfm_syst_info) = PFM_CPUINFO_TIME_SWITCH;
+
+	/*
+	 * if we were the last user of the PMU on that CPU,
+	 * then nothing to do except restore psr
+	 */
+	if (ctx->last_cpu == mycpu && ctx->last_act == cur_act) {
+		/*
+		 * check for forced reload conditions
+		 */
+		reload_pmcs = set->priv_flags & PFM_SETFL_PRIV_MOD_PMCS;
+		reload_pmds = set->priv_flags & PFM_SETFL_PRIV_MOD_PMDS;
+	} else {
+#ifndef CONFIG_SMP
+		struct pfm_context *ctxp;
+		ctxp = __get_cpu_var(pmu_ctx);
+		if (ctxp)
+			pfm_save_prev_context(ctxp);
+#endif
+		reload_pmcs = 1;
+		reload_pmds = 1;
+	}
+	/* consumed */
+	set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+	if (reload_pmds)
+		pfm_arch_restore_pmds(ctx, set);
+
+	/*
+	 * need to check if had in-flight interrupt in
+	 * pfm_ctxswout_thread(). If at least one bit set, then we must replay
+	 * the interrupt to avoid losing some important performance data.
+	 *
+	 * npend_ovfls is cleared in interrupt handler
+	 */
+	if (set->npend_ovfls) {
+		pfm_arch_resend_irq();
+		__get_cpu_var(pfm_stats).ovfl_intr_replay_count++;
+	}
+
+	if (reload_pmcs)
+		pfm_arch_restore_pmcs(ctx, set);
+
+	/*
+	 * record current activation for this context
+	 */
+	pfm_inc_activation();
+	pfm_set_last_cpu(ctx, mycpu);
+	pfm_set_activation(ctx);
+
+	/*
+	 * establish new ownership.
+	 */
+	pfm_set_pmu_owner(task, ctx);
+
+	pfm_arch_ctxswin_thread(task, ctx, set);
+	/*
+	 * set->duration does not count when context in MASKED state.
+	 * set->duration_start is reset in unmask_monitoring()
+	 */
+	set->duration_start = now;
+
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * interrupts are masked, runqueue lock is held.
+ *
+ * In UP. we simply stop monitoring and leave the state
+ * in place, i.e., lazy save
+ */
+static void __pfm_ctxswout_thread(struct task_struct *task,
+				  struct pfm_context *ctx, u64 now)
+{
+	struct pfm_event_set *set;
+	int need_save_pmds;
+
+	/*
+	 * we need to lock context because it could be accessed
+	 * from another CPU
+	 */
+	spin_lock(&ctx->lock);
+
+	set = ctx->active_set;
+
+	/*
+	 * stop monitoring and
+	 * collect pending overflow information
+	 * needed on ctxswin. We cannot afford to lose
+	 * a PMU interrupt.
+	 */
+	need_save_pmds = pfm_arch_ctxswout_thread(task, ctx, set);
+
+	/*
+	 * accumulate only when set is actively monitoring,
+	 */
+	if (ctx->state == PFM_CTX_LOADED)
+		set->duration += now - set->duration_start;
+
+#ifdef CONFIG_SMP
+	/*
+	 * in SMP, release ownership of this PMU.
+	 * PMU interrupts are masked, so nothing
+	 * can happen.
+	 */
+	pfm_set_pmu_owner(NULL, NULL);
+
+	/*
+	 * On some architectures, it is necessary to read the
+	 * PMD registers to check for pending overflow in
+	 * pfm_arch_ctxswout_thread(). In that case, saving of
+	 * the PMDs  may be  done there and not here.
+	 */
+	if (need_save_pmds)
+		pfm_save_pmds(ctx, set);
+#endif
+	/*
+	 * clear cpuinfo, cpuinfo is used in
+	 * per task mode with the set time switch flag.
+	 */
+	__get_cpu_var(pfm_syst_info) = 0;
+
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * no need to lock the context. To operate on a system-wide
+ * context, the task has to run on the monitored CPU. In the
+ * case of close issued on another CPU, an IPI is sent but
+ * this routine runs with interrupts masked, so we are
+ * protected
+ *
+ * On some architectures, such as IA-64, it may be necessary
+ * to intervene during the context even in system-wide mode
+ * to modify some machine state.
+ */
+static void __pfm_ctxsw_sys(struct task_struct *prev,
+			    struct task_struct *next)
+{
+	struct pfm_context *ctx;
+	struct pfm_event_set *set;
+
+	ctx = __get_cpu_var(pmu_ctx);
+	if (!ctx) {
+		pr_info("prev=%d tif=%d ctx=%p next=%d tif=%d ctx=%p\n",
+			prev->pid,
+			test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW),
+			prev->pfm_context,
+			next->pid,
+			test_tsk_thread_flag(next, TIF_PERFMON_CTXSW),
+			next->pfm_context);
+		BUG_ON(!ctx);
+	}
+
+	set = ctx->active_set;
+
+	/*
+	 * propagate TIF_PERFMON_CTXSW to ensure that:
+	 * - previous task has TIF_PERFMON_CTXSW cleared, in case it is
+	 *   scheduled onto another CPU where there is syswide monitoring
+	 * - next task has TIF_PERFMON_CTXSW set to ensure it will come back
+	 *   here when context switched out
+	 */
+	clear_tsk_thread_flag(prev, TIF_PERFMON_CTXSW);
+	set_tsk_thread_flag(next, TIF_PERFMON_CTXSW);
+
+	/*
+	 * nothing to do until actually started
+	 * XXX: assumes no mean to start from user level
+	 */
+	if (!ctx->flags.started)
+		return;
+
+	pfm_arch_ctxswout_sys(prev, ctx, set);
+	pfm_arch_ctxswin_sys(next, ctx, set);
+}
+
+/*
+ * come here when either prev or next has TIF_PERFMON_CTXSW flag set
+ * Note that this is not because a task has TIF_PERFMON_CTXSW set that
+ * it has a context attached, e.g., in system-wide on certain arch.
+ */
+void pfm_ctxsw(struct task_struct *prev, struct task_struct *next)
+{
+	struct pfm_context *ctxp, *ctxn;
+	u64 now;
+
+	now = sched_clock();
+
+	ctxp = prev->pfm_context;
+	ctxn = next->pfm_context;
+
+	if (ctxp)
+		__pfm_ctxswout_thread(prev, ctxp, now);
+
+	if (ctxn)
+		__pfm_ctxswin_thread(next, ctxn, now);
+
+	/*
+	 * given that prev and next can never be the same, this
+	 * test is checking that ctxp == ctxn == NULL which is
+	 * an indication we have an active system-wide session on
+	 * this CPU that needs ctxsw intervention. Not all processors
+	 * needs this, IA64 is one.
+	 */
+	if (ctxp == ctxn)
+		__pfm_ctxsw_sys(prev, next);
+
+	__get_cpu_var(pfm_stats).ctxsw_count++;
+	__get_cpu_var(pfm_stats).ctxsw_ns += sched_clock() - now;
+}
Index: linux-2.6/perfmon/perfmon_dfl_smpl.c
===================================================================
--- /dev/null
+++ linux-2.6/perfmon/perfmon_dfl_smpl.c
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ *               Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *
+ * This file implements the new default sampling buffer format
+ * for the perfmon2 subsystem.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+#include <linux/perfmon.h>
+#include <linux/perfmon_dfl_smpl.h>
+
+MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>");
+MODULE_DESCRIPTION("new perfmon default sampling format");
+MODULE_LICENSE("GPL");
+
+static int pfm_dfl_fmt_validate(u32 ctx_flags, u16 npmds, void *data)
+{
+	struct pfm_dfl_smpl_arg *arg = data;
+	u64 min_buf_size;
+
+	if (data == NULL) {
+		PFM_DBG("no argument passed");
+		return -EINVAL;
+	}
+
+	/*
+	 * sanity check in case size_t is smaller then u64
+	 */
+#if BITS_PER_LONG == 4
+#define MAX_SIZE_T	(1ULL<<(sizeof(size_t)<<3))
+	if (sizeof(size_t) < sizeof(arg->buf_size)) {
+		if (arg->buf_size >= MAX_SIZE_T)
+			return -ETOOBIG;
+	}
+#endif
+	
+	/*
+	 * compute min buf size. npmds is the maximum number
+	 * of implemented PMD registers.
+	 */
+	min_buf_size = sizeof(struct pfm_dfl_smpl_hdr)
+	             + (sizeof(struct pfm_dfl_smpl_entry) + (npmds*sizeof(u64)));
+
+	PFM_DBG("validate ctx_flags=0x%x flags=0x%x npmds=%u "
+		   "min_buf_size=%llu buf_size=%llu\n",
+		   ctx_flags,
+		   arg->buf_flags,
+		   npmds,
+		   (unsigned long long)min_buf_size,
+		   (unsigned long long)arg->buf_size);
+
+	/*
+	 * must hold at least the buffer header + one minimally sized entry
+	 */
+	if (arg->buf_size < min_buf_size)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int pfm_dfl_fmt_get_size(u32 flags, void *data, size_t *size)
+{
+	struct pfm_dfl_smpl_arg *arg = data;
+
+	/*
+	 * size has been validated in default_validate
+	 * we can never loose bits from buf_size.
+	 */
+	*size = (size_t)arg->buf_size;
+
+	return 0;
+}
+
+static int pfm_dfl_fmt_init(struct pfm_context *ctx, void *buf, u32 ctx_flags,
+			    u16 npmds, void *data)
+{
+	struct pfm_dfl_smpl_hdr *hdr;
+	struct pfm_dfl_smpl_arg *arg = data;
+
+	hdr = buf;
+
+	hdr->hdr_version = PFM_DFL_SMPL_VERSION;
+	hdr->hdr_buf_size = arg->buf_size;
+	hdr->hdr_buf_flags = arg->buf_flags;
+	hdr->hdr_cur_offs = sizeof(*hdr);
+	hdr->hdr_overflows = 0;
+	hdr->hdr_count = 0;
+	hdr->hdr_min_buf_space = sizeof(struct pfm_dfl_smpl_entry) + (npmds*sizeof(u64));
+
+	PFM_DBG("buffer=%p buf_size=%llu hdr_size=%zu hdr_version=%u.%u "
+		  "min_space=%llu npmds=%u",
+		  buf,
+		  (unsigned long long)hdr->hdr_buf_size,
+		  sizeof(*hdr),
+		  PFM_VERSION_MAJOR(hdr->hdr_version),
+		  PFM_VERSION_MINOR(hdr->hdr_version),
+		  (unsigned long long)hdr->hdr_min_buf_space,
+		  npmds);
+
+	return 0;
+}
+
+/*
+ * called from pfm_overflow_handler() to record a new sample
+ *
+ * context is locked, interrupts are disabled (no preemption)
+ */
+static int pfm_dfl_fmt_handler(void *buf, struct pfm_ovfl_arg *arg,
+			       unsigned long ip, u64 tstamp, void *data)
+{
+	struct pfm_dfl_smpl_hdr *hdr;
+	struct pfm_dfl_smpl_entry *ent;
+	void *cur, *last;
+	u64 *e;
+	size_t entry_size, min_size;
+	u16 npmds, i;
+	u16 ovfl_pmd;
+
+	hdr = buf;
+	cur = buf+hdr->hdr_cur_offs;
+	last = buf+hdr->hdr_buf_size;
+	ovfl_pmd = arg->ovfl_pmd;
+	min_size = hdr->hdr_min_buf_space;
+
+	/*
+	 * precheck for sanity
+	 */
+	if ((last - cur) < min_size)
+		goto full;
+
+	npmds = arg->num_smpl_pmds;
+
+	ent = (struct pfm_dfl_smpl_entry *)cur;
+
+	entry_size = sizeof(*ent) + (npmds << 3);
+
+	/* position for first pmd */
+	e = (u64 *)(ent+1);
+
+	hdr->hdr_count++;
+
+	PFM_DBG_ovfl("count=%llu cur=%p last=%p free_bytes=%zu ovfl_pmd=%d "
+		       "npmds=%u",
+		       (unsigned long long)hdr->hdr_count,
+		       cur, last,
+		       (last-cur),
+		       ovfl_pmd,
+		       npmds);
+
+	/*
+	 * current = task running at the time of the overflow.
+	 *
+	 * per-task mode:
+	 * 	- this is usually the task being monitored.
+	 * 	  Under certain conditions, it might be a different task
+	 *
+	 * system-wide:
+	 * 	- this is not necessarily the task controlling the session
+	 */
+	ent->pid = current->pid;
+	ent->ovfl_pmd = ovfl_pmd;
+	ent->last_reset_val = arg->pmd_last_reset;
+
+	/*
+	 * where did the fault happen (includes slot number)
+	 */
+	ent->ip = ip;
+
+	ent->tstamp = tstamp;
+	ent->cpu = smp_processor_id();
+	ent->set = arg->active_set;
+	ent->tgid = current->tgid;
+
+	/*
+	 * selectively store PMDs in increasing index number
+	 */
+	if (npmds) {
+		u64 *val = arg->smpl_pmds_values;
+		for(i=0; i < npmds; i++) {
+			*e++ = *val++;
+		}
+	}
+
+	/*
+	 * update position for next entry
+	 */
+	hdr->hdr_cur_offs += entry_size;
+	cur += entry_size;
+
+	/*
+	 * post check to avoid losing the last sample
+	 */
+	if ((last - cur) < min_size)
+		goto full;
+
+	/* reset before returning from interrupt handler */
+	arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET;
+
+	return 0;
+full:
+	PFM_DBG_ovfl("sampling buffer full free=%zu, count=%llu",
+		     last-cur,
+		     (unsigned long long)hdr->hdr_count);
+
+	/*
+	 * increment number of buffer overflows.
+	 * important to detect duplicate set of samples.
+	 */
+	hdr->hdr_overflows++;
+
+	/*
+	 * request notification and masking of monitoring.
+	 * Notification is still subject to the overflowed
+	 * register having the FL_NOTIFY flag set.
+	 */
+	arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY| PFM_OVFL_CTRL_MASK;
+
+	return -ENOBUFS; /* we are full, sorry */
+}
+
+static int pfm_dfl_fmt_restart(int is_active, u32 *ovfl_ctrl, void *buf)
+{
+	struct pfm_dfl_smpl_hdr *hdr;
+
+	hdr = buf;
+
+	hdr->hdr_count = 0;
+	hdr->hdr_cur_offs = sizeof(*hdr);
+
+	*ovfl_ctrl = PFM_OVFL_CTRL_RESET;
+
+	return 0;
+}
+
+static int pfm_dfl_fmt_exit(void *buf)
+{
+	return 0;
+}
+
+static struct pfm_smpl_fmt dfl_fmt={
+	.fmt_name = "default",
+	.fmt_version = 0x10000,
+	.fmt_arg_size = sizeof(struct pfm_dfl_smpl_arg),
+	.fmt_validate = pfm_dfl_fmt_validate,
+	.fmt_getsize = pfm_dfl_fmt_get_size,
+	.fmt_init = pfm_dfl_fmt_init,
+	.fmt_handler = pfm_dfl_fmt_handler,
+	.fmt_restart = pfm_dfl_fmt_restart,
+	.fmt_exit = pfm_dfl_fmt_exit,
+	.fmt_flags = PFM_FMT_BUILTIN_FLAG,
+	.owner = THIS_MODULE
+};
+
+static int pfm_dfl_fmt_init_module(void)
+{
+	return pfm_fmt_register(&dfl_fmt);
+}
+
+static void pfm_dfl_fmt_cleanup_module(void)
+{
+	pfm_fmt_unregister(&dfl_fmt);
+}
+
+module_init(pfm_dfl_fmt_init_module);
+module_exit(pfm_dfl_fmt_cleanup_module);
Index: linux-2.6/perfmon/perfmon_file.c
===================================================================
--- /dev/null
+++ linux-2.6/perfmon/perfmon_file.c
@@ -0,0 +1,791 @@
+/*
+ * perfmon_file.c: perfmon2 file input/output functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/file.h>
+#include <linux/poll.h>
+#include <linux/vfs.h>
+#include <linux/pagemap.h>
+#include <linux/mount.h>
+#include <linux/perfmon.h>
+
+#define PFMFS_MAGIC 0xa0b4d889	/* perfmon filesystem magic number */
+
+static inline int pfm_msgq_is_empty(struct pfm_context *ctx)
+{
+	return ctx->msgq_head == ctx->msgq_tail;
+}
+
+static int pfmfs_delete_dentry(struct dentry *dentry)
+{
+	return 1;
+}
+
+static struct dentry_operations pfmfs_dentry_operations = {
+	.d_delete = pfmfs_delete_dentry,
+};
+
+static union pfarg_msg *pfm_get_next_msg(struct pfm_context *ctx)
+{
+	union pfarg_msg *msg;
+
+	PFM_DBG("in head=%d tail=%d",
+		ctx->msgq_head & PFM_MSGQ_MASK,
+		ctx->msgq_tail & PFM_MSGQ_MASK);
+
+	if (pfm_msgq_is_empty(ctx))
+		return NULL;
+
+	/*
+	 * get oldest message
+	 */
+	msg = ctx->msgq + (ctx->msgq_tail & PFM_MSGQ_MASK);
+
+	/*
+	 * move tail forward
+	 */
+	ctx->msgq_tail++;
+
+	PFM_DBG("out head=%d tail=%d type=%d",
+		ctx->msgq_head & PFM_MSGQ_MASK,
+		ctx->msgq_tail & PFM_MSGQ_MASK,
+		msg->type);
+
+	return msg;
+}
+
+static struct page *pfm_buf_map_pagefault(struct vm_area_struct *vma,
+					  unsigned long address, int *type)
+{
+	void *kaddr;
+	struct pfm_context *ctx;
+	struct page *page;
+	size_t size;
+
+	ctx = vma->vm_private_data;
+	if (ctx == NULL) {
+		PFM_DBG("no ctx");
+		return NOPAGE_SIGBUS;
+	}
+	size = ctx->smpl_size;
+
+	if ( (address < (unsigned long) vma->vm_start) ||
+	     (address >= (unsigned long) (vma->vm_start + size)) )
+		return NOPAGE_SIGBUS;
+
+	kaddr = ctx->smpl_addr + (address - vma->vm_start);
+
+	if (type)
+		*type = VM_FAULT_MINOR;
+
+	page = vmalloc_to_page(kaddr);
+	get_page(page);
+
+	PFM_DBG("[%d] start=%p ref_count=%d",
+		current->pid,
+		kaddr, page_count(page));
+
+	return page;
+}
+/*
+ * we need to determine whther or not we are closing the last reference
+ * to the file and thus are going to end up in pfm_close() which eventually
+ * calls pfm_release_buf_space(). In that function, we update the accouting
+ * for locked_vm given that we are actually freeing the sampling buffer. The
+ * issue is that there are multiple paths leading to pfm_release_buf_space(),
+ * from exit(), munmap(), close(). The path coming from munmap() is problematic
+ * becuse do_munmap() grabs mmap_sem in write-mode which is also what
+ * pfm_release_buf_space does. To avoid deadlock, we need to determine where
+ * we are calling from and skip the locking. The vm_ops->close() callback
+ * is invoked for each remove_vma() independently of the number of references
+ * left on the file descriptor, therefore simple reference counter does not
+ * work. We need to determine if this is the last call, and then set a flag
+ * to skip the locking.
+ */
+static void pfm_buf_map_close(struct vm_area_struct *vma)
+{
+	struct file *file;
+	struct pfm_context *ctx;
+
+	file = vma->vm_file;
+	ctx = vma->vm_private_data;
+
+	/*
+	 * if file is going to close, then pfm_close() will
+	 * be called, do not lock in pfm_release_buf
+	 */
+	if (atomic_read(&file->f_count) == 1)
+		ctx->flags.mmap_nlock = 1;
+}
+
+/*
+ * we do not have a close callback because, the locked
+ * memory accounting must be done when the actual buffer
+ * is freed. Munmap does not free the page backing the vma
+ * because they may still be in use by the PMU interrupt handler.
+ */
+struct vm_operations_struct pfm_buf_map_vm_ops = {
+	.nopage	= pfm_buf_map_pagefault,
+	.close = pfm_buf_map_close
+};
+
+static int pfm_mmap_buffer(struct pfm_context *ctx, struct vm_area_struct *vma,
+			   size_t size)
+{
+	if (ctx->smpl_addr == NULL) {
+		PFM_DBG("no sampling buffer to map");
+		return -EINVAL;
+	}
+
+	if (size > ctx->smpl_size) {
+		PFM_DBG("mmap size=%zu >= actual buf size=%zu",
+			size,
+			ctx->smpl_size);
+		return -EINVAL;
+	}
+
+	vma->vm_ops = &pfm_buf_map_vm_ops;
+	vma->vm_private_data = ctx;
+
+	return 0;
+}
+
+static int pfm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	size_t size;
+	struct pfm_context *ctx;
+	unsigned long flags;
+	int ret;
+
+	PFM_DBG("pfm_file_ops");
+
+	ctx  = file->private_data;
+	size = (vma->vm_end - vma->vm_start);
+
+	if (ctx == NULL)
+		return -EINVAL;
+
+	ret = -EINVAL;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	if (vma->vm_flags & VM_WRITE) {
+		PFM_DBG("cannot map buffer for writing");
+		goto done;
+	}
+
+	PFM_DBG("vm_pgoff=%lu size=%zu vm_start=0x%lx",
+		vma->vm_pgoff,
+		size,
+		vma->vm_start);
+
+	ret = pfm_mmap_buffer(ctx, vma, size);
+	if (ret == 0)
+		vma->vm_flags |= VM_RESERVED;
+
+	PFM_DBG("ret=%d vma_flags=0x%lx vma_start=0x%lx vma_size=%lu",
+		ret,
+		vma->vm_flags,
+		vma->vm_start,
+		vma->vm_end-vma->vm_start);
+done:
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	return ret;
+}
+
+/*
+ * Extract one message from queue.
+ *
+ * return:
+ * 	-EAGAIN:  when non-blocking and nothing is* in the queue.
+ * 	-ERESTARTSYS: when blocking and signal is pending
+ * 	Otherwise returns size of message (sizeof(pfarg_msg))
+ */
+ssize_t __pfm_read(struct pfm_context *ctx, union pfarg_msg *msg_buf, int non_block)
+{
+	union pfarg_msg *msg;
+	ssize_t ret = 0;
+	unsigned long flags;
+	DECLARE_WAITQUEUE(wait, current);
+
+	/*
+	 * we must masks interrupts to avoid a race condition
+	 * with the PMU interrupt handler.
+	 */
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	while (pfm_msgq_is_empty(ctx)) {
+
+		/*
+		 * handle non-blocking reads
+		 * return -EAGAIN
+		 */
+		ret = -EAGAIN;
+		if (non_block)
+			break;
+
+		add_wait_queue(&ctx->msgq_wait, &wait);
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		spin_unlock_irqrestore(&ctx->lock, flags);
+
+		schedule();
+
+		/*
+		 * during this window, another thread may call
+		 * pfm_read() and steal our message
+		 */
+
+		spin_lock_irqsave(&ctx->lock, flags);
+
+		remove_wait_queue(&ctx->msgq_wait, &wait);
+		set_current_state(TASK_RUNNING);
+
+		/*
+		 * check for pending signals
+		 * return -ERESTARTSYS
+		 */
+		ret = -ERESTARTSYS;
+		if(signal_pending(current))
+			break;
+
+		/*
+		 * we may have a message
+		 */
+		ret = 0;
+	}
+
+	/*
+	 * extract message
+	 */
+	if (ret == 0) {
+		msg = pfm_get_next_msg(ctx);
+		BUG_ON(msg == NULL);
+
+		/*
+		 * we must make a local copy before we unlock
+		 * to ensure that the message queue cannot fill
+		 * (overwriting our message) up before
+		 * we do copy_to_user() which cannot be done
+		 * with interrupts masked.
+		 */
+		*msg_buf = *msg;
+
+		ret = sizeof(*msg);
+
+		PFM_DBG("extracted type=%d", msg->type);
+	}
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	PFM_DBG("blocking=%d ret=%zd", non_block, ret);
+
+	return ret;
+}
+
+static ssize_t pfm_read(struct file *filp, char __user *buf, size_t size,
+			loff_t *ppos)
+{
+	struct pfm_context *ctx;
+	union pfarg_msg msg_buf;
+	int non_block, ret;
+
+	PFM_DBG("pfm_file_ops");
+
+	ctx = filp->private_data;
+	if (ctx == NULL) {
+		PFM_ERR("no ctx for pfm_read");
+		return -EINVAL;
+	}
+
+	/*
+	 * cannot extract partial messages.
+	 * check even when there is no message
+	 *
+	 * cannot extract more than one message per call. Bytes
+	 * above sizeof(msg) are ignored.
+	 */
+	if (size < sizeof(msg_buf)) {
+		PFM_DBG("message is too small size=%zu must be >=%zu)",
+			size,
+			sizeof(msg_buf));
+		return -EINVAL;
+	}
+
+	non_block = filp->f_flags & O_NONBLOCK;
+
+	ret =  __pfm_read(ctx, &msg_buf, non_block);
+	if (ret > 0) {
+  		if(copy_to_user(buf, &msg_buf, sizeof(msg_buf)))
+			ret = -EFAULT;
+	}
+	return ret;
+}
+
+static ssize_t pfm_write(struct file *file, const char __user *ubuf,
+			  size_t size, loff_t *ppos)
+{
+	PFM_DBG("pfm_write called");
+	return -EINVAL;
+}
+
+static unsigned int pfm_poll(struct file *filp, poll_table *wait)
+{
+	struct pfm_context *ctx;
+	unsigned long flags;
+	unsigned int mask = 0;
+
+	PFM_DBG("pfm_file_ops");
+
+	if (filp->f_op != &pfm_file_ops) {
+		PFM_ERR("pfm_poll bad magic");
+		return 0;
+	}
+
+	ctx = filp->private_data;
+	if (ctx == NULL) {
+		PFM_ERR("pfm_poll no ctx");
+		return 0;
+	}
+
+	PFM_DBG("before poll_wait");
+
+	poll_wait(filp, &ctx->msgq_wait, wait);
+
+	/*
+	 * pfm_msgq_is_empty() is non-atomic
+	 *
+	 * filp is protected by fget() at upper level
+	 * context cannot be closed by another thread.
+	 *
+	 * There may be a race with a PMU interrupt adding
+	 * messages to the queue. But we are interested in
+	 * queue not empty, so adding more messages should
+	 * not really be a problem.
+	 *
+	 * There may be a race with another thread issuing
+	 * a read() and stealing messages from the queue thus
+	 * may return the wrong answer. This could potentially
+	 * lead to a blocking read, because nothing is
+	 * available in the queue
+	 */
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	if (!pfm_msgq_is_empty(ctx))
+		mask =  POLLIN | POLLRDNORM;
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	PFM_DBG("after poll_wait mask=0x%x", mask);
+
+	return mask;
+}
+
+static int pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
+	  	     unsigned long arg)
+{
+	PFM_DBG("pfm_ioctl called");
+	return -EINVAL;
+}
+
+/*
+ * interrupt cannot be masked when entering this function
+ */
+static inline int __pfm_fasync(int fd, struct file *filp,
+			       struct pfm_context *ctx, int on)
+{
+	int ret;
+
+	ret = fasync_helper (fd, filp, on, &ctx->async_queue);
+
+	PFM_DBG("fd=%d on=%d async_q=%p ret=%d",
+		fd,
+		on,
+		ctx->async_queue, ret);
+
+	return ret;
+}
+
+static int pfm_fasync(int fd, struct file *filp, int on)
+{
+	struct pfm_context *ctx;
+	int ret;
+
+	PFM_DBG("pfm_file_ops");
+
+	ctx = filp->private_data;
+	if (ctx == NULL) {
+		PFM_ERR("pfm_fasync no ctx");
+		return -EBADF;
+	}
+
+	/*
+	 * we cannot mask interrupts during this call because this may
+	 * may go to sleep if memory is not readily avalaible.
+	 *
+	 * We are protected from the context disappearing by the
+	 * get_fd()/put_fd() done in caller. Serialization of this function
+	 * is ensured by caller.
+	 */
+	ret = __pfm_fasync(fd, filp, ctx, on);
+
+	PFM_DBG("pfm_fasync called on fd=%d on=%d async_queue=%p ret=%d",
+		fd,
+		on,
+		ctx->async_queue, ret);
+
+	return ret;
+}
+
+#ifdef CONFIG_SMP
+static void __pfm_close_remote_cpu(void *info)
+{
+	struct pfm_context *ctx = info;
+	int can_release;
+
+	BUG_ON(ctx != __get_cpu_var(pmu_ctx));
+
+	/*
+	 * we are in IPI interrupt handler which has always higher
+	 * priority than PMU interrupt, therefore we do not need to
+	 * mask interrupts. context locking is not needed because we
+	 * are in close(), no more user references.
+	 *
+	 * can_release is ignored, release done on calling CPU
+	 */
+	__pfm_unload_context(ctx, &can_release);
+
+	/*
+	 * we cannot free context here because we are in_interrupt().
+	 * we free on the calling CPU
+	 */
+}
+
+static int pfm_close_remote_cpu(u32 cpu, struct pfm_context *ctx)
+{
+	BUG_ON(irqs_disabled());
+	return smp_call_function_single(cpu, __pfm_close_remote_cpu, ctx, 0, 1);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * called either on explicit close() or from exit_files().
+ * Only the LAST user of the file gets to this point, i.e., it is
+ * called only ONCE.
+ *
+ * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero
+ * (fput()),i.e, last task to access the file. Nobody else can access the
+ * file at this point.
+ *
+ * When called from exit_files(), the VMA has been freed because exit_mm()
+ * is executed before exit_files().
+ *
+ * When called from exit_files(), the current task is not yet ZOMBIE but we
+ * flush the PMU state to the context.
+ */
+int __pfm_close(struct pfm_context *ctx, struct file *filp)
+{
+	unsigned long flags;
+	int state;	
+	int can_free = 1, can_unload = 1;
+	int is_system, can_release = 0;
+	u32 cpu;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	state = ctx->state;
+	is_system = ctx->flags.system;
+	cpu = ctx->cpu;
+
+	PFM_DBG("state=%d", state);
+
+	/*
+	 * check if unload is needed
+	 */
+	if (state == PFM_CTX_UNLOADED)
+		goto doit;
+
+#ifdef CONFIG_SMP
+	/*
+	 * we need to release the resource on the ORIGINAL cpu.
+	 * we need to release the context lock to avoid deadlocks
+	 * on the original CPU, especially in the context switch
+	 * routines. It is safe to unlock because we are in close(),
+	 * in other words, there is no more access from user level.
+	 * we can also unmask interrupts on this CPU because the
+	 * context is running on the original CPU. Context will be
+	 * unloaded and the session will be released on the original
+	 * CPU. Upon return, the caller is guaranteed that the context
+	 * is gone from original CPU.
+	 */
+	if (is_system && cpu != smp_processor_id()) {
+		spin_unlock_irqrestore(&ctx->lock, flags);
+		pfm_close_remote_cpu(cpu, ctx);
+		can_release = 1;
+		goto free_it;
+	}
+
+	if (!is_system && ctx->task != current) {
+		/*
+		 * switch context to zombie state
+		 */
+		ctx->state = PFM_CTX_ZOMBIE;
+
+		PFM_DBG("zombie ctx for [%d]", ctx->task->pid);
+		/*
+		 * must check if other thread is using block overflow
+		 * notification mode. If so make sure it will not block
+		 * because there will not be any pfm_restart() issued.
+		 * When the thread notices the ZOMBIE state, it will clean
+		 * up what is left of the context
+		 */
+		if (state == PFM_CTX_MASKED && ctx->flags.block) {
+			/*
+		 	* force task to wake up from MASKED state
+		 	*/
+			PFM_DBG("waking up [%d]", ctx->task->pid);
+
+			complete(&ctx->restart_complete);
+		}
+		/*
+		 * PMU session will be release by monitored task when it notices
+		 * ZOMBIE state as part of pfm_unload_context()
+		 */
+		can_unload = can_free = 0;
+	}
+#endif
+	if (can_unload)
+		__pfm_unload_context(ctx, &can_release);
+doit:
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+#ifdef CONFIG_SMP
+free_it:
+#endif
+	if (can_release)
+		pfm_release_session(is_system, cpu);
+
+	if (can_free)
+		pfm_context_free(ctx);
+
+	return 0;
+}
+
+static int pfm_close(struct inode *inode, struct file *filp)
+{
+	struct pfm_context *ctx;
+
+	PFM_DBG("pfm_file_ops");
+
+	ctx = filp->private_data;
+	if (ctx == NULL) {
+		PFM_ERR("no ctx");
+		return -EBADF;
+	}
+	return __pfm_close(ctx, filp);
+}
+
+static int pfm_no_open(struct inode *irrelevant, struct file *dontcare)
+{
+	PFM_DBG("pfm_file_ops");
+
+	return -ENXIO;
+}
+
+/*
+ * pfm_flush() is called from filp_close() on every call to
+ * close(). pfm_close() is only invoked when the last user
+ * calls close(). pfm_close() is never invoked without
+ * pfm_flush() being invoked first.
+ *
+ * Partially free resources:
+ * 	- remove from fasync queue
+ */
+static int pfm_flush(struct file *filp, fl_owner_t id)
+{
+	struct pfm_context *ctx;
+
+	PFM_DBG("pfm_file_ops");
+
+	ctx = filp->private_data;
+	if (ctx == NULL) {
+		PFM_ERR("pfm_flush no ctx");
+		return -EBADF;
+	}
+
+	/*
+	 * remove our file from the async queue, if we use this mode.
+	 * This can be done without the context being protected. We come
+	 * here when the context has become unreacheable by other tasks.
+	 *
+	 * We may still have active monitoring at this point and we may
+	 * end up in pfm_overflow_handler(). However, fasync_helper()
+	 * operates with interrupts disabled and it cleans up the
+	 * queue. If the PMU handler is called prior to entering
+	 * fasync_helper() then it will send a signal. If it is
+	 * invoked after, it will find an empty queue and no
+	 * signal will be sent. In both case, we are safe
+	 */
+	if (filp->f_flags & FASYNC) {
+		PFM_DBG("cleaning up async_queue=%p", ctx->async_queue);
+		__pfm_fasync (-1, filp, ctx, 0);
+	}
+	return 0;
+}
+
+const struct file_operations pfm_file_ops = {
+	.llseek = no_llseek,
+	.read = pfm_read,
+	.write = pfm_write,
+	.poll = pfm_poll,
+	.ioctl = pfm_ioctl,
+	.open = pfm_no_open, /* special open to disallow open via /proc */
+	.fasync = pfm_fasync,
+	.release = pfm_close,
+	.flush= pfm_flush,
+	.mmap = pfm_mmap
+};
+
+static int pfmfs_get_sb(struct file_system_type *fs_type,
+			int flags, const char *dev_name,
+			void *data, struct vfsmount *mnt)
+{
+	return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt);
+}
+
+static struct file_system_type pfm_fs_type = {
+	.name     = "pfmfs",
+	.get_sb   = pfmfs_get_sb,
+	.kill_sb  = kill_anon_super,
+};
+
+/*
+ * pfmfs should _never_ be mounted by userland - too much of security hassle,
+ * no real gain from having the whole whorehouse mounted. So we don't need
+ * any operations on the root directory. However, we need a non-trivial
+ * d_name - pfm: will go nicely and kill the special-casing in procfs.
+ */
+static struct vfsmount *pfmfs_mnt;
+
+int __init pfm_init_fs(void)
+{
+	int err = register_filesystem(&pfm_fs_type);
+	if (!err) {
+		pfmfs_mnt = kern_mount(&pfm_fs_type);
+		err = PTR_ERR(pfmfs_mnt);
+		if (IS_ERR(pfmfs_mnt))
+			unregister_filesystem(&pfm_fs_type);
+		else
+			err = 0;
+	}
+	return err;
+}
+
+static void __exit exit_pfm_fs(void)
+{
+	unregister_filesystem(&pfm_fs_type);
+	mntput(pfmfs_mnt);
+}
+
+int pfm_alloc_fd(struct file **cfile)
+{
+	int fd, ret = 0;
+	struct file *file = NULL;
+	struct inode * inode;
+	char name[32];
+	struct qstr this;
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		return -ENFILE;
+
+	ret = -ENFILE;
+
+	file = get_empty_filp();
+	if (!file)
+		goto out;
+
+	/*
+	 * allocate a new inode
+	 */
+	inode = new_inode(pfmfs_mnt->mnt_sb);
+	if (!inode)
+		goto out;
+
+	PFM_DBG("new inode ino=%ld @%p", inode->i_ino, inode);
+
+	inode->i_sb = pfmfs_mnt->mnt_sb;
+	inode->i_mode = S_IFCHR|S_IRUGO;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+
+	sprintf(name, "[%lu]", inode->i_ino);
+	this.name = name;
+	this.hash = inode->i_ino;
+	this.len = strlen(name);
+
+	ret = -ENOMEM;
+
+	/*
+	 * allocate a new dcache entry
+	 */
+	file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this);
+	if (!file->f_dentry)
+		goto out;
+
+	file->f_dentry->d_op = &pfmfs_dentry_operations;
+
+	d_add(file->f_dentry, inode);
+	file->f_vfsmnt = mntget(pfmfs_mnt);
+	file->f_mapping = inode->i_mapping;
+
+	file->f_op = &pfm_file_ops;
+	file->f_mode = FMODE_READ;
+	file->f_flags = O_RDONLY;
+	file->f_pos  = 0;
+
+	*cfile = file;
+
+	return fd;
+out:
+	if (file)
+		put_filp(file);
+	put_unused_fd(fd);
+	return ret;
+}
Index: linux-2.6/perfmon/perfmon_fmt.c
===================================================================
--- /dev/null
+++ linux-2.6/perfmon/perfmon_fmt.c
@@ -0,0 +1,218 @@
+/*
+ * perfmon_fmt.c: perfmon2 sampling buffer format management
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_smpl_fmt_lock);
+static LIST_HEAD(pfm_smpl_fmt_list);
+
+static inline int fmt_is_mod(struct pfm_smpl_fmt *f)
+{
+	return !(f->fmt_flags & PFM_FMTFL_IS_BUILTIN);
+}
+
+static struct pfm_smpl_fmt *pfm_find_fmt(char *name)
+{
+	struct pfm_smpl_fmt * entry;
+
+	list_for_each_entry(entry, &pfm_smpl_fmt_list, fmt_list) {
+		if (!strcmp(entry->fmt_name, name))
+			return entry;
+	}
+	return NULL;
+}
+/*
+ * find a buffer format based on its name
+ */
+struct pfm_smpl_fmt *pfm_smpl_fmt_get(char *name)
+{
+	struct pfm_smpl_fmt * fmt;
+
+	spin_lock(&pfm_smpl_fmt_lock);
+
+	fmt = pfm_find_fmt(name);
+
+	/*
+	 * increase module refcount
+	 */
+	if (fmt && fmt_is_mod(fmt) && !try_module_get(fmt->owner))
+		fmt = NULL;
+
+	spin_unlock(&pfm_smpl_fmt_lock);
+
+	return fmt;
+}
+
+void pfm_smpl_fmt_put(struct pfm_smpl_fmt *fmt)
+{
+	if (fmt == NULL || !fmt_is_mod(fmt))
+		return;
+	BUG_ON(fmt->owner == NULL);
+
+	spin_lock(&pfm_smpl_fmt_lock);
+	module_put(fmt->owner);
+	spin_unlock(&pfm_smpl_fmt_lock);
+}
+
+int pfm_fmt_register(struct pfm_smpl_fmt *fmt)
+{
+	int ret = 0;
+
+	if (perfmon_disabled) {
+		PFM_INFO("perfmon disabled, cannot add sampling format");
+		return -ENOSYS;
+	}
+
+	/* some sanity checks */
+	if (fmt == NULL) {
+		PFM_INFO("perfmon: NULL format for register");
+		return -EINVAL;
+	}
+
+	if (fmt->fmt_name == NULL) {
+		PFM_INFO("perfmon: format has no name");
+		return -EINVAL;
+	}
+
+	if (fmt->fmt_qdepth > PFM_MSGS_COUNT) {
+		PFM_INFO("perfmon: format %s requires %u msg queue depth (max %d)",
+		       fmt->fmt_name,
+		       fmt->fmt_qdepth,
+		       PFM_MSGS_COUNT);
+		return -EINVAL;
+	}
+
+	/*
+	 * fmt is missing the initialization of .owner = THIS_MODULE
+	 * this is only valid when format is compiled as a module
+	 */
+	if (fmt->owner == NULL && fmt_is_mod(fmt)) {
+		PFM_INFO("format %s has no module owner", fmt->fmt_name);
+		return -EINVAL;
+	}
+	/*
+	 * we need at least a handler
+	 */
+	if (fmt->fmt_handler == NULL) {
+		PFM_INFO("format %s has no handler", fmt->fmt_name);
+		return -EINVAL;
+	}
+
+	/*
+	 * format argument size cannot be bigger than PAGE_SIZE
+	 */
+	if (fmt->fmt_arg_size > PAGE_SIZE) {
+		PFM_INFO("format %s arguments too big", fmt->fmt_name);
+		return -EINVAL;
+	}
+
+	spin_lock(&pfm_smpl_fmt_lock);
+
+	/*
+	 * because of sysfs, we cannot have two formats with the same name
+	 */
+	if (pfm_find_fmt(fmt->fmt_name)) {
+		PFM_INFO("format %s already registered", fmt->fmt_name);
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = pfm_sysfs_add_fmt(fmt);
+	if (ret) {
+		PFM_INFO("sysfs cannot add format entry for %s", fmt->fmt_name);
+		goto out;
+	}
+
+	list_add(&fmt->fmt_list, &pfm_smpl_fmt_list);
+
+	PFM_INFO("added sampling format %s", fmt->fmt_name);
+out:
+	spin_unlock(&pfm_smpl_fmt_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(pfm_fmt_register);
+
+int pfm_fmt_unregister(struct pfm_smpl_fmt *fmt)
+{
+	struct pfm_smpl_fmt *fmt2;
+	int ret = 0;
+
+	if (!fmt || !fmt->fmt_name) {
+		PFM_DBG("invalid fmt");
+		return -EINVAL;
+	}
+
+	spin_lock(&pfm_smpl_fmt_lock);
+
+	fmt2 = pfm_find_fmt(fmt->fmt_name);
+	if (!fmt) {
+		PFM_INFO("unregister failed, format not registered");
+		ret = -EINVAL;
+		goto out;
+	}
+	list_del_init(&fmt->fmt_list);
+
+	pfm_sysfs_remove_fmt(fmt);
+
+	PFM_INFO("removed sampling format: %s", fmt->fmt_name);
+
+out:
+	spin_unlock(&pfm_smpl_fmt_lock);
+	return ret;
+
+}
+EXPORT_SYMBOL(pfm_fmt_unregister);
+
+/*
+ * we defer adding the builtin formats to /sys/kernel/perfmon/formats
+ * until after the pfm sysfs subsystem is initialized. This function
+ * is called from pfm_sysfs_init()
+ */
+void pfm_sysfs_builtin_fmt_add(void)
+{
+	struct pfm_smpl_fmt * entry;
+
+	/*
+	 * locking not needed, kernel not fully booted
+	 * when called
+	 */
+	list_for_each_entry(entry, &pfm_smpl_fmt_list, fmt_list) {
+		pfm_sysfs_add_fmt(entry);
+	}
+}
Index: linux-2.6/perfmon/perfmon_intr.c
===================================================================
--- /dev/null
+++ linux-2.6/perfmon/perfmon_intr.c
@@ -0,0 +1,565 @@
+/*
+ * perfmon_intr.c: perfmon2 interrupt handling
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/kernel.h>
+#include <linux/perfmon.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+void pfm_mask_monitoring(struct pfm_context *ctx,
+			 struct pfm_event_set *set)
+{
+	u64 now;
+
+	PFM_DBG_ovfl("masking monitoring");
+
+	now = sched_clock();
+
+	/*
+	 * we save the PMD values such that we can read them while
+	 * MASKED without having to have the thread stopped which
+	 * is uncessary because monitoring is stopped
+	 *
+	 * XXX: could be avoided in system-wide
+	 */
+	pfm_save_pmds(ctx, set);
+	pfm_arch_mask_monitoring(ctx, set);
+	/*
+	 * accumulate the set duration up to this point
+	 */
+	set->duration += now - set->duration_start;
+}
+EXPORT_SYMBOL(pfm_mask_monitoring);
+
+/*
+ * main overflow processing routine.
+ *
+ * set->num_ovfl_pmds is 0 when returning from this function even though
+ * set->ovfl_pmds[] may have bits set. When leaving set->num_ovfl_pmds
+ * must never be used to determine if there was a pending overflow.
+ */
+static void pfm_overflow_handler(struct pfm_context *ctx, struct pfm_event_set *set,
+				 unsigned long ip,
+				 struct pt_regs *regs)
+{
+	struct pfm_ovfl_arg *ovfl_arg;
+	struct pfm_event_set *set_orig;
+	void *hdr;
+	u64 old_val, ovfl_mask, new_val, ovfl_thres;
+	u64 *ovfl_notify, *ovfl_pmds, *pend_ovfls;
+	u64 *smpl_pmds, *reset_pmds;
+	u64 now, t0, t1;
+	u32 ovfl_ctrl, num_ovfl, num_ovfl_orig;
+	u16 i, max_pmd, max_cnt_pmd, first_cnt_pmd;
+	u8 must_switch, has_64b_ovfl;
+	u8 ctx_block, has_notify, has_ovfl_sw;
+
+	now = t0 = sched_clock();
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+
+	max_pmd = pfm_pmu_conf->regs.max_pmd;
+	first_cnt_pmd = pfm_pmu_conf->regs.first_cnt_pmd;
+	max_cnt_pmd = pfm_pmu_conf->regs.max_cnt_pmd;
+
+	ovfl_pmds = set->ovfl_pmds;
+	num_ovfl = num_ovfl_orig = set->npend_ovfls;
+	pend_ovfls = set->povfl_pmds;
+	has_ovfl_sw = set->flags & PFM_SETFL_OVFL_SWITCH;
+	set_orig = set;
+
+	if (unlikely(ctx->state == PFM_CTX_ZOMBIE))
+		goto stop_monitoring;
+
+	must_switch = has_64b_ovfl = 0;
+
+	hdr = ctx->smpl_addr;
+
+	PFM_DBG_ovfl("ovfl_pmds=0x%llx npend=%u ip=%p, blocking=%d "
+		     "u_pmds=0x%llx use_fmt=%u",
+		     (unsigned long long)pend_ovfls[0],
+		     num_ovfl,
+		     (void *)ip,
+		     ctx->flags.block,
+		     (unsigned long long)set->used_pmds[0],
+		     ctx->smpl_fmt != NULL);
+
+	/*
+	 * initialize temporary bitvectors
+	 * we allocate bitvectors in the context
+	 * rather than on the stack to minimize stack
+	 * space consumption. PMU interrupt is very high
+	 * which implies possible deep nesting of interrupt
+	 * hence limited kernel stack space.
+	 *
+	 * This is safe because a context can only be in the
+	 * overflow handler once at a time
+	 */
+	reset_pmds = set->reset_pmds;
+	ovfl_notify = ctx->ovfl_ovfl_notify;
+
+	bitmap_zero(cast_ulp(reset_pmds), max_pmd);
+
+	/*
+	 * first we update the virtual counters
+	 *
+	 * we leverage num_ovfl to minimize number of
+	 * iterations of the loop.
+	 *
+	 * The i < max_cnt_pmd is just a sanity check
+	 */
+	for (i = first_cnt_pmd; num_ovfl && i < max_cnt_pmd; i++) {
+		/*
+		 * skip pmd which did not overflow
+		 */
+		if (!test_bit(i, cast_ulp(pend_ovfls)))
+			continue;
+
+		num_ovfl--;
+
+		/*
+		 * Note that the pmd is not necessarily 0 at this point as
+		 * qualified events may have happened before the PMU was
+		 * frozen. The residual count is not taken into consideration
+		 * here but will be with any read of the pmd
+		 */
+		old_val = new_val = set->pmds[i].value;
+		ovfl_thres = set->pmds[i].ovflsw_thres;
+		new_val += 1 + ovfl_mask;
+		set->pmds[i].value = new_val;
+
+		/*
+		 * check for overflow condition
+		 */
+		if (likely(old_val > new_val)) {
+			PFM_DBG_ovfl("64 bit overflow of PMD%d", i);
+			has_64b_ovfl = 1;
+			if (has_ovfl_sw && ovfl_thres > 0) {
+				if (ovfl_thres == 1)
+					must_switch = 1;
+				set->pmds[i].ovflsw_thres = ovfl_thres - 1;
+			}
+
+			/*
+			 * what to reset because of this overflow
+			 */
+			__set_bit(i, cast_ulp(reset_pmds));
+
+			bitmap_or(cast_ulp(reset_pmds),
+				  cast_ulp(reset_pmds),
+				  cast_ulp(set->pmds[i].reset_pmds),
+				  max_pmd);
+
+		} else {
+ 			PFM_DBG_ovfl("Hardware counter overflow of PMD%d=0x%04Lx",
+ 				i,
+				(unsigned long long)new_val);
+			/* only keep track of 64-bit overflows */
+			__clear_bit(i, cast_ulp(pend_ovfls));
+			/*
+ 			 * on some PMU, it may be necessary to re-arm the PMD
+		 	 */
+			pfm_arch_ovfl_reset_pmd(ctx, i);
+		}
+
+		PFM_DBG_ovfl("pmd%u=0x%llx old_val=0x%llx "
+			     "hw_pmd=0x%llx o_pmds=0x%llx must_switch=%u "
+			     "o_thres=%llu o_thres_ref=%llu",
+			     i,
+			     (unsigned long long)new_val,
+			     (unsigned long long)old_val,
+			     (unsigned long long)pfm_read_pmd(ctx, i),
+			     (unsigned long long)ovfl_pmds[0],
+			     must_switch,
+			     (unsigned long long)set->pmds[i].ovflsw_thres,
+			     (unsigned long long)set->pmds[i].ovflsw_ref_thres);
+	}
+
+	/*
+	 * mark the overflow as consumed
+	 */
+	set->npend_ovfls = 0;
+
+	ctx_block = ctx->flags.block;
+	
+	t1 = sched_clock();
+	__get_cpu_var(pfm_stats).ccnt0 += t1 - t0;
+	t0 = t1;
+
+	/*
+	 * there was no 64-bit overflow, nothing else to do
+	 */
+	if (!has_64b_ovfl)
+		return;
+
+	/*
+	 * copy pending_ovfls to ovfl_pmd. It is used in
+	 * the notification message or getinfo_evtsets().
+	 *
+	 * pend_ovfls modified to reflect only 64-bit overflows
+	 */
+	bitmap_copy(cast_ulp(ovfl_pmds),
+		    cast_ulp(pend_ovfls),
+		    max_cnt_pmd);
+
+	/*
+	 * build ovfl_notify bitmask from ovfl_pmds
+	 */
+	bitmap_and(cast_ulp(ovfl_notify),
+		   cast_ulp(pend_ovfls),
+		   cast_ulp(set->ovfl_notify),
+		   max_cnt_pmd);
+
+	has_notify = !bitmap_empty(cast_ulp(ovfl_notify), max_cnt_pmd);
+	/*
+	 * must reset for next set of overflows
+	 */
+	bitmap_zero(cast_ulp(pend_ovfls), max_cnt_pmd);
+
+	/*
+	 * check for format
+	 */
+	if (likely(ctx->smpl_fmt)) {
+		u64 start_cycles, end_cycles;
+		u64 *cnt_pmds;
+		int j, k, ret = 0;
+
+		ovfl_ctrl = 0;
+		num_ovfl = num_ovfl_orig;
+		ovfl_arg = &ctx->ovfl_arg;
+		cnt_pmds = pfm_pmu_conf->regs.cnt_pmds;
+
+		ovfl_arg->active_set = set->id;
+
+		for (i = first_cnt_pmd; num_ovfl && !ret; i++) {
+
+			if (!test_bit(i, cast_ulp(ovfl_pmds)))
+				continue;
+
+			num_ovfl--;
+
+			ovfl_arg->ovfl_pmd = i;
+			ovfl_arg->ovfl_ctrl = 0;
+
+			ovfl_arg->pmd_last_reset = set->pmds[i].lval;
+			ovfl_arg->pmd_eventid = set->pmds[i].eventid;
+
+			/*
+		 	 * copy values of pmds of interest.
+			 * Sampling format may use them
+			 * We do not initialize the unused smpl_pmds_values
+		 	 */
+			k = 0;
+			smpl_pmds = set->pmds[i].smpl_pmds;
+			if (!bitmap_empty(cast_ulp(smpl_pmds), max_pmd)) {
+
+				for (j = 0; j < max_pmd; j++) {
+
+					if (!test_bit(j, cast_ulp(smpl_pmds)))
+						continue;
+
+					new_val = pfm_read_pmd(ctx, j);
+
+					/* for counters, build 64-bit value */
+					if (test_bit(j, cast_ulp(cnt_pmds))) {
+						new_val = (set->pmds[j].value & ~ovfl_mask)
+							| (new_val & ovfl_mask);
+					}
+					ovfl_arg->smpl_pmds_values[k++] = new_val;
+
+					PFM_DBG_ovfl("s_pmd_val[%u]="
+						     "pmd%u=0x%llx",
+						     k, j,
+						     (unsigned long long)new_val);
+				}
+			}
+			ovfl_arg->num_smpl_pmds = k;
+
+			__get_cpu_var(pfm_stats).fmt_handler_calls++;
+
+			start_cycles = sched_clock();
+
+			/*
+		 	 * call custom buffer format record (handler) routine
+		 	 */
+			ret = (*ctx->smpl_fmt->fmt_handler)(hdr,
+							    ovfl_arg,
+							    ip,
+							    now,
+							    regs);
+
+			end_cycles = sched_clock();
+
+			/*
+			 * for PFM_OVFL_CTRL_MASK and PFM_OVFL_CTRL_NOTIFY
+			 * we take the union
+			 *
+			 * The reset_pmds mask is constructed automatically
+			 * on overflow. When the actual reset takes place
+			 * depends on the masking, switch and notification
+			 * status. It may be deferred until pfm_restart().
+			 */
+			ovfl_ctrl |= ovfl_arg->ovfl_ctrl;
+
+			__get_cpu_var(pfm_stats).fmt_handler_ns += end_cycles
+								 - start_cycles;
+		}
+		/*
+		 * when the format cannot handle the rest of the overflow,
+		 * we abort right here
+		 */
+		if (ret) {
+			PFM_DBG_ovfl("handler aborted at PMD%u ret=%d",
+				     i, ret);
+		}
+	} else {
+		/*
+		 * When no sampling format is used, the default
+		 * is:
+		 * 	- mask monitoring
+		 * 	- notify user if requested
+		 *
+		 * If notification is not requested, monitoring is masked
+		 * and overflowed counters are not reset (saturation).
+		 * This mimics the behavior of the default sampling format.
+		 */
+		ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY;
+
+		if (!must_switch || has_notify)
+			ovfl_ctrl |= PFM_OVFL_CTRL_MASK;
+	}
+	t1 = sched_clock();
+	__get_cpu_var(pfm_stats).ccnt1 += t1 - t0;
+	t0 = t1;
+
+	PFM_DBG_ovfl("set%u o_notify=0x%llx o_pmds=0x%llx "
+		     "r_pmds=0x%llx ovfl_ctrl=0x%x",
+		     set->id,
+		     (unsigned long long)ovfl_notify[0],
+		     (unsigned long long)ovfl_pmds[0],
+		     (unsigned long long)reset_pmds[0],
+		     ovfl_ctrl);
+
+	/*
+	 * we only reset (short reset) when we are not masking. Otherwise
+	 * the reset is postponed until restart.
+	 */
+	if (likely(!(ovfl_ctrl & PFM_OVFL_CTRL_MASK))) {
+		if (must_switch) {
+			/*
+			 * pfm_switch_sets() takes care
+			 * of resetting new set if needed
+			 */
+			pfm_switch_sets_from_intr(ctx);
+
+			/*
+		 	 * update our view of the active set
+		 	 */
+			set = ctx->active_set;
+
+			must_switch = 0;
+		} else if (ovfl_ctrl & PFM_OVFL_CTRL_RESET) {
+			u16 nn;
+			t0 = sched_clock();
+			nn = bitmap_weight(cast_ulp(reset_pmds), max_pmd);
+			if (nn)
+				pfm_reset_pmds(ctx, set, nn, PFM_PMD_RESET_SHORT);
+			__get_cpu_var(pfm_stats).ccnt5 += sched_clock() - t0;
+		}
+		/*
+		 * do not block if not masked
+		 */
+		ctx_block = 0;
+	} else {
+		pfm_mask_monitoring(ctx, set);
+		ctx->state = PFM_CTX_MASKED;
+		ctx->flags.can_restart = 1;
+	}
+	/*
+	 * if we have not switched here, then remember for the
+	 * time monitoring is restarted
+	 */
+	if (must_switch)
+		set->priv_flags |= PFM_SETFL_PRIV_SWITCH;
+
+	/*
+	 * block only if CTRL_NOTIFY+CTRL_MASK and requested by user
+	 *
+	 * Defer notification until last operation in the handler
+	 * to avoid spinlock contention
+	 */
+	if (has_notify && (ovfl_ctrl & PFM_OVFL_CTRL_NOTIFY)) {
+		if (ctx_block) {
+			ctx->flags.work_type = PFM_WORK_BLOCK;
+			set_thread_flag(TIF_PERFMON_WORK);
+		}
+		/*
+		 * if message queue is full, then mask monitoring
+		 * and wait for pfm_restart()
+		 */
+		if (pfm_ovfl_notify_user(ctx, set_orig, ip)) {
+			pfm_mask_monitoring(ctx, set);
+			ctx->state = PFM_CTX_MASKED;
+			ctx->flags.can_restart = 1;
+		}
+	}
+
+	t1 = sched_clock();
+	__get_cpu_var(pfm_stats).ccnt2 += t1 - t0;
+
+	return;
+
+stop_monitoring:
+	/*
+	 * Does not happen for a system-wide context nor for a
+	 * self-monitored context. We cannot attach to kernel-only
+	 * thread, thus it is safe to set TIF bits, i.e., the thread
+	 * will eventually leave the kernel or die and either we will
+	 * catch the context and clean it up in pfm_handler_work() or
+	 * pfm_exit_thread().
+	 */
+	PFM_DBG_ovfl("ctx is zombie, converted to spurious");
+
+	__pfm_stop(ctx);
+	ctx->flags.work_type = PFM_WORK_ZOMBIE;
+	set_thread_flag(TIF_PERFMON_WORK);
+}
+
+/*
+ * interrupts are masked
+ *
+ * Context locking necessary to avoid concurrent accesses from other CPUs
+ * 	- For per-thread, we must prevent pfm_restart() which works when
+ * 	  context is LOADED or MASKED
+ */
+static void __pfm_interrupt_handler(unsigned long iip, struct pt_regs *regs)
+{
+	struct task_struct *task;
+	struct pfm_context *ctx;
+	struct pfm_event_set *set;
+	u64 t0;
+
+	t0 = sched_clock();
+	__get_cpu_var(pfm_stats).ovfl_intr_all_count++;
+
+	task = __get_cpu_var(pmu_owner);
+	ctx = __get_cpu_var(pmu_ctx);
+
+	if (unlikely(ctx == NULL)) {
+		PFM_DBG_ovfl("no ctx");
+		goto spurious;
+	}
+
+	spin_lock(&ctx->lock);
+
+	set = ctx->active_set;
+
+	/*
+	 * For SMP per-thread, it is not possible to have
+	 * owner != NULL && task != current.
+	 *
+	 * For UP per-thread, because of lazy save, it
+	 * is possible to receive an interrupt in another task
+	 * which is not using the PMU. This means
+	 * that the interrupt was in-flight at the
+	 * time of pfm_ctxswout_thread(). In that
+	 * case it will be replayed when the task
+	 * is scheduled again. Hence we convert to spurious.
+	 *
+	 * The basic rule is that an overflow is always
+	 * processed in the context of the task that
+	 * generated it for all per-thread contexts.
+	 *
+	 * for system-wide, task is always NULL
+	 */
+#ifndef CONFIG_SMP
+	if (unlikely((task && current->pfm_context != ctx))) {
+		PFM_DBG_ovfl("spurious: not owned by current task");
+		goto spurious;
+	}
+#endif
+	if (unlikely(!pfm_arch_is_active(ctx))) {
+		PFM_DBG_ovfl("spurious: monitoring non active");
+		goto spurious;
+	}
+
+	/*
+	 * freeze PMU and collect overflowed PMD registers
+	 * into set->povfl_pmds. Number of overflowed PMDs reported
+	 * in set->npend_ovfls
+	 */
+	pfm_arch_intr_freeze_pmu(ctx, set);
+	if (unlikely(!set->npend_ovfls)) {
+		PFM_DBG_ovfl("no npend_ovfls");
+		goto spurious;
+	}
+
+	__get_cpu_var(pfm_stats).ovfl_intr_regular_count++;
+
+	__get_cpu_var(pfm_stats).ccnt3 += sched_clock() - t0;
+
+	pfm_overflow_handler(ctx, set, iip, regs);
+
+	pfm_arch_intr_unfreeze_pmu(ctx);
+
+	__get_cpu_var(pfm_stats).ccnt4 += sched_clock() - t0;
+
+	spin_unlock(&ctx->lock);
+
+	return;
+
+spurious:
+	/* ctx may be NULL */
+	pfm_arch_intr_unfreeze_pmu(ctx);
+	if (ctx)
+		spin_unlock(&ctx->lock);
+}
+
+void pfm_interrupt_handler(unsigned long iip, struct pt_regs *regs)
+{
+	u64 start;
+
+	BUG_ON(!irqs_disabled());
+
+	start = sched_clock();
+
+	__pfm_interrupt_handler(iip, regs);
+
+	__get_cpu_var(pfm_stats).ovfl_intr_ns += sched_clock() - start;
+}
+EXPORT_SYMBOL(pfm_interrupt_handler);
+
Index: linux-2.6/perfmon/perfmon_pmu.c
===================================================================
--- /dev/null
+++ linux-2.6/perfmon/perfmon_pmu.c
@@ -0,0 +1,567 @@
+/*
+ * perfmon_pmu.c: perfmon2 PMU configuration management
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/module.h>
+#include <linux/perfmon.h>
+
+#ifndef CONFIG_MODULE_UNLOAD
+#define module_refcount(n)	1
+#endif
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_conf_lock);
+static __cacheline_aligned_in_smp int request_mod_in_progress;
+/*
+ * perfmon core must acces PMU information ONLY through pfm_pmu_conf
+ * if pfm_pmu_conf is NULL, then no description is registered
+ */
+struct pfm_pmu_config	*pfm_pmu_conf;
+EXPORT_SYMBOL(pfm_pmu_conf);
+
+static inline int pmu_is_module(struct pfm_pmu_config *c)
+{
+	return !(c->flags & PFM_PMUFL_IS_BUILTIN);
+}
+
+/*
+ * compute the following:
+ * 	- max_pmc, num_pmcs
+ * 	- max_pmd, num_pmds, first_cnt_pmd, max_rw_pmd
+ * based on existing regdesc and valid pmc_desc and pmd_desc
+ */
+static void pfm_pmu_regdesc_calc_limits(struct pfm_regdesc *d)
+{
+	u16 n, n2, n_counters, i;
+	int max1, max2, max3, first_cnt, first_i;
+
+	n = 0;
+	max1 = max2 = -1;
+	for (i = 0; i < pfm_pmu_conf->num_pmc_entries;  i++) {
+		if (!test_bit(i, cast_ulp(d->pmcs)))
+			continue;
+		max1 = i;
+		n++;
+	}
+	d->max_pmc = max1 + 1;
+	d->num_pmcs = n;
+
+	n = n_counters = n2 = 0;
+	max1 = max2 = max3 = first_cnt = first_i = -1;
+	for (i = 0; i < pfm_pmu_conf->num_pmd_entries;  i++) {
+		if (!test_bit(i, cast_ulp(d->pmds)))
+			continue;
+
+		if (first_i == -1)
+			first_i = i;
+
+		max1 = i;
+		n++;
+
+		/*
+		 * read-write registers
+		 */
+		if (!(pfm_pmu_conf->pmd_desc[i].type & PFM_REG_RO)) {
+			max3 = i;
+			n2++;
+		}
+
+		/*
+		 * counters registers
+		 */
+		if (pfm_pmu_conf->pmd_desc[i].type & PFM_REG_C64) {
+			max2 = i;
+			n_counters++;
+			if (first_cnt == -1)
+				first_cnt = i;
+		}
+	}
+	d->max_pmd = max1 + 1;
+	d->first_cnt_pmd = first_cnt == -1 ?  first_i : first_cnt;
+
+	/* guaranteed to be <= PFM_MAX_HW_PMDS */
+	d->max_cnt_pmd  = max2 + 1;
+
+	d->num_counters = n_counters;
+	d->num_pmds = n;
+	d->max_rw_pmd = max3 + 1;
+	d->num_rw_pmd = n2;
+}
+
+static int pfm_regdesc_init(struct pfm_regdesc *d, struct pfm_pmu_config *cfg)
+{
+	u16 n, n2, n_counters, i;
+	int max1, max2, max3, first_cnt, first_i;
+
+	memset(d, 0 , sizeof(*d));
+	/*
+	 * compute the number of implemented PMC from the
+	 * description tables
+	 *
+	 * We separate actual PMC registers from virtual
+	 * PMC registers. Needed for PMC save/restore routines.
+	 */
+	n = 0;
+	max1 = max2 = -1;
+	for (i = 0; i < cfg->num_pmc_entries;  i++) {
+		if (!(cfg->pmc_desc[i].type & PFM_REG_I))
+			continue;
+
+		__set_bit(i, cast_ulp(d->pmcs));
+
+		max1 = i;
+		n++;
+	}
+
+	if (!n) {
+		PFM_INFO("%s PMU description has no PMC registers",
+			 cfg->pmu_name);
+		return -EINVAL;
+	}
+
+	d->max_pmc = max1 + 1;
+	d->num_pmcs = n;
+
+	n = n_counters = n2 = 0;
+	max1 = max2 = max3 = first_cnt = first_i = -1;
+	for (i = 0; i < cfg->num_pmd_entries;  i++) {
+		if (!(cfg->pmd_desc[i].type & PFM_REG_I))
+			continue;
+
+		if (first_i == -1)
+			first_i = i;
+
+		__set_bit(i, cast_ulp(d->pmds));
+		max1 = i;
+		n++;
+
+		/*
+		 * read-write registers
+		 */
+		if (!(cfg->pmd_desc[i].type & PFM_REG_RO)) {
+			__set_bit(i, cast_ulp(d->rw_pmds));
+			max3 = i;
+			n2++;
+		}
+
+		/*
+		 * counters registers
+		 */
+		if (cfg->pmd_desc[i].type & PFM_REG_C64) {
+			__set_bit(i, cast_ulp(d->cnt_pmds));
+			max2 = i;
+			n_counters++;
+			if (first_cnt == -1)
+				first_cnt = i;
+		}
+	}
+
+	if (!n) {
+		PFM_INFO("%s PMU description has no PMD registers",
+			 cfg->pmu_name);
+		return -EINVAL;
+	}
+
+	d->max_pmd = max1 + 1;
+	d->first_cnt_pmd = first_cnt == -1 ?  first_i : first_cnt;
+
+	/* guaranteed to be <= PFM_MAX_HW_PMDS */
+	d->max_cnt_pmd  = max2 + 1;
+
+	d->num_counters = n_counters;
+	d->num_pmds = n;
+	d->max_rw_pmd = max3 + 1;
+	d->num_rw_pmd = n2;
+
+	return 0;
+}
+
+/*
+ * initialize PMU configuration from PMU config descriptor
+ */
+static int pfm_pmu_config_init(struct pfm_pmu_config *cfg)
+{
+	int ret;
+
+	/* we build the register description using the full mapping
+	 * table as defined by the module. For the first user, we update
+	 * the current description (regs) based on local constraints,
+	 * such as some register used by other subsystems
+	 */
+	ret = pfm_regdesc_init(&cfg->full_regs, cfg);
+	if (ret)
+		return ret;
+
+	if (!cfg->version)
+		cfg->version = "0.0";
+
+	pfm_pmu_conf = cfg;
+	pfm_pmu_conf->ovfl_mask = (1ULL << cfg->counter_width) -1;
+
+	PFM_INFO("%s PMU detected, %u PMCs, %u PMDs, %u counters (%u bits)",
+		 pfm_pmu_conf->pmu_name,
+		 pfm_pmu_conf->full_regs.num_pmcs,
+		 pfm_pmu_conf->full_regs.num_pmds,
+		 pfm_pmu_conf->full_regs.num_counters,
+		 pfm_pmu_conf->counter_width);
+
+	return 0;
+}
+
+int pfm_pmu_register(struct pfm_pmu_config *cfg)
+{
+	u16 i, nspec, nspec_ro, num_pmcs, num_pmds, num_wc = 0;
+	int type, ret = -EBUSY;
+
+	if (perfmon_disabled) {
+		PFM_INFO("perfmon disabled, cannot add PMU description");
+		return -ENOSYS;
+	}
+
+	nspec = nspec_ro = num_pmds = num_pmcs = 0;
+
+	/* some sanity checks */
+	if (cfg == NULL || cfg->pmu_name == NULL) {
+		PFM_INFO("PMU config descriptor is invalid");
+		return -EINVAL;
+	}
+
+	/* must have a probe */
+	if (cfg->probe_pmu == NULL) {
+		PFM_INFO("PMU config has no probe routine");
+		return -EINVAL;
+	}
+
+	/*
+	 * execute probe routine before anything else as it
+	 * may update configuration tables
+	 */
+	if ((*cfg->probe_pmu)() == -1) {
+		PFM_INFO("%s PMU detection failed", cfg->pmu_name);
+		return -EINVAL;
+	}
+
+	if (!(cfg->flags & PFM_PMUFL_IS_BUILTIN) && cfg->owner == NULL) {
+		PFM_INFO("PMU config %s is missing owner", cfg->pmu_name);
+		return -EINVAL;
+	}
+
+	if (!cfg->num_pmd_entries) {
+		PFM_INFO("%s needs to define num_pmd_entries", cfg->pmu_name);
+		return -EINVAL;
+	}
+
+	if (!cfg->num_pmc_entries) {
+		PFM_INFO("%s needs to define num_pmc_entries", cfg->pmu_name);
+		return -EINVAL;
+	}
+
+	if (!cfg->counter_width) {
+		PFM_INFO("PMU config %s, zero width counters", cfg->pmu_name);
+		return -EINVAL;
+	}
+
+	/*
+	 * REG_RO, REG_V not supported on PMC registers
+	 */
+	for (i = 0; i < cfg->num_pmc_entries;  i++) {
+
+		type = cfg->pmc_desc[i].type;
+
+		if (type & PFM_REG_I)
+			num_pmcs++;
+
+		if (type & PFM_REG_WC)
+			num_wc++;
+
+		if (type & PFM_REG_V) {
+			PFM_INFO("PFM_REG_V is not supported on "
+				 "PMCs (PMC%d)", i);
+			return -EINVAL;
+		}
+		if (type & PFM_REG_RO) {
+			PFM_INFO("PFM_REG_RO meaningless on "
+				 "PMCs (PMC%u)", i);
+			return -EINVAL;
+		}
+	}
+
+	if (num_wc && cfg->pmc_write_check == NULL) {
+		PFM_INFO("PMC have write-checker but no callback provided\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * check virtual PMD registers
+	 */
+	num_wc= 0;
+	for (i = 0; i < cfg->num_pmd_entries;  i++) {
+
+		type = cfg->pmd_desc[i].type;
+
+		if (type & PFM_REG_I)
+			num_pmds++;
+
+		if (type & PFM_REG_V) {
+			nspec++;
+			if (type & PFM_REG_RO)
+				nspec_ro++;
+		}
+
+		if (type & PFM_REG_WC)
+			num_wc++;
+
+		/*
+		 * only up to HW_PMD can overflow, SW PMD cannot
+		 */
+		if (type & PFM_REG_C64 && i >= PFM_ARCH_MAX_HW_PMDS) {
+			PFM_INFO("overflowing PMD counters must be < %d",
+				PFM_ARCH_MAX_HW_PMDS);
+			return -EINVAL;
+		}
+	}
+
+	if (num_wc && cfg->pmd_write_check == NULL) {
+		PFM_INFO("PMD have write-checker but no callback provided\n");
+		return -EINVAL;
+	}
+
+	if (nspec && cfg->pmd_sread == NULL) {
+		PFM_INFO("PMU config is missing pmd_sread()");
+		return -EINVAL;
+	}
+
+	nspec = nspec - nspec_ro;
+	if (nspec && cfg->pmd_swrite == NULL) {
+		PFM_INFO("PMU config is missing pmd_swrite()");
+		return -EINVAL;
+	}
+
+	if (num_pmcs >= PFM_MAX_PMCS) {
+		PFM_INFO("%s PMCS registers exceed name space [0-%u]",
+			 cfg->pmu_name,
+			 PFM_MAX_PMCS);
+		return -EINVAL;
+	}
+
+	spin_lock(&pfm_pmu_conf_lock);
+
+	if (pfm_pmu_conf)
+		goto unlock;
+
+	ret = pfm_pmu_config_init(cfg);
+	if (ret)
+		goto unlock;
+
+	ret = pfm_arch_pmu_config_init(pfm_pmu_conf);
+	if (ret)
+		goto unlock;
+
+	ret = pfm_sysfs_add_pmu(pfm_pmu_conf);
+	if (ret) {
+		pfm_arch_pmu_config_remove();
+		pfm_pmu_conf = NULL;
+	}
+
+unlock:
+	spin_unlock(&pfm_pmu_conf_lock);
+
+	if (ret) {
+		PFM_INFO("register %s PMU error %d", cfg->pmu_name, ret);
+	} else {
+		PFM_INFO("%s PMU installed", cfg->pmu_name);
+		/*
+		 * (re)initialize PMU on each PMU now that we have a description
+		 */
+		on_each_cpu(__pfm_init_percpu, cfg, 0, 0);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(pfm_pmu_register);
+
+/*
+ * remove PMU description. Caller must pass address of current
+ * configuration. This is mostly for sanity checking as only
+ * one config can exist at any time.
+ *
+ * We are using the module refcount mechanism to protect against
+ * removal while the configuration is being used. As long as there is
+ * one context, a PMU configuration cannot be removed. The protection is
+ * managed in module logic.
+ */
+void pfm_pmu_unregister(struct pfm_pmu_config *cfg)
+{
+	if (!(cfg ||pfm_pmu_conf))
+		return;
+
+	spin_lock(&pfm_pmu_conf_lock);
+
+	BUG_ON(module_refcount(pfm_pmu_conf->owner));
+
+	if (cfg->owner == pfm_pmu_conf->owner) {
+		pfm_arch_pmu_config_remove();
+		pfm_sysfs_remove_pmu(pfm_pmu_conf);
+		pfm_pmu_conf = NULL;
+	}
+
+	spin_unlock(&pfm_pmu_conf_lock);
+}
+EXPORT_SYMBOL(pfm_pmu_unregister);
+
+static int pfm_pmu_request_module(void)
+{
+	char *mod_name;
+	int ret;
+
+	mod_name = pfm_arch_get_pmu_module_name();
+	if (mod_name == NULL)
+		return -ENOSYS;
+
+	ret = request_module(mod_name);
+
+	PFM_DBG("mod=%s ret=%d\n", mod_name, ret);
+	return ret;
+}
+
+/*
+ * autoload:
+ * 	0     : do not try to autoload the PMU description module
+ * 	not 0 : try to autoload the PMU description module
+ */
+int pfm_pmu_conf_get(int autoload)
+{
+	int ret;
+
+	spin_lock(&pfm_pmu_conf_lock);
+
+	if (request_mod_in_progress) {
+		ret = -ENOSYS;
+		goto skip;
+	}
+	
+	if (autoload && pfm_pmu_conf == NULL) {
+
+		request_mod_in_progress = 1;
+
+		spin_unlock(&pfm_pmu_conf_lock);
+
+		pfm_pmu_request_module();
+
+		spin_lock(&pfm_pmu_conf_lock);
+
+		request_mod_in_progress = 0;
+
+		/*
+		 * request_module() may succeed but the module
+		 * may not have registered properly so we need
+		 * to check
+		 */
+	}
+
+	ret = pfm_pmu_conf == NULL ? -ENOSYS : 0;
+	if (!ret && pmu_is_module(pfm_pmu_conf)
+	    && !try_module_get(pfm_pmu_conf->owner))
+		ret = -ENOSYS;
+skip:
+	spin_unlock(&pfm_pmu_conf_lock);
+
+	return ret;
+}
+
+void pfm_pmu_conf_put(void)
+{
+	if (pfm_pmu_conf == NULL || !pmu_is_module(pfm_pmu_conf))
+		return;
+
+	spin_lock(&pfm_pmu_conf_lock);
+	module_put(pfm_pmu_conf->owner);
+	spin_unlock(&pfm_pmu_conf_lock);
+}
+
+static __cacheline_aligned_in_smp atomic_t pfm_pmu_acquired;
+
+int pfm_pmu_acquire(void)
+{
+	int ret;
+
+	PFM_DBG("before pmu_acquired=%d", atomic_read(&pfm_pmu_acquired));
+
+	if (atomic_inc_return(&pfm_pmu_acquired) > 1)
+		return 0;
+
+	PFM_DBG("after pmu_acquired=%d", atomic_read(&pfm_pmu_acquired));
+	/*
+	 * copy full description and then check if arch-specific
+	 * layer needs some adjustments
+	 */
+	pfm_pmu_conf->regs = pfm_pmu_conf->full_regs;
+
+	ret = pfm_arch_pmu_acquire();
+	if (ret) {
+		atomic_dec(&pfm_pmu_acquired);
+		return ret;
+	}
+
+	/*
+	 * calculate new limits (num, max)
+	 */
+	pfm_pmu_regdesc_calc_limits(&pfm_pmu_conf->regs);
+
+	/* available PMU ressources */
+	PFM_DBG("PMU acquired: %u PMCs, %u PMDs, %u counters",
+		 pfm_pmu_conf->regs.num_pmcs,
+		 pfm_pmu_conf->regs.num_pmds,
+		 pfm_pmu_conf->regs.num_counters);
+
+	if (!(pfm_pmu_conf->regs.num_pmcs && pfm_pmu_conf->regs.num_pmcs)) {
+		PFM_DBG("no usable PMU registers");
+		pfm_arch_pmu_release();
+		atomic_dec(&pfm_pmu_acquired);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+void pfm_pmu_release(void)
+{
+	PFM_DBG("pmu_acquired=%d", atomic_read(&pfm_pmu_acquired));
+
+	if (!atomic_dec_and_test(&pfm_pmu_acquired))
+		return;
+
+	pfm_arch_pmu_release();
+	memset(&pfm_pmu_conf->regs, 0, sizeof(pfm_pmu_conf->regs));
+	PFM_DBG("PMU released");
+}
Index: linux-2.6/perfmon/perfmon_res.c
===================================================================
--- /dev/null
+++ linux-2.6/perfmon/perfmon_res.c
@@ -0,0 +1,419 @@
+/*
+ * perfmon_res.c:  perfmon2 resource allocations
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/spinlock.h>
+#include <linux/perfmon.h>
+#include <linux/module.h>
+
+/*
+ * global information about all sessions
+ * mostly used to synchronize between system wide and per-process
+ */
+struct pfm_sessions {
+	u32		pfs_task_sessions;/* #num loaded per-thread sessions */
+	size_t		pfs_smpl_buffer_mem_cur; /* current smpl buf mem usage */
+	cpumask_t	pfs_sys_cpumask;  /* bitmask of used cpus (system-wide) */
+};
+
+static struct pfm_sessions pfm_sessions;
+
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_sessions_lock);
+
+/*
+ * sampling buffer allocated by perfmon must be
+ * checked against max usage thresholds for security
+ * reasons.
+ *
+ * The first level check is against the system wide limit
+ * as indicated by the system administrator in /proc/sys/kernel/perfmon
+ *
+ * The second level check is on a per-process basis using
+ * RLIMIT_MEMLOCK limit.
+ *
+ * Operating on the current task only.
+ */
+int pfm_reserve_buf_space(size_t size)
+{
+	struct mm_struct *mm;
+	unsigned long locked;
+	unsigned long buf_mem, buf_mem_max;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pfm_sessions_lock, flags);
+
+	/*
+	 * check against global buffer limit
+	 */
+	buf_mem_max = pfm_controls.smpl_buffer_mem_max;
+	buf_mem = pfm_sessions.pfs_smpl_buffer_mem_cur + size;
+
+	if (buf_mem <= buf_mem_max) {
+		pfm_sessions.pfs_smpl_buffer_mem_cur = buf_mem;
+
+		PFM_DBG("buf_mem_max=%lu current_buf_mem=%lu",
+			buf_mem_max,
+			buf_mem);
+	}
+	spin_unlock_irqrestore(&pfm_sessions_lock, flags);
+
+	if (buf_mem > buf_mem_max) {
+		PFM_DBG("smpl buffer memory threshold reached");
+		return -ENOMEM;
+	}
+
+	/*
+	 * check against RLIMIT_MEMLOCK
+	 */
+	mm = get_task_mm(current);
+
+	down_write(&mm->mmap_sem);
+
+	locked  = mm->locked_vm << PAGE_SHIFT;
+	locked += size;
+
+	if (locked > current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur) {
+
+		PFM_DBG("RLIMIT_MEMLOCK reached ask_locked=%lu rlim_cur=%lu",
+			locked,
+			current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur);
+
+		up_write(&mm->mmap_sem);
+		mmput(mm);
+		goto unres;
+	}
+
+	mm->locked_vm = locked >> PAGE_SHIFT;
+
+	up_write(&mm->mmap_sem);
+
+	mmput(mm);
+
+	return 0;
+
+unres:
+	/*
+	 * remove global buffer memory allocation
+	 */
+	spin_lock_irqsave(&pfm_sessions_lock, flags);
+
+	pfm_sessions.pfs_smpl_buffer_mem_cur -= size;
+
+	spin_unlock_irqrestore(&pfm_sessions_lock, flags);
+
+	return -ENOMEM;
+}
+/*
+ *There exist multiple paths leading to this function. We need to
+ * be very careful withlokcing on the mmap_sem as it may already be
+ * held by the time we come here.
+ * The following paths exist:
+ *
+ * exit path:
+ * sys_exit_group
+ *    do_group_exit
+ *     do_exit
+ *      exit_mm
+ *       mmput
+ *        exit_mmap
+ *         remove_vma
+ *          fput
+ *           __fput
+ *            pfm_close
+ *             __pfm_close
+ *              pfm_context_free
+ * 	         pfm_release_buf_space
+ * munmap path:
+ * sys_munmap
+ *  do_munmap
+ *   remove_vma
+ *    fput
+ *     __fput
+ *      pfm_close
+ *       __pfm_close
+ *        pfm_context_free
+ *         pfm_release_buf_space
+ *
+ * close path:
+ * sys_close
+ *  filp_close
+ *   fput
+ *    __fput
+ *     pfm_close
+ *      __pfm_close
+ *       pfm_context_free
+ *        pfm_release_buf_space
+ *
+ * The issue is that on the munmap() path, the mmap_sem is already held
+ * in write-mode by the time we come here. To avoid the deadlock, we need
+ * to know where we are coming from and skip down_write(). If is fairly
+ * difficult to know this because of the lack of good hooks and
+ * the fact that, there may not have been any mmap() of the sampling buffer
+ * (i.e. create_context() followed by close() or exit()).
+ *
+ * We use a set flag ctx->flags.mmap_nlock which is toggle in the vm_ops
+ * callback in remove_vma() which is called systematically for the call, so
+ * on all but the pure close() path. The exit path does not already hold
+ * the lock but this is exit so there is no task->mm by the time we come here.
+ *
+ * The mmap_nlock is set only when unmapping and this is the LAST reference
+ * to the file (i.e., close() followed by munmap()).
+ */
+void pfm_release_buf_space(struct pfm_context *ctx, size_t size)
+{
+	unsigned long flags;
+	struct mm_struct *mm;
+
+	mm = get_task_mm(current);
+	if (mm) {
+		if (ctx->flags.mmap_nlock == 0) {
+			PFM_DBG("doing down_write");
+			down_write(&mm->mmap_sem);
+		}
+
+		mm->locked_vm -= size >> PAGE_SHIFT;
+
+		PFM_DBG("locked_vm=%lu size=%zu", mm->locked_vm, size);
+
+		if (ctx->flags.mmap_nlock == 0)
+			up_write(&mm->mmap_sem);
+
+		mmput(mm);
+	}
+
+	spin_lock_irqsave(&pfm_sessions_lock, flags);
+
+	pfm_sessions.pfs_smpl_buffer_mem_cur -= size;
+
+	spin_unlock_irqrestore(&pfm_sessions_lock, flags);
+}
+
+int pfm_reserve_session(int is_system, u32 cpu)
+{
+	unsigned long flags;
+	u32 nsys_cpus;
+	int ret = 0;
+
+	/*
+	 * validy checks on cpu_mask have been done upstream
+	 */
+	spin_lock_irqsave(&pfm_sessions_lock, flags);
+
+	nsys_cpus = cpus_weight(pfm_sessions.pfs_sys_cpumask);
+
+	PFM_DBG("in  sys=%u task=%u is_sys=%d cpu=%u",
+		nsys_cpus,
+		pfm_sessions.pfs_task_sessions,
+		is_system,
+		cpu);
+
+	if (is_system) {
+		/*
+		 * cannot mix system wide and per-task sessions
+		 */
+		if (pfm_sessions.pfs_task_sessions > 0) {
+			PFM_DBG("%u conflicting task_sessions",
+			  	pfm_sessions.pfs_task_sessions);
+			ret = -EBUSY;
+			goto abort;
+		}
+
+		if (cpu_isset(cpu, pfm_sessions.pfs_sys_cpumask)) {
+			PFM_DBG("conflicting session on CPU%u", cpu);
+			ret = -EBUSY;
+			goto abort;
+		}
+
+		PFM_DBG("reserved session on CPU%u", cpu);
+
+		cpu_set(cpu, pfm_sessions.pfs_sys_cpumask);
+		nsys_cpus++;
+	} else {
+		if (nsys_cpus) {
+			ret = -EBUSY;
+			goto abort;
+		}
+		pfm_sessions.pfs_task_sessions++;
+	}
+
+	PFM_DBG("out sys=%u task=%u is_sys=%d cpu=%u",
+		nsys_cpus,
+		pfm_sessions.pfs_task_sessions,
+		is_system,
+		cpu);
+
+abort:
+	spin_unlock_irqrestore(&pfm_sessions_lock, flags);
+
+	return ret;
+}
+
+/*
+ * called from __pfm_unload_context()
+ */
+int pfm_release_session(int is_system, u32 cpu)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pfm_sessions_lock, flags);
+
+	PFM_DBG("in sys_sessions=%u task_sessions=%u syswide=%d cpu=%u",
+		cpus_weight(pfm_sessions.pfs_sys_cpumask),
+		pfm_sessions.pfs_task_sessions,
+		is_system, cpu);
+
+	if (is_system)
+		cpu_clear(cpu, pfm_sessions.pfs_sys_cpumask);
+	else
+		pfm_sessions.pfs_task_sessions--;
+
+	PFM_DBG("out sys_sessions=%u task_sessions=%u syswide=%d cpu=%u",
+		cpus_weight(pfm_sessions.pfs_sys_cpumask),
+		pfm_sessions.pfs_task_sessions,
+		is_system, cpu);
+
+	spin_unlock_irqrestore(&pfm_sessions_lock, flags);
+	return 0;
+}
+
+int pfm_reserve_allcpus(void)
+{
+	unsigned long flags;
+	u32 nsys_cpus, cpu;
+
+	spin_lock_irqsave(&pfm_sessions_lock, flags);
+
+	nsys_cpus = cpus_weight(pfm_sessions.pfs_sys_cpumask);
+
+	PFM_DBG("in  sys=%u task=%u",
+		nsys_cpus,
+		pfm_sessions.pfs_task_sessions);
+
+	if (nsys_cpus) {
+		PFM_DBG("already some system-wide sessions");
+		goto abort;
+	}
+
+	/*
+	 * cannot mix system wide and per-task sessions
+	 */
+	if (pfm_sessions.pfs_task_sessions) {
+		PFM_DBG("%u conflicting task_sessions",
+		  	pfm_sessions.pfs_task_sessions);
+		goto abort;
+	}
+
+	for_each_online_cpu(cpu) {
+		cpu_set(cpu, pfm_sessions.pfs_sys_cpumask);
+		nsys_cpus++;
+	}
+
+	PFM_DBG("out sys=%u task=%u",
+		nsys_cpus,
+		pfm_sessions.pfs_task_sessions);
+
+	spin_unlock_irqrestore(&pfm_sessions_lock, flags);
+
+	return 0;
+
+abort:
+	spin_unlock_irqrestore(&pfm_sessions_lock, flags);
+
+	return -EBUSY;
+}
+EXPORT_SYMBOL(pfm_reserve_allcpus);
+
+int pfm_release_allcpus(void)
+{
+	unsigned long flags;
+	u32 nsys_cpus, cpu;
+
+	spin_lock_irqsave(&pfm_sessions_lock, flags);
+
+	nsys_cpus = cpus_weight(pfm_sessions.pfs_sys_cpumask);
+
+	PFM_DBG("in  sys=%u task=%u",
+		nsys_cpus,
+		pfm_sessions.pfs_task_sessions);
+
+	/*
+	 * XXX: could use __cpus_clear() with nbits
+	 */
+	for_each_online_cpu(cpu) {
+		cpu_clear(cpu, pfm_sessions.pfs_sys_cpumask);
+		nsys_cpus--;
+	}
+
+	PFM_DBG("out sys=%u task=%u",
+		nsys_cpus,
+		pfm_sessions.pfs_task_sessions);
+
+	spin_unlock_irqrestore(&pfm_sessions_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(pfm_release_allcpus);
+
+/*
+ * called from perfmon_sysfs.c:
+ *  what=0 : pfs_task_sessions
+ *  what=1 : cpus_weight(pfs_sys_cpumask)
+ *  what=2 : smpl_buffer_mem_cur
+ *  what=3 : pmu model name
+ *
+ * return number of bytes written into buf (up to sz)
+ */
+ssize_t pfm_sysfs_session_show(char *buf, size_t sz, int what)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pfm_sessions_lock, flags);
+
+	switch (what) {
+	case 0: snprintf(buf, sz, "%u\n", pfm_sessions.pfs_task_sessions);
+		break;
+	case 1: snprintf(buf, sz, "%d\n", cpus_weight(pfm_sessions.pfs_sys_cpumask));
+		break;
+	case 2: snprintf(buf, sz, "%zu\n", pfm_sessions.pfs_smpl_buffer_mem_cur);
+		break;
+	case 3:
+		snprintf(buf, sz, "%s\n",
+			pfm_pmu_conf ?	pfm_pmu_conf->pmu_name
+				     :	"unknown\n");
+	}
+	spin_unlock_irqrestore(&pfm_sessions_lock, flags);
+	return strlen(buf);
+}
Index: linux-2.6/perfmon/perfmon_rw.c
===================================================================
--- /dev/null
+++ linux-2.6/perfmon/perfmon_rw.c
@@ -0,0 +1,602 @@
+/*
+ * perfmon.c: perfmon2 PMC/PMD read/write system calls
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net/
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/perfmon.h>
+
+#define PFM_REGFL_PMC_ALL	(PFM_REGFL_NO_EMUL64|PFM_REG_RETFL_MASK)
+#define PFM_REGFL_PMD_ALL	(PFM_REGFL_RANDOM     | \
+				 PFM_REGFL_OVFL_NOTIFY| \
+				 PFM_REG_RETFL_MASK)
+/*
+ * function called from sys_pfm_write_pmds() to write the
+ * requested PMD registers. The function succeeds whether the context is
+ * attached or not. When attached to another thread, that thread must be
+ * stopped.
+ *
+ * compat: is used only on IA-64 to maintain backward compatibility with v2.0
+ *
+ * The context is locked and interrupts are disabled.
+ */
+int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count,
+		     int compat)
+{
+	struct pfm_event_set *set, *active_set;
+	u64 value, ovfl_mask;
+	u64 *smpl_pmds, *reset_pmds, *impl_pmds, *impl_rw_pmds;
+	u32 req_flags, flags;
+	u16 cnum, pmd_type, max_pmd, max_pmc;
+	u16 set_id, prev_set_id;
+	int i, can_access_pmu;
+	int is_counter;
+	int ret, error_code;
+	pfm_pmd_check_t	wr_func;
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	active_set = ctx->active_set;
+	max_pmd	= pfm_pmu_conf->regs.max_pmd;
+	max_pmc	= pfm_pmu_conf->regs.max_pmc;
+	impl_pmds = pfm_pmu_conf->regs.pmds;
+	impl_rw_pmds = pfm_pmu_conf->regs.rw_pmds;
+	wr_func = pfm_pmu_conf->pmd_write_check;
+	set = NULL;
+
+	prev_set_id = 0;
+	can_access_pmu = 0;
+
+	/*
+	 * we cannot access the actual PMD registers when monitoring is masked
+	 */
+	if (unlikely(ctx->state == PFM_CTX_LOADED))
+		can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task
+			       || ctx->flags.system;
+
+	error_code = PFM_REG_RETFL_EINVAL;
+	ret = -EINVAL;
+
+	for (i = 0; i < count; i++, req++) {
+
+		cnum = req->reg_num;
+		set_id = req->reg_set;
+		req_flags = req->reg_flags;
+		smpl_pmds = req->reg_smpl_pmds;
+		reset_pmds = req->reg_reset_pmds;
+		flags = 0;
+
+		if (unlikely(cnum >= max_pmd || !test_bit(cnum, cast_ulp(impl_pmds)))) {
+			PFM_DBG("pmd%u is not implemented/unaccessible", cnum);
+			error_code  = PFM_REG_RETFL_NOTAVAIL;
+			goto error;
+		}
+
+		pmd_type = pfm_pmu_conf->pmd_desc[cnum].type;
+		is_counter = pmd_type & PFM_REG_C64;
+
+		if (likely(!compat && is_counter)) {
+			/*
+		 	 * ensure only valid flags are set
+		 	 */
+			if (req_flags & ~(PFM_REGFL_PMD_ALL)) {
+				PFM_DBG("pmd%u: invalid flags=0x%x",
+						cnum, req_flags);
+				goto error;
+			}
+
+			if (req_flags & PFM_REGFL_OVFL_NOTIFY)
+				flags |= PFM_REGFL_OVFL_NOTIFY;
+			if (req_flags & PFM_REGFL_RANDOM)
+				flags |= PFM_REGFL_RANDOM;
+			/*
+			 * verify validity of smpl_pmds
+			 */
+			if (unlikely(!bitmap_subset(cast_ulp(smpl_pmds),
+						    cast_ulp(impl_pmds),
+							max_pmd))) {
+				PFM_DBG("invalid smpl_pmds=0x%llx "
+						"for pmd%u",
+						(unsigned long long)smpl_pmds[0],
+						cnum);
+				goto error;
+			}
+			/*
+			 * verify validity of reset_pmds
+			 * check against impl_rw_pmds because it is not
+			 * possible to reset read-only PMDs
+			 */
+			if (unlikely(!bitmap_subset(cast_ulp(reset_pmds),
+						    cast_ulp(impl_rw_pmds),
+						    max_pmd))) {
+				PFM_DBG("invalid reset_pmds=0x%llx "
+						"for pmd%u",
+						(unsigned long long)reset_pmds[0],
+						cnum);
+				goto error;
+			}
+
+		}
+
+		/*
+		 * locate event set
+		 */
+		if (i == 0 || set_id != prev_set_id) {
+			set = pfm_find_set(ctx, set_id, 0);
+			if (set == NULL) {
+				PFM_DBG("event set%u does not exist",
+					set_id);
+				error_code = PFM_REG_RETFL_NOSET;
+				goto error;
+			}
+		}
+
+		/*
+		 * execute write checker, if any
+		 */
+		if (unlikely(wr_func && (pmd_type & PFM_REG_WC))) {
+			ret = (*wr_func)(ctx, set, req);
+			if (ret)
+				goto error;
+
+		}
+
+		value = req->reg_value;
+
+		/*
+		 * now commit changes to software state
+		 */
+
+		if (likely(is_counter)) {
+			if (likely(!compat)) {
+				set->pmds[cnum].flags = flags;
+
+				/*
+				 * copy reset and sampling bitvectors
+				 */
+				bitmap_copy(cast_ulp(set->pmds[cnum].reset_pmds),
+					    cast_ulp(reset_pmds),
+						max_pmd);
+
+				bitmap_copy(cast_ulp(set->pmds[cnum].smpl_pmds),
+					    cast_ulp(smpl_pmds),
+					    max_pmd);
+
+				set->pmds[cnum].eventid = req->reg_smpl_eventid;
+
+				/*
+				 * Mark reset/smpl PMDS as used.
+				 *
+				 * We do not keep track of PMC because we have to
+				 * systematically restore ALL of them.
+				 */
+				bitmap_or(cast_ulp(set->used_pmds),
+					  cast_ulp(set->used_pmds),
+					  cast_ulp(reset_pmds), max_pmd);
+
+				bitmap_or(cast_ulp(set->used_pmds),
+					  cast_ulp(set->used_pmds),
+					  cast_ulp(smpl_pmds), max_pmd);
+
+				/*
+				 * we reprogrammed the PMD hence, clear any pending
+				 * ovfl, switch based on the old value
+				 * for restart we have already established new values
+				 */
+				if (test_bit(cnum, cast_ulp(set->povfl_pmds))) {
+					set->npend_ovfls--;
+					__clear_bit(cnum, cast_ulp(set->povfl_pmds));
+				}
+				__clear_bit(cnum, cast_ulp(set->ovfl_pmds));
+
+				/*
+				 * update ovfl_notify
+				 */
+				if (flags & PFM_REGFL_OVFL_NOTIFY)
+					__set_bit(cnum, cast_ulp(set->ovfl_notify));
+				else
+					__clear_bit(cnum, cast_ulp(set->ovfl_notify));
+			}
+			/*
+			 * reset last value to new value
+			 */
+			set->pmds[cnum].lval = value;
+
+			/*
+			 * establish new switch count
+			 */
+			set->pmds[cnum].ovflsw_thres = req->reg_ovfl_switch_cnt;
+			set->pmds[cnum].ovflsw_ref_thres = req->reg_ovfl_switch_cnt;
+		}
+
+		/*
+		 * update reset values (not just for counters)
+		 */
+		set->pmds[cnum].long_reset = req->reg_long_reset;
+		set->pmds[cnum].short_reset = req->reg_short_reset;
+
+		/*
+		 * update randomization mask
+		 */
+		set->pmds[cnum].mask = req->reg_random_mask;
+
+		/*
+		 * update set values
+		 */
+		set->pmds[cnum].value = value;
+
+		__set_bit(cnum, cast_ulp(set->used_pmds));
+
+		if (set == active_set) {
+			set->priv_flags |= PFM_SETFL_PRIV_MOD_PMDS;
+			if (can_access_pmu)
+				pfm_write_pmd(ctx, cnum, value);
+		}
+
+		/*
+		 * update number of used PMD registers
+		 */
+		set->nused_pmds = bitmap_weight(cast_ulp(set->used_pmds), max_pmd);
+
+		pfm_retflag_set(req->reg_flags, 0);
+
+		prev_set_id = set_id;
+
+		PFM_DBG("set%u pmd%u=0x%llx flags=0x%x a_pmu=%d "
+			"ctx_pmd=0x%llx s_reset=0x%llx "
+			"l_reset=0x%llx u_pmds=0x%llx nu_pmds=%u "
+			"s_pmds=0x%llx r_pmds=0x%llx o_pmds=0x%llx "
+			"o_thres=%llu compat=%d eventid=%llx",
+			set->id,
+			cnum,
+			(unsigned long long)value,
+			set->pmds[cnum].flags,
+			can_access_pmu,
+			(unsigned long long)set->pmds[cnum].value,
+			(unsigned long long)set->pmds[cnum].short_reset,
+			(unsigned long long)set->pmds[cnum].long_reset,
+			(unsigned long long)set->used_pmds[0],
+			set->nused_pmds,
+			(unsigned long long)set->pmds[cnum].smpl_pmds[0],
+			(unsigned long long)set->pmds[cnum].reset_pmds[0],
+			(unsigned long long)set->ovfl_pmds[0],
+			(unsigned long long)set->pmds[cnum].ovflsw_thres,
+			compat,
+			(unsigned long long)set->pmds[cnum].eventid);
+	}
+
+	/*
+	 * make changes visible
+	 */
+	if (can_access_pmu)
+		pfm_arch_serialize();
+
+	return 0;
+
+error:
+	/*
+	 * for now, we have only one possibility for error
+	 */
+	pfm_retflag_set(req->reg_flags, error_code);
+	PFM_DBG("set%u pmd%u error=%d", set_id, cnum, error_code);
+	return ret;
+}
+
+/*
+ * function called from sys_pfm_write_pmcs() to write the
+ * requested PMC registers. The function succeeds whether the context is
+ * attached or not. When attached to another thread, that thread must be
+ * stopped.
+ *
+ * The context is locked and interrupts are disabled.
+ */
+int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmc *req, int count)
+{
+	struct pfm_event_set *set, *active_set;
+	u64 value, dfl_val, rsvd_msk;
+	u64 *impl_pmcs;
+	int i, can_access_pmu;
+	int ret, error_code;
+	u16 set_id, prev_set_id;
+	u16 cnum, pmc_type, max_pmc;
+	u32 flags;
+	pfm_pmc_check_t	wr_func;
+
+	active_set = ctx->active_set;
+
+	wr_func = pfm_pmu_conf->pmc_write_check;
+	max_pmc = pfm_pmu_conf->regs.max_pmc;
+	impl_pmcs = pfm_pmu_conf->regs.pmcs;
+
+	set = NULL;
+	prev_set_id = 0;
+	can_access_pmu = 0;
+
+	/*
+	 * we cannot access the actual PMC registers when monitoring is masked
+	 */
+	if (unlikely(ctx->state == PFM_CTX_LOADED))
+		can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task
+			        || ctx->flags.system;
+
+	error_code  = PFM_REG_RETFL_EINVAL;
+
+	for (i = 0; i < count; i++, req++) {
+
+		ret = -EINVAL;
+		cnum = req->reg_num;
+		set_id = req->reg_set;
+		value = req->reg_value;
+		flags = req->reg_flags;
+
+		/*
+		 * no access to unimplemented PMC register
+		 */
+		if (unlikely(cnum >= max_pmc
+			     || !test_bit(cnum, cast_ulp(impl_pmcs)))) {
+			PFM_DBG("pmc%u is not implemented/unaccessible", cnum);
+			error_code  = PFM_REG_RETFL_NOTAVAIL;
+			goto error;
+		}
+
+		pmc_type = pfm_pmu_conf->pmc_desc[cnum].type;
+		dfl_val = pfm_pmu_conf->pmc_desc[cnum].dfl_val;
+		rsvd_msk = pfm_pmu_conf->pmc_desc[cnum].rsvd_msk;
+
+		/*
+		 * ensure only valid flags are set
+		 */
+		if (flags & ~PFM_REGFL_PMC_ALL) {
+			PFM_DBG("pmc%u: invalid flags=0x%x", cnum, flags);
+			goto error;
+		}
+
+		/*
+		 * locate event set
+		 */
+		if (i == 0 || set_id != prev_set_id) {
+			set = pfm_find_set(ctx, set_id, 0);
+			if (set == NULL) {
+				PFM_DBG("event set%u does not exist",
+					set_id);
+				error_code = PFM_REG_RETFL_NOSET;
+				goto error;
+			}
+		}
+
+		/*
+		 * set reserved bits to default values
+		 * (reserved bits must be 1 in rsvd_msk)
+		 */
+		value = (value & ~rsvd_msk) | (dfl_val & rsvd_msk);
+
+		if (flags & PFM_REGFL_NO_EMUL64) {
+			if (!(pmc_type & PFM_REG_NO64)) {
+				PFM_DBG("pmc%u no support for "
+					"PFM_REGFL_NO_EMUL64", cnum);
+				goto error;
+			}
+			value &= ~pfm_pmu_conf->pmc_desc[cnum].no_emul64_msk;
+		}
+
+		/*
+		 * execute write checker, if any
+		 */
+		if (likely(wr_func && (pmc_type & PFM_REG_WC))) {
+			req->reg_value = value;
+			ret = (*wr_func)(ctx, set, req);
+			if (ret)
+				goto error;
+			value = req->reg_value;
+		}
+
+		/*
+		 * Now we commit the changes
+		 */
+
+		/*
+		 * mark PMC register as used
+		 * We do not track associated PMC register based on
+		 * the fact that they will likely need to be written
+		 * in order to become useful at which point the statement
+		 * below will catch that.
+		 *
+		 * The used_pmcs bitmask is only useful on architectures where
+		 * the PMC needs to be modified for particular bits, especially
+		 * on overflow or to stop/start.
+		 */
+		if (!test_bit(cnum, cast_ulp(set->used_pmcs))) {
+			__set_bit(cnum, cast_ulp(set->used_pmcs));
+			set->nused_pmcs++;
+		}
+
+		set->pmcs[cnum] = value;
+
+		if (set == active_set) {
+			set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS;
+			if (can_access_pmu)
+				pfm_arch_write_pmc(ctx, cnum, value);
+		}
+
+		pfm_retflag_set(req->reg_flags, 0);
+
+		prev_set_id = set_id;
+
+		PFM_DBG("set%u pmc%u=0x%llx a_pmu=%d "
+			"u_pmcs=0x%llx nu_pmcs=%u",
+			set->id,
+			cnum,
+			(unsigned long long)value,
+			can_access_pmu,
+			(unsigned long long)set->used_pmcs[0],
+			set->nused_pmcs);
+	}
+	/*
+	 * make sure the changes are visible
+	 */
+	if (can_access_pmu)
+		pfm_arch_serialize();
+
+	return 0;
+error:
+	pfm_retflag_set(req->reg_flags, error_code);
+	PFM_DBG("set%u pmc%u error=0x%08x", set_id, cnum, error_code);
+	return ret;
+}
+
+/*
+ * function called from sys_pfm_read_pmds() to read the 64-bit value of
+ * requested PMD registers. The function succeeds whether the context is
+ * attached or not. When attached to another thread, that thread must be
+ * stopped.
+ *
+ * The context is locked and interrupts are disabled.
+ */
+int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count)
+{
+	u64 val = 0, lval, ovfl_mask, hw_val;
+	u64 sw_cnt;
+	u64 *impl_pmds;
+	struct pfm_event_set *set, *active_set;
+	int i, can_access_pmu = 0;
+	int error_code;
+	u16 cnum, pmd_type, set_id, prev_set_id, max_pmd;
+
+	ovfl_mask = pfm_pmu_conf->ovfl_mask;
+	impl_pmds = pfm_pmu_conf->regs.pmds;
+	max_pmd   = pfm_pmu_conf->regs.max_pmd;
+	active_set = ctx->active_set;
+	set = NULL;
+	prev_set_id = 0;
+
+	if (likely(ctx->state == PFM_CTX_LOADED)) {
+		can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task
+			       || ctx->flags.system;
+
+		if (can_access_pmu)
+			pfm_arch_serialize();
+	}
+	error_code = PFM_REG_RETFL_EINVAL;
+
+	/*
+	 * on both UP and SMP, we can only read the PMD from the hardware
+	 * register when the task is the owner of the local PMU.
+	 */
+	for (i = 0; i < count; i++, req++) {
+
+		cnum = req->reg_num;
+		set_id = req->reg_set;
+
+		if (unlikely(cnum >= max_pmd
+			     || !test_bit(cnum, cast_ulp(impl_pmds)))) {
+			PFM_DBG("pmd%u is not implemented/unaccessible", cnum);
+			error_code  = PFM_REG_RETFL_NOTAVAIL;
+			goto error;
+		}
+
+		pmd_type = pfm_pmu_conf->pmd_desc[cnum].type;
+
+		/*
+		 * locate event set
+		 */
+		if (i == 0 || set_id != prev_set_id) {
+			set = pfm_find_set(ctx, set_id, 0);
+			if (set == NULL) {
+				PFM_DBG("event set%u does not exist",
+					set_id);
+				error_code = PFM_REG_RETFL_NOSET;
+				goto error;
+			}
+		}
+		/*
+		 * it is not possible to read a PMD which was not requested:
+		 * 	- explicitly written via pfm_write_pmds()
+		 * 	- provided as a reg_smpl_pmds[] to another PMD during
+		 * 	  pfm_write_pmds()
+		 *
+		 * This is motivated by security and for optimization purposes:
+		 * 	- on context switch restore, we can restore only what we
+		 * 	  use (except when regs directly readable at user level,
+		 * 	  e.g., IA-64 self-monitoring, I386 RDPMC).
+		 * 	- do not need to maintain PMC -> PMD dependencies
+		 */
+		if (unlikely(!test_bit(cnum, cast_ulp(set->used_pmds)))) {
+			PFM_DBG("pmd%u cannot be read, because never "
+				"requested", cnum);
+			goto error;
+		}
+
+		val = set->pmds[cnum].value;
+		lval = set->pmds[cnum].lval;
+
+		/*
+		 * extract remaining ovfl to switch
+		 */
+		sw_cnt = set->pmds[cnum].ovflsw_thres;
+
+		/*
+		 * If the task is not the current one, then we check if the
+		 * PMU state is still in the local live register due to lazy
+		 * ctxsw. If true, then we read directly from the registers.
+		 */
+		if (set == active_set && can_access_pmu) {
+			hw_val = pfm_read_pmd(ctx, cnum);
+			if (pmd_type & PFM_REG_C64)
+				val = (val & ~ovfl_mask) | (hw_val & ovfl_mask);
+			else
+				val = hw_val;
+		}
+
+		PFM_DBG("set%u pmd%u=0x%llx sw_thr=%llu lval=0x%llx",
+			set->id,
+			cnum,
+			(unsigned long long)val,
+			(unsigned long long)sw_cnt,
+			(unsigned long long)lval);
+
+		pfm_retflag_set(req->reg_flags, 0);
+
+		req->reg_value = val;
+		req->reg_last_reset_val = lval;
+		req->reg_ovfl_switch_cnt = sw_cnt;
+
+		prev_set_id = set_id;
+	}
+	return 0;
+error:
+	pfm_retflag_set(req->reg_flags, error_code);
+	PFM_DBG("set%u pmd%u error=%d", set_id, cnum, error_code);
+	return -EINVAL;
+}
Index: linux-2.6/perfmon/perfmon_sets.c
===================================================================
--- /dev/null
+++ linux-2.6/perfmon/perfmon_sets.c
@@ -0,0 +1,833 @@
+/*
+ * perfmon_sets.c: perfmon2 event sets and multiplexing functions
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/kernel.h>
+#include <linux/perfmon.h>
+#include <linux/pagemap.h>
+
+static struct kmem_cache	*pfm_set_cachep;
+
+/*
+ * reload reference overflow switch thresholds
+ */
+static void pfm_reload_switch_thresholds(struct pfm_event_set *set)
+{
+	u64 *mask;
+	u16 i, max_cnt_pmd, first_cnt_pmd;
+
+	mask = set->used_pmds;
+	first_cnt_pmd = pfm_pmu_conf->regs.first_cnt_pmd;
+	max_cnt_pmd = pfm_pmu_conf->regs.max_cnt_pmd;
+
+	for (i = first_cnt_pmd; i< max_cnt_pmd; i++) {
+		if (test_bit(i, cast_ulp(mask))) {
+			set->pmds[i].ovflsw_thres = set->pmds[i].ovflsw_ref_thres;
+			PFM_DBG("set%u pmd%u ovflsw_thres=%llu",
+				set->id,
+				i,
+				(unsigned long long)set->pmds[i].ovflsw_thres);
+		}
+	}
+}
+
+/*
+ * ensures that all id_next sets exists such that the round-robin
+ * will work correctly, i.e., next dangling references.
+ */
+int pfm_prepare_sets(struct pfm_context *ctx, struct pfm_event_set *act_set)
+{
+	struct pfm_event_set *set1, *set2;
+	u16 max_cnt_pmd;
+#define is_last_set(s, c)	((s)->list.next == &(c)->list)
+
+	max_cnt_pmd = pfm_pmu_conf->regs.max_cnt_pmd;
+
+	list_for_each_entry(set1, &ctx->list, list) {
+
+		if (is_last_set(set1, ctx))
+			set2 = list_entry(ctx->list.next,
+					  struct pfm_event_set, list);
+		else
+			set2 = list_entry(set1->list.next,
+					  struct pfm_event_set, list);
+		/*
+		 * update field used during actual switching
+		 */
+		set1->sw_next = set2;
+
+		PFM_DBG("set%u sw_next=%u", set1->id, set2->id);
+
+		/*
+		 * cleanup bitvectors
+		 */
+		bitmap_zero(cast_ulp(set1->ovfl_pmds), max_cnt_pmd);
+		bitmap_zero(cast_ulp(set1->povfl_pmds), max_cnt_pmd);
+		set1->npend_ovfls = 0;
+		/*
+		 * we cannot just use plain clear because of arch-specific flags
+		 */
+		set1->priv_flags &= ~(PFM_SETFL_PRIV_MOD_BOTH|PFM_SETFL_PRIV_SWITCH);
+
+		/*
+		 * reset activation and elapsed ns
+		 */
+		set1->duration = 0;
+
+		set1->runs = 0;
+	}
+	/*
+	 * setting PFM_CPUINFO_TIME_SWITCH, triggers
+	 * further checking if __pfm_handle_switch_timeout().
+	 * switch timeout is effectively decremented only when
+	 * monitoring has been activated via pfm_start() or
+	 * any user level equivalent.
+	 */
+	if (act_set->flags & PFM_SETFL_TIME_SWITCH) {
+
+		act_set->timeout_sw_left = act_set->timeout_sw_ref;
+		PFM_DBG("arming timeout for set%u", act_set->id);
+
+		if (ctx->flags.system)
+			__get_cpu_var(pfm_syst_info) = PFM_CPUINFO_TIME_SWITCH;
+
+	} else if (act_set->flags & PFM_SETFL_OVFL_SWITCH)
+		pfm_reload_switch_thresholds(act_set);
+
+	return 0;
+}
+
+/*
+ * called from *_timer_interrupt(). task == current
+ */
+void __pfm_handle_switch_timeout(void)
+{
+	struct pfm_event_set *set;
+	struct pfm_context *ctx;
+	unsigned long flags;
+
+	/*
+	 * The timer tick check is operating on each
+	 * CPU. Not all CPUs have time switching enabled
+	 * hence we need to check.
+	 */
+	ctx  = __get_cpu_var(pmu_ctx);
+	if (ctx == NULL)
+		return;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	set = ctx->active_set;
+	BUG_ON(set == NULL);
+
+	/*
+	 * we decrement only when attached and not masked or zombie
+	 */
+	if (ctx->state != PFM_CTX_LOADED)
+		goto done;
+
+	/*
+	 * do not decrement timeout unless monitoring is active.
+	 */
+	if (!pfm_arch_is_active(ctx))
+		goto done;
+
+	set->timeout_sw_left--;
+
+	__get_cpu_var(pfm_stats).handle_timeout_count++;
+
+	if (!set->timeout_sw_left)
+		pfm_switch_sets(ctx, NULL, PFM_PMD_RESET_SHORT, 0);
+done:
+	spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+/*
+ *
+ * always operating on the current task
+ * interrupts are masked
+ *
+ * input:
+ * 	- new_set: new set to switch to, if NULL follow normal chain
+ */
+void pfm_switch_sets(struct pfm_context *ctx,
+		    struct pfm_event_set *new_set,
+		    int reset_mode,
+		    int no_restart)
+{
+	struct pfm_event_set *set;
+	u64 switch_count;
+	u64 now, end;
+	unsigned long info = 0;
+	u32 new_flags;
+	int is_system, state, is_active, nn;
+
+	now = sched_clock();
+	set = ctx->active_set;
+	is_active = pfm_arch_is_active(ctx);
+
+	/*
+	 * if no set is explicitly requested,
+	 * use the set_switch_next field
+	 */
+	if (new_set == NULL) {
+		/*
+	 	 * we use round-robin unless the user specified
+		 * a particular set to go to.
+	 	 */
+		new_set = set->sw_next;
+		BUG_ON(new_set == NULL);
+	}
+
+	PFM_DBG("state=%d act=%d cur_set=%u cur_runs=%llu cur_npend=%d next_set=%u "
+		  "next_runs=%llu new_npend=%d reset_mode=%d reset_pmds=%llx",
+		  ctx->state,
+		  is_active,
+		  set->id,
+		  (unsigned long long)set->runs,
+		  set->npend_ovfls,
+		  new_set->id,
+		  (unsigned long long)new_set->runs,
+		  new_set->npend_ovfls,
+		  reset_mode,
+		  (unsigned long long)new_set->reset_pmds[0]);
+
+	is_system = ctx->flags.system;
+	state = ctx->state;
+	new_flags = new_set->flags;
+	switch_count = __get_cpu_var(pfm_stats).set_switch_count;
+
+
+	/*
+	 * nothing more to do
+	 */
+	if (new_set == set)
+		goto skip_same_set;
+
+
+	if (is_active) {
+		pfm_arch_stop(current, ctx, set);
+
+		pfm_save_pmds(ctx, set);
+
+		/*
+	 	 * compute elapsed ns for active set
+	 	 */
+		set->duration += now - set->duration_start;
+		/*
+		 * must be done after pfm_arch_stop()
+		 */
+		if (is_system)
+			info = __get_cpu_var(pfm_syst_info);
+	}
+	pfm_arch_restore_pmds(ctx, new_set);
+
+	/*
+	 * if masked, we must restore the pmcs such that they
+	 * do not capture anything.
+	 */
+	pfm_arch_restore_pmcs(ctx, new_set);
+
+	if (new_set->npend_ovfls) {
+		pfm_arch_resend_irq();
+		__get_cpu_var(pfm_stats).ovfl_intr_replay_count++;
+	}
+
+	new_set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+
+skip_same_set:
+	switch_count++;
+	new_set->runs++;
+	/*
+	 * reset switch threshold
+	 */
+	if (new_flags & PFM_SETFL_OVFL_SWITCH)
+		pfm_reload_switch_thresholds(new_set);
+	else if (new_flags & PFM_SETFL_TIME_SWITCH)
+		new_set->timeout_sw_left = new_set->timeout_sw_ref;
+
+	/*
+	 * reset overflowed PMD registers
+	 */
+	nn = bitmap_weight(cast_ulp(new_set->reset_pmds),
+			   pfm_pmu_conf->regs.max_pmd);
+	if (nn)
+		pfm_reset_pmds(ctx, new_set, nn, reset_mode);
+	/*
+	 * this is needed when coming from pfm_start()
+	 */
+	if (no_restart)
+		goto skip_restart;
+
+	/*
+	 * reactivate monitoring
+	 */
+	if (is_system) {
+		info  &= ~PFM_CPUINFO_TIME_SWITCH;
+
+		if (new_flags & PFM_SETFL_TIME_SWITCH)
+			info |= PFM_CPUINFO_TIME_SWITCH;
+
+		__get_cpu_var(pfm_syst_info) = info;
+
+		PFM_DBG("new_set=%u info=0x%lx flags=0x%x",
+			new_set->id,
+			info,
+			new_flags);
+	}
+
+	if (is_active) {
+		pfm_arch_start(current, ctx, new_set);
+		new_set->duration_start = now;
+	}
+
+skip_restart:
+	ctx->active_set = new_set;
+	ctx->set_all_runs++;
+
+	end = sched_clock();
+
+	__get_cpu_var(pfm_stats).set_switch_count = switch_count;
+	__get_cpu_var(pfm_stats).set_switch_ns += end - now;
+}
+
+/*
+ * called from __pfm_overflow_handler() to switch event sets.
+ * monitoring is stopped, task is current, interrupts are masked.
+ * compared to pfm_switch_sets(), this version is simplified because
+ * it know about the call path. There is no need to stop monitoring
+ * because it is already frozen by PMU handler.
+ */
+void pfm_switch_sets_from_intr(struct pfm_context *ctx)
+{
+	struct pfm_event_set *set, *new_set;
+	u64 switch_count;
+	u64 now, end;
+	u32 new_flags;
+	unsigned long info = 0;
+	int is_system, state, n;
+
+	now = sched_clock();
+	set = ctx->active_set;
+	new_set = set->sw_next;
+
+	PFM_DBG_ovfl("state=%d cur_set=%u cur_runs=%llu cur_npend=%d next_set=%u "
+		  "next_runs=%llu new_npend=%d new_r_pmds=%llx",
+		  ctx->state,
+		  set->id,
+		  (unsigned long long)set->runs,
+		  set->npend_ovfls,
+		  new_set->id,
+		  (unsigned long long)new_set->runs,
+		  new_set->npend_ovfls,
+		  (unsigned long long)new_set->reset_pmds[0]);
+
+	is_system = ctx->flags.system;
+	state = ctx->state;
+	new_flags = new_set->flags;
+	switch_count = __get_cpu_var(pfm_stats).set_switch_count;
+
+	/*
+	 * nothing more to do
+	 */
+	if (new_set == set)
+		goto skip_same_set;
+
+	/*
+	 * when called from PMU intr handler, monitoring
+	 * is already stopped
+	 *
+	 * save current PMD registers, we use a special
+	 * form for performance reason. On some architectures,
+	 * such as x86, the pmds are already saved when entering
+	 * the PMU interrupt handler via pfm-arch_intr_freeze()
+	 * so we don't need to save them again. On the contrary,
+	 * on IA-64, they are not saved by freeze, thus we have to
+	 * to it here.
+	 */
+	pfm_arch_save_pmds_from_intr(ctx, set);
+
+	/*
+	 * compute elapsed ns for active set
+	 */
+	set->duration += now - set->duration_start;
+
+	if (is_system)
+		info = __get_cpu_var(pfm_syst_info);
+
+	pfm_arch_restore_pmds(ctx, new_set);
+	__get_cpu_var(pfm_stats).ccnt6 += sched_clock() - now;
+	/*
+	 * must not be restored active as we are still executing in the
+	 * PMU interrupt handler. activation is deferred to unfreeze PMU
+	 */
+	pfm_arch_restore_pmcs(ctx, new_set);
+
+	/*
+	 * check for pending interrupt on incoming set.
+	 * interrupts are masked so handler call deferred
+	 */
+	if (new_set->npend_ovfls) {
+		pfm_arch_resend_irq();
+		__get_cpu_var(pfm_stats).ovfl_intr_replay_count++;
+	}
+	/*
+	 * no need to restore anything, that is already done
+	 */
+	new_set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH;
+	/*
+	 * reset duration counter
+	 */
+	new_set->duration_start = now;
+
+skip_same_set:
+	switch_count++;
+	new_set->runs++;
+
+	/*
+	 * reset switch threshold
+	 */
+	if (new_flags & PFM_SETFL_OVFL_SWITCH)
+		pfm_reload_switch_thresholds(new_set);
+	else if (new_flags & PFM_SETFL_TIME_SWITCH)
+		new_set->timeout_sw_left = new_set->timeout_sw_ref;
+
+	/*
+	 * reset overflowed PMD registers
+	 */
+	n = bitmap_weight(cast_ulp(new_set->reset_pmds), pfm_pmu_conf->regs.max_pmd);
+	if (n)
+		pfm_reset_pmds(ctx, new_set, n, PFM_PMD_RESET_SHORT);
+
+	/*
+	 * reactivate monitoring
+	 */
+	if (is_system) {
+		info  &= ~PFM_CPUINFO_TIME_SWITCH;
+
+		if (new_flags & PFM_SETFL_TIME_SWITCH)
+			info |= PFM_CPUINFO_TIME_SWITCH;
+
+		__get_cpu_var(pfm_syst_info) = info;
+
+		PFM_DBG("new_set=%u info=0x%lx flags=0x%x",
+			new_set->id,
+			info,
+			new_flags);
+	}
+
+	ctx->active_set = new_set;
+	ctx->set_all_runs++;
+
+	end = sched_clock();
+
+	__get_cpu_var(pfm_stats).set_switch_count = switch_count;
+	__get_cpu_var(pfm_stats).set_switch_ns += end - now;
+}
+
+
+static int pfm_setfl_sane(struct pfm_context *ctx, u32 flags)
+{
+#define PFM_SETFL_BOTH_SWITCH	(PFM_SETFL_OVFL_SWITCH|PFM_SETFL_TIME_SWITCH)
+	int ret;
+
+	ret = pfm_arch_setfl_sane(ctx, flags);
+	if (ret)
+		return ret;
+
+	if ((flags & PFM_SETFL_BOTH_SWITCH) == PFM_SETFL_BOTH_SWITCH) {
+		PFM_DBG("both switch ovfl and switch time are set");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * it is never possible to change the identification of an existing set
+ */
+static int __pfm_change_evtset(struct pfm_context *ctx,
+				  struct pfm_event_set *set,
+				  struct pfarg_setdesc *req)
+{
+	u32 flags;
+	u16 set_id;
+	struct timespec tv;
+	unsigned long ji;
+	int ret;
+
+	BUG_ON(ctx->state == PFM_CTX_LOADED);
+
+	set_id = req->set_id;
+	flags = req->set_flags;
+
+	ret = pfm_setfl_sane(ctx, flags);
+	if (ret) {
+		PFM_DBG("invalid flags 0x%x set %u", flags, set_id);
+		return -EINVAL;
+	}
+
+	/*
+	 * commit changes
+	 *
+	 * note that we defer checking the validity of set_id_next until the
+	 * context is actually attached. This is the only moment where we can
+	 * safely assess the sanity of the sets because sets cannot be changed
+	 * or deleted once the context is attached
+	 */
+	set->id = set_id;
+	set->flags = flags;
+	set->priv_flags = 0;
+
+	/*
+	 * reset pointer to next set
+	 */
+	set->sw_next = NULL;
+
+	tv.tv_sec = 0;
+	tv.tv_nsec = req->set_timeout;
+	ji = timespec_to_jiffies(&tv);
+
+	/*
+	 * verify that timeout is not 0
+	 */
+	if (!ji && (flags & PFM_SETFL_TIME_SWITCH) != 0) {
+		PFM_DBG("invalid timeout=0");
+		return -EINVAL;
+	}
+
+	set->timeout_sw_ref = set->timeout_sw_left = ji;
+
+	PFM_DBG("set%u flags=0x%x req_nsec=%llu"
+		" jiffies=%lu HZ=%u TICK_NSEC=%lu eff_nsec=%lu",
+		set_id,
+		flags,
+		(unsigned long long)req->set_timeout,
+		ji,
+		HZ, TICK_NSEC,
+		ji * TICK_NSEC);
+
+	/*
+	 * return actual timeout in nsecs
+	 */
+	jiffies_to_timespec(ji, &tv);
+	req->set_timeout = tv.tv_sec * 1000000000 + tv.tv_nsec;
+
+	return 0;
+}
+
+/*
+ * this function does not modify the next field
+ */
+void pfm_init_evtset(struct pfm_event_set *set)
+{
+	u64 *impl_pmcs;
+	u16 i, max_pmc;
+
+	max_pmc = pfm_pmu_conf->regs.max_pmc;
+	impl_pmcs =  pfm_pmu_conf->regs.pmcs;
+
+	/*
+	 * install default values for all PMC  registers
+	 */
+	for (i=0; i < max_pmc;  i++) {
+		if (test_bit(i, cast_ulp(impl_pmcs))) {
+			set->pmcs[i] = pfm_pmu_conf->pmc_desc[i].dfl_val;
+			PFM_DBG("set%u pmc%u=0x%llx",
+				set->id,
+				i,
+				(unsigned long long)set->pmcs[i]);
+		}
+	}
+
+	/*
+	 * PMD registers are set to 0 when the event set is allocated,
+	 * hence we do not need to explicitly initialize them.
+	 *
+	 * For virtual PMD registers (i.e., those tied to a SW resource)
+	 * their value becomes meaningful once the context is attached.
+	 */
+}
+
+/*
+ * look for an event set using its identification. If the set does not
+ * exist:
+ * 	- if alloc == 0 then return error
+ * 	- if alloc == 1  then allocate set
+ *
+ * alloc is one ONLY when coming from pfm_create_evtsets() which can only
+ * be called when the context is detached, i.e. monitoring is stopped.
+ */
+struct pfm_event_set *pfm_find_set(struct pfm_context *ctx, u16 set_id,
+					  int alloc)
+{
+	struct pfm_event_set *set, *new_set, *prev;
+
+	PFM_DBG("looking for set=%u", set_id);
+
+	/*
+	 * shortcut for set0: always exist, cannot be removed
+	 */
+	if (!(set_id || alloc))
+		return list_entry(ctx->list.next, struct pfm_event_set, list);
+
+	prev = NULL;
+	list_for_each_entry(set, &ctx->list, list) {
+		if (set->id == set_id)
+			return set;
+		if (set->id > set_id)
+			break;
+		prev = set;
+	}
+
+	if (!alloc)
+		return NULL;
+
+	/*
+	 * when alloc == 1, context is detached, monitoring is stopped and
+	 * interrupts are enabled but context is locked
+	 */
+
+	/*
+	 * we are holding the context spinlock and interrupts
+	 * are unmasked. We must use GFP_ATOMIC as we cannot
+	 * sleep while holding a spin lock.
+	 */
+	new_set = kmem_cache_zalloc(pfm_set_cachep, GFP_ATOMIC);
+	if (!new_set)
+		return NULL;
+
+	new_set->id = set_id;
+
+	INIT_LIST_HEAD(&new_set->list);
+
+	if (prev == NULL) {
+		list_add(&(new_set->list), &ctx->list);
+	} else {
+		PFM_DBG("add after set=%u", prev->id);
+		list_add(&(new_set->list), &prev->list);
+	}
+	return new_set;
+}
+
+/*
+ * context is unloaded for this command. Interrupts are enabled
+ */
+int __pfm_create_evtsets(struct pfm_context *ctx, struct pfarg_setdesc *req,
+			int count)
+{
+	struct pfm_event_set *set;
+	u16 set_id;
+	int i, ret;
+
+	for (i = 0; i < count; i++, req++) {
+		set_id = req->set_id;
+
+		PFM_DBG("set_id=%u", set_id);
+
+		set = pfm_find_set(ctx, set_id, 1);
+		if (set == NULL)
+			goto error_mem;
+
+		ret = __pfm_change_evtset(ctx, set, req);
+		if (ret)
+			goto error_params;
+
+		pfm_init_evtset(set);
+	}
+	return 0;
+error_mem:
+	PFM_DBG("cannot allocate set %u", set_id);
+	pfm_retflag_set(req->set_flags, PFM_REG_RETFL_EINVAL);
+	return -ENOMEM;
+error_params:
+	pfm_retflag_set(req->set_flags, PFM_REG_RETFL_EINVAL);
+	return ret;
+}
+
+int __pfm_getinfo_evtsets(struct pfm_context *ctx, struct pfarg_setinfo *req,
+				 int count)
+{
+	struct pfm_event_set *set;
+	int i, is_system, is_loaded;
+	u16 set_id;
+	int max_cnt_pmd;
+	u64 end;
+
+	end = sched_clock();
+	is_system = ctx->flags.system;
+	is_loaded = ctx->state == PFM_CTX_LOADED;
+	max_cnt_pmd = pfm_pmu_conf->regs.max_cnt_pmd;
+
+	for (i = 0; i < count; i++, req++) {
+
+		set_id = req->set_id;
+
+		list_for_each_entry(set, &ctx->list, list) {
+			if (set->id == set_id)
+				goto found;
+			if (set->id > set_id)
+				goto error;
+		}
+found:
+		/*
+		 * compute leftover timeout
+		 */
+		req->set_flags = set->flags;
+		req->set_timeout = set->timeout_sw_left * TICK_NSEC;
+		req->set_runs = set->runs;
+		req->set_act_duration = set->duration;
+
+		/*
+		 * adjust for active set if needed
+		 */
+		if (is_system && is_loaded && ctx->flags.started
+		    && set == ctx->active_set)
+			req->set_act_duration  += end - set->duration_start;
+
+		/*
+		 * copy the list of pmds which last overflowed for
+		 * the set
+		 */
+		bitmap_copy(cast_ulp(req->set_ovfl_pmds),
+			    cast_ulp(set->ovfl_pmds),
+			    max_cnt_pmd);
+
+		/*
+		 * copy bitmask of available PMU registers
+		 */
+		bitmap_copy(cast_ulp(req->set_avail_pmcs),
+			    cast_ulp(pfm_pmu_conf->regs.pmcs),
+			    pfm_pmu_conf->regs.max_pmc);
+
+		bitmap_copy(cast_ulp(req->set_avail_pmds),
+			    cast_ulp(pfm_pmu_conf->regs.pmds),
+			    pfm_pmu_conf->regs.max_pmd);
+
+		pfm_retflag_set(req->set_flags, 0);
+
+		PFM_DBG("set%u flags=0x%x eff_usec=%llu runs=%llu "
+			"a_pmcs=0x%llx a_pmds=0x%llx",
+			set_id,
+			set->flags,
+			(unsigned long long)req->set_timeout,
+			(unsigned long long)set->runs,
+			(unsigned long long)pfm_pmu_conf->regs.pmcs[0],
+			(unsigned long long)pfm_pmu_conf->regs.pmds[0]);
+	}
+	return 0;
+error:
+	PFM_DBG("set%u not found", set_id);
+	pfm_retflag_set(req->set_flags, PFM_REG_RETFL_EINVAL);
+	return -EINVAL;
+}
+
+/*
+ * context is unloaded for this command. Interrupts are enabled
+ */
+int __pfm_delete_evtsets(struct pfm_context *ctx, void *arg, int count)
+{
+	struct pfarg_setdesc *req = arg;
+	struct pfm_event_set *set;
+	u16 set_id;
+	int i;
+
+	/* delete operation only works when context is detached */
+	BUG_ON(ctx->state != PFM_CTX_UNLOADED);
+
+	for (i = 0; i < count; i++, req++) {
+		set_id = req->set_id;
+
+		/*
+		 * cannot remove set 0
+		 */
+		if (!set_id)
+			goto error;
+
+		list_for_each_entry(set, &ctx->list, list) {
+			if (set->id == set_id)
+				goto found;
+			if (set->id > set_id)
+				goto error;
+		}
+		goto error;
+found:
+		/*
+		 * clear active set if necessary.
+		 * will be updated when context is loaded
+		 */
+		if (set == ctx->active_set)
+			ctx->active_set = NULL;
+
+		list_del(&set->list);
+
+		kmem_cache_free(pfm_set_cachep, set);
+
+		pfm_retflag_set(req->set_flags, 0);
+
+		PFM_DBG("set%u deleted", set_id);
+	}
+	return 0;
+error:
+	PFM_DBG("set%u not found or invalid", set_id);
+	pfm_retflag_set(req->set_flags, PFM_REG_RETFL_EINVAL);
+	return -EINVAL;
+}
+
+/*
+ * called from pfm_context_free() to free all sets
+ */
+void pfm_free_sets(struct pfm_context *ctx)
+{
+	struct pfm_event_set *set, *tmp;
+
+	list_for_each_entry_safe(set, tmp, &ctx->list, list) {
+		list_del(&set->list);
+		kmem_cache_free(pfm_set_cachep, set);
+	}
+}
+
+int pfm_sets_init(void)
+{
+
+	pfm_set_cachep = kmem_cache_create("pfm_event_set",
+					   sizeof(struct pfm_event_set),
+					   SLAB_HWCACHE_ALIGN, 0, NULL, NULL);
+	if (pfm_set_cachep == NULL) {
+		PFM_ERR("cannot initialize event set slab");
+		return -ENOMEM;
+	}
+	return 0;
+}
Index: linux-2.6/perfmon/perfmon_syscalls.c
===================================================================
--- /dev/null
+++ linux-2.6/perfmon/perfmon_syscalls.c
@@ -0,0 +1,1007 @@
+/*
+ * perfmon_syscalls.c: perfmon2 system call interface
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/kernel.h>
+#include <linux/perfmon.h>
+#include <linux/fs.h>
+#include <linux/ptrace.h>
+#include <asm/uaccess.h>
+
+/*
+ * Context locking rules:
+ * ---------------------
+ * 	- any thread with access to the file descriptor of a context can
+ * 	  potentially issue perfmon calls
+ *
+ * 	- calls must be serialized to guarantee correctness
+ *
+ * 	- as soon as a context is attached to a thread or CPU, it may be
+ * 	  actively monitoring. On some architectures, such as IA-64, this
+ * 	  is true even though the pfm_start() call has not been made. This
+ * 	  comes from the fact that on some architectures, it is possible to
+ * 	  start/stop monitoring from userland.
+ *
+ *	- If monitoring is active, then there can PMU interrupts. Because
+ *	  context accesses must be serialized, the perfmon system calls
+ *	  must mask interrupts as soon as the context is attached.
+ *
+ *	- perfmon system calls that operate with the context unloaded cannot
+ *	  assume it is actually unloaded when they are called. They first need
+ *	  to check and for that they need interrupts masked. Then if the context
+ *	  is actually unloaded, they can unmask interrupts.
+ *
+ *	- interrupt masking holds true for other internal perfmon functions as
+ *	  well. Except for PMU interrupt handler because those interrupts cannot
+ *	  be nested.
+ *
+ * 	- we mask ALL interrupts instead of just the PMU interrupt because we
+ * 	  also need to protect against timer interrupts which could trigger
+ * 	  a set switch.
+ */
+
+/*
+ * cannot attach if :
+ * 	- kernel task
+ * 	- task not owned by caller (checked by ptrace_may_attach())
+ * 	- task is dead or zombie
+ * 	- cannot use blocking notification when self-monitoring
+ */
+static int pfm_task_incompatible(struct pfm_context *ctx, struct task_struct *task)
+{
+	/*
+	 * cannot attach to a kernel thread
+	 */
+	if (!task->mm) {
+		PFM_DBG("cannot attach to kernel thread [%d]", task->pid);
+		return -EPERM;
+	}
+
+	/*
+	 * cannot use block on notification when 
+	 * self-monitoring.
+	 */
+	if (ctx->flags.block && task == current) {
+		PFM_DBG("cannot use block on notification when self-monitoring"
+			"[%d]", task->pid);
+		return -EINVAL;
+	}
+	/*
+	 * cannot attach to a zombie task
+	 */
+	if (task->exit_state == EXIT_ZOMBIE || task->exit_state == EXIT_DEAD) {
+		PFM_DBG("cannot attach to zombie/dead task [%d]", task->pid);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+/*
+ * This function  is used in per-thread mode only AND when not
+ * self-monitoring. It finds the task to monitor and checks
+ * that the caller has persmissions to attach. It also checks
+ * that the task is stopped via ptrace so that we can safely
+ * modify its state.
+ *
+ * task refcount is increment when succesful.
+ * This function is not declared static because it is used by the
+ * IA-64 compatiblity module arch/ia64/perfmon/perfmon_comapt.c
+ */
+int pfm_get_task(struct pfm_context *ctx, pid_t pid, struct task_struct **task)
+{
+	struct task_struct *p;
+	int ret = 0, ret1 = 0;
+
+	/*
+	 * When attaching to another thread we must ensure
+	 * that the thread is actually stopped. Just like with
+	 * perfmon system calls, we enforce that the thread
+	 * be ptraced and STOPPED by using ptrace_check_attach().
+	 *
+	 * As a consequence, only the ptracing parent can actually
+	 * attach a context to a thread. Obviously, this constraint
+	 * does not exist for self-monitoring threads.
+	 *
+	 * We use ptrace_may_attach() to check for permission.
+	 * No permission checking is needed for self monitoring.
+	 */
+	read_lock(&tasklist_lock);
+
+	p = find_task_by_pid(pid);
+	if (p)
+		get_task_struct(p);
+
+	read_unlock(&tasklist_lock);
+
+	if (p == NULL)
+		return -ESRCH;
+
+	ret = -EPERM;
+
+	/*
+	 * returns 0 if cannot attach
+	 */
+	ret1 = ptrace_may_attach(p);
+	if (ret1)
+		ret = ptrace_check_attach(p, 0);
+
+	PFM_DBG("may_attach=%d check_attach=%d", ret1, ret);
+
+	if (ret || !ret1)
+		goto error;
+
+	ret = pfm_task_incompatible(ctx, p);
+	if (ret)
+		goto error;
+
+	*task = p;
+
+	return 0;
+error:
+	if (!(ret1 || ret))
+		ret = -EPERM;
+
+	put_task_struct(p);
+
+	return ret;
+}
+
+/*
+ * context must be locked when calling this function
+ */
+int pfm_check_task_state(struct pfm_context *ctx, int check_mask,
+			 unsigned long *flags)
+{
+	struct task_struct *task;
+	unsigned long local_flags, new_flags;
+	int state, ret;
+
+recheck:
+	/*
+	 * task is NULL for system-wide context
+	 */
+	task = ctx->task;
+	state = ctx->state;
+	local_flags = *flags;
+
+	PFM_DBG("state=%d check_mask=0x%x", state, check_mask);
+	/*
+	 * if the context is detached, then we do not touch
+	 * hardware, therefore there is not restriction on when we can
+	 * access it.
+	 */
+	if (state == PFM_CTX_UNLOADED)
+		return 0;
+	/*
+	 * no command can operate on a zombie context.
+	 * A context becomes zombie when the file that identifies
+	 * it is closed while the context is still attached to the
+	 * thread it monitors.
+	 */
+	if (state == PFM_CTX_ZOMBIE)
+		return -EINVAL;
+
+	/*
+	 * at this point, state is PFM_CTX_LOADED or PFM_CTX_MASKED
+	 */
+
+	/*
+	 * some commands require the context to be unloaded to operate
+	 */
+	if (check_mask & PFM_CMD_UNLOADED)  {
+		PFM_DBG("state=%d, cmd needs context unloaded", state);
+		return -EBUSY;
+	}
+
+	/*
+	 * self-monitoring always ok.
+	 */
+	if (task == current)
+		return 0;
+
+	/*
+	 * for syswide, the calling thread must be running on the cpu
+	 * the context is bound to. There cannot be preemption as we
+	 * check with interrupts disabled.
+	 */
+	if (ctx->flags.system) {
+		if (ctx->cpu != smp_processor_id())
+			return -EBUSY;
+		return 0;
+	}
+
+	/*
+	 * at this point, monitoring another thread
+	 */
+
+	/*
+	 * the pfm_unload_context() command is allowed on masked context
+	 */
+	if (state == PFM_CTX_MASKED && !(check_mask & PFM_CMD_UNLOAD))
+		return 0;
+
+	/*
+	 * When we operate on another thread, we must wait for it to be
+	 * stopped and completely off any CPU as we need to access the
+	 * PMU state (or machine state).
+	 *
+	 * A thread can be put in the STOPPED state in various ways
+	 * including PTRACE_ATTACH, or when it receives a SIGSTOP signal.
+	 * We enforce that the thread must be ptraced, so it is stopped
+	 * AND it CANNOT wake up while we operate on it because this
+	 * would require an action from the ptracing parent which is the
+	 * thread that is calling this function.
+	 *
+	 * The dependency on ptrace, imposes that only the ptracing
+	 * parent can issue command on a thread. This is unfortunate
+	 * but we do not know of a better way of doing this.
+	 */
+	if (check_mask & PFM_CMD_STOPPED) {
+
+		spin_unlock_irqrestore(&ctx->lock, local_flags);
+
+		/*
+		 * check that the thread is ptraced AND STOPPED
+		 */
+		ret = ptrace_check_attach(task, 0);
+
+		spin_lock_irqsave(&ctx->lock, new_flags);
+
+		/*
+		 * flags may be different than when we released the lock
+		 */
+		*flags = new_flags;
+
+		if (ret)
+			return ret;
+		/*
+		 * we must recheck to verify if state has changed
+		 */
+		if (ctx->state != state) {
+			PFM_DBG("old_state=%d new_state=%d",
+				state,
+				ctx->state);
+			goto recheck;
+		}
+	}
+	return 0;
+}
+
+/*
+ * both req and ptr_free are kmalloc'ed, thus they need a kfree by caller
+ */
+int pfm_get_args(void __user *ureq, size_t sz, size_t lsz, void *laddr,
+		 void **req, void **ptr_free)
+{
+	void *addr;
+
+	/*
+	 * check if we can get by with stack buffer
+	 */
+	if (sz <= lsz) {
+		*req = laddr;
+		*ptr_free = NULL;
+		return copy_from_user(laddr, ureq, sz) ? -EFAULT : 0;
+	}
+
+	if (unlikely(sz > pfm_controls.arg_mem_max)) {
+		PFM_DBG("argument too big %zu max=%zu",
+			sz,
+			pfm_controls.arg_mem_max);
+		return -E2BIG;
+	}
+
+	addr = kmalloc(sz, GFP_KERNEL);
+	if (unlikely(addr == NULL))
+		return -ENOMEM;
+
+	if (copy_from_user(addr, ureq, sz)) {
+		kfree(addr);
+		return -EFAULT;
+	}
+	*req = *ptr_free = addr;
+
+	return 0;
+}
+
+/*
+ * arg is kmalloc'ed, thus it needs a kfree by caller
+ */
+int pfm_get_smpl_arg(char __user *fmt_uname, void __user *fmt_uarg, size_t usize, void **arg,
+		     struct pfm_smpl_fmt **fmt)
+{
+	struct pfm_smpl_fmt *f;
+	char *fmt_name;
+	void *addr = NULL;
+	size_t sz;
+	int ret;
+
+	fmt_name = getname(fmt_uname);
+	if (!fmt_name) {
+		PFM_DBG("getname failed");
+		return -ENOMEM;
+	}
+
+	/*
+	 * find fmt and increase refcount
+	 */
+	f = pfm_smpl_fmt_get(fmt_name);
+
+	putname(fmt_name);
+
+	if (f == NULL) {
+		PFM_DBG("buffer format not found");
+		return -EINVAL;
+	}
+
+	/*
+	 * expected format argument size
+	 */
+	sz = f->fmt_arg_size;
+
+	/*
+	 * check user size matches expected size
+	 * usize = -1 is for IA-64 backward compatibility
+	 */
+	ret = -EINVAL;
+	if (sz != usize && usize != -1) {
+		PFM_DBG("invalid arg size %zu, format expects %zu",
+			usize, sz);
+		goto error;
+	}
+	
+	if (sz) {
+		ret = -ENOMEM;
+		addr = kmalloc(sz, GFP_KERNEL);
+		if (addr == NULL)
+			goto error;
+
+		ret = -EFAULT;
+		if (copy_from_user(addr, fmt_uarg, sz))
+			goto error;
+	}
+	*arg = addr;
+	*fmt = f;
+	return 0;
+
+error:
+	kfree(addr);
+	pfm_smpl_fmt_put(f);
+	return ret;
+}
+
+/*
+ * unlike the other perfmon system calls, this one return a file descriptor
+ * or a value < 0 in case of error, very much like open() or socket()
+ */
+asmlinkage long sys_pfm_create_context(struct pfarg_ctx __user *ureq,
+				       char __user *fmt_name,
+				       void __user *fmt_uarg, size_t fmt_size)
+{
+	struct pfarg_ctx req;
+	struct pfm_context *new_ctx;
+	struct pfm_smpl_fmt *fmt = NULL;
+	void *fmt_arg = NULL;
+	int ret;
+
+	if (perfmon_disabled)
+		return -ENOSYS;
+
+	if (copy_from_user(&req, ureq, sizeof(req)))
+		return -EFAULT;
+
+	if (fmt_name) {
+		ret = pfm_get_smpl_arg(fmt_name, fmt_uarg, fmt_size, &fmt_arg, &fmt);
+		if (ret)
+			goto abort;
+	}
+
+	ret = __pfm_create_context(&req, fmt, fmt_arg, PFM_NORMAL, &new_ctx);
+
+	kfree(fmt_arg);
+abort:
+	return ret;
+}
+
+asmlinkage long sys_pfm_write_pmcs(int fd, struct pfarg_pmc __user *ureq, int count)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	struct pfarg_pmc pmcs[PFM_PMC_STK_ARG];
+	struct pfarg_pmc *req;
+	void *fptr;
+	unsigned long flags;
+	size_t sz;
+	int ret, fput_needed;
+
+	if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) {
+                PFM_DBG("invalid arg count %d", count);
+		return -EINVAL;
+	}
+
+	sz = count*sizeof(*ureq);
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	ret = pfm_get_args(ureq, sz, sizeof(pmcs), pmcs, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+	if (!ret)
+		ret = __pfm_write_pmcs(ctx, req, count);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (copy_to_user(ureq, req, sz))
+		ret = -EFAULT;
+
+	/*
+	 * This function may be on the critical path.
+	 * We want to avoid the branch if unecessary.
+	 */
+	if (fptr)
+		kfree(fptr);
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long sys_pfm_write_pmds(int fd, struct pfarg_pmd __user *ureq, int count)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	struct pfarg_pmd pmds[PFM_PMD_STK_ARG];
+	struct pfarg_pmd *req;
+	void *fptr;
+	unsigned long flags;
+	size_t sz;
+	int ret, fput_needed;
+
+	if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq))
+		return -EINVAL;
+
+	sz = count*sizeof(*ureq);
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	ret = pfm_get_args(ureq, sz, sizeof(pmds), pmds, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+	if (!ret)
+		ret = __pfm_write_pmds(ctx, req, count, 0);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (copy_to_user(ureq, req, sz))
+		ret = -EFAULT;
+
+	if (fptr)
+		kfree(fptr);
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long sys_pfm_read_pmds(int fd, struct pfarg_pmd __user *ureq, int count)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	struct pfarg_pmd pmds[PFM_PMD_STK_ARG];
+	struct pfarg_pmd *req;
+	void *fptr;
+	unsigned long flags;
+	size_t sz;
+	int ret, fput_needed;
+
+	if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq))
+		return -EINVAL;
+
+	sz = count*sizeof(*ureq);
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	ret = pfm_get_args(ureq, sz, sizeof(pmds), pmds, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+	if (!ret)
+		ret = __pfm_read_pmds(ctx, req, count);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (copy_to_user(ureq, req, sz))
+		ret = -EFAULT;
+
+	if (fptr)
+		kfree(req);
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long sys_pfm_restart(int fd)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	unsigned long flags;
+	int ret, fput_needed, complete_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, 0, &flags);
+	if (!ret)
+		ret = __pfm_restart(ctx, &complete_needed);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	/*
+	 * In per-thread mode with blocking notification, i.e.
+	 * ctx->flags.blocking=1, we need to defer issuing the
+	 * complete to unblock the blocked monitored thread.
+	 * Otherwise we have a potential deadlock due to a lock
+	 * inversion between the context lock and the task_rq_lock()
+	 * which can happen if one thread is in this call and the other
+	 * (the monitored thread) is in the context switch code.
+	 *
+	 * It is safe to access the context outside the critical section
+	 * because:
+	 * 	- we are protected by the fget_light(), so the context cannot
+	 * 	  disappear.
+	 * 	- we are protected against another thread issuing a extraneous
+	 * 	  pfm_restart() because the ctx->flags.can-restart flag has
+	 * 	  already been cleared
+	 * 	- the restart_complete field is only touched by the context init
+	 * 	  code (happens only once) or by wait_for_completion_interruptible
+	 * 	  in __pfm_handle_work(), so this is already serialized
+	 */
+	if (complete_needed)
+		complete(&ctx->restart_complete);
+
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long sys_pfm_stop(int fd)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	unsigned long flags;
+	int ret, fput_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+	if (!ret)
+		ret = __pfm_stop(ctx);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *ureq)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	struct pfarg_start req;
+	unsigned long flags;
+	int ret, fput_needed;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	/*
+	 * the one argument is actually optional
+	 */
+	if (ureq && copy_from_user(&req, ureq, sizeof(req)))
+		return -EFAULT;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags);
+	if (!ret)
+		ret = __pfm_start(ctx, ureq ? &req : NULL);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ureq)
+{
+	struct pfm_context *ctx;
+	struct task_struct *task;
+	struct file *filp;
+	unsigned long flags;
+	struct pfarg_load req;
+	int ret, fput_needed;
+
+	if (copy_from_user(&req, ureq, sizeof(req)))
+		return -EFAULT;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	task = NULL;
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	/*
+	 * in per-thread mode (not self-monitoring), get a reference
+	 * on task to monitor. This must be done with interrupts enabled
+	 * Upon succesful return, refcount on task is increased.
+	 *
+	 * fget_light() is protecting the context.
+	 */
+	if (!ctx->flags.system) {
+		if (req.load_pid != current->pid) {
+			ret = pfm_get_task(ctx, req.load_pid, &task);
+			if (ret)
+				goto error;
+		} else
+			task = current;
+	}
+
+	/*
+	 * irqsave is required to avoid race in case context is already
+	 * loaded or with switch timeout in the case of self-monitoring
+	 */
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags);
+	if (!ret)
+		ret = __pfm_load_context(ctx, &req, task);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	/*
+	 * in per-thread mode (not self-monitoring), we need
+	 * to decrease refcount on task to monitor:
+	 *   - load successful: we have a reference to the task in ctx->task
+	 *   - load failed    : undo the effect of pfm_get_task()
+	 */
+	if (task && task != current)
+		put_task_struct(task);
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long sys_pfm_unload_context(int fd)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	unsigned long flags;
+	int ret, fput_needed;
+	int is_system, can_release = 0;
+	u32 cpu;
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+	is_system = ctx->flags.system;
+	cpu = ctx->cpu;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED|PFM_CMD_UNLOAD, &flags);
+	if (!ret)
+		ret = __pfm_unload_context(ctx, &can_release);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (can_release)
+		pfm_release_session(is_system, cpu);
+
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long sys_pfm_create_evtsets(int fd, struct pfarg_setdesc __user *ureq, int count)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	struct pfarg_setdesc *req;
+	void *fptr;
+	unsigned long flags;
+	size_t sz;
+	int ret, fput_needed;
+
+	if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq))
+		return -EINVAL;
+
+	sz = count*sizeof(*ureq);
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+	/*
+	 * must mask interrupts because we do not know the state of context,
+	 * could be attached and we could be getting PMU interrupts. So
+	 * we mask and lock context and we check and possibly relax masking
+	 */
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags);
+	if (!ret)
+		ret = __pfm_create_evtsets(ctx, req, count);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (copy_to_user(ureq, req, sz))
+		ret = -EFAULT;
+
+	kfree(fptr);
+
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
+
+asmlinkage long  sys_pfm_getinfo_evtsets(int fd, struct pfarg_setinfo __user *ureq, int count)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	struct pfarg_setinfo *req;
+	void *fptr;
+	unsigned long flags;
+	size_t sz;
+	int ret, fput_needed;
+
+	if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq))
+		return -EINVAL;
+
+	sz = count*sizeof(*ureq);
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+	/*
+	 * this command operate even when context is loaded, so we need
+	 * to keep interrupts masked to avoid a race with PMU interrupt
+	 * which may switch active set
+	 */
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, 0, &flags);
+	if (!ret)
+		ret = __pfm_getinfo_evtsets(ctx, req, count);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (copy_to_user(ureq, req, sz))
+		ret = -EFAULT;
+
+	kfree(fptr);
+error:
+	fput_light(filp, fput_needed);
+	if (ret)
+		PFM_DBG("failed: errno=%d", -ret);
+	return ret;
+}
+
+asmlinkage long sys_pfm_delete_evtsets(int fd, struct pfarg_setinfo __user *ureq, int count)
+{
+	struct pfm_context *ctx;
+	struct file *filp;
+	struct pfarg_setinfo *req;
+	void *fptr;
+	unsigned long flags;
+	size_t sz;
+	int ret, fput_needed;
+
+	if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq))
+		return -EINVAL;
+
+	sz = count*sizeof(*ureq);
+
+	filp = fget_light(fd, &fput_needed);
+	if (unlikely(filp == NULL)) {
+		PFM_DBG("invalid fd %d", fd);
+		return -EBADF;
+	}
+
+	ctx = filp->private_data;
+	ret = -EBADF;
+
+	if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) {
+		PFM_DBG("fd %d not related to perfmon", fd);
+		goto error;
+	}
+
+	ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr);
+	if (ret)
+		goto error;
+
+	/*
+	 * must mask interrupts because we do not know the state of context,
+	 * could be attached and we could be getting PMU interrupts
+	 */
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags);
+	if (!ret)
+		ret = __pfm_delete_evtsets(ctx, req, count);
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+
+	if (copy_to_user(ureq, req, sz))
+		ret = -EFAULT;
+
+	kfree(fptr);
+
+error:
+	fput_light(filp, fput_needed);
+	return ret;
+}
Index: linux-2.6/perfmon/perfmon_sysfs.c
===================================================================
--- /dev/null
+++ linux-2.6/perfmon/perfmon_sysfs.c
@@ -0,0 +1,937 @@
+/*
+ * perfmon_sysfs.c: perfmon2 sysfs interface
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <eranian@hpl.hp.com>
+ *                David Mosberger-Tang <davidm@hpl.hp.com>
+ *
+ * More information about perfmon available at:
+ * 	http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/smp_lock.h>
+#include <linux/proc_fs.h>
+#include <linux/list.h>
+#include <linux/version.h>
+#include <linux/perfmon.h>
+#include <linux/device.h>
+#include <linux/cpu.h>
+
+#include <asm/bitops.h>
+#include <asm/errno.h>
+#include <asm/processor.h>
+
+struct pfm_attribute {
+	struct attribute attr;
+	ssize_t (*show)(void *, char *);
+	ssize_t (*store)(void *, const char *, size_t);
+};
+#define to_attr(n) container_of(n, struct pfm_attribute, attr);
+
+#define PFM_RO_ATTR(_name) \
+struct pfm_attribute attr_##_name = __ATTR_RO(_name)
+
+#define PFM_RW_ATTR(_name,_mode,_show,_store) 			\
+struct pfm_attribute attr_##_name = __ATTR(_name,_mode,_show,_store);
+
+static int pfm_sysfs_init_done;	/* true when pfm_sysfs_init() completed */
+
+int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu);
+
+struct pfm_controls pfm_controls = {
+	.sys_group = PFM_GROUP_PERM_ANY,
+	.task_group = PFM_GROUP_PERM_ANY,
+	.arg_mem_max = PAGE_SIZE,
+	.smpl_buffer_mem_max = ~0,
+};
+EXPORT_SYMBOL(pfm_controls);
+
+DECLARE_PER_CPU(struct pfm_stats, pfm_stats);
+
+static struct kobject pfm_kernel_kobj, pfm_kernel_fmt_kobj;
+
+static void pfm_reset_stats(int cpu)
+{
+	struct pfm_stats *st;
+	unsigned long flags;
+
+	st = &per_cpu(pfm_stats, cpu);
+
+	local_irq_save(flags);
+
+	/*
+	 * cannot use memset because of kobj member
+	 */
+	st->ovfl_intr_replay_count = 0;
+	st->ovfl_intr_regular_count = 0;
+	st->ovfl_intr_all_count = 0;
+	st->ovfl_intr_ns = 0;
+	st->ovfl_intr_phase1 = 0;
+	st->ovfl_intr_phase2 = 0;
+	st->ovfl_intr_phase3 = 0;
+	st->ovfl_intr_nmi_count = 0;
+	st->handle_work_count = 0;
+	st->ovfl_notify_count = 0;
+	st->reset_pmds_count = 0;
+	st->pfm_restart_count = 0;
+	st->ccnt0 = 0;
+	st->ccnt1 = 0;
+	st->ccnt2 = 0;
+	st->ccnt3 = 0;
+	st->ccnt4 = 0;
+	st->ccnt5 = 0;
+	st->ccnt6 = 0;
+	st->fmt_handler_calls = 0;
+	st->fmt_handler_ns = 0;
+	st->set_switch_count = 0;
+	st->set_switch_ns = 0;
+	st->ctxsw_count = 0;
+	st->ctxsw_ns = 0;
+	st->handle_timeout_count = 0;
+
+	local_irq_restore(flags);
+}
+
+static ssize_t pfm_fmt_attr_show(struct kobject *kobj,
+		struct attribute *attr, char *buf)
+{
+	struct pfm_smpl_fmt *fmt = to_smpl_fmt(kobj);
+	struct pfm_attribute *attribute = to_attr(attr);
+	return attribute->show ? attribute->show(fmt, buf) : -EIO;
+}
+
+static ssize_t pfm_pmu_attr_show(struct kobject *kobj,
+		struct attribute *attr, char *buf)
+{
+	struct pfm_pmu_config *pmu= to_pmu(kobj);
+	struct pfm_attribute *attribute = to_attr(attr);
+	return attribute->show ? attribute->show(pmu, buf) : -EIO;
+}
+
+static ssize_t pfm_stats_attr_show(struct kobject *kobj,
+		struct attribute *attr, char *buf)
+{
+	struct pfm_stats *st = to_stats(kobj);
+	struct pfm_attribute *attribute = to_attr(attr);
+	return attribute->show ? attribute->show(st, buf) : -EIO;
+}
+
+static ssize_t pfm_regs_attr_show(struct kobject *kobj,
+		struct attribute *attr, char *buf)
+{
+	struct pfm_regmap_desc *reg = to_reg(kobj);
+	struct pfm_attribute *attribute = to_attr(attr);
+	return attribute->show ? attribute->show(reg, buf) : -EIO;
+}
+
+static ssize_t pfm_stats_attr_store(struct kobject *kobj,
+		struct attribute *attr, const char *buf, size_t count)
+{
+	struct pfm_stats *st = to_stats(kobj);
+	struct pfm_attribute *attribute = to_attr(attr);
+	return attribute->store ? attribute->store(st, buf, count) : -EIO;
+}
+
+static struct sysfs_ops pfm_fmt_sysfs_ops = {
+	.show = pfm_fmt_attr_show
+};
+
+static struct sysfs_ops pfm_pmu_sysfs_ops = {
+	.show = pfm_pmu_attr_show
+};
+
+static struct sysfs_ops pfm_stats_sysfs_ops = {
+	.show  = pfm_stats_attr_show,
+	.store = pfm_stats_attr_store
+};
+
+static struct sysfs_ops pfm_regs_sysfs_ops = {
+	.show  = pfm_regs_attr_show
+};
+
+static struct kobj_type pfm_fmt_ktype = {
+	.sysfs_ops = &pfm_fmt_sysfs_ops,
+};
+
+static struct kobj_type pfm_pmu_ktype = {
+	.sysfs_ops = &pfm_pmu_sysfs_ops,
+};
+
+static struct kobj_type pfm_stats_ktype = {
+	.sysfs_ops = &pfm_stats_sysfs_ops,
+};
+
+static struct kobj_type pfm_regs_ktype = {
+	.sysfs_ops = &pfm_regs_sysfs_ops,
+};
+
+decl_subsys_name(pfm_fmt, pfm_fmt, &pfm_fmt_ktype, NULL);
+decl_subsys_name(pfm_pmu, pfm_pmu, &pfm_pmu_ktype, NULL);
+decl_subsys_name(pfm_stats, pfm_stats, &pfm_stats_ktype, NULL);
+decl_subsys_name(pfm_regs, pfm_regs, &pfm_regs_ktype, NULL);
+
+static ssize_t version_show(void *info, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u.%u\n",  PFM_VERSION_MAJ, PFM_VERSION_MIN);
+}
+
+static ssize_t pmd_max_fast_arg_show(void *info, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n",  PFM_PMD_STK_ARG);
+}
+
+static ssize_t pmc_max_fast_arg_show(void *info, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%u\n",  PFM_PMC_STK_ARG);
+}
+
+static ssize_t task_sessions_count_show(void *info, char *buf)
+{
+	return pfm_sysfs_session_show(buf, PAGE_SIZE, 0);
+}
+
+static ssize_t sys_sessions_count_show(void *info, char *buf)
+{
+	return pfm_sysfs_session_show(buf, PAGE_SIZE, 1);
+}
+
+static ssize_t smpl_buffer_mem_cur_show(void *info, char *buf)
+{
+	return pfm_sysfs_session_show(buf, PAGE_SIZE, 2);
+}
+
+static ssize_t debug_show(void *info, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.debug);
+}
+
+static ssize_t debug_store(void *info, const char *buf, size_t sz)
+{
+	int d, i;
+
+	if (sscanf(buf,"%d", &d) != 1)
+		return -EINVAL;
+
+	pfm_controls.debug = d;
+
+	if (d == 0) {
+		for_each_online_cpu(i) {
+			pfm_reset_stats(i);
+		}
+	}
+	return sz;
+}
+
+static ssize_t debug_ovfl_show(void *info, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.debug_ovfl);
+}
+
+static ssize_t debug_ovfl_store(void *info, const char *buf, size_t sz)
+{
+	int d;
+
+	if (sscanf(buf,"%d", &d) != 1)
+		return -EINVAL;
+
+	pfm_controls.debug_ovfl = d;
+
+	return strnlen(buf, PAGE_SIZE);
+}
+
+static ssize_t reset_stats_show(void *info, char *buf)
+{
+	buf[0]='0';
+	buf[1]='\0';
+	return strnlen(buf, PAGE_SIZE);
+}
+
+static ssize_t reset_stats_store(void *info, const char *buf, size_t count)
+{
+	int i;
+
+	for_each_online_cpu(i) {
+		pfm_reset_stats(i);
+	}
+	return count;
+}
+
+static ssize_t sys_group_show(void *info, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.sys_group);
+}
+
+static ssize_t sys_group_store(void *info, const char *buf, size_t sz)
+{
+	int d;
+
+	if (sscanf(buf,"%d", &d) != 1)
+		return -EINVAL;
+
+	pfm_controls.sys_group = d;
+
+	return strnlen(buf, PAGE_SIZE);
+}
+
+static ssize_t task_group_show(void *info, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.task_group);
+}
+
+static ssize_t task_group_store(void *info, const char *buf, size_t sz)
+{
+	int d;
+
+	if (sscanf(buf,"%d", &d) != 1)
+		return -EINVAL;
+
+	pfm_controls.task_group = d;
+
+	return strnlen(buf, PAGE_SIZE);
+}
+
+static ssize_t buf_size_show(void *info, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%zu\n", pfm_controls.smpl_buffer_mem_max);
+}
+
+static ssize_t buf_size_store(void *info, const char *buf, size_t sz)
+{
+	size_t d;
+
+	if (sscanf(buf,"%zu", &d) != 1)
+		return -EINVAL;
+	/*
+	 * we impose a page as the minimum
+	 */
+	if (d < PAGE_SIZE)
+		return -EINVAL;
+
+	pfm_controls.smpl_buffer_mem_max = d;
+
+	return strnlen(buf, PAGE_SIZE);
+}
+
+static ssize_t arg_size_show(void *info, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%zu\n", pfm_controls.arg_mem_max);
+}
+
+static ssize_t arg_size_store(void *info, const char *buf, size_t sz)
+{
+	size_t d;
+
+	if (sscanf(buf,"%zu", &d) != 1)
+		return -EINVAL;
+
+	/*
+	 * we impose a page as the minimum.
+	 *
+	 * This limit may be smaller than the stack buffer
+	 * available and that is fine.
+	 */
+	if (d < PAGE_SIZE)
+		return -EINVAL;
+
+	pfm_controls.arg_mem_max = d;
+
+	return strnlen(buf, PAGE_SIZE);
+}
+
+/*
+ * /sys/kernel/perfmon attributes
+ */
+static PFM_RO_ATTR(version);
+static PFM_RO_ATTR(task_sessions_count);
+static PFM_RO_ATTR(sys_sessions_count);
+static PFM_RO_ATTR(smpl_buffer_mem_cur);
+static PFM_RO_ATTR(pmd_max_fast_arg);
+static PFM_RO_ATTR(pmc_max_fast_arg);
+
+static PFM_RW_ATTR(debug, 0644, debug_show, debug_store);
+static PFM_RW_ATTR(debug_ovfl, 0644, debug_ovfl_show, debug_ovfl_store);
+static PFM_RW_ATTR(reset_stats, 0644, reset_stats_show, reset_stats_store);
+static PFM_RW_ATTR(sys_group, 0644, sys_group_show, sys_group_store);
+static PFM_RW_ATTR(task_group, 0644, task_group_show, task_group_store);
+static PFM_RW_ATTR(smpl_buffer_mem_max, 0644, buf_size_show, buf_size_store);
+static PFM_RW_ATTR(arg_mem_max, 0644, arg_size_show, arg_size_store);
+
+static struct attribute *pfm_kernel_attrs[] = {
+	&attr_version.attr,
+	&attr_task_sessions_count.attr,
+	&attr_sys_sessions_count.attr,
+	&attr_smpl_buffer_mem_cur.attr,
+	&attr_pmd_max_fast_arg.attr,
+	&attr_pmc_max_fast_arg.attr,
+	&attr_debug.attr,
+	&attr_debug_ovfl.attr,
+	&attr_reset_stats.attr,
+	&attr_sys_group.attr,
+	&attr_task_group.attr,
+	&attr_smpl_buffer_mem_max.attr,
+	&attr_arg_mem_max.attr,
+	NULL
+};
+
+static struct attribute_group pfm_kernel_attr_group = {
+	.attrs = pfm_kernel_attrs,
+};
+
+int __init pfm_init_sysfs(void)
+{
+	int ret;
+	int done_fmt = 0, done_pmu = 0, done_stats = 0, done_regs = 0;
+	int done_kobj_fmt = 0, done_kobj_kernel = 0;
+	int i, cpu = -1;
+	
+	ret = subsystem_register(&pfm_fmt_subsys);
+	if (ret) {
+		PFM_INFO("cannot register pfm_fmt_subsys: %d", ret);
+		goto error;
+	}
+	done_fmt = 1;
+
+	ret = subsystem_register(&pfm_pmu_subsys);
+	if (ret) {
+		PFM_INFO("cannot register pfm_pmu_subsys: %d", ret);
+		goto error;
+	}
+	done_pmu = 1;
+
+	ret = subsystem_register(&pfm_stats_subsys);
+	if (ret) {
+		PFM_INFO("cannot register pfm_stats_subsys: %d", ret);
+		goto error;
+	}
+	done_stats = 1;
+
+	ret = subsystem_register(&pfm_regs_subsys);
+	if (ret) {
+		PFM_INFO("cannot register pfm_regs_subsys: %d", ret);
+		goto error;
+	}
+	done_regs = 1;
+
+	kobject_init(&pfm_kernel_kobj);
+	kobject_init(&pfm_kernel_fmt_kobj);
+
+	pfm_kernel_kobj.parent = &kernel_subsys.kobj;
+	kobject_set_name(&pfm_kernel_kobj, "perfmon");
+
+	pfm_kernel_fmt_kobj.parent = &pfm_kernel_kobj;
+	kobject_set_name(&pfm_kernel_fmt_kobj, "formats");
+
+	ret = kobject_add(&pfm_kernel_kobj);
+	if (ret) {
+		PFM_INFO("cannot add kernel object: %d", ret);
+		goto error;
+	}
+	done_kobj_kernel = 1;
+
+	ret = kobject_add(&pfm_kernel_fmt_kobj);
+	if (ret) {
+		PFM_INFO("cannot add fmt object: %d", ret);
+		goto error;
+	}
+	done_kobj_fmt = 1;
+
+	ret = sysfs_create_group(&pfm_kernel_kobj, &pfm_kernel_attr_group);
+	if (ret) {
+		PFM_INFO("cannot create kernel group");
+		goto error;
+	}
+
+	/*
+	 * must be set before builtin_fmt and
+	 * add_pmu() calls
+	 */
+	pfm_sysfs_init_done = 1;
+
+	pfm_sysfs_builtin_fmt_add();
+
+	if (pfm_pmu_conf)
+		pfm_sysfs_add_pmu(pfm_pmu_conf);
+
+	for_each_online_cpu(cpu) {
+		ret = pfm_sysfs_add_cpu(cpu);
+		if (ret)
+			goto error;
+	}
+	return 0;
+error:
+	if (done_fmt)
+		subsystem_unregister(&pfm_fmt_subsys);
+	if (done_pmu)
+		subsystem_unregister(&pfm_pmu_subsys);
+	if (done_stats)
+		subsystem_unregister(&pfm_stats_subsys);
+	if (done_regs)
+		subsystem_unregister(&pfm_regs_subsys);
+	if (done_kobj_kernel)
+		kobject_del(&pfm_kernel_kobj);
+	if (done_kobj_fmt)
+		kobject_del(&pfm_kernel_fmt_kobj);
+
+	for (i=0; i < cpu; i++)
+		pfm_sysfs_del_cpu(i);
+
+	return ret;
+}
+
+/*
+ * per-cpu perfmon stats attributes
+ */
+#define PFM_DECL_STATS_ATTR(name) \
+static ssize_t name##_show(void *info, char *buf) \
+{ \
+	struct pfm_stats *st = info;\
+	return snprintf(buf, PAGE_SIZE, "%llu\n", \
+			(unsigned long long)st->name); \
+} \
+static PFM_RO_ATTR(name)
+
+PFM_DECL_STATS_ATTR(ovfl_intr_replay_count);
+PFM_DECL_STATS_ATTR(ovfl_intr_all_count);
+PFM_DECL_STATS_ATTR(ovfl_intr_ns);
+PFM_DECL_STATS_ATTR(fmt_handler_calls);
+PFM_DECL_STATS_ATTR(fmt_handler_ns);
+PFM_DECL_STATS_ATTR(set_switch_count);
+PFM_DECL_STATS_ATTR(set_switch_ns);
+PFM_DECL_STATS_ATTR(ctxsw_count);
+PFM_DECL_STATS_ATTR(ctxsw_ns);
+PFM_DECL_STATS_ATTR(handle_timeout_count);
+PFM_DECL_STATS_ATTR(ovfl_intr_nmi_count);
+PFM_DECL_STATS_ATTR(handle_work_count);
+PFM_DECL_STATS_ATTR(ovfl_notify_count);
+PFM_DECL_STATS_ATTR(reset_pmds_count);
+PFM_DECL_STATS_ATTR(pfm_restart_count);
+PFM_DECL_STATS_ATTR(ccnt0);
+PFM_DECL_STATS_ATTR(ccnt1);
+PFM_DECL_STATS_ATTR(ccnt2);
+PFM_DECL_STATS_ATTR(ccnt3);
+PFM_DECL_STATS_ATTR(ccnt4);
+PFM_DECL_STATS_ATTR(ccnt5);
+PFM_DECL_STATS_ATTR(ccnt6);
+
+/*
+ * per-reg attributes
+ */
+static ssize_t name_show(void *info, char *buf)
+{
+	struct pfm_regmap_desc *reg = info;
+	return snprintf(buf, PAGE_SIZE, "%s\n", reg->desc);
+}
+static PFM_RO_ATTR(name);
+
+static ssize_t dfl_val_show(void *info, char *buf)
+{
+	struct pfm_regmap_desc *reg = info;
+	return snprintf(buf, PAGE_SIZE, "0x%llx\n",
+			(unsigned long long)reg->dfl_val);
+}
+static PFM_RO_ATTR(dfl_val);
+
+static ssize_t rsvd_msk_show(void *info, char *buf)
+{
+	struct pfm_regmap_desc *reg = info;
+	return snprintf(buf, PAGE_SIZE, "0x%llx\n",
+			(unsigned long long)reg->rsvd_msk);
+}
+static PFM_RO_ATTR(rsvd_msk);
+
+static ssize_t width_show(void *info, char *buf)
+{
+	struct pfm_regmap_desc *reg = info;
+	int w;
+
+	w = (reg->type & PFM_REG_C64) ? pfm_pmu_conf->counter_width : 64;
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", w);
+}
+static PFM_RO_ATTR(width);
+
+
+static ssize_t addr_show(void *info, char *buf)
+{
+	struct pfm_regmap_desc *reg = info;
+	return snprintf(buf, PAGE_SIZE, "0x%lx\n", reg->hw_addr);
+}
+static PFM_RO_ATTR(addr);
+
+static ssize_t ovfl_intr_spurious_count_show(void *info, char *buf)
+{
+	struct pfm_stats *st = info;
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(unsigned long long)(st->ovfl_intr_all_count
+					     - st->ovfl_intr_regular_count));
+}
+
+static ssize_t ovfl_intr_regular_count_show(void *info, char *buf)
+{
+	struct pfm_stats *st = info;
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(unsigned long long)(st->ovfl_intr_regular_count
+					     - st->ovfl_intr_replay_count));
+}
+
+static PFM_RO_ATTR(ovfl_intr_spurious_count);
+static PFM_RO_ATTR(ovfl_intr_regular_count);
+
+static struct attribute *pfm_cpu_attrs[] = {
+	&attr_ovfl_intr_spurious_count.attr,
+	&attr_ovfl_intr_replay_count.attr,
+	&attr_ovfl_intr_regular_count.attr,
+	&attr_ovfl_intr_all_count.attr,
+	&attr_ovfl_intr_ns.attr,
+	&attr_handle_work_count.attr,
+	&attr_ovfl_notify_count.attr,
+	&attr_reset_pmds_count.attr,
+	&attr_pfm_restart_count.attr,
+	&attr_ccnt0.attr,
+	&attr_ccnt1.attr,
+	&attr_ccnt2.attr,
+	&attr_ccnt3.attr,
+	&attr_ccnt4.attr,
+	&attr_ccnt5.attr,
+	&attr_ccnt6.attr,
+	&attr_fmt_handler_calls.attr,
+	&attr_fmt_handler_ns.attr,
+	&attr_set_switch_count.attr,
+	&attr_set_switch_ns.attr,
+	&attr_ctxsw_count.attr,
+	&attr_ctxsw_ns.attr,
+	&attr_handle_timeout_count.attr,
+	&attr_ovfl_intr_nmi_count.attr,
+	NULL
+};
+
+static struct attribute_group pfm_cpu_attr_group = {
+	.attrs = pfm_cpu_attrs,
+};
+
+int pfm_sysfs_add_cpu(int mycpu)
+{
+	struct sys_device *cpudev;
+	struct pfm_stats *st;
+	int ret;
+
+	cpudev = get_cpu_sysdev(mycpu);
+	if (!cpudev)
+		return -EINVAL;
+
+	st = &per_cpu(pfm_stats, mycpu);
+
+	kobject_init(&st->kobj);
+
+	st->kobj.parent = &cpudev->kobj;
+	kobject_set_name(&st->kobj, "perfmon");
+	kobj_set_kset_s(st, pfm_stats_subsys);
+
+	ret = kobject_add(&st->kobj);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&st->kobj, &pfm_cpu_attr_group);
+	if (ret)
+		kobject_del(&st->kobj);
+
+	pfm_reset_stats(mycpu);
+
+	return ret;
+}
+
+void pfm_sysfs_del_cpu(int mycpu)
+{
+	struct sys_device *cpudev;
+	struct pfm_stats *st;
+
+	cpudev = get_cpu_sysdev(mycpu);
+	if (!cpudev)
+		return;
+
+	st = &per_cpu(pfm_stats, mycpu);
+	kobject_del(&st->kobj);
+
+	sysfs_remove_group(&st->kobj, &pfm_cpu_attr_group);
+}
+
+static ssize_t smpl_version_show(void *data, char *buf)
+{
+	struct pfm_smpl_fmt *fmt = data;
+
+	return snprintf(buf, PAGE_SIZE, "%u.%u",
+		fmt->fmt_version >>16 & 0xffff,
+		fmt->fmt_version & 0xffff);
+}
+PFM_RO_ATTR(smpl_version);
+
+static ssize_t smpl_argsize_show(void *data, char *buf)
+{
+	struct pfm_smpl_fmt *fmt = data;
+
+	return snprintf(buf, PAGE_SIZE, "%zu", fmt->fmt_arg_size);
+}
+PFM_RO_ATTR(smpl_argsize);
+
+static struct attribute *pfm_fmt_attrs[] = {
+	&attr_smpl_version.attr,
+	&attr_smpl_argsize.attr,
+	NULL
+};
+
+static struct attribute_group pfm_fmt_attr_group = {
+	.attrs = pfm_fmt_attrs,
+};
+
+
+/*
+ * when a sampling format module is inserted, we populate
+ * sysfs with some information
+ */
+int pfm_sysfs_add_fmt(struct pfm_smpl_fmt *fmt)
+{
+	int ret;
+
+	if (pfm_sysfs_init_done == 0)
+		return 0;
+
+	kobject_init(&fmt->kobj);
+	kobject_set_name(&fmt->kobj, fmt->fmt_name);
+	kobj_set_kset_s(fmt, pfm_fmt_subsys);
+	fmt->kobj.parent = &pfm_kernel_fmt_kobj;
+
+	ret = kobject_add(&fmt->kobj);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&fmt->kobj, &pfm_fmt_attr_group);
+	if (ret)
+		kobject_del(&fmt->kobj);
+
+	return ret;
+}
+
+/*
+ * when a sampling format module is removed, its information
+ * must also be removed from sysfs
+ */
+int pfm_sysfs_remove_fmt(struct pfm_smpl_fmt *fmt)
+{
+	if (pfm_sysfs_init_done == 0)
+		return 0;
+
+	sysfs_remove_group(&fmt->kobj, &pfm_fmt_attr_group);
+	kobject_del(&fmt->kobj);
+
+	return 0;
+}
+
+static struct attribute *pfm_reg_attrs[] = {
+	&attr_name.attr,
+	&attr_dfl_val.attr,
+	&attr_rsvd_msk.attr,
+	&attr_width.attr,
+	&attr_addr.attr,
+	NULL
+};
+
+static struct attribute_group pfm_reg_attr_group = {
+	.attrs = pfm_reg_attrs,
+};
+
+static ssize_t model_show(void *info, char *buf)
+{
+	struct pfm_pmu_config *p = info;
+	return snprintf(buf, PAGE_SIZE, "%s\n", p->pmu_name);
+}
+static PFM_RO_ATTR(model);
+
+static struct attribute *pfm_pmu_desc_attrs[] = {
+	&attr_model.attr,
+	NULL
+};
+
+static struct attribute_group pfm_pmu_desc_attr_group = {
+	.attrs = pfm_pmu_desc_attrs,
+};
+
+static int pfm_sysfs_add_pmu_regs(struct pfm_pmu_config *pmu)
+{
+	struct pfm_regmap_desc *reg;
+	unsigned int i, k;
+	int ret;
+	char reg_name[8];
+
+	reg = pmu->pmc_desc;
+	for(i=0; i < pmu->num_pmc_entries; i++, reg++) {
+
+		if (!(reg->type & PFM_REG_I))
+			continue;
+
+		kobject_init(&reg->kobj);
+
+		reg->kobj.parent = &pmu->kobj;
+		snprintf(reg_name, sizeof(reg_name), "pmc%u", i);
+		kobject_set_name(&reg->kobj, reg_name);
+		kobj_set_kset_s(reg, pfm_regs_subsys);
+
+		ret = kobject_add(&reg->kobj);
+		if (ret)
+			goto undo_pmcs;
+
+		ret = sysfs_create_group(&reg->kobj, &pfm_reg_attr_group);
+		if (ret) {
+			kobject_del(&reg->kobj);
+			goto undo_pmcs;
+		}
+	}
+
+	reg = pmu->pmd_desc;
+	for(i=0; i < pmu->num_pmd_entries; i++, reg++) {
+
+		if (!(reg->type & PFM_REG_I))
+			continue;
+
+		kobject_init(&reg->kobj);
+
+		reg->kobj.parent = &pmu->kobj;
+		snprintf(reg_name, sizeof(reg_name), "pmd%u", i);
+		kobject_set_name(&reg->kobj, reg_name);
+		kobj_set_kset_s(reg, pfm_regs_subsys);
+
+		ret = kobject_add(&reg->kobj);
+		if (ret)
+			goto undo_pmds;
+
+		ret = sysfs_create_group(&reg->kobj, &pfm_reg_attr_group);
+		if (ret) {
+			kobject_del(&reg->kobj);
+			goto undo_pmds;
+		}
+	}
+	return 0;
+undo_pmds:
+	reg = pmu->pmd_desc;
+	for(k = 0; k < i; k++, reg++) {
+		if (!(reg->type & PFM_REG_I))
+			continue;
+		sysfs_remove_group(&reg->kobj, &pfm_reg_attr_group);
+		kobject_del(&reg->kobj);
+	}
+	i = pmu->num_pmc_entries;
+	/* fall through */
+undo_pmcs:
+	reg = pmu->pmc_desc;
+	for(k=0; k < i; k++, reg++) {
+		if (!(reg->type & PFM_REG_I))
+			continue;
+		sysfs_remove_group(&reg->kobj, &pfm_reg_attr_group);
+		kobject_del(&reg->kobj);
+	}
+	return ret;
+}
+
+static int pfm_sysfs_del_pmu_regs(struct pfm_pmu_config *pmu)
+{
+	struct pfm_regmap_desc *reg;
+	unsigned int i;
+
+	reg = pmu->pmc_desc;
+	for(i=0; i < pmu->regs.max_pmc; i++, reg++) {
+
+		if (!(reg->type & PFM_REG_I))
+			continue;
+
+		sysfs_remove_group(&reg->kobj, &pfm_reg_attr_group);
+		kobject_del(&reg->kobj);
+	}
+
+	reg = pmu->pmd_desc;
+	for(i=0; i < pmu->regs.max_pmd; i++, reg++) {
+
+		if (!(reg->type & PFM_REG_I))
+			continue;
+
+		sysfs_remove_group(&reg->kobj, &pfm_reg_attr_group);
+		kobject_del(&reg->kobj);
+	}
+	return 0;
+}
+
+/*
+ * when a PMU description module is inserted, we create
+ * a pmu_desc subdir in sysfs and we populate it with
+ * PMU specific information, such as register mappings
+ */
+int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu)
+{
+	int ret;
+
+	if (pfm_sysfs_init_done == 0)
+		return 0;
+
+	kobject_init(&pmu->kobj);
+	kobject_set_name(&pmu->kobj, "pmu_desc");
+	kobj_set_kset_s(pmu, pfm_pmu_subsys);
+	pmu->kobj.parent = &pfm_kernel_kobj;
+
+	ret = kobject_add(&pmu->kobj);
+	if (ret)
+		return ret;
+
+	ret = sysfs_create_group(&pmu->kobj, &pfm_pmu_desc_attr_group);
+	if (ret)
+		kobject_del(&pmu->kobj);
+
+	ret = pfm_sysfs_add_pmu_regs(pmu);
+	if (ret) {
+		sysfs_remove_group(&pmu->kobj, &pfm_pmu_desc_attr_group);
+                kobject_del(&pmu->kobj);
+	}
+	return ret;
+}
+
+/*
+ * when a PMU description module is removed, we also remove
+ * all its information from sysfs, i.e., the pmu_desc subdir
+ * disappears
+ */
+int pfm_sysfs_remove_pmu(struct pfm_pmu_config *pmu)
+{
+	if (pfm_sysfs_init_done == 0)
+		return 0;
+
+	pfm_sysfs_del_pmu_regs(pmu);
+	sysfs_remove_group(&pmu->kobj, &pfm_pmu_desc_attr_group);
+	kobject_del(&pmu->kobj);
+
+	return 0;
+}