Generic CPU operations: Core piece

Currently the per cpu subsystem is not able to use the atomic capabilities
of the processors we have.

This adds new functionality that allows the optimizing of per cpu variable
handliong. It in particular provides a simple way to exploit atomic operations
to avoid having to disable itnerrupts or add an per cpu offset.

F.e. current implementations may do

unsigned long flags;
struct stat_struct *p;

local_irq_save(flags);
/* Calculate address of per processor area */
p = CPU_PTR(stat, smp_processor_id());
p->counter++;
local_irq_restore(flags);

This whole segment can be replaced by a single CPU operation

CPU_INC(stat->counter);

And on most processors it is possible to perform the increment with
a single processor instruction. Processors have segment registers,
global registers and per cpu mappings of per cpu areas for that purpose.

The problem is that the current schemes cannot utilize those features.
local_t is not really addressing the issue since the offset calculation
is not solved. local_t is x86 processor specific. This solution here
can utilize other methods than just the x86 instruction set.

On x86 the above CPU_INC translated into a single instruction:

inc %%gs:(&stat->counter)

This instruction is interrupt safe since it can either be completed
or not.

The determination of the correct per cpu area for the current processor
does not require access to smp_processor_id() (expensive...). The gs
register is used to provide a processor specific offset to the respective
per cpu area where the per cpu variabvle resides.

Note tha the counter offset into the struct was added *before* the segment
selector was added. This is necessary to avoid calculation, In the past
we first determine the address of the stats structure on the respective
processor and then added the field offset. However, the offset may as
well be added earlier.

If stat was declared via DECLARE_PER_CPU then this patchset is capoable of
convincing the linker to provide the proper base address. In that case
no calculations are necessary.

Should the stats structure be reachable via a register then the address
calculation capabilities can be leverages to avoid calculations.

On IA64 the same will result in another single instruction using the
factor that we have a virtual address that always maps to the local per cpu
area.

fetchadd &stat->counter + (VCPU_BASE - __per_cpu_base)

The access is forced into the per cpu address reachable via the virtualized
address. Again the counter field offset is eadded to the offset. The access
is then similarly a singular instruction thing as on x86.

In order to be able to exploit the atomicity of this instructions we
introduce a series of new functions that take a BASE pointer (a pointer
into the area of cpu 0 which is the canonical base).

CPU_READ()
CPU_WRITE()
CPU_INC
CPU_DEC
CPU_ADD
CPU_SUB
CPU_XCHG
CPU_CMPXCHG


Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/percpu.h |  187 +++++++++++++++++++++++++++++++++++++++++++++++++
 init/main.c            |    8 ++
 2 files changed, 195 insertions(+)

Index: linux-2.6/include/linux/percpu.h
===================================================================
--- linux-2.6.orig/include/linux/percpu.h	2007-11-21 11:54:34.868922857 -0800
+++ linux-2.6/include/linux/percpu.h	2007-11-21 11:56:56.205732647 -0800
@@ -148,7 +148,15 @@ static inline void percpu_free(void *__p
 
 #define SHIFT_PTR(__p, __offset) ((__typeof__(__p))((void *)(__p) \
 						+ (__offset)))
+
+/*
+ * If an architecture wants a virtualized cpu area then it needs
+ * to provide a definiton of the virtual address to use in the cpu_area
+ * variable. This is usually set up in percpu.h of the arch.
+ */
+#ifndef CONFIG_CPU_AREA_VIRTUAL
 extern char cpu_area[];
+#endif
 
 static inline unsigned long __cpu_offset(unsigned long cpu)
 {
@@ -175,6 +183,14 @@ static inline unsigned long cpu_offset(u
 					__alignof__(type))
 #define CPU_FREE(pointer)	cpu_free((pointer), sizeof(*(pointer)))
 
+/*
+ * If we have fast cpu operations then we can use those to have fast
+ * access to the address of the current cpu area
+ */
+#if defined(CONFIG_FAST_CPU_OPS) && defined(CONFIG_PERCPU_FOLDED_INTO_CPU_AREA)
+DECLARE_PER_CPU(this_cpu_offset)
+#define THIS_CPU_OFFSET CPU_READ(read_cpu_var(this_cpu_offset))
+#endif
 
 /*
  * An arch may define THIS_CPU_OFFSET if there is a special way to access
@@ -202,4 +218,175 @@ void cpu_free(void *cpu_pointer, unsigne
  */
 void *boot_cpu_alloc(unsigned long size);
 
+/*
+ * Bridge function to be able to use CPU_xxx operations on percpu variables.
+ *
+ * If the per cpu area is folded into the cpu_area then the per cpu variables
+ * start at zero and so the address of the per cpu area can be used directly
+ * as an offset. Otherwise we need to take the address of the per cpu variable
+ * and subtract what the CPU_xxx operation will add later.
+ */
+#ifdef CONFIG_PERCPU_FOLDED_INTO_CPU_AREA
+#define per_cpu_var(name) &per_cpu__##name
+#else
+#define per_cpu_var(name) (*SHIFT_PTR(&__get_cpu_var(name), -__THIS_CPU_OFFSET))
+#endif
+
+/*
+ * Fast Atomic per cpu operations.
+ *
+ * The following operations can be overridden by arches to implement fast
+ * and efficient operations. The operations are atomic meaning that the
+ * determination of the processor, the calculation of the address and the
+ * operation on the data is an atomic operation.
+ */
+
+#ifndef CONFIG_FAST_CPU_OPS
+
+/*
+ * Generic bridge function to calculate an address that is compatible with
+ * CPU_ops given an address that is provided by an arch specific percpu
+ * implementation.
+ *
+ * Note that this operation is not atomic as required by the CPU ops. We
+ * rely on the expression being evaluated when the variable is accessed.
+ * At that time the CPU operation should have done the appropriate thing
+ * to provide the needed atomicity needed.
+ *
+ * If the arch allocs the per cpu area via cpu_alloc then simply
+ * determining the address of the cpu variable is enough.
+ */
+#ifndef get_cpu_var
+#define get_cpu_var(var) ({						\
+	int cpu = raw_smp_processor_id();				\
+	(*SHIFT_PTR(&per_cpu(var, cpu), -cpu_offset(cpu));		\
+})
+#endif
+
+/*
+ * The fallbacks may be slow but they are safe
+ *
+ * The first group of macros is used when we it is safe to update the per
+ * cpu variable because preemption is off (per cpu variables that are not
+ * updated from interrupt cointext) or because interrupts are already off.
+ */
+
+#define __CPU_READ(var)				\
+({						\
+	(*THIS_CPU(&(var)));			\
+})
+
+#define __CPU_WRITE(var, value)			\
+({						\
+	*THIS_CPU((&(var)) = (value);		\
+})
+
+#define __CPU_ADD(var, value)			\
+({						\
+	*THIS_CPU(&(var)) += (value);		\
+})
+
+#define __CPU_INC(var) __CPU_ADD((var), 1)
+#define __CPU_DEC(var) __CPU_ADD((var), -1)
+#define __CPU_SUB(var, value) __CPU_ADD((var), -(value))
+
+#define __CPU_CMPXCHG(var, old, new)		\
+({						\
+	typeof(obj) x;				\
+	typeof(obj) *p = THIS_CPU(&(obj));	\
+	x = *p;					\
+	if (x == (old))				\
+		*p = (new);			\
+	(x);					\
+})
+
+#define __CPU_XCHG(obj, new)			\
+({						\
+	typeof(obj) x;				\
+	typeof(obj) *p = THIS_CPU(&(obj));	\
+	x = *p;					\
+	*p = (new);				\
+	(x);					\
+})
+
+/*
+ * Second group used for per cpu variables that
+ * are not updated from an interrupt context.
+ * In that case we can simply disable preemption which
+ * may be free if the kernel is compiled without preemption.
+ */
+
+#define _CPU_READ __CPU_READ
+#define _CPU_WRITE __CPU_WRITE
+
+#define _CPU_ADD(var, value)			\
+({						\
+	preempt_disable();			\
+	__CPU_ADD((var), (value));		\
+	preempt_enable();			\
+})
+
+#define _CPU_INC(var) _CPU_ADD((var), 1)
+#define _CPU_DEC(var) _CPU_ADD((var), -1)
+#define _CPU_SUB(var, value) _CPU_ADD((var), -(value))
+
+#define _CPU_CMPXCHG(var, old, new)		\
+({						\
+	typeof(addr) x;				\
+	preempt_disable();			\
+	x = __CPU_CMPXCHG((var), (old), (new));	\
+	preempt_enable();			\
+	(x);					\
+})
+
+#define _CPU_XCHG(var, new)			\
+({						\
+	typeof(var) x;				\
+	preempt_disable();			\
+	x = __CPU_XCHG((var), (new));		\
+	preempt_enable();			\
+	(x);					\
+})
+
+/*
+ * Interrupt safe CPU functions
+ */
+
+#define CPU_READ __CPU_READ
+#define CPU_WRITE __CPU_WRITE
+
+#define CPU_ADD(var, value)			\
+({						\
+	unsigned long flags;			\
+	local_irq_save(flags);			\
+	__CPU_ADD((var), (value));		\
+	local_irq_restore(flags);		\
+})
+
+#define CPU_INC(var) CPU_ADD((var), 1)
+#define CPU_DEC(var) CPU_ADD((var), -1)
+#define CPU_SUB(var, value) CPU_ADD((var), -(value))
+
+#define CPU_CMPXCHG(var, old, new)		\
+({						\
+	unsigned long flags;			\
+	typeof(var) x;				\
+	local_irq_save(flags);			\
+	x = __CPU_CMPXCHG((var), (old), (new));	\
+	local_irq_restore(flags);		\
+	(x);					\
+})
+
+#define CPU_XCHG(var, new)			\
+({						\
+	unsigned long flags;			\
+	typeof(var) x;				\
+	local_irq_save(flags);			\
+	x = __CPU_XCHG((var), (new));		\
+	local_irq_restore(flags);		\
+	(x);					\
+})
+
+#endif /* CONFIG_FAST_CPU_OPS */
+
 #endif /* __LINUX_PERCPU_H */
Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c	2007-11-21 11:57:16.095232649 -0800
+++ linux-2.6/init/main.c	2007-11-21 12:00:02.206732716 -0800
@@ -368,6 +368,11 @@ unsigned long __per_cpu_offset[NR_CPUS] 
 
 EXPORT_SYMBOL(__per_cpu_offset);
 
+#if defined(CONFIG_FAST_CPU_OPS) && defined(CONFIG_PERCPU_FOLDED_INTO_CPU_AREA)
+DEFINE_PER_CPU(unsigned long, this_cpu_offset);
+EXPORT_SYMBOL_PER_CPU(this_cpu_offset);
+#endif
+
 static void __init setup_per_cpu_areas(void)
 {
 	unsigned long size, i;
@@ -381,6 +386,9 @@ static void __init setup_per_cpu_areas(v
 	for_each_possible_cpu(i) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+#if defined(CONFIG_FAST_CPU_OPS) && defined(CONFIG_PERCPU_FOLDED_INTO_CPU_AREA)
+		per_cpu(this_cpu_offset, (unsigned long)ptr);
+#endif
 		ptr += size;
 	}
 }