SLUB: Place kmem_cache_cpu structures in a NUMA aware way.

The kmem_cache_cpu structures introduces are currently an array in the
kmem_cache_struct. Meaning the kmem_cache_cpu structures are overwhelmingly
on the wrong node.

In order to place the kmem_cache_cpu structure optimally we put an array
of pointers to kmem_cache_cpu structs in kmem_cache (similar to SLAB).

The kmem_cache_cpu structures have to be allocated in an intelligent way.
kmalloc_node is easy to use for them once we have bootstrapped the kmalloc
array. However, during the buildup of the kmalloc cache array we have no
such capability.

In order to bootstrap we define per cpu kmem_cache_cpu structures for each
kmalloc size that is supported. We establish pointers to these kmem_cache_cpu
structures (unlike SLAB we never free the boot structure since per cpu
structures are optimally placed). That solves the bootstrap problem.
For the per cpu array we pack the kmem_cache_cpu structures as densely as
possible. There is no danger of cacheline bouncing since they are all
from the same cpu. We can therefore fit lots of them into a single cacheline.
On 32 bit a kmem_cache_cpu structure is 12 bytes meaning that 5 fit into
one 64 byte cacheline. On 64 bit we get to 24 bytes which still fits almost 3.

If we allocate the kmem_cache structure via kmalloc then there is danger
of cacheline bouncing so we pad to full cacheline size.

Pro:
	NUMA aware placement
	All global structures become readonly
	Potential avoidance of an exclusive cacheline fetch
	since multiple kmem_cache_cpu structures are in one
	cacheline.

Cons:
	Additional reference to one read only cacheline in
	both slab_alloc and slab_free.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/slub_def.h |    9 ++++++---
 mm/slub.c                |   44 ++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 48 insertions(+), 5 deletions(-)

Index: linux-2.6.22-rc4-mm2/include/linux/slub_def.h
===================================================================
--- linux-2.6.22-rc4-mm2.orig/include/linux/slub_def.h	2007-06-15 18:47:30.000000000 -0700
+++ linux-2.6.22-rc4-mm2/include/linux/slub_def.h	2007-06-15 19:13:03.000000000 -0700
@@ -16,8 +16,7 @@ struct kmem_cache_cpu {
 	struct page *page;
 	int objects;	/* Saved page->inuse */
 	int node;
-	/* Lots of wasted space */
-} ____cacheline_aligned_in_smp;
+};
 
 struct kmem_cache_node {
 	spinlock_t list_lock;	/* Protect partial list and nr_partial */
@@ -55,10 +54,14 @@ struct kmem_cache {
 	struct kobject kobj;	/* For sysfs */
 
 #ifdef CONFIG_NUMA
+#define KMEM_CACHE_CPU_SIZE sizeof(struct kmem_cache_cpu *)
 	int defrag_ratio;
 	struct kmem_cache_node *node[MAX_NUMNODES];
-#endif
+	struct kmem_cache_cpu *cpu_slab[NR_CPUS];
+#else
+#define KMEM_CACHE_CPU_SIZE sizeof(struct kmem_cache_cpu)
 	struct kmem_cache_cpu cpu_slab[NR_CPUS];
+#endif
 };
 
 /*
Index: linux-2.6.22-rc4-mm2/mm/slub.c
===================================================================
--- linux-2.6.22-rc4-mm2.orig/mm/slub.c	2007-06-15 18:47:30.000000000 -0700
+++ linux-2.6.22-rc4-mm2/mm/slub.c	2007-06-15 19:11:37.000000000 -0700
@@ -284,8 +284,11 @@ static inline struct kmem_cache_node *ge
 
 static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu)
 {
-	/* For SMP its simple */
+#ifdef CONFIG_NUMA
+	return s->cpu_slab[cpu];
+#else
 	return &s->cpu_slab[cpu];
+#endif
 }
 
 static inline int check_valid_pointer(struct kmem_cache *s,
@@ -1909,6 +1912,8 @@ static void init_kmem_cache_node(struct 
 }
 
 #ifdef CONFIG_NUMA
+static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_cpu)[KMALLOC_SHIFT_HIGH + 1];
+
 /*
  * No kmalloc_node yet so do it by hand. We know that this is the first
  * slab on the node for this slabcache. There are no concurrent accesses
@@ -1945,9 +1950,19 @@ static struct kmem_cache_node * __init e
 static void free_kmem_cache_nodes(struct kmem_cache *s)
 {
 	int node;
+	int cpu;
+
+	for_each_online_cpu(cpu) {
+		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
+
+		s->cpu_slab[cpu] = NULL;
+		if (c)
+			kfree(c);
+	}
 
 	for_each_online_node(node) {
 		struct kmem_cache_node *n = s->node[node];
+
 		if (n && n != &s->local_node)
 			kmem_cache_free(kmalloc_caches, n);
 		s->node[node] = NULL;
@@ -1957,6 +1972,7 @@ static void free_kmem_cache_nodes(struct
 static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
 {
 	int node;
+	int cpu;
 	int local_node;
 
 	if (slab_state >= UP)
@@ -1987,6 +2003,30 @@ static int init_kmem_cache_nodes(struct 
 		s->node[node] = n;
 		init_kmem_cache_node(n);
 	}
+
+	for_each_online_cpu(cpu) {
+		struct kmem_cache_cpu *c;
+
+		if (slab_state < UP)
+			/*
+			 * Special arrangements for the kmalloc cache so
+			 * that per cpu structures of one processor are
+			 * likely to fall into the same cacheline.
+			 */
+			c = percpu_ptr(kmalloc_cpu, cpu)[s  - kmalloc_caches];
+		else {
+			/* Each kmem_cache_cpu gets its own cacheline. */
+			c = kmalloc_node(ALIGN(sizeof(struct kmem_cache_cpu),
+					cache_line_size()), gfpflags, node);
+
+			if (!c) {
+				free_kmem_cache_nodes(s);
+				return 0;
+			}
+		}
+		memset(c, 0, sizeof(struct kmem_cache_cpu));
+		s->cpu_slab[node] = c;
+	}
 	return 1;
 }
 #else
@@ -2619,7 +2659,7 @@ void __init kmem_cache_init(void)
 #endif
 
 	kmem_size = offsetof(struct kmem_cache, cpu_slab) +
-				nr_cpu_ids * sizeof(struct kmem_cache_cpu);
+				nr_cpu_ids * KMEM_CACHE_CPU_SIZE;
 
 	printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d,"
 		" MinObjects=%d, CPUs=%d, Nodes=%d\n",