Simple NUMA slab allocator

The NUMA slab allocator simply generates a slab per node
on demand and uses the slabifier to manage per node
slab caches.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.18-rc4/mm/numa_slab.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.18-rc4/mm/numa_slab.c	2006-08-19 18:17:00.065868237 -0700
@@ -0,0 +1,263 @@
+/*
+ * NUMA slab implementation
+ *
+ * (C) 2006 Silicon Graphics Inc.
+ *		Christoph Lameter <clameter@sgi.com
+ */
+
+#include <linux/mm.h>
+#include <linux/allocator.h>
+#include <linux/kmalloc.h>
+#include <linux/cpuset.h>
+#include <linux/mempolicy.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+
+#define NUMA_SLAB_DEBUG
+
+//#define TPRINTK printk
+#define TPRINTK(x, ...)
+
+/*
+ * Allocator on which we base this functionality.
+ */
+#define base slabifier_allocator
+
+struct numa_slab {
+	struct slab_cache sc;
+	spinlock_t lock;
+	atomic_t refcount;
+	struct slab_cache *node[MAX_NUMNODES];
+};
+
+static int __numa_slab_destroy(struct numa_slab *n)
+{
+	int node;
+
+	TPRINTK(KERN_CRIT "__numa_slab_destroy(%s)\n", n->sc.name);
+
+	for_each_node(node) {
+		base.free(NULL,n->node[node]);
+		n->node[node] = NULL;
+	}
+	return 0;
+}
+
+static struct slab_cache *bring_up_node(struct numa_slab *n, int node)
+{
+	struct slab_cache *s = n->node[node];
+	struct slab_cache *sc = &n->sc;
+
+	TPRINTK(KERN_CRIT "bring_up_node(%s, %d)\n", n->sc.name, node);
+	if (s)
+		return s;
+
+	spin_lock(&n->lock);
+	s = n->node[node];
+	if (s) {
+		spin_unlock(&n->lock);
+		return s;
+	}
+	s = n->node[node] = base.create(&base, sc->page_alloc, node,
+		sc->name, sc->size, sc->align, sc->order, sc->objsize, sc->inuse,
+		sc->offset);
+
+	spin_unlock(&n->lock);
+	return s;
+}
+
+static struct slab_cache *numa_slab_create
+	(struct slab_control *x,
+		const struct slab_allocator *slab_alloc,
+		const struct page_allocator *page_alloc,
+		const struct slab_cache *s)
+{
+	struct numa_slab *n;
+
+	TPRINTK(KERN_CRIT "numa_slab_create(%s, %s, %d, %s, %d, %d, %d ,%d ,%d ,%d)\n",
+			slab_alloc->name, page_alloc->name, node, name, size,
+			align, order, objsize, inuse, offset);
+
+
+	n = base.alloc(numa_cache, in_atomic() ? GFP_ATOMIC : GFP_KERNEL);
+	if (!n)
+		return NULL;
+
+	memset(n, 0, sizeof(struct numa_slab));
+	memcpy(&n->sc, s, sizeof(struct slab_cache));
+	spin_lock_init(&n->lock);
+	atomic_set(&n->refcount, 1);
+
+	/* Do not bring up a node here. slabulator may set a constructor after the fact */
+	return &n->sc;
+}
+
+static void *numa_slab_alloc(struct slab_cache *sc, gfp_t flags);
+
+static void *numa_slab_alloc_node(struct slab_cache *sc, gfp_t flags, int node)
+{
+	struct numa_slab *n = (void *)sc;
+	struct slab_cache *s;
+
+	TPRINTK(KERN_CRIT "numa_slab_alloc_node(%s, %x, %d)\n", sc->name, flags, node);
+
+	if (node < 0)
+		node = numa_node_id();
+
+	s = n->node[node];
+
+	if (unlikely(!s)) {
+		s = bring_up_node(n, node);
+		if (!s)
+			return NULL;
+	}
+	return base.alloc(s, flags);
+}
+
+static void *numa_slab_alloc(struct slab_cache *sc, gfp_t flags)
+{
+	int node = numa_node_id();
+
+	TPRINTK(KERN_CRIT "numa_slab_alloc(%s, %x)\n", sc->name, flags);
+
+	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))
+		&& !in_interrupt()) {
+		if (cpuset_do_slab_mem_spread())
+	 		node = cpuset_mem_spread_node();
+		else if (current->mempolicy)
+                	node = slab_node(current->mempolicy);
+	}
+	return numa_slab_alloc_node(sc, flags, node);
+}
+
+static int numa_slab_destroy(struct slab_cache *sc)
+{
+	struct numa_slab *n = (void *)sc;
+
+	TPRINTK(KERN_CRIT "numa_slab_destroy(%s)\n", sc->name);
+
+	if (!atomic_dec_and_test(&n->refcount))
+		return 0;
+
+	__numa_slab_destroy(n);
+	return 0;
+}
+
+static int numa_slab_pointer_valid(struct slab_cache *sc, const void *object)
+{
+	struct numa_slab *n = (void *)sc;
+	int node;
+
+	TPRINTK(KERN_CRIT "numa_slab_pointer_valid(%s, %p)\n", sc->name, object);
+
+	/* We can deduct from the allocator which node this is. */
+	node = ((struct numactl *)(sc->page_alloc))->node;
+	return base.valid_pointer(n->node[node], object);
+}
+
+static unsigned long numa_slab_object_size(struct slab_cache *sc,
+						 const void *object)
+{
+	struct numa_slab *n = (void *)sc;
+	int node;
+
+	TPRINTK(KERN_CRIT "numa_slab_object_size(%s, %p)\n", sc->name, object);
+
+	/* We can deduct from the allocator which node this is. */
+	node = ((struct numactl *)(sc->page_alloc))->node;
+	return base.object_size(n->node[node], object);
+}
+
+static void numa_slab_free(struct slab_cache *sc, const void *object)
+{
+	TPRINTK(KERN_CRIT "numa_slab_free(%s, %p)\n", sc ? sc->name : "<none>", object);
+	base.free(NULL, object);
+}
+
+static struct slab_cache *numa_slab_dup(struct slab_cache *sc)
+{
+	struct numa_slab *n = (void *)sc;
+
+	TPRINTK(KERN_CRIT "numa_slab_dup(%s)\n", sc->name);
+
+	atomic_inc(&n->refcount);
+	return sc;
+}
+
+static struct slab_cache *numa_slab_node(struct slab_cache *sc, int node)
+{
+	struct numa_slab *n = (void *)sc;
+	struct slab_cache *s = n->node[node];
+
+	return s;
+}
+
+static int numa_slab_shrink(struct slab_cache *sc,
+	int (*move_object)(struct slab_cache *, void *))
+{
+	struct numa_slab *n = (void *)sc;
+	int node;
+	int count = 0;
+
+	TPRINTK(KERN_CRIT "numa_slab_shrink(%s, %p)\n", sc->name, move_object);
+
+	/*
+	 * FIXME: What you really want to do here is to
+	 * run the shrinking on each node separately
+	 */
+	spin_lock(&n->lock);
+	for_each_node(node) {
+		struct slab_cache *s = n->node[node];
+
+		if (s)
+			count += base.shrink(n->node[node], move_object);
+	}
+	spin_unlock(&n->lock);
+	return count;
+}
+
+static unsigned long numa_slab_objects(struct slab_cache *sc,
+	unsigned long *active, unsigned long *partial)
+{
+	struct numa_slab *n = (void *)sc;
+	int node;
+	unsigned long count = 0;
+	unsigned long count_active = 0;
+	unsigned long count_partial = 0;
+
+	printk(KERN_CRIT "numa_slab_objects(%s)\n", sc->name);
+
+	for_each_node(node) {
+		unsigned long nactive, npartial;
+		struct slab_cache *s = n->node[node];
+
+		if (!s)
+			continue;
+
+		count += base.objects(n->node[node], &nactive, &npartial);
+		count_active += nactive;
+		count_partial += npartial;
+	}
+	if  (active)
+		*active = count_active;
+	if (partial)
+		*partial = count_partial;
+	return count;
+}
+
+const struct slab_allocator numa_slab_allocator = {
+	.name = "NumaSlab",
+	.create = numa_slab_create,
+	.alloc = numa_slab_alloc,
+	.alloc_node = numa_slab_alloc_node,
+	.free = numa_slab_free,
+	.valid_pointer = numa_slab_pointer_valid,
+	.object_size = numa_slab_object_size,
+	.objects = numa_slab_objects,
+	.shrink = numa_slab_shrink,
+	.dup = numa_slab_dup,
+	.node = numa_slab_node,
+	.destroy = numa_slab_destroy,
+	.destructor = null_slab_allocator_destructor
+};
+EXPORT_SYMBOL(numa_slab_allocator);
Index: linux-2.6.18-rc4/mm/Makefile
===================================================================
--- linux-2.6.18-rc4.orig/mm/Makefile	2006-08-19 16:46:06.586412205 -0700
+++ linux-2.6.18-rc4/mm/Makefile	2006-08-19 18:13:59.716643596 -0700
@@ -25,4 +25,5 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_h
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_MODULAR_SLAB) += allocator.o slabifier.o kmalloc.o slabulator.o
+obj-$(CONFIG_NUMA_SLAB) += numa_slab.o
 
Index: linux-2.6.18-rc4/init/Kconfig
===================================================================
--- linux-2.6.18-rc4.orig/init/Kconfig	2006-08-19 16:46:06.587388707 -0700
+++ linux-2.6.18-rc4/init/Kconfig	2006-08-19 18:13:31.412725870 -0700
@@ -411,6 +411,11 @@ config SHMEM
 	  option replaces shmem and tmpfs with the much simpler ramfs code,
 	  which may be appropriate on small systems without swap.
 
+config NUMA_SLAB
+	default y
+	bool "NUMA Slab allocator (for lots of memory)"
+	depends on MODULAR_SLAB && NUMA
+
 config VM_EVENT_COUNTERS
 	default y
 	bool "Enable VM event counters for /proc/vmstat" if EMBEDDED