Index: linux-2.6.19-mm1/include/linux/slub_def.h
===================================================================
--- linux-2.6.19-mm1.orig/include/linux/slub_def.h	2006-12-12 15:34:24.000000000 -0800
+++ linux-2.6.19-mm1/include/linux/slub_def.h	2006-12-12 15:54:39.000000000 -0800
@@ -11,18 +11,29 @@
 #include <linux/workqueue.h>
 
 /*
+ * Per cpu structure to manage active slabs.
+ */
+struct active_slab {
+	struct page *page;
+	struct kmem_cache *slab;
+	int referenced;
+#ifdef CONFIG_SMP
+	int flush_active;
+	struct delayed_work flush;
+#endif
+}  ____cacheline_aligned_in_smp;
+/*
  * Slab cache management.
  */
 struct kmem_cache {
 	spinlock_t list_lock;	/* Protecty partial list and nr_partial */
 	struct list_head partial;
 	unsigned long nr_partial;
-	int offset;		/* Free pointer offset. */
-	struct page *active[NR_CPUS];
 	atomic_long_t nr_slabs;	/* Total slabs used */
+	int offset;		/* Free pointer offset. */
+	int size;		/* Total size of an object */
 	unsigned int order;	/* Size of the slab page */
 	unsigned long flags;
-	int size;		/* Total size of an object */
 	int objects;		/* Number of objects in slab */
 	atomic_t refcount;	/* Refcount for destroy */
 	int align;
@@ -33,10 +44,10 @@
 	int inuse;		/* Used portion of the chunk */
 	const char *name;	/* Name (only for display!) */
 	struct list_head list;	/* List of slabs */
-#ifdef CONFIG_SMP
-	struct mutex flushing;
-	atomic_t active_cpus;		/* if >0 then flusher is scheduled */
-	struct delayed_work flush;
+#ifdef CONFIG_NUMA
+	struct active_slab *active[NR_CPUS];
+#else
+	struct active_slab active[NR_CPUS]  ____cacheline_aligned_in_smp;
 #endif
 };
 
Index: linux-2.6.19-mm1/mm/slub.c
===================================================================
--- linux-2.6.19-mm1.orig/mm/slub.c	2006-12-12 15:34:28.000000000 -0800
+++ linux-2.6.19-mm1/mm/slub.c	2006-12-12 15:50:37.000000000 -0800
@@ -60,6 +60,12 @@
 static void register_slab(struct kmem_cache *s);
 static void unregister_slab(struct kmem_cache *s);
 
+#ifdef CONFIG_NUMA
+#define ACTIVE_SLAB(__s,__cpu)	((__s)->active[__cpu])
+#else
+#define ACTIVE_SLAB(__s,__cpu)	(&(__s)->active[__cpu])
+#endif
+
 /********************************************************************
  * 			Core slab cache functions
  *******************************************************************/
@@ -450,14 +456,15 @@
 /*
  * Remove the currently active slab
  */
-static void __always_inline deactivate_slab(struct kmem_cache *s,
-						struct page *page, int cpu)
+static void __always_inline deactivate_slab(struct active_slab *a)
 {
-	s->active[cpu] = NULL;
+	struct page *page = a->page;
+
+	a->page = NULL;
+	a->referenced = 0;
 	__ClearPageActive(page);
-	__ClearPageReferenced(page);
 
-	putback_slab(s, page);
+	putback_slab(a->slab, page);
 }
 
 /*
@@ -467,13 +474,12 @@
 static void flush_active(void *d)
 {
 	struct kmem_cache *s = d;
-	int cpu = smp_processor_id();
-	struct page *page = s->active[cpu];
+	struct active_slab *a = ACTIVE_SLAB(s, smp_processor_id());
 
-	page = s->active[cpu];
-	if (likely(page)) {
-		slab_lock(page);
-		deactivate_slab(s, page, cpu);
+	if (likely(a->page)) {
+		slab_lock(a->page);
+		deactivate_slab(a);
+		a->flush_active = 0;
 	}
 }
 
@@ -481,50 +487,31 @@
 /*
  * Called from IPI during flushing to check and flush active slabs.
  */
-void check_flush_active(void *d)
+void check_flush_active(struct work_struct *w)
 {
-	struct kmem_cache *s = d;
-	int cpu = smp_processor_id();
-	struct page *page = s->active[cpu];
+	struct active_slab *a = container_of(w, struct active_slab, flush.work);
 
-	if (!page)
+	if (!a->page)
 		return;
 
-	if (PageReferenced(page)) {
-		ClearPageReferenced(page);
-		atomic_inc(&s->active_cpus);
+	if (a->referenced) {
+		a->referenced = 0;
+		a->flush_active = 1;
+		schedule_delayed_work(&a->flush, 2 * HZ);
 	} else {
-		slab_lock(page);
-		deactivate_slab(s, page, cpu);
+		slab_lock(a->page);
+		deactivate_slab(a);
+		a->flush_active = 0;
 	}
 }
 
 /*
  * Called from eventd
  */
-static void flusher(struct work_struct *w)
-{
-	struct kmem_cache *s = container_of(w, struct kmem_cache, flush.work);
-
-	if (!mutex_trylock(&s->flushing))
-		return;
-
-	atomic_set(&s->active_cpus, num_online_cpus());
-	on_each_cpu(check_flush_active, s, 1, 1);
-	if (atomic_read(&s->active_cpus))
-		schedule_delayed_work(&s->flush, 2 * HZ);
-	mutex_unlock(&s->flushing);
-}
-
 static void drain_all(struct kmem_cache *s)
 {
-	if (atomic_read(&s->active_cpus)) {
-		mutex_lock(&s->flushing);
-		cancel_delayed_work(&s->flush);
-		atomic_set(&s->active_cpus, 0);
-		on_each_cpu(flush_active, s, 1, 1);
-		mutex_unlock(&s->flushing);
-	}
+	on_each_cpu(flush_active, s , 1, 1);
+
 }
 #else
 static void drain_all(struct kmem_cache *s)
@@ -532,7 +519,7 @@
 	unsigned long flags;
 
 	local_irq_save(flags);
-	flush_active(s);
+	flush_active(&s->active[0]);
 	local_irq_restore(flags);
 }
 #endif
@@ -540,36 +527,35 @@
 static __always_inline void *__slab_alloc(struct kmem_cache *s,
 					gfp_t gfpflags, int node)
 {
+	struct active_slab *a;
 	struct page *page;
 	void **object;
 	void *next_object;
 	unsigned long flags;
-	int cpu;
 
 	local_irq_save(flags);
-	cpu = smp_processor_id();
-	page = s->active[cpu];
-	if (!page)
+	a = ACTIVE_SLAB(s, smp_processor_id());
+	if (!a->page)
 		goto new_slab;
 
-	slab_lock(page);
-	check_free_chain(s, page);
-	if (unlikely(!page->freelist))
+	slab_lock(a->page);
+	check_free_chain(s, a->page);
+	if (unlikely(!a->page->freelist))
 		goto another_slab;
 
-	if (unlikely(node != -1 && page_to_nid(page) != node))
+	if (unlikely(node != -1 && page_to_nid(a->page) != node))
 		goto another_slab;
 redo:
-	page->inuse++;
-	object = page->freelist;
-	page->freelist = next_object = object[page->offset];
-	__SetPageReferenced(page);
-	slab_unlock(page);
+	a->page->inuse++;
+	object = a->page->freelist;
+	a->page->freelist = next_object = object[a->page->offset];
+	a->referenced = 1;
+	slab_unlock(a->page);
 	local_irq_restore(flags);
 	return object;
 
 another_slab:
-	deactivate_slab(s, page, cpu);
+	deactivate_slab(a);
 
 new_slab:
 	page = get_partial(s, gfpflags, node);
@@ -588,27 +574,26 @@
 	 */
 	if (unlikely(s->objects == 1)) {
 		local_irq_restore(flags);
-		return page_address(page);
+		return page_address(a->page);
 	}
 
-	slab_lock(page);
+	slab_lock(a->page);
 
 gotpage:
-	if (s->active[cpu]) {
+	if (a->page) {
 		slab_unlock(page);
 		discard_slab(s, page);
-		page = s->active[cpu];
-		slab_lock(page);
+		slab_lock(a->page);
 	} else
-		s->active[cpu] = page;
+		a->page = page;
 
-	__SetPageActive(page);
-	check_free_chain(s, page);
+	__SetPageActive(a->page);
+	check_free_chain(s, a->page);
 
 #ifdef CONFIG_SMP
-	if (keventd_up() && !atomic_read(&s->active_cpus)) {
-		atomic_inc(&s->active_cpus);
-		schedule_delayed_work(&s->flush, 2 * HZ);
+	if (keventd_up() && !a->flush_active) {
+		a->flush_active = 1;
+		schedule_delayed_work(&a->flush, 2 * HZ);
 	}
 #endif
 	goto redo;
@@ -639,7 +624,6 @@
 		return;
 
 	page = virt_to_page(x);
-
 	if (unlikely(PageCompound(page)))
 		page = page->first_page;
 
@@ -822,14 +806,7 @@
 	atomic_long_set(&s->nr_slabs, 0);
 	atomic_set(&s->refcount, 1);
 	spin_lock_init(&s->list_lock);
-	for_each_possible_cpu(cpu)
-		s->active[cpu] = NULL;
 	INIT_LIST_HEAD(&s->partial);
-#ifdef CONFIG_SMP
-	mutex_init(&s->flushing);
-	atomic_set(&s->active_cpus, 0);
-	INIT_DELAYED_WORK(&s->flush, flusher);
-#endif
 	s->name = name;
 	s->ctor = ctor;
 	s->dtor = dtor;
@@ -876,6 +853,23 @@
 	if (!s->objects)
 		goto error;
 
+	for_each_online_cpu(cpu) {
+		struct active_slab *a;
+
+#ifdef CONFIG_NUMA
+		s->active[cpu] = a = kmalloc(sizeof(struct active_slab), GFP_KERNEL);
+#else
+		a = ACTIVE_SLAB(s, cpu);
+#endif
+
+		a->page = NULL;
+		a->slab = s;
+#ifdef CONFIG_SMP
+		a->flush_active = 0;
+		INIT_DELAYED_WORK(&a->flush, check_flush_active);
+#endif
+	}
+
 	register_slab(s);
 	return 1;
 
@@ -1068,6 +1062,11 @@
 		return 1;
 
 	unregister_slab(s);
+
+#ifdef CONFIG_NUMA
+	for_each_cpu(cpu)
+		kfree(ACTIVE_SLAB(s, cpu));
+#endif
 	return 0;
 }
 EXPORT_SYMBOL(kmem_cache_close);
@@ -1083,7 +1082,6 @@
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
-
 static unsigned long count_objects(struct kmem_cache *s, struct list_head *list)
 {
 	int count = 0;
@@ -1108,11 +1106,11 @@
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		struct page *page = s->active[cpu];
+		struct active_slab *a = ACTIVE_SLAB(s, cpu);
 
-		if (page) {
+		if (a->page) {
 			nr_active++;
-			active += page->inuse;
+			active += a->page->inuse;
 		}
 	}
 
@@ -1129,6 +1127,10 @@
 		(nr_slabs - s->nr_partial - nr_active) * s->objects;
 }
 
+#ifdef CONFIG_NUMA
+/* logic to bring up per cpu portions is missing here */
+#endif
+
 /********************************************************************
  *		Kmalloc subsystem
  *******************************************************************/