Index: linux-2.6.19-rc1-mm1/mm/slabifier.c
===================================================================
--- linux-2.6.19-rc1-mm1.orig/mm/slabifier.c	2006-10-16 18:38:12.387873148 -0700
+++ linux-2.6.19-rc1-mm1/mm/slabifier.c	2006-10-16 20:05:26.549541001 -0700
@@ -27,14 +27,14 @@ struct slab {
 	struct work_struct flush;
 #endif
 	atomic_t refcount;		/* Refcount for destroy */
-	atomic_long_t nr_slabs;		/* Total slabs used */
 	/* Performance critical items follow */
 	int size;			/* Total size of an object */
 	int offset;			/* Free pointer offset. */
 	int objects;			/* Number of objects in slab */
-	spinlock_t list_lock;
-	struct list_head partial;
-	unsigned long nr_partial;
+	int fallback;			/* Last fallback node */
+	atomic_long_t nr_slabs;		/* Total slabs used */
+	atomic_long_t nr_partial;
+	struct page *partial[MAX_NUMNODES];
 	struct page *active[NR_CPUS];
 };
 
@@ -42,10 +42,6 @@ struct slab {
  * The page struct is used to keep necessary information about a slab.
  * For a compound page the first page keeps the slab state.
  *
- * Lock order:
- *   1. slab_lock(page)
- *   2. slab->list_lock
- *
  * The slabifier assigns one slab for allocation to each processor.
  * Allocations only occur from these active slabs.
  *
@@ -81,99 +77,80 @@ static __always_inline void slab_unlock(
  */
 static void __always_inline add_partial(struct slab *s, struct page *page)
 {
-	spin_lock(&s->list_lock);
-	s->nr_partial++;
-	list_add_tail(&page->lru, &s->partial);
-	spin_unlock(&s->list_lock);
-}
+	int node = page_to_nid(page);
+	struct page *oldpage;
 
-static void __always_inline remove_partial(struct slab *s,
-						struct page *page)
-{
-	spin_lock(&s->list_lock);
-	list_del(&page->lru);
-	s->nr_partial--;
-	spin_unlock(&s->list_lock);
+	do {
+		oldpage = s->partial[node];
+		page->next = oldpage;
+	} while (cmpxchg(&s->partial[node], oldpage, page) != oldpage);
+	atomic_long_inc(&s->nr_partial);
 }
 
-/*
- * Lock page and remove it from the partial list
- *
- * Must hold list_lock
- */
-static __always_inline int lock_and_del_slab(struct slab *s,
-						struct page *page)
+static void discard_slab(struct slab *s, struct page *page)
 {
-	if (bit_spin_trylock(PG_locked, &page->flags)) {
-		list_del(&page->lru);
-		s->nr_partial--;
-		return 1;
-	}
-	return 0;
+	atomic_long_dec(&s->nr_slabs);
+
+	page->mapping = NULL;
+	reset_page_mapcount(page);
+	__ClearPageSlab(page);
+	__ClearPageSlabsingle(page);
+
+	s->sc.page_alloc->free(s->sc.page_alloc, page, s->sc.order);
 }
 
 /*
- * Get a partial page, lock it and return it.
+ * Get a partially allocated or fully free page.
  */
-#ifdef CONFIG_NUMA
 static struct page *get_partial(struct slab *s, int node)
 {
 	struct page *page;
 	int searchnode = (node == -1) ? numa_node_id() : node;
+	int fallback = -1;
 
-	if (!s->nr_partial)
+redo:
+	if (!atomic_read(&s->nr_partial))
 		return NULL;
 
-	spin_lock(&s->list_lock);
-	/*
-	 * Search for slab on the right node
-	 */
-	list_for_each_entry(page, &s->partial, lru)
-		if (likely(page_to_nid(page) == searchnode) &&
-			lock_and_del_slab(s, page))
-				goto out;
+	page = s->partial[searchnode];
 
-	if (likely(node == -1)) {
+	if (page) {
+		if (cmpxchg(&s->partial[searchnode], page, page->next) != page)
+			goto redo;
+		atomic_long_dec(&s->nr_partial);
 		/*
-		 * We can fall back to any other node in order to
-		 * reduce the size of the partial list.
+		 * If we have gotten a completely empty slab and still have pages left
+		 * on this node then free the slab and take the next. This helps to
+		 * defragment slab pages and avoids otherwise expensive removal of
+		 * fully freed slabs
 		 */
-		list_for_each_entry(page, &s->partial, lru)
-			if (likely(lock_and_del_slab(s, page)))
-				goto out;
+		if (!page->inuse && s->partial[searchnode]) {
+			discard_slab(s, page);
+			goto redo;
+		}
+		return page;
 	}
 
-	/* Nothing found */
-	page = NULL;
-out:
-	spin_unlock(&s->list_lock);
-	return page;
-}
-#else
-static struct page *get_partial(struct slab *s, int node)
-{
-	struct page *page;
-
-	/* Racy check. If we mistakenly see no partial slabs then we
-	 * just allocate an empty slab. If we mistakenly try to get a
-	 * partial slab then get_partials() will return NULL.
-	 */
-	if (!s->nr_partial)
+#ifdef CONFIG_NUMA
+	/* Request for a specific node that we were unable to fullfill */
+	if (node != -1)
 		return NULL;
 
-	spin_lock(&s->list_lock);
-	list_for_each_entry(page, &s->partial, lru)
-		if (likely(lock_and_del_slab(s, page)))
-			goto out;
+	/* Ok. We may fall back to other nodes .... */
+	fallback = s->fallback;
+	do {
+		if (s->partial[fallback]) {
+			searchnode = fallback;
+			goto redo;
+		}
+		fallback++;
+		if (fallback == MAX_NUMNODES)
+			fallback = 0;
 
-	/* No slab or all slabs busy */
-	page = NULL;
-out:
-	spin_unlock(&s->list_lock);
-	return page;
-}
+	} while (fallback != s->fallback);
 #endif
-
+	return NULL;
+}
 
 /*
  * Debugging checks
@@ -224,7 +201,7 @@ static int on_freelist(struct slab *s, s
 {
 	int nr = 0;
 	void **object = page->freelist;
-	void *origin = &page->lru;
+	void *origin = &page->next;
 
 	if (PageSlabsingle(page))
 		return 0;
@@ -265,18 +242,6 @@ void check_free_chain(struct slab *s, st
 /*
  * Operations on slabs
  */
-static void discard_slab(struct slab *s, struct page *page)
-{
-	atomic_long_dec(&s->nr_slabs);
-
-	page->mapping = NULL;
-	reset_page_mapcount(page);
-	__ClearPageSlab(page);
-	__ClearPageSlabsingle(page);
-
-	s->sc.page_alloc->free(s->sc.page_alloc, page, s->sc.order);
-}
-
 /*
  * Allocate a new slab and prepare an empty freelist and the basic struct
  * page settings.
@@ -488,17 +453,14 @@ new_slab:
 		return page_address(page);
 	}
 
-	slab_lock(page);
-
 gotpage:
 	if (s->active[cpu]) {
-		slab_unlock(page);
-		discard_slab(s, page);
+		putback_slab(s, page);
 		page = s->active[cpu];
-		slab_lock(page);
 	} else
 		s->active[cpu] = page;
 
+	slab_lock(page);
 	__SetPageActive(page);
 	check_free_chain(s, page);
 
@@ -566,27 +528,17 @@ static void slab_free(struct slab_cache 
 	page->freelist = object;
 	page->inuse--;
 
-	if (likely(PageActive(page) || (page->inuse && prior))) {
-out_unlock:
-		slab_unlock(page);
-		local_irq_restore(flags);
-		return;
-	}
-
-	if (!prior) {
+	if (!PageActive(page) && !prior)
 		/*
 		 * Page was fully used before. It will have one free
 		 * object now. So move to the partial list.
 		 */
 		add_partial(s, page);
-		goto out_unlock;
-	}
 
-	/*
-	 * All object have been freed.
-	 */
-	remove_partial(s, page);
 	slab_unlock(page);
+	local_irq_restore(flags);
+	return;
+
 single_object_slab:
 	discard_slab(s, page);
 	local_irq_restore(flags);
@@ -684,7 +636,7 @@ static struct slab_cache *slab_create(st
 	s->objects = (PAGE_SIZE << sc->order) / s->size;
 	BUG_ON(s->objects > 65535);
 	atomic_long_set(&s->nr_slabs, 0);
-	s->nr_partial = 0;
+	atomic_set(&s->nr_partial, 0);
 #ifdef CONFIG_SMP
 	atomic_set(&s->active_cpus, 0);
 	INIT_WORK(&s->flush, &flusher, s);
@@ -692,10 +644,8 @@ static struct slab_cache *slab_create(st
 	if (!s->objects)
 		return NULL;
 
-	INIT_LIST_HEAD(&s->partial);
-
+	memset(s->partial, 0, sizeof(s->partial));
 	atomic_set(&s->refcount, 1);
-	spin_lock_init(&s->list_lock);
 	mutex_init(&s->flushing);
 	for_each_possible_cpu(cpu)
 		s->active[cpu] = NULL;
@@ -784,6 +734,64 @@ static int move_slab_objects(struct slab
 	return unfreeable;
 }
 
+struct shrink_info {
+	struct slab *s;
+	int (*move_object)(struct slab_cache *, void *);
+};
+
+/*
+ * Shrink a slab on a node.
+ *
+ * We get all pages off the per node lists and free all that are
+ * empty. The remaining pages are compacted if move_slab_object
+ * is set otherwise they are simply put back onto the per node
+ * list.
+ */
+static void slab_shrink_node(void *d)
+{
+	struct page *list = NULL;
+	struct page *page;
+	struct shrink_info *si = d;
+
+	while ((page = get_partial(si->s, numa_node_id()))) {
+
+		if (!page->inuse)
+			discard_slab(si->s, page);
+		else
+		if (page->inuse < si->s->objects) {
+			page->next = list;
+			list = page;
+		}
+	};
+
+	if (!list)
+		return;
+
+	if (!si->s->partial[numa_node_id()]) {
+		page = list;
+		list = page->next;
+		add_partial(si->s, page);
+	}
+
+	/*
+	 * Cycle through the remainder of the list adding slab objects
+	 */
+	while (list) {
+		page = list;
+		list = page->next;
+
+		if (!page->inuse)
+			discard_slab(si->s,page);
+		else {
+			if (si->move_object)
+				move_slab_objects(si->s, page, si->move_object);
+
+			if (page->inuse)
+				putback_slab(si->s, page);
+		}
+	}
+}
+
 /*
  * Shrinking drops the active per cpu slabs and also reaps all empty
  * slabs off the partial list. Returns the number of slabs freed.
@@ -798,47 +806,17 @@ static int move_slab_objects(struct slab
  *
  * Returns the number of slabs freed.
  */
-static int slab_shrink(struct slab_cache *sc,
+static void slab_shrink(struct slab_cache *sc,
 			int (*move_object)(struct slab_cache *, void *))
 {
 	struct slab *s = (void *)sc;
-	unsigned long flags;
-	int slabs_freed = 0;
-	int i;
+	struct shrink_info si = { s, move_object };
 
 	drain_all(s);
-
-	local_irq_save(flags);
-	for(i = 0; s->nr_partial > 1 && i < s->nr_partial - 1; i++ ) {
-		struct page * page;
-
-		page = get_partial(s, -1);
-		if (!page)
-			break;
-
-		/*
-		 * Pin page so that slab_free will not free even if we
-		 * drop the slab lock.
-		 */
-		__SetPageActive(page);
-
-		if (page->inuse < s->objects && move_object)
-			if (move_slab_objects(s,
-					page, move_object) == 0)
-				slabs_freed++;
-
-		/*
-		 * This will put the slab on the front of the partial
-		 * list, the used list or free it.
-		 */
-		__ClearPageActive(page);
-		putback_slab(s, page);
-	}
-	local_irq_restore(flags);
-	return slabs_freed;
-
+	schedule_on_each_node(slab_shrink_node, &si);
 }
 
+
 static struct slab_cache *slab_dup(struct slab_cache *sc)
 {
 	struct slab *s = (void *)sc;
@@ -847,23 +825,6 @@ static struct slab_cache *slab_dup(struc
 	return &s->sc;
 }
 
-static int free_list(struct slab *s, struct list_head *list)
-{
-	int slabs_inuse = 0;
-	unsigned long flags;
-	struct page *page, *h;
-
-	spin_lock_irqsave(&s->list_lock, flags);
-	list_for_each_entry_safe(page, h, list, lru)
-		if (!page->inuse) {
-			list_del(&s->partial);
-			discard_slab(s, page);
-		} else
-			slabs_inuse++;
-	spin_unlock_irqrestore(&s->list_lock, flags);
-	return slabs_inuse;
-}
-
 static int slab_destroy(struct slab_cache *sc)
 {
 	struct slab *s = (void *)sc;
@@ -871,8 +832,7 @@ static int slab_destroy(struct slab_cach
 	if (!atomic_dec_and_test(&s->refcount))
 		return 0;
 
-	drain_all(s);
-	free_list(s, &s->partial);
+	slab_shrink(sc, NULL);
 
 	if (atomic_long_read(&s->nr_slabs))
 		return 1;
@@ -882,16 +842,22 @@ static int slab_destroy(struct slab_cach
 	return 0;
 }
 
-static unsigned long count_objects(struct slab *s, struct list_head *list)
+/*
+ * This is racy and may produce weird results. We check the page pointers
+ * carefully to see if they are still valid.
+ */
+static unsigned long count_objects(struct slab *s)
 {
 	int count = 0;
 	struct page *page;
-	unsigned long flags;
+	int node;
 
-	spin_lock_irqsave(&s->list_lock, flags);
-	list_for_each_entry(page, list, lru)
-		count += page->inuse;
-	spin_unlock_irqrestore(&s->list_lock, flags);
+	for_each_node(node) {
+		page = s->partial[node];
+		while (!page && pfn_valid(page_to_pfn(page)) && page->inuse < s->objects)
+			count += page->inuse;
+		page = page->next;
+	}
 	return count;
 }
 
@@ -900,7 +866,7 @@ static unsigned long slab_objects(struct
 	unsigned long *p_partial)
 {
 	struct slab *s = (void *)sc;
-	int partial = count_objects(s, &s->partial);
+	int partial = count_objects(s);
 	int nr_slabs = atomic_read(&s->nr_slabs);
 	int active = 0;		/* Active slabs */
 	int nr_active = 0;	/* Objects in active slabs */
@@ -916,7 +882,7 @@ static unsigned long slab_objects(struct
 	}
 
 	if (p_partial)
-		*p_partial = s->nr_partial;
+		*p_partial = atomic_long_read(&s->nr_partial);
 
 	if (p_active)
 		*p_active = nr_active;
@@ -925,7 +891,7 @@ static unsigned long slab_objects(struct
 		*p_total = nr_slabs;
 
 	return partial + active +
-		(nr_slabs - s->nr_partial - nr_active) * s->objects;
+		(nr_slabs - atomic_read(&s->nr_partial) - nr_active) * s->objects;
 }
 
 const struct slab_allocator slabifier_allocator = {
Index: linux-2.6.19-rc1-mm1/include/linux/allocator.h
===================================================================
--- linux-2.6.19-rc1-mm1.orig/include/linux/allocator.h	2006-10-16 18:38:10.341124236 -0700
+++ linux-2.6.19-rc1-mm1/include/linux/allocator.h	2006-10-16 20:03:36.215526138 -0700
@@ -133,6 +133,7 @@ struct slab_control {
 	struct slab_cache sc;	/* Common information */
 	void *data[50];		/* Some data */
 	void *percpu[NR_CPUS];	/* Some per cpu information. */
+	void *pernode[MAX_NUMNODES];	/* some per node data */
 };
 
 struct slab_allocator {
@@ -177,7 +178,7 @@ struct slab_allocator {
 	 * return 1 for success. If it return 0 then the object is pinned.
 	 * the slab that the object resides on will not be freed.
 	 */
-	int (*shrink)(struct slab_cache *,
+	void (*shrink)(struct slab_cache *,
 			int (*move_object)(struct slab_cache *, void *));
 
 	/*
Index: linux-2.6.19-rc1-mm1/include/linux/mm_types.h
===================================================================
--- linux-2.6.19-rc1-mm1.orig/include/linux/mm_types.h	2006-10-16 18:38:12.389826153 -0700
+++ linux-2.6.19-rc1-mm1/include/linux/mm_types.h	2006-10-16 19:10:47.044777115 -0700
@@ -58,9 +58,12 @@ struct page {
 		pgoff_t index;		/* Our offset within mapping. */
 		void *freelist;		/* Slabifier: free object */
 	};
-	struct list_head lru;		/* Pageout list, eg. active_list
+	union {
+		struct list_head lru;	/* Pageout list, eg. active_list
 					 * protected by zone->lru_lock !
 					 */
+		struct page *next;	/* slabifier: Next free page */
+	};
 	/*
 	 * On machines where all RAM is mapped into kernel address space,
 	 * we can simply calculate the virtual address. On machines with
Index: linux-2.6.19-rc1-mm1/include/linux/workqueue.h
===================================================================
--- linux-2.6.19-rc1-mm1.orig/include/linux/workqueue.h	2006-10-04 19:57:05.000000000 -0700
+++ linux-2.6.19-rc1-mm1/include/linux/workqueue.h	2006-10-16 20:04:03.215813870 -0700
@@ -72,6 +72,7 @@ extern int FASTCALL(schedule_delayed_wor
 
 extern int schedule_delayed_work_on(int cpu, struct work_struct *work, unsigned long delay);
 extern int schedule_on_each_cpu(void (*func)(void *info), void *info);
+extern int schedule_on_each_node(void (*func)(void *info), void *info);
 extern void flush_scheduled_work(void);
 extern int current_is_keventd(void);
 extern int keventd_up(void);
Index: linux-2.6.19-rc1-mm1/kernel/workqueue.c
===================================================================
--- linux-2.6.19-rc1-mm1.orig/kernel/workqueue.c	2006-10-16 18:38:04.540700317 -0700
+++ linux-2.6.19-rc1-mm1/kernel/workqueue.c	2006-10-16 20:06:40.397525117 -0700
@@ -531,6 +531,29 @@ int schedule_on_each_cpu(void (*func)(vo
 	return 0;
 }
 
+int schedule_on_each_node(void (*func)(void *info), void *info)
+{
+	int node;
+	struct work_struct *works;
+
+	works = alloc_percpu(struct work_struct);
+	if (!works)
+		return -ENOMEM;
+
+	mutex_lock(&workqueue_mutex);
+	for_each_online_node(node) {
+		int cpu = node_to_first_cpu(node);
+
+		INIT_WORK(per_cpu_ptr(works, cpu), func, info);
+		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
+				per_cpu_ptr(works, cpu));
+	}
+	mutex_unlock(&workqueue_mutex);
+	flush_workqueue(keventd_wq);
+	free_percpu(works);
+	return 0;
+}
+
 void flush_scheduled_work(void)
 {
 	flush_workqueue(keventd_wq);