Index: linux-2.6.18-rc4/mm/slabifier.c
===================================================================
--- linux-2.6.18-rc4.orig/mm/slabifier.c	2006-08-22 21:57:46.982809406 -0700
+++ linux-2.6.18-rc4/mm/slabifier.c	2006-08-23 11:56:44.603652940 -0700
@@ -23,18 +23,17 @@
 
 struct slab {
 	struct slab_cache sc;
-	spinlock_t list_lock;
-	struct list_head partial;	/* List of partially allocated slabs */
-	struct list_head full;		/* Fully allocated slabs */
-	unsigned long nr_partial;	/* Partial slabs */
-	unsigned long nr_slabs;		/* Total slabs used */
+	struct work_struct flush;
+	ZONE_PADDING(slab_pad);		/* Align to cacheline boundary */
 	int size;			/* Slab size */
 	int offset;			/* Free pointer offset. */
 	int objects;			/* Number of objects in slab */
 	atomic_t refcount;		/* Refcount for destroy */
-	/* Flusher related data */
+	atomic_long_t nr_slabs;		/* Total slabs used */
+	spinlock_t list_lock;
+	struct list_head partial;	/* List of partially allocated slabs */
+	unsigned long nr_partial;	/* Partial slabs */
 	int flusher_active;
-	struct work_struct flush;
 	struct page *active[NR_CPUS];	/* Per CPU slabs list protected by
 					 * page lock
 					 */
@@ -120,28 +119,95 @@ static int get_object_counter(struct pag
 }
 
 /*
- * For a given active page get the corresponding cpu */
-static int get_active_cpu(struct page *page)
+ * Locking for each individual slab using the pagelock
+ */
+static __always_inline void slab_lock(struct page *page)
 {
-	return (unsigned long)(page->lru.prev);
+	bit_spin_lock(PG_locked, &page->flags);
 }
 
-static void set_active_cpu(struct page *page, unsigned long cpu)
+static __always_inline void slab_unlock(struct page *page)
 {
-	page->lru.prev = (void *)cpu;
+	bit_spin_unlock(PG_locked, &page->flags);
+}
+
+static void add_partial(struct slab *s, struct page *page)
+{
+	spin_lock(&s->list_lock);
+	s->nr_partial++;
+	list_add_tail(&page->lru, &s->partial);
+	spin_unlock(&s->list_lock);
+}
+
+static void remove_partial(struct slab *s, struct page *page)
+{
+	spin_lock(&s->list_lock);
+	list_del(&page->lru);
+	s->nr_partial--;
+	spin_unlock(&s->list_lock);
 }
 
 /*
- * Locking for each individual slab using the pagelock
+ * Get a page and remove it from the partial list
+ * Must hold list_lock
  */
-static void slab_lock(struct page *page)
+static int lock_and_del_slab(struct slab *s, struct page *page)
 {
-	bit_spin_lock(PG_locked, &page->flags);
+	if (bit_spin_trylock(PG_locked, &page->flags)) {
+		list_del(&page->lru);
+		s->nr_partial--;
+		return 1;
+	}
+	return 0;
 }
 
-static void slab_unlock(struct page *page)
+/*
+ * Get a partial page, lock it and return it.
+ */
+static struct page *get_partial(struct slab *s, int node)
 {
-	bit_spin_unlock(PG_locked, &page->flags);
+	struct page *page;
+	struct list_head *h;
+	int wanted_node;
+
+	spin_lock(&s->list_lock);
+
+#ifdef CONFIG_NUMA
+	/*
+	 * Search for slab on the right node
+	 *
+	 * This search is a scalability concern. Searching big
+	 * lists under lock can cause latencies.
+	 *
+	 * On the other hand picking the right slab that
+	 * is from the node were we are and maybe even
+	 * from the same cpu as before is very good
+	 * for latency.
+	 */
+	wanted_node = node < 0 ? numa_node_id() : node;
+	list_for_each(h, &s->partial) {
+		page = container_of(h, struct page, lru);
+
+		if (likely(page_to_nid(page) == wanted_node) &&
+			lock_and_del_slab(s, page))
+			goto out;
+	}
+
+	if (node >= 0)
+		goto fail;
+
+#endif
+	list_for_each(h, &s->partial) {
+		page = container_of(h, struct page, lru);
+
+		if (lock_and_del_slab(s, page))
+			goto out;
+	}
+fail:
+	page = NULL;
+out:
+	spin_unlock(&s->list_lock);
+	return page;
 }
 
 static void check_slab(struct page *page)
@@ -170,18 +236,12 @@ static void check_active_slab(struct pag
 
 /*
  * Discard an unused slab page
- * Must hold list_lock.
- * Cannot hold the slab lock since the page is going away.
  */
 static void discard_slab(struct slab *s, struct page *page)
 {
-	TPRINTK(KERN_CRIT "slab %s free %p page_alloc=%p free=%p\n", s->sc.name, page,
-		s->sc.page_alloc, s->sc.page_alloc->free);
-
 	DBUG_ON(PageActive(page));
 	DBUG_ON(PageLocked(page));
-	list_del(&page->lru);
-	s->nr_slabs--;
+	atomic_long_dec(&s->nr_slabs);
 
 	/* Restore page state */
 	page->mapping = NULL;		/* was used for slab pointer */
@@ -194,50 +254,39 @@ static void discard_slab(struct slab *s,
 }
 
 /*
- * Move a page back to the lists. This can be an active page or a page
- * that was taken off the list for another purpose.
+ * Move a page back to the lists.
  *
  * Must be called with the slab lock held.
  * On exit the slab lock will have been dropped.
  */
-static void deactivate_slab(struct slab *s, struct page *page)
+static void putback_slab(struct slab *s, struct page *page)
 {
 	int inuse;
-#ifdef SLABIFIER_DEBUG
-	void *objp;
-	int cpu = get_active_cpu(page);
-#endif
 
-	spin_lock(&s->list_lock);
-	s->active[get_active_cpu(page)] = NULL;
-	ClearPageActive(page);
-	ClearPageReferenced(page);
 	inuse = get_object_counter(page);
-#ifdef SLABIFIER_DEBUG
-	/*
-	 * Must get this before dropping slab lock otherwise others
-	 * may already be freeing objects in the page again.
-	 */
-	objp = get_object_pointer(page);
-#endif
-	slab_unlock(page);
+
+	TPRINTK(KERN_CRIT "putback_slab %s: %p %d/%d\n",s->sc.name, page, inuse, s->objects);
 	if (inuse) {
-		if (inuse < s->objects) {
-			DBUG_ON(!objp);
-			TPRINTK(KERN_CRIT "slab %s: %p partial %d/%d %d cpu=%d\n",s->sc.name, page, inuse, s->objects, contended, cpu);
-			s->nr_partial++;
-			list_add(&page->lru, &s->partial);
-		} else {
-			DBUG_ON(objp);
-			TPRINTK(KERN_CRIT "slab %s:  %p full %d cpu=%d\n",s->sc.name, page, contended, cpu);
-			list_add_tail(&page->lru, &s->full);
-		}
+		if (inuse < s->objects)
+			add_partial(s, page);
+		slab_unlock(page);
 	} else {
-		/* For discard_slab we must have the slab on some list */
-		list_add_tail(&page->lru, &s->full);
+		slab_unlock(page);
 		discard_slab(s, page);
 	}
-	spin_unlock(&s->list_lock);
+}
+
+/*
+ * Make the current active page inactive
+ */
+static void deactivate_slab(struct slab *s, struct page *page, int cpu)
+{
+	s->active[cpu] = NULL;
+	smp_wmb();
+	ClearPageActive(page);
+	ClearPageReferenced(page);
+
+	putback_slab(s, page);
 }
 
 static int check_valid_pointer(struct slab *s, struct page *page, void *object, void *origin)
@@ -245,10 +294,8 @@ static int check_valid_pointer(struct sl
 #ifdef SLABIFIER_DEBUG
 	void *base = page_address(page);
 
-	check_slab(page);
-
 	if (object < base || object >= base + s->objects * s->size) {
-		printk(KERN_CRIT "slab %s size %d: pointer %p->%p\nnot in "
+		printk(KERN_CRIT "slab %s size %d: pointer %p->%p\nnot in"
 			" range (%p-%p) in page %p\n", s->sc.name, s->size,
 			origin, object, base, base + s->objects * s->size,
 			page);
@@ -279,7 +326,7 @@ static int on_freelist(struct slab *s, s
 
 	check_slab(page);
 
-	while (object) {
+	while (object && nr <= s->objects) {
 		if (object == search)
 			return 1;
 		if (!check_valid_pointer(s, page, object, origin))
@@ -292,7 +339,7 @@ static int on_freelist(struct slab *s, s
 	if (get_object_counter(page) != s->objects - nr) {
 		printk(KERN_CRIT "slab %s: page %p wrong object count."
 			" counter is %d but counted were %d\n",
-			s->sc.name, page, get_object_counter(page), nr);
+			s->sc.name, page, get_object_counter(page), s->objects - nr);
 try_recover:
 		printk(KERN_CRIT "****** Trying to continue by marking "
 			"all objects used (memory leak!)\n");
@@ -312,17 +359,16 @@ void check_free_chain(struct slab *s, st
 /*
  * Allocate a new slab and prepare an empty freelist
  * and the basic struct page settings.
+ * Return with the slab locked.
  */
-static struct page *new_slab(struct slab *s, gfp_t flags)
+static struct page *new_slab(struct slab *s, gfp_t flags, int node)
 {
 	void *p, *start, *end;
 	void **last;
 	struct page *page;
 
-	TPRINTK(KERN_CRIT "add slab %s flags=%x\n", s->sc.name, flags);
-
 	page = s->sc.page_alloc->allocate(s->sc.page_alloc, s->sc.order,
-			flags, s->sc.node);
+			flags, node < 0 ? s->sc.node : node);
 	if (!page)
 		return NULL;
 
@@ -341,6 +387,8 @@ static struct page *new_slab(struct slab
 	__SetPageSlab(page);
 	check_free_chain(s, page);
 	add_zone_page_state(page_zone(page), NR_SLAB, 1 << s->sc.order);
+	atomic_long_inc(&s->nr_slabs);
+	slab_lock(page);
 	return page;
 }
 
@@ -348,8 +396,7 @@ static struct page *new_slab(struct slab
  * Acquire the slab lock from the active array. If there is no active
  * slab for this processor then return NULL;
  */
-static struct page *get_and_lock_active(struct slab *s, int cpu)
-{
+static __always_inline struct page *get_and_lock_active(struct slab *s, int cpu) {
 	struct page *page;
 
 redo:
@@ -362,6 +409,7 @@ redo:
 		goto redo;
 	}
 	check_active_slab(page);
+	check_free_chain(s, page);
 	return page;
 }
 
@@ -377,7 +425,7 @@ static void flush_active(struct slab *s,
 	local_irq_save(flags);
 	page = get_and_lock_active(s, cpu);
 	if (likely(page))
-		deactivate_slab(s, page);
+		deactivate_slab(s, page, cpu);
 	local_irq_restore(flags);
 }
 
@@ -450,6 +498,8 @@ static struct slab_cache *slab_create(st
 	struct slab *s = (void *)x;
 	int cpu;
 
+	BUG_ON(sizeof(struct slab_control) < sizeof(struct slab));
+
 	memcpy(&x->sc, sc, sizeof(struct slab_cache));
 
 	s->size = ALIGN(sc->size, sizeof(void *));
@@ -459,7 +509,7 @@ static struct slab_cache *slab_create(st
 
 	s->offset = sc->offset / sizeof(void *);
 	s->objects = (PAGE_SIZE << sc->order) / s->size;
-	s->nr_slabs = 0;
+	atomic_long_set(&s->nr_slabs, 0);
 	s->nr_partial = 0;
 	s->flusher_active = 0;
 
@@ -467,7 +517,6 @@ static struct slab_cache *slab_create(st
 		return NULL;
 
 	INIT_LIST_HEAD(&s->partial);
-	INIT_LIST_HEAD(&s->full);
 
 	atomic_set(&s->refcount, 1);
 	spin_lock_init(&s->list_lock);
@@ -486,117 +535,117 @@ static struct slab_cache *slab_create(st
  *
  * Return NULL if we cannot reload.
  */
-static struct page *reload(struct slab *s, unsigned long cpu, gfp_t flags)
+static struct page *reload(struct slab *s, unsigned long cpu, gfp_t flags,
+							int node)
 {
 	struct page *page;
 
 redo:
-	if (unlikely(list_empty(&s->partial))) {
-		/* Add more slabs to the partial list */
-		if ((flags & __GFP_WAIT)) {
-			local_irq_enable();
-			page = new_slab(s, flags);
-			local_irq_disable();
-		} else
-			page = new_slab(s, flags);
-
-		if (!page)
-			return NULL;
+	if (s->nr_partial) { /* Racy check. If we do a useless allocation then
+			         we just build up the partial list */
+		page = get_partial(s, node);
+		if (page)
+			goto gotpage;
+	}
 
-		spin_lock(&s->list_lock);
-		s->nr_slabs++;
+	if ((flags & __GFP_WAIT)) {
+		local_irq_enable();
+		page = new_slab(s, flags, node);
+		local_irq_disable();
 	} else
-		spin_lock(&s->list_lock);
-		page = NULL;		/* Help compiler to not get confused */
-		/* Recheck */
-		if (unlikely(list_empty(&s->partial))) {
-			/* Another processor drained the list */
-			spin_unlock(&s->list_lock);
-			goto redo;
-		page = lru_to_first_page(&s->partial);
-		list_del(&page->lru);
-		/* Search list for page from the correct node */
-		s->nr_partial--;
-	}
+		page = new_slab(s, flags, node);
+
+	if (!page)
+		return NULL;
 
+gotpage:
 	/*
-	 * Now we have a page that is isolated from the lists
-	 * and we hold the list lock. So no one can modify
-	 * active slab pointers.
+	 * Now we have a page that is isolated from the lists and
+	 * locked,
 	 */
+	SetPageActive(page);
+	ClearPageReferenced(page);
+
+	/*
+	 * Barrier is needed so that a racing process never
+	 * sees a page that thas active not set.
+	 */
+	smp_wmb();
+
+	if (cmpxchg(&s->active[cpu], NULL, page) != NULL) {
+
+		TPRINTK(KERN_CRIT "active already provided %s\n", s->sc.name);
+
+		ClearPageActive(page);
+		add_partial(s, page);
+		slab_unlock(page);
 
-	if (unlikely(s->active[cpu])) {
-		/* Someone else created a new slab here */
-		list_add(&page->lru,&s->partial);
-		s->nr_partial++;
-		spin_unlock(&s->list_lock);
 		page = get_and_lock_active(s, cpu);
 		if (page)
 			return page;
 		goto redo;
 	}
 
-	/*
-	 * Lock inversion. This works because s->active[cpu] is null. No one else
-	 * can acquire the lock. However, we must insure that the lock bit becomes
-	 * visible before the update  to s->active[cpu]. Thus the write barrier here.
-	 * get_and_lock active uses a lock there which gives us the implicit
-	 * corresponding smb_rmb() barrier.
-	 */
-	slab_lock(page);
-	SetPageActive(page);
-	ClearPageReferenced(page);
-	set_active_cpu(page, cpu);
-	smp_wmb();
-	s->active[cpu] = page;
 	check_free_chain(s, page);
-	spin_unlock(&s->list_lock);
 	if (keventd_up() && !s->flusher_active &&
 			s->size != (PAGE_SIZE << s->sc.order))
 		schedule_delayed_work(&s->flush, 10 * HZ);
 	return page;
 }
+
 /*
  * If the gfp mask has __GFP_WAIT set then slab_alloc() may enable interrupts
  * if it needs to acquire more pages for new slabs.
  */
-static void *slab_alloc(struct slab_cache *sc, gfp_t gfpflags)
+static __always_inline void *__slab_alloc(struct slab_cache *sc, gfp_t gfpflags,
+		int node)
 {
 	struct slab *s = (void *)sc;
-	int cpu = smp_processor_id();
 	struct page *page;
-	void **object = NULL;
+	void **object;
 	void *next_object;
 	unsigned long flags;
+	int cpu = smp_processor_id();
 
 	local_irq_save(flags);
+	page = get_and_lock_active(s, cpu);
+	if (unlikely(!page))
+		goto load;
 
-	do {
-		page = get_and_lock_active(s, cpu);
-
-		if (unlikely(!page)) {
-			page = reload(s, cpu, gfpflags);
+	while (unlikely(!get_object_pointer(page) ||
+		(node > 0 && page_to_nid(page) != node))) {
 
-			if (!page)
-				goto out;
+		/* Current slab is unfit for allocation */
+		deactivate_slab(s, page, cpu);
+load:
+		/* Get a new slab */
+		page = reload(s, cpu, gfpflags, node);
+		if (!page) {
+			local_irq_restore(flags);
+			return NULL;
 		}
-
-		object = get_object_pointer(page);
-	} while (!object);
+	}
 
 	inc_object_counter(page);
+	object = get_object_pointer(page);
 	next_object = object[s->offset];
 	set_object_pointer(page, next_object);
-	if (likely(!next_object))
-		/* Sorry, fully allocated slab! */
-		deactivate_slab(s, page);
-	else
-		SetPageReferenced(page);
 	check_free_chain(s, page);
+	SetPageReferenced(page);
 	slab_unlock(page);
-out:
 	local_irq_restore(flags);
 	return object;
+
+}
+
+static void *slab_alloc(struct slab_cache *sc, gfp_t gfpflags)
+{
+	return __slab_alloc(sc, gfpflags, -1);
+}
+
+static void *slab_alloc_node(struct slab_cache *sc, gfp_t gfpflags, int node)
+{
+	return __slab_alloc(sc, gfpflags, node);
 }
 
 /* Figure out on which slab object the object resides */
@@ -708,11 +757,11 @@ dumpret:
 		 * We deallocated all objects in a slab and the slab
 		 * is not under allocation. So we can free it.
 		 */
-		spin_lock(&s->list_lock);
+		if (s->objects > 1)
+			remove_partial(s, page);
 		check_free_chain(s, page);
 		slab_unlock(page);
 		discard_slab(s, page);
-		spin_unlock(&s->list_lock);
 		goto out;
 	}
 	if (unlikely(!prior)) {
@@ -722,10 +771,7 @@ dumpret:
 		 * This will increase the chances of the first object
 		 * to be reused soon. Its likely cache hot.
 		 */
-		spin_lock(&s->list_lock);
-		list_move(&page->lru, &s->partial);
-		s->nr_partial++;
-		spin_unlock(&s->list_lock);
+		add_partial(s, page);
 	}
 out_unlock:
 	slab_unlock(page);
@@ -803,13 +849,16 @@ static int move_slab_objects(struct slab
 				 * Drop the lock here to allow the
 				 * move_object function to do things
 				 * with the slab_cache and maybe this
-				 * page
-				*/
+				 * page.
+				 *
+				 */
 				slab_unlock(page);
+				local_irq_enable();
 				if (move_objects((struct slab_cache *)s, p))
 					slab_free(&s->sc, p);
 				else
 					unfreeable++;
+				local_irq_disable();
 				slab_lock(page);
 			}
 		}
@@ -846,22 +895,12 @@ static int slab_shrink(struct slab_cache
 
 	drain_all(s);
 
+	local_irq_save(flags);
 	for(i = 0; s->nr_partial > 1 && i < s->nr_partial - 1; i++ ) {
 		struct page * page;
 
-		/* Take one page off the list */
-		spin_lock_irqsave(&s->list_lock, flags);
-
-		if (s->nr_partial == 0) {
-			spin_unlock_irqrestore(&s->list_lock, flags);
-			break;
-		}
-
-		page = lru_to_last_page(&s->partial);
-		s->nr_partial--;
-		list_del(&page->lru);
-		SetPageActive(page);		/* Pin page so that slab_free will not free */
-		spin_unlock_irqrestore(&s->list_lock, flags);
+		page = get_partial(s, -1);
+		SetPageActive(page);	/* Pin page so that slab_free will not free */
 
 		/*
 		 * Ok. The page cannot become active anymore.
@@ -878,8 +917,9 @@ static int slab_shrink(struct slab_cache
 		 * This will put the slab on the front of the partial
 		 * list, the used list or free it.
 		 */
-		deactivate_slab(s, page);
+		putback_slab(s, page);
 	}
+	local_irq_restore(flags);
 
 	return slabs_freed;
 
@@ -917,12 +957,13 @@ static int slab_destroy(struct slab_cach
 		return 0;
 
 	TPRINTK("Slab destroy %s\n",sc->name);
+
 	drain_all(s);
 
-	free_list(s, &s->full);
+	/* There may be empty slabs on the partial list */
 	free_list(s, &s->partial);
 
-	if (s->nr_slabs)
+	if (atomic_long_read(&s->nr_slabs))
 		return 1;
 
 	/* Just to make sure that no one uses this again */
@@ -967,51 +1008,8 @@ static unsigned long slab_objects(struct
 	if (p_active)
 		*p_active = active;
 
-	return active + partial + count_objects(s, &s->full);
-}
-
-static void *slab_alloc_node(struct slab_cache *sc, gfp_t flags, int node)
-{
-	struct slab *s = (void *)sc;
-	int cpu = node_to_first_cpu(node);
-	struct page *page;
-	void **object = NULL;
-	void *next_object;
-	unsigned long flags;
-
-	local_irq_save(flags);
-
-	do {
-		page = get_and_lock_active(s, cpu);
-
-		if (unlikely(!page)) {
-			page = reload(s, cpu, gfpflags);
-
-			if (!page)
-				goto out;
-		}
-		if (page_to_nid(page) != node) {
-			deactivate_slab(page);
-			continue;
-		}
-
-		object = get_object_pointer(page);
-	} while (!object);
-
-	inc_object_counter(page);
-	next_object = object[s->offset];
-	set_object_pointer(page, next_object);
-	if (likely(!next_object))
-		/* Sorry, fully allocated slab! */
-		deactivate_slab(s, page);
-	else
-		SetPageReferenced(page);
-	check_free_chain(s, page);
-	slab_unlock(page);
-out:
-	local_irq_restore(flags);
-	return object;
-	return slab_alloc(sc, flags);
+	return active + partial +
+		(atomic_read(&s->nr_slabs) - s->nr_partial) * s->objects;
 }
 
 const struct slab_allocator slabifier_allocator = {