SLUB: Avoid interrupt disable / enable in hot paths through cmpxchg

A cmpxchg allows us to avoid disabling and enabling interrupts. The cmpxchg
is optimal to allow operations on per cpu freelist even if we may be moved
to other processors while getting to the cmpxchg. So we do not need to be
pinned to a cpu. This may be particularly useful for the RT kernel
where we currently seem to have major SLAB issues with the per cpu structures.
But the constant interrupt disable / enable of slab operations also increases
the performance in general.

The hard binding to per cpu structures only comes into play when we enter
the slow path (__slab_alloc and __slab_free).

Pro:

	- Dirty single cacheline with a single instruction in
	  slab_alloc to accomplish allocation.
	- Critical section is also a single instruction in slab_free.
	  (but we need to write to the cacheline of the object too)

Con:
	- Complex racy freelist management
	- Recalculation of per cpu structure address is necessary
	  in __slab_alloc since process may be rescheduled while
	  executing in slab_alloc.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 mm/slub.c |  125 +++++++++++++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 91 insertions(+), 34 deletions(-)

Index: linux-2.6.22-rc6-mm1/mm/slub.c
===================================================================
--- linux-2.6.22-rc6-mm1.orig/mm/slub.c	2007-07-06 16:33:32.000000000 -0700
+++ linux-2.6.22-rc6-mm1/mm/slub.c	2007-07-06 20:00:07.000000000 -0700
@@ -143,6 +143,25 @@ static inline void ClearSlabDebug(struct
  */
 #define CPU_FREELIST_OFF (void **)(16)
 
+static inline int freelist_off(void **x)
+{
+	return x == CPU_FREELIST_OFF;
+}
+
+static inline int freelist_off_or_empty(void *x)
+{
+	return (unsigned long)x <= (unsigned long)CPU_FREELIST_OFF;
+}
+
+/*
+ * Retrieve the status of a freelist and clear it. There are no locks
+ * protecting the freelist. So this is the only way to get the freelist
+ * into a stable condition.
+ */
+static inline void **freelist_get_and_clear(struct kmem_cache_cpu *c)
+{
+	return xchg(&c->freelist, CPU_FREELIST_OFF);
+}
 /*
  * Issues still to be resolved:
  *
@@ -1372,34 +1391,38 @@ static void unfreeze_slab(struct kmem_ca
 /*
  * Remove the cpu slab
  */
-static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c,
+			void **freelist)
 {
 	struct page *page = c->page;
+
+	c->page = NULL;
 	/*
 	 * Merge cpu freelist into freelist. Typically we get here
 	 * because both freelists are empty. So this is unlikely
 	 * to occur.
 	 */
-	while (unlikely(c->freelist)) {
+	while (unlikely(!freelist_off_or_empty(freelist))) {
 		void **object;
 
 		/* Retrieve object from cpu_freelist */
-		object = c->freelist;
-		c->freelist = c->freelist[c->offset];
+		object = freelist;
+		freelist = freelist[c->offset];
 
 		/* And put onto the regular freelist */
 		object[c->offset] = page->freelist;
 		page->freelist = object;
 		page->inuse--;
 	}
-	c->page = NULL;
 	unfreeze_slab(s, page);
 }
 
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
+	void **freelist = freelist_get_and_clear(c);
+
 	slab_lock(c->page);
-	deactivate_slab(s, c);
+	deactivate_slab(s, c, freelist);
 }
 
 /*
@@ -1465,17 +1488,28 @@ static inline int node_match(struct kmem
  * we need to allocate a new slab. This is slowest path since we may sleep.
  */
 static void *__slab_alloc(struct kmem_cache *s,
-		gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
+		gfp_t gfpflags, int node, void *addr)
 {
-	void **object;
+	void **object = NULL;
 	struct page *new;
+	struct kmem_cache_cpu *c;
+	unsigned long flags;
 
+	local_irq_save(flags);
+	c = get_cpu_slab(s, smp_processor_id());
 	if (!c->page)
+		/* Slab was flushed */
 		goto new_slab;
 
+	object = freelist_get_and_clear(c);
+
 	slab_lock(c->page);
 	if (unlikely(!node_match(c, node)))
 		goto another_slab;
+
+	if (unlikely(!freelist_off_or_empty(object)))
+		goto out_object;
+
 load_freelist:
 	object = c->page->freelist;
 	if (unlikely(!object))
@@ -1484,15 +1518,20 @@ load_freelist:
 		goto debug;
 
 	object = c->page->freelist;
-	c->freelist = object[c->offset];
 	c->page->inuse = s->objects;
 	c->page->freelist = NULL;
 	c->node = page_to_nid(c->page);
+out_object:
+	c->freelist = object[c->offset];
 	slab_unlock(c->page);
+out:
+	local_irq_restore(flags);
+	if (unlikely((gfpflags & __GFP_ZERO)))
+		memset(object, 0, s->objsize);
 	return object;
 
 another_slab:
-	deactivate_slab(s, c);
+	deactivate_slab(s, c, object);
 
 new_slab:
 	new = get_partial(s, gfpflags, node);
@@ -1529,17 +1568,18 @@ new_slab:
 		c->page = new;
 		goto load_freelist;
 	}
+	local_irq_restore(flags);
 	return NULL;
 debug:
-	c->freelist = CPU_FREELIST_OFF;
 	object = c->page->freelist;
 	if (!alloc_debug_processing(s, c->page, object, addr))
 		goto another_slab;
 
 	c->page->inuse++;
 	c->page->freelist = object[c->offset];
+	c->node = page_to_nid(c->page);
 	slab_unlock(c->page);
-	return object;
+	goto out;
 }
 
 /*
@@ -1556,26 +1596,29 @@ static void __always_inline *slab_alloc(
 		gfp_t gfpflags, int node, void *addr)
 {
 	void **object;
-	unsigned long flags;
 	struct kmem_cache_cpu *c;
 
-	local_irq_save(flags);
-	c = get_cpu_slab(s, smp_processor_id());
-	if (unlikely(!c->page || !c->freelist ||
-					!node_match(c, node)))
+redo:
+	c = get_cpu_slab(s, raw_smp_processor_id());
+	object = c->freelist;
 
-		object = __slab_alloc(s, gfpflags, node, addr, c);
+	if (unlikely(freelist_off_or_empty(object)))
+		goto slow;
 
-	else {
-		object = c->freelist;
-		c->freelist = object[c->offset];
-	}
-	local_irq_restore(flags);
+	if (unlikely(!node_match(c, node)))
+		goto slow;
 
-	if (unlikely((gfpflags & __GFP_ZERO) && object))
+	if (unlikely(cmpxchg(&c->freelist, object,
+				object[c->offset]) != object))
+		goto redo;
+
+	if (unlikely((gfpflags & __GFP_ZERO)))
 		memset(object, 0, s->objsize);
 
 	return object;
+
+slow:
+	return __slab_alloc(s, gfpflags, node, addr);
 }
 
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
@@ -1601,11 +1644,13 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
  * handling required then we can return immediately.
  */
 static void __slab_free(struct kmem_cache *s, struct page *page,
-				void *x, void *addr, unsigned int offset)
+				void *x, void *addr, int offset)
 {
 	void *prior;
 	void **object = (void *)x;
+	unsigned long flags;
 
+	local_irq_save(flags);
 	slab_lock(page);
 
 	if (unlikely(SlabDebug(page)))
@@ -1631,6 +1676,7 @@ checks_ok:
 
 out_unlock:
 	slab_unlock(page);
+	local_irq_restore(flags);
 	return;
 
 slab_empty:
@@ -1642,6 +1688,7 @@ slab_empty:
 
 	slab_unlock(page);
 	discard_slab(s, page);
+	local_irq_restore(flags);
 	return;
 
 debug:
@@ -1665,18 +1712,28 @@ static void __always_inline slab_free(st
 			struct page *page, void *x, void *addr)
 {
 	void **object = (void *)x;
-	unsigned long flags;
+	void **freelist;
 	struct kmem_cache_cpu *c;
 
-	local_irq_save(flags);
-	c = get_cpu_slab(s, smp_processor_id());
-	if (likely(page == c->page && c->freelist != CPU_FREELIST_OFF)) {
-		object[c->offset] = c->freelist;
-		c->freelist = object;
-	} else
-		__slab_free(s, page, x, addr, c->offset);
+redo:
+	c = get_cpu_slab(s, raw_smp_processor_id());
+	freelist = c->freelist;
 
-	local_irq_restore(flags);
+	if (unlikely(page != c->page))
+		goto slow;
+
+	if (unlikely(freelist_off(freelist)))
+		goto slow;
+
+	object[c->offset] = freelist;
+
+	if (unlikely(cmpxchg(&c->freelist, freelist, object) != freelist))
+		goto redo;
+
+	return;
+
+slow:
+	__slab_free(s, page, x, addr, c->offset);
 }
 
 void kmem_cache_free(struct kmem_cache *s, void *x)