SLUB: Optimize cacheline use for zeroing

We touch a cacheline in the kmem_cache structure for zeroing to get the
size. However, the hot paths in slab_alloc and slab_free do not reference
any other fields in kmem_cache.

Add a new field to kmem_cache_cpu that contains the object size. That
cacheline must already be used. So we save one cacheline on every
slab_alloc.

We need to update the kmem_cache_cpu object size if an aliasing operation
changes the objsize of an non debug slab.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/slub_def.h |    1 
 mm/slub.c                |   14 ++-
 slub.c                   |  192 +++++++++++++++++++++++++++--------------------
 3 files changed, 124 insertions(+), 83 deletions(-)

Index: linux-2.6.22-rc6-mm1/include/linux/slub_def.h
===================================================================
--- linux-2.6.22-rc6-mm1.orig/include/linux/slub_def.h	2007-07-11 22:04:59.000000000 -0700
+++ linux-2.6.22-rc6-mm1/include/linux/slub_def.h	2007-07-11 22:05:03.000000000 -0700
@@ -16,6 +16,7 @@ struct kmem_cache_cpu {
 	struct page *page;
 	int node;
 	unsigned int offset;
+	unsigned int objsize;
 };
 
 struct kmem_cache_node {
Index: linux-2.6.22-rc6-mm1/mm/slub.c
===================================================================
--- linux-2.6.22-rc6-mm1.orig/mm/slub.c	2007-07-11 22:04:59.000000000 -0700
+++ linux-2.6.22-rc6-mm1/mm/slub.c	2007-07-11 22:05:03.000000000 -0700
@@ -1571,7 +1571,7 @@ static void __always_inline *slab_alloc(
 	local_irq_restore(flags);
 
 	if (unlikely((gfpflags & __GFP_ZERO) && object))
-		memset(object, 0, s->objsize);
+		memset(object, 0, c->objsize);
 
 	return object;
 }
@@ -1864,8 +1864,9 @@ static void init_kmem_cache_cpu(struct k
 {
 	c->page = NULL;
 	c->freelist = NULL;
-	c->offset = s->offset / sizeof(void *);
 	c->node = 0;
+	c->offset = s->offset / sizeof(void *);
+	c->objsize = s->objsize;
 }
 
 static void init_kmem_cache_node(struct kmem_cache_node *n)
@@ -3173,12 +3174,21 @@ struct kmem_cache *kmem_cache_create(con
 	down_write(&slub_lock);
 	s = find_mergeable(size, align, flags, ctor, ops);
 	if (s) {
+		int cpu;
+
 		s->refcount++;
 		/*
 		 * Adjust the object sizes so that we clear
 		 * the complete object on kzalloc.
 		 */
 		s->objsize = max(s->objsize, (int)size);
+
+		/*
+		 * And then we need to update the object size in the
+		 * per cpu structures
+		 */
+		for_each_online_cpu(cpu)
+			get_cpu_slab(s, cpu)->objsize = s->objsize;
 		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
 		up_write(&slub_lock);
 
Index: linux-2.6.22-rc6-mm1/slub.c
===================================================================
--- linux-2.6.22-rc6-mm1.orig/slub.c	2007-07-11 21:55:22.000000000 -0700
+++ linux-2.6.22-rc6-mm1/slub.c	2007-07-11 22:05:03.000000000 -0700
@@ -138,26 +138,6 @@ static inline void ClearSlabDebug(struct
 }
 
 /*
- * Special value to put onto the freelist to signal
- * that it is not to be used.
- */
-#define CPU_FREELIST_OFF (void **)(16)
-
-static inline int freelist_off(void **x)
-{
-	return x == CPU_FREELIST_OFF;
-}
-
-static inline int freelist_off_or_empty(void *x)
-{
-	return (unsigned long)x <= (unsigned long)CPU_FREELIST_OFF;
-}
-
-static inline void **freelist_get_and_clear(void ***x)
-{
-	return xchg(x, CPU_FREELIST_OFF);
-}
-/*
  * Issues still to be resolved:
  *
  * - Support PAGE_ALLOC_DEBUG. Should be easy to do.
@@ -252,6 +232,9 @@ static enum {
 static DECLARE_RWSEM(slub_lock);
 LIST_HEAD(slab_caches);
 
+/* Maximum objects in defragmentable slabs */
+static unsigned int max_defrag_slab_objects = 0;
+
 /*
  * Tracking user of a slab.
  */
@@ -1398,7 +1381,7 @@ static void deactivate_slab(struct kmem_
 	 * because both freelists are empty. So this is unlikely
 	 * to occur.
 	 */
-	while (unlikely(!freelist_off_or_empty(freelist))) {
+	while (unlikely(freelist)) {
 		void **object;
 
 		/* Retrieve object from cpu_freelist */
@@ -1415,7 +1398,7 @@ static void deactivate_slab(struct kmem_
 
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
-	void **freelist = freelist_get_and_clear(&c->freelist);
+	void **freelist = xchg(&c->freelist, NULL);
 
 	slab_lock(c->page);
 	deactivate_slab(s, c, freelist);
@@ -1489,24 +1472,26 @@ static void *__slab_alloc(struct kmem_ca
 	void **object;
 	struct page *new;
 	struct kmem_cache_cpu *c;
-	void **cpu_freelist = NULL;
+	void **freelist = NULL;
 	unsigned long flags;
 
 	local_irq_save(flags);
-
 	c = get_cpu_slab(s, smp_processor_id());
-
 	if (!c->page)
 		/* Slab was flushed */
 		goto new_slab;
 
-	cpu_freelist = freelist_get_and_clear(&c->freelist);
+	freelist = xchg(&c->freelist, NULL);
 
 	slab_lock(c->page);
-	if (unlikely(!freelist_off_or_empty(cpu_freelist)))
-		goto mess;
 	if (unlikely(!node_match(c, node)))
 		goto another_slab;
+
+	if (unlikely(freelist)) {
+		object = freelist;
+		goto out_object;
+	}
+
 load_freelist:
 	object = c->page->freelist;
 	if (unlikely(!object))
@@ -1515,19 +1500,20 @@ load_freelist:
 		goto debug;
 
 	object = c->page->freelist;
-	c->freelist = object[c->offset];
 	c->page->inuse = s->objects;
 	c->page->freelist = NULL;
 	c->node = page_to_nid(c->page);
+out_object:
+	c->freelist = object[c->offset];
 	slab_unlock(c->page);
 out:
 	local_irq_restore(flags);
 	if (unlikely((gfpflags & __GFP_ZERO)))
-		memset(object, 0, s->objsize);
+		memset(object, 0, c->objsize);
 	return object;
 
 another_slab:
-	deactivate_slab(s, c, cpu_freelist);
+	deactivate_slab(s, c, freelist);
 
 new_slab:
 	new = get_partial(s, gfpflags, node);
@@ -1576,21 +1562,6 @@ debug:
 	c->node = page_to_nid(c->page);
 	slab_unlock(c->page);
 	goto out;
-
-mess:
-	/*
-	 * We have switched off the cpu freelist but found that
-	 * a racing process has freed additional objects in the
-	 * meantime.
-	 *
-	 * So leave the cpu freelist off. Return the newly freed
-	 * object and deactivate the slab. This may have just been
-	 * a single object. In that case we return the freed object
-	 * and can release a full slab.
-	 */
-	object = cpu_freelist;
-	deactivate_slab(s, c, object[c->offset]);
-	goto out;
 }
 
 /*
@@ -1613,15 +1584,18 @@ redo:
 	c = get_cpu_slab(s, raw_smp_processor_id());
 	object = c->freelist;
 
-	if (unlikely(freelist_off_or_empty(object) || !node_match(c, node)))
+	if (unlikely(!object))
 		goto slow;
 
-	if (cmpxchg(&c->freelist, object,
-				object[c->offset]) != object)
+	if (unlikely(!node_match(c, node)))
+		goto slow;
+
+	if (unlikely(cmpxchg(&c->freelist, object,
+				object[c->offset]) != object))
 		goto redo;
 
 	if (unlikely((gfpflags & __GFP_ZERO)))
-		memset(object, 0, s->objsize);
+		memset(object, 0, c->objsize);
 
 	return object;
 
@@ -1652,7 +1626,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
  * handling required then we can return immediately.
  */
 static void __slab_free(struct kmem_cache *s, struct page *page,
-				void *x, void *addr, int offset)
+				void *x, void *addr, unsigned int offset)
 {
 	void *prior;
 	void **object = (void *)x;
@@ -1725,15 +1699,24 @@ static void __always_inline slab_free(st
 
 redo:
 	c = get_cpu_slab(s, raw_smp_processor_id());
-	freelist = c->freelist;
 
-	if (unlikely(page != c->page))
-		goto slow;
 
-	if (freelist_off(freelist))
+	freelist = c->freelist;
+	/*
+	 * Must read c->freelist before c->page. If the page is
+	 * later changed then the freelist also changes which
+	 * will make the cmpxchg() fail.
+	 *
+	 * deactivate_slab() sets c->page to NULL while taking
+	 * the slab lock which provides the corresponding
+	 * smp_wmb() barriers.
+	 */
+	smp_rmb();
+	if (unlikely(c->page != page))
 		goto slow;
 
-	object[c->offset] = freelist;
+	if (unlikely(!freelist))
+		goto slow;
 
 	if (unlikely(cmpxchg(&c->freelist, freelist, object) != freelist))
 		goto redo;
@@ -1930,9 +1913,10 @@ static void init_kmem_cache_cpu(struct k
 			struct kmem_cache_cpu *c)
 {
 	c->page = NULL;
-	c->freelist = CPU_FREELIST_OFF;
-	c->offset = s->offset / sizeof(void *);
+	c->freelist = NULL;
 	c->node = 0;
+	c->offset = s->offset / sizeof(void *);
+	c->objsize = s->objsize;
 }
 
 static void init_kmem_cache_node(struct kmem_cache_node *n)
@@ -1941,7 +1925,9 @@ static void init_kmem_cache_node(struct 
 	atomic_long_set(&n->nr_slabs, 0);
 	spin_lock_init(&n->list_lock);
 	INIT_LIST_HEAD(&n->partial);
+#ifdef CONFIG_SLUB_DEBUG
 	INIT_LIST_HEAD(&n->full);
+#endif
 }
 
 #ifdef CONFIG_SMP
@@ -1996,7 +1982,7 @@ static void free_kmem_cache_cpu(struct k
 		return;
 	}
 	c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu);
-	per_cpu(kmem_cache_cpu_free, cpu) = c;
+ 	per_cpu(kmem_cache_cpu_free, cpu) = c;
 }
 
 static void free_kmem_cache_cpus(struct kmem_cache *s)
@@ -2050,7 +2036,7 @@ static void __init init_alloc_cpu(void)
 
 	for_each_online_cpu(cpu)
 		init_alloc_cpu_cpu(cpu);
-}
+  }
 
 #else
 static inline void free_kmem_cache_cpus(struct kmem_cache *s) {}
@@ -2058,13 +2044,12 @@ static inline void init_alloc_cpu(void) 
 
 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
 {
-	init_kmem_cache_cpu(s, &s->cpu_slab);
+ 	init_kmem_cache_cpu(s, &s->cpu_slab);
 	return 1;
 }
 #endif
 
 #ifdef CONFIG_NUMA
-
 /*
  * No kmalloc_node yet so do it by hand. We know that this is the first
  * slab on the node for this slabcache. There are no concurrent accesses
@@ -2092,8 +2077,10 @@ static struct kmem_cache_node * __init e
 	page->freelist = get_freepointer(kmalloc_caches, n);
 	page->inuse++;
 	kmalloc_caches->node[node] = n;
+#ifdef CONFIG_SLUB_DEBUG
 	init_object(kmalloc_caches, n, 1);
 	init_tracking(kmalloc_caches, n);
+#endif
 	init_kmem_cache_node(n);
 	atomic_long_inc(&n->nr_slabs);
 	add_partial(n, page);
@@ -2756,7 +2743,9 @@ static unsigned long sort_partial_list(s
 	 * list_lock. page->inuse here is the upper limit.
 	 */
 	list_for_each_entry_safe(page, t, &n->partial, lru) {
-		if (!page->inuse && slab_trylock(page)) {
+		int inuse = page->inuse;
+
+		if (!inuse && slab_trylock(page)) {
 			/*
 			 * Must hold slab lock here because slab_free
 			 * may have freed the last object and be
@@ -2769,7 +2758,7 @@ static unsigned long sort_partial_list(s
 			freed++;
 		} else {
 			list_move(&page->lru,
-			slabs_by_inuse + page->inuse);
+				slabs_by_inuse + inuse);
 		}
 	}
 
@@ -2783,6 +2772,8 @@ static unsigned long sort_partial_list(s
 	return freed;
 }
 
+#define NR_INUSE 40
+
 /*
  * Shrink the slab cache on a particular node of the cache
  */
@@ -2793,6 +2784,9 @@ static unsigned long __kmem_cache_shrink
 	struct page *page, *page2;
 	LIST_HEAD(zaplist);
 	int freed;
+	int inuse;
+	int nr[NR_INUSE] = { 0, };
+	int i;
 
 	spin_lock_irqsave(&n->list_lock, flags);
 	freed = sort_partial_list(s, n, scratch);
@@ -2813,12 +2807,13 @@ static unsigned long __kmem_cache_shrink
 	 */
 	while (n->nr_partial > MAX_PARTIAL) {
 		page = container_of(n->partial.prev, struct page, lru);
+		inuse = page->inuse;
 
 		/*
 		 * We are holding the list_lock so we can only
 		 * trylock the slab
 		 */
-		if (page->inuse > s->objects / 4)
+		if (inuse > s->objects / 4)
 			break;
 
 		if (!slab_trylock(page))
@@ -2828,6 +2823,8 @@ static unsigned long __kmem_cache_shrink
 		n->nr_partial--;
 		SetSlabFrozen(page);
 		slab_unlock(page);
+		if (inuse < NR_INUSE)
+			nr[inuse]++;
 	}
 
 	spin_unlock_irqrestore(&n->list_lock, flags);
@@ -2841,6 +2838,13 @@ static unsigned long __kmem_cache_shrink
 		if (__kmem_cache_vacate(s, page, flags, scratch) == 0)
 			freed++;
 	}
+	printk(KERN_INFO "Slab %s: Defrag freed %d pages. PartSlab config=",
+			s->name, freed << s->order);
+
+	for (i = 0; i < NR_INUSE; i++)
+		if (nr[i])
+			printk(" %d=%d", i, nr[i]);
+	printk("\n");
 	return freed;
 }
 
@@ -2960,7 +2964,7 @@ static unsigned long __kmem_cache_defrag
 								void *scratch)
 {
 	unsigned long capacity;
-	unsigned long objects;
+	unsigned long objects_in_full_slabs;
 	unsigned long ratio;
 	struct kmem_cache_node *n = get_node(s, node);
 
@@ -2971,17 +2975,26 @@ static unsigned long __kmem_cache_defrag
 	if (n->nr_partial <= MAX_PARTIAL)
 		return 0;
 
-	/*
-	 * Calculate usage ratio
-	 */
 	capacity = atomic_long_read(&n->nr_slabs) * s->objects;
-	objects = capacity - n->nr_partial * s->objects + count_partial(n);
-	ratio = objects * 100 / capacity;
+	objects_in_full_slabs =
+			(atomic_long_read(&n->nr_slabs) - n->nr_partial)
+							* s->objects;
+	/*
+	 * Worst case calculation: If we would be over the ratio
+	 * even if all partial slabs would only have one object
+	 * then we can skip the further test that would require a scan
+	 * through all the partial page structs to sum up the actual
+	 * number of objects in the partial slabs.
+	 */
+	ratio = (objects_in_full_slabs + 1 * n->nr_partial) * 100 / capacity;
+	if (ratio > s->defrag_ratio)
+		return 0;
 
 	/*
-	 * If usage ratio is more than required then no
-	 * defragmentation
+	 * Now for the real calculation. If usage ratio is more than required
+	 * then no defragmentation
 	 */
+	ratio = (objects_in_full_slabs + count_partial(n)) * 100 / capacity;
 	if (ratio > s->defrag_ratio)
 		return 0;
 
@@ -2998,6 +3011,11 @@ int kmem_cache_defrag(int node)
 	unsigned long pages = 0;
 	void *scratch;
 
+	scratch = kmalloc(sizeof(struct list_head) * max_defrag_slab_objects,
+								GFP_KERNEL);
+	if (!scratch)
+		return 0;
+
 	/*
 	 * kmem_cache_defrag may be called from the reclaim path which may be
 	 * called for any page allocator alloc. So there is the danger that we
@@ -3016,16 +3034,17 @@ int kmem_cache_defrag(int node)
 		if (!s->ops->kick)
 			break;
 
-		scratch = kmalloc(sizeof(struct list_head) * s->objects,
-								GFP_KERNEL);
+
 		if (node == -1) {
-			for_each_online_node(node)
-				pages += __kmem_cache_defrag(s, node, scratch);
+			int nid;
+
+			for_each_online_node(nid)
+				pages += __kmem_cache_defrag(s, nid, scratch);
 		} else
 			pages += __kmem_cache_defrag(s, node, scratch);
-		kfree(scratch);
 	}
 	up_read(&slub_lock);
+	kfree(scratch);
 	return pages;
 }
 EXPORT_SYMBOL(kmem_cache_defrag);
@@ -3205,12 +3224,21 @@ struct kmem_cache *kmem_cache_create(con
 	down_write(&slub_lock);
 	s = find_mergeable(size, align, flags, ctor, ops);
 	if (s) {
+		int cpu;
+
 		s->refcount++;
 		/*
 		 * Adjust the object sizes so that we clear
 		 * the complete object on kzalloc.
 		 */
 		s->objsize = max(s->objsize, (int)size);
+
+		/*
+		 * And then we need to update the object size in the
+		 * per cpu structures
+		 */
+		for_each_online_cpu(cpu)
+			get_cpu_slab(s, cpu)->objsize = s->objsize;
 		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
 		up_write(&slub_lock);
 
@@ -3229,9 +3257,11 @@ struct kmem_cache *kmem_cache_create(con
 			 * Reclaimable slabs first because we may have
 			 * to scan them repeatedly.
 			 */
-			if (ops->kick)
+			if (ops->kick) {
 				list_add(&s->list, &slab_caches);
-			else
+				if (s->objects > max_defrag_slab_objects)
+					max_defrag_slab_objects = s->objects;
+			} else
 				list_add_tail(&s->list, &slab_caches);
 
 			up_write(&slub_lock);
@@ -3743,8 +3773,8 @@ static unsigned long slab_objects(struct
 	per_cpu = nodes + nr_node_ids;
 
 	for_each_possible_cpu(cpu) {
-		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
 		struct page *page;
+		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
 
 		if (!c)
 			continue;