SLUB: Support per cpu allocation list of objects independent of slabs.

This patch changes the way we manage per cpu objects by managing linked lists
of objects. These objects are no longer restricted to come only from a single slab.

This has the following benefits:

- The fastpath for slab_free() can be taken regularly (vs. about 20%-50% with
  the old approach).

- It is possible to put more objects than fit in a slab onto the per cpu allocation
  list. This allows more frequent use of the slab_alloc fast path without the need
  for larger pages. Fetches of objects via the partial lists can be batched which
  will reduce locking overhead.

- We can manage list sizes per slab cache on SLUB like SLAB.

Issues that arise because of this:

- The alloc/free fastpaths get slower since we need to update a counter.
  Likely a minimal impact since we increment in an already cache hot cacheline.

- Further optimizations of the alloc/free paths are not possible since we
  now have to update two variables in the per cpu structure (pointer +
  counter). Mathieu's cmpxchg_local optimization that brings us around 30-50%
  in speed gain in the fast paths for slab_alloc and slab_free will no longer
  be possible.

- The per slab control of slowpath/fastpath to support containers/cgroups
  accounting and access control is also no longer possible. We will likely
  have to add code to the fastpath to check for a container/cgroup
  restriction. With the current approach we can switch individual slabs to
  slow path mode and then put the logic only into the slow path. I favor that
  approach since it is likely that future enhancements may require more
  specialized object handling and the existing approach allows us to keep
  additional logic out of the fast path for good.

- NUMA localization control is getting problematic. If slab objects only come
  from a specific page then we can check the locality by checking the the node
  that we calculated when activating a cpu slab. This is no longer working if
  random objects can show up on the per cpu freelists. For NUMA we have to add
  a number of calls to virt_to_page and page_to_nid to determine NUMA locality.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/slub_def.h |    5 
 mm/slub.c                |  539 ++++++++++++++++++++++++++++++-----------------
 2 files changed, 352 insertions(+), 192 deletions(-)

Index: linux-2.6.23-rc8-mm2/include/linux/slub_def.h
===================================================================
--- linux-2.6.23-rc8-mm2.orig/include/linux/slub_def.h	2007-10-09 13:44:48.000000000 -0700
+++ linux-2.6.23-rc8-mm2/include/linux/slub_def.h	2007-10-09 13:44:56.000000000 -0700
@@ -13,8 +13,9 @@
 
 struct kmem_cache_cpu {
 	void **freelist;
-	struct page *page;
-	int node;
+	int objects;
+	int refill;
+	int max;
 	unsigned int offset;
 	unsigned int objsize;
 };
Index: linux-2.6.23-rc8-mm2/mm/slub.c
===================================================================
--- linux-2.6.23-rc8-mm2.orig/mm/slub.c	2007-10-09 13:44:48.000000000 -0700
+++ linux-2.6.23-rc8-mm2/mm/slub.c	2007-10-10 10:47:02.000000000 -0700
@@ -877,9 +877,11 @@ bad:
 	return 0;
 }
 
-static int free_debug_processing(struct kmem_cache *s, struct page *page,
+static int free_debug_processing(struct kmem_cache *s,
 						void *object, void *addr)
 {
+	struct page *page = virt_to_head_page(object);
+
 	if (!check_slab(s, page))
 		goto fail;
 
@@ -1022,10 +1024,10 @@ static inline void setup_object_debug(st
 			struct page *page, void *object) {}
 
 static inline int alloc_debug_processing(struct kmem_cache *s,
-	struct page *page, void *object, void *addr) { return 0; }
+	void *object, void *addr) { return 0; }
 
 static inline int free_debug_processing(struct kmem_cache *s,
-	struct page *page, void *object, void *addr) { return 0; }
+	void *object, void *addr) { return 0; }
 
 static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
 			{ return 1; }
@@ -1173,6 +1175,8 @@ static void discard_slab(struct kmem_cac
 {
 	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
 
+	WARN_ON(!atomic_long_read(&n->nr_slabs));
+
 	atomic_long_dec(&n->nr_slabs);
 	reset_page_mapcount(page);
 	__ClearPageSlab(page);
@@ -1376,36 +1380,106 @@ static void unfreeze_slab(struct kmem_ca
 }
 
 /*
- * Remove the cpu slab
+ * Object count was reduced. Unlock page and make sure that the page is on the
+ * right list
  */
-static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+static void putback_slab(struct kmem_cache *s, struct page *page, void *prior)
 {
-	struct page *page = c->page;
+	if (unlikely(SlabFrozen(page)))
+		goto out_unlock;
+
+	if (unlikely(!page->inuse)) {
+		/* Page is now empty */
+		if (prior)
+			/*
+			 * Slab still on the partial list.
+			 */
+			remove_partial(s, page);
+
+		slab_unlock(page);
+		discard_slab(s, page);
+		return;
+	}
+
 	/*
-	 * Merge cpu freelist into freelist. Typically we get here
-	 * because both freelists are empty. So this is unlikely
-	 * to occur.
+	 * Objects left in the slab. If it
+	 * was not on the partial list before
+	 * then add it.
 	 */
-	while (unlikely(c->freelist)) {
-		void **object;
+	if (unlikely(!prior))
+		add_partial(get_node(s, page_to_nid(page)), page);
 
-		/* Retrieve object from cpu_freelist */
-		object = c->freelist;
-		c->freelist = c->freelist[c->offset];
+out_unlock:
+	slab_unlock(page);
+}
+
+/*
+ * Free a list of objects and attempt to minimize the lock overhead
+ * while doing so.
+ *
+ * Interrupts are disabled
+ */
+static void free_objects(struct kmem_cache *s, struct kmem_cache_cpu *c,
+								void **list)
+{
+	struct page *page = NULL;
+	struct page *npage;
+	void **object;
+	void **prior = NULL;
+
+	while (list) {
+		object = list;
+		list = object[c->offset];
+
+		npage = virt_to_head_page(object);
+		if (npage != page) {
+			if (page)
+				putback_slab(s, page, prior);
+			/* finish off the old one !*/
+			page = npage;
+			slab_lock(page);
+			prior = page->freelist;
+		}
 
-		/* And put onto the regular freelist */
 		object[c->offset] = page->freelist;
 		page->freelist = object;
 		page->inuse--;
 	}
-	c->page = NULL;
-	unfreeze_slab(s, page);
+
+	if (page)
+		putback_slab(s, page, prior);
+}
+
+/*
+ * Reduce number of objects in a cpu structure
+ */
+static void drain(struct kmem_cache *s, struct kmem_cache_cpu *c, int objects)
+{
+	void **free;
+
+	if (c->objects <= objects)
+		return;
+
+	c->objects = objects;
+	free = c->freelist;
+	if (!objects)
+		c->freelist = NULL;
+	else {
+		/* Keep the first hot objects */
+		void **x = free;
+
+		while (--objects > 0)
+			x = x[c->offset];
+
+		free = x[c->offset];
+		x[c->offset] = NULL;
+	}
+	free_objects(s, c, free);
 }
 
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
 {
-	slab_lock(c->page);
-	deactivate_slab(s, c);
+	drain(s, c, 0);
 }
 
 /*
@@ -1416,7 +1490,7 @@ static inline void __flush_cpu_slab(stru
 {
 	struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
 
-	if (likely(c && c->page))
+	if (likely(c && c->freelist))
 		flush_slab(s, c);
 }
 
@@ -1444,16 +1518,131 @@ static void flush_all(struct kmem_cache 
  * Check if the objects in a per cpu structure fit numa
  * locality expectations.
  */
-static inline int node_match(struct kmem_cache_cpu *c, int node)
+static inline int node_match(void *object, int node)
 {
 #ifdef CONFIG_NUMA
-	if (node != -1 && c->node != node)
+	if (node != -1 && page_to_nid(virt_to_page(object)) != node)
 		return 0;
 #endif
 	return 1;
 }
 
 /*
+ * Find a new slab.
+ *
+ * Find new slab may reenable interrupts. Thus there is no guarantee that
+ * we are on the same cpu as before when this function returns.
+ */
+static struct page *find_new_slab(struct kmem_cache *s, gfp_t flags, int node)
+{
+	struct page *page;
+
+	page = get_partial(s, flags, node);
+	if (page)
+		return page;
+
+	if (flags & __GFP_WAIT)
+		local_irq_enable();
+
+	page = new_slab(s, flags, node);
+
+	if (flags & __GFP_WAIT)
+		local_irq_disable();
+
+	if (page)
+		slab_lock(page);
+	return page;
+}
+
+/*
+ * Drain objects that are not on the specified node until we find a fitting
+ * one.
+ */
+void numa_drain(struct kmem_cache *s, struct kmem_cache_cpu *c, int node)
+{
+#ifdef CONFIG_NUMA
+	if (node >= 0) {
+		/* NUMA case. Throw the wrong node objects away */
+		void **free = NULL;
+
+		while (c->freelist) {
+			void **object = c->freelist;
+
+			if (page_to_nid(virt_to_page(object)) == node)
+				break;
+
+			c->freelist = object[c->offset];
+			object[c->offset] = free;
+			free = object;
+			c->objects--;
+		}
+		if (free)
+			free_objects(s, c, free);
+	}
+#endif
+}
+
+static struct kmem_cache_cpu * fillup(struct kmem_cache *s,
+	struct kmem_cache_cpu *c, gfp_t gfpflags, int node, int refill)
+{
+	struct page *page;
+
+	while (c->objects < refill) {
+		int new_objects;
+		void **new_freelist;
+
+		page = find_new_slab(s, gfpflags, node);
+
+		/*
+		 * We may have switched cpus in find_new_slab. We merge the
+		 * new objects into whatever processor per cpu list we are on
+		 * right now. The bad side effects are limited to getting too
+		 * many objects onto a per cpu list.
+		 */
+		c = get_cpu_slab(s, smp_processor_id());
+
+		if (!page)
+			return c;
+
+		/* Extract objects and dispose of the slab */
+		new_objects = s->objects - page->inuse;
+		new_freelist = page->freelist;
+		page->freelist = NULL;
+		page->inuse = s->objects;
+		unfreeze_slab(s, page);
+
+		/*
+		 * We may have to merge two linked lists.
+		 * Find the smaller one.
+		 */
+		if (unlikely(c->objects > new_objects)) {
+			void **temp = c->freelist;
+
+			c->freelist = new_freelist;
+			new_freelist = temp;
+		}
+
+		/*
+		 * If there are objects already in the per cpu list
+		 * then we need to find the end of that list and
+		 * attach the new objects to the end of that list.
+		 */
+		if (unlikely(c->objects)) {
+			void **p = c->freelist;
+
+			while (p[c->offset])
+				p = p[c->offset];
+
+			p[c->offset] = new_freelist;
+		} else
+			c->freelist = new_freelist;
+
+		c->objects += new_objects;
+	}
+	return c;
+}
+
+/*
  * Slow path. The lockless freelist is empty or we need to perform
  * debugging duties.
  *
@@ -1470,88 +1659,59 @@ static inline int node_match(struct kmem
  * And if we were unable to get a new slab from the partial slab lists then
  * we need to allocate a new slab. This is slowest path since we may sleep.
  */
-static void *__slab_alloc(struct kmem_cache *s,
-		gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c)
+static void *__slab_alloc(struct kmem_cache *s, struct kmem_cache_cpu *c,
+		gfp_t gfpflags, int node, void *addr)
 {
+	int refill = c->refill;
 	void **object;
-	struct page *new;
-
-	if (!c->page)
-		goto new_slab;
-
-	slab_lock(c->page);
-	if (unlikely(!node_match(c, node)))
-		goto another_slab;
-load_freelist:
-	object = c->page->freelist;
-	if (unlikely(!object))
-		goto another_slab;
-	if (unlikely(SlabDebug(c->page)))
-		goto debug;
-
-	object = c->page->freelist;
-	c->freelist = object[c->offset];
-	c->page->inuse = s->objects;
-	c->page->freelist = NULL;
-	c->node = page_to_nid(c->page);
-	slab_unlock(c->page);
-	return object;
-
-another_slab:
-	deactivate_slab(s, c);
 
-new_slab:
-	new = get_partial(s, gfpflags, node);
-	if (new) {
-		c->page = new;
-		goto load_freelist;
+	/*
+	 * If we got here in a NUMA system then the first object on the
+	 * freelist does not have the right node.
+	 *
+	 * Simple approach: Dump all objects from the freelist that
+	 * do not fit the requirements. If any are left then use that.
+	 */
+	if (unlikely(node >= 0 && c->objects)) {
+		numa_drain(s, c, node);
+		if (c->objects)
+			goto out;
+		refill = 1;
 	}
 
-	if (gfpflags & __GFP_WAIT)
-		local_irq_enable();
+	/*
+	 * Check for unusual situations. These are not bugs but may not
+	 * have been tested well yet.
+	 */
+	if (c->objects || c->freelist)
+		printk("refill(%s, %p, %x, %d, %p) have=%d freelist=%p\n",
+			s->name, c, gfpflags, node, addr, c->objects, c->freelist);
 
-	new = new_slab(s, gfpflags, node);
+	/*
+	 * Check for debug mode. If we are in a debug mode then the slab
+	 * must be processed one object at a time.
+	 */
+	if (unlikely(c->max < 0)) {
+		struct page * page;
 
-	if (gfpflags & __GFP_WAIT)
-		local_irq_disable();
+		do {
+			page = find_new_slab(s, gfpflags, node);
+			object = page->freelist;
+		} while (!alloc_debug_processing(s, page, object, addr));
 
-	if (new) {
-		c = get_cpu_slab(s, smp_processor_id());
-		if (c->page) {
-			/*
-			 * Someone else populated the cpu_slab while we
-			 * enabled interrupts, or we have gotten scheduled
-			 * on another cpu. The page may not be on the
-			 * requested node even if __GFP_THISNODE was
-			 * specified. So we need to recheck.
-			 */
-			if (node_match(c, node)) {
-				/*
-				 * Current cpuslab is acceptable and we
-				 * want the current one since its cache hot
-				 */
-				discard_slab(s, new);
-				slab_lock(c->page);
-				goto load_freelist;
-			}
-			/* New slab does not fit our expectations */
-			flush_slab(s, c);
-		}
-		slab_lock(new);
-		SetSlabFrozen(new);
-		c->page = new;
-		goto load_freelist;
+		page->inuse++;
+		page->freelist = object[c->offset];
+		unfreeze_slab(s, page);
+		return object;
+	}
+
+	c = fillup(s, c, gfpflags, node, refill);
+out:
+	object = c->freelist;
+	if (object) {
+		c->freelist = c->freelist[c->offset];
+		c->objects--;
 	}
-	return NULL;
-debug:
-	object = c->page->freelist;
-	if (!alloc_debug_processing(s, c->page, object, addr))
-		goto another_slab;
-
-	c->page->inuse++;
-	c->page->freelist = object[c->offset];
-	c->node = -1;
-	slab_unlock(c->page);
 	return object;
 }
 
@@ -1574,17 +1734,17 @@ static void __always_inline *slab_alloc(
 
 	local_irq_save(flags);
 	c = get_cpu_slab(s, smp_processor_id());
-	if (unlikely(!c->freelist || !node_match(c, node)))
-
-		object = __slab_alloc(s, gfpflags, node, addr, c);
-
+	if (unlikely(!c->freelist || !node_match(c->freelist, node)))
+		object = __slab_alloc(s, c, gfpflags, node, addr);
 	else {
+
 		object = c->freelist;
 		c->freelist = object[c->offset];
+		c->objects--;
 	}
 	local_irq_restore(flags);
 
-	if (unlikely((gfpflags & __GFP_ZERO) && object))
+	if (unlikely(gfpflags & __GFP_ZERO) && object)
 		memset(object, 0, c->objsize);
 
 	return object;
@@ -1605,61 +1765,17 @@ EXPORT_SYMBOL(kmem_cache_alloc_node);
 #endif
 
 /*
- * Slow patch handling. This may still be called frequently since objects
- * have a longer lifetime than the cpu slabs in most processing loads.
- *
- * So we still attempt to reduce cache line usage. Just take the slab
- * lock and free the item. If there is no additional partial page
- * handling required then we can return immediately.
+ * Slow path of __slab_free. Implements debugging and draining.
  */
-static void __slab_free(struct kmem_cache *s, struct page *page,
-				void *x, void *addr, unsigned int offset)
+static void __slab_free(struct kmem_cache *s, struct kmem_cache_cpu *c,
+		void *addr)
 {
-	void *prior;
-	void **object = (void *)x;
-
-	slab_lock(page);
-
-	if (unlikely(SlabDebug(page)))
-		goto debug;
-checks_ok:
-	prior = object[offset] = page->freelist;
-	page->freelist = object;
-	page->inuse--;
-
-	if (unlikely(SlabFrozen(page)))
-		goto out_unlock;
-
-	if (unlikely(!page->inuse))
-		goto slab_empty;
-
-	/*
-	 * Objects left in the slab. If it
-	 * was not on the partial list before
-	 * then add it.
-	 */
-	if (unlikely(!prior))
-		add_partial(get_node(s, page_to_nid(page)), page);
-
-out_unlock:
-	slab_unlock(page);
-	return;
-
-slab_empty:
-	if (prior)
-		/*
-		 * Slab still on the partial list.
-		 */
-		remove_partial(s, page);
-
-	slab_unlock(page);
-	discard_slab(s, page);
-	return;
-
-debug:
-	if (!free_debug_processing(s, page, x, addr))
-		goto out_unlock;
-	goto checks_ok;
+	if (unlikely(c->max < 0)) {
+		if (!free_debug_processing(s, c->freelist, addr))
+			return;
+		drain(s, c, 0);
+	} else
+		drain(s, c, c->refill);
 }
 
 /*
@@ -1673,8 +1789,8 @@ debug:
  * If fastpath is not possible then fall back to __slab_free where we deal
  * with all sorts of special processing.
  */
-static void __always_inline slab_free(struct kmem_cache *s,
-			struct page *page, void *x, void *addr)
+static void __always_inline slab_free(struct kmem_cache *s, void *x,
+								void *addr)
 {
 	void **object = (void *)x;
 	unsigned long flags;
@@ -1683,22 +1799,18 @@ static void __always_inline slab_free(st
 	local_irq_save(flags);
 	debug_check_no_locks_freed(object, s->objsize);
 	c = get_cpu_slab(s, smp_processor_id());
-	if (likely(page == c->page && c->node >= 0)) {
-		object[c->offset] = c->freelist;
-		c->freelist = object;
-	} else
-		__slab_free(s, page, x, addr, c->offset);
+	object[c->offset] = c->freelist;
+	c->freelist = object;
+	c->objects++;
+	if (c->objects >= c->max)
+		__slab_free(s, c, addr);
 
 	local_irq_restore(flags);
 }
 
 void kmem_cache_free(struct kmem_cache *s, void *x)
 {
-	struct page *page;
-
-	page = virt_to_head_page(x);
-
-	slab_free(s, page, x, __builtin_return_address(0));
+	slab_free(s, x, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -1870,11 +1982,18 @@ static unsigned long calculate_alignment
 static void init_kmem_cache_cpu(struct kmem_cache *s,
 			struct kmem_cache_cpu *c)
 {
-	c->page = NULL;
 	c->freelist = NULL;
-	c->node = 0;
+	c->objects = 0;
 	c->offset = s->offset / sizeof(void *);
 	c->objsize = s->objsize;
+	if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE |
+			SLAB_POISON | SLAB_STORE_USER | SLAB_TRACE)) {
+		c->refill = 1;
+		c->max = -1;
+	} else {
+		c->refill = max(s->objects / 2, 10);
+		c->max = max(4 * s->objects, 20);
+	}
 }
 
 static void init_kmem_cache_node(struct kmem_cache_node *n)
@@ -2622,7 +2741,7 @@ void kfree(const void *x)
 		put_page(page);
 		return;
 	}
-	slab_free(page->slab, page, (void *)x, __builtin_return_address(0));
+	slab_free(page->slab, (void *)x, __builtin_return_address(0));
 }
 EXPORT_SYMBOL(kfree);
 
@@ -3410,29 +3529,23 @@ static unsigned long slab_objects(struct
 	nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL);
 	per_cpu = nodes + nr_node_ids;
 
-	for_each_possible_cpu(cpu) {
-		struct page *page;
+	for_each_online_cpu(cpu) {
 		int node;
 		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
 
 		if (!c)
 			continue;
 
-		page = c->page;
-		node = c->node;
-		if (node < 0)
-			continue;
-		if (page) {
-			if (flags & SO_CPU) {
-				int x = 0;
-
-				if (flags & SO_OBJECTS)
-					x = page->inuse;
-				else
-					x = 1;
-				total += x;
-				nodes[node] += x;
-			}
+		node = cpu_to_node(cpu);
+		if (flags & SO_CPU) {
+			int x = 0;
+
+			if (flags & SO_OBJECTS)
+				x = c->objects;
+			else
+				x = 0;
+			total += x;
+			nodes[node] += x;
 			per_cpu[node]++;
 		}
 	}
@@ -3482,7 +3595,7 @@ static int any_slab_objects(struct kmem_
 	for_each_possible_cpu(cpu) {
 		struct kmem_cache_cpu *c = get_cpu_slab(s, cpu);
 
-		if (c && c->page)
+		if (c && c->freelist)
 			return 1;
 	}
 
@@ -3538,6 +3651,50 @@ static ssize_t objs_per_slab_show(struct
 }
 SLAB_ATTR_RO(objs_per_slab);
 
+static ssize_t refill_show(struct kmem_cache *s, char *buf)
+{
+	int refill = get_cpu_slab(s, 0)->refill;
+
+	return sprintf(buf, "%d\n", refill);
+}
+static ssize_t refill_store(struct kmem_cache *s,
+				const char *buf, size_t length)
+{
+	int n = simple_strtoul(buf, NULL, 10);
+	int cpu;
+
+	if (n < 1 || n > 200)
+		return -EINVAL;
+
+	for_each_online_cpu(cpu)
+		get_cpu_slab(s, cpu)->refill = n;
+
+	return length;
+}
+SLAB_ATTR(refill);
+
+static ssize_t max_show(struct kmem_cache *s, char *buf)
+{
+	int max = get_cpu_slab(s, 0)->max;
+
+	return sprintf(buf, "%d\n", max);
+}
+static ssize_t max_store(struct kmem_cache *s,
+				const char *buf, size_t length)
+{
+	int n = simple_strtoul(buf, NULL, 10);
+	int cpu;
+
+	if (n < 1 || n > 10000)
+		return -EINVAL;
+
+	for_each_online_cpu(cpu)
+		get_cpu_slab(s, cpu)->max = n;
+
+	return length;
+}
+SLAB_ATTR(max);
+
 static ssize_t order_show(struct kmem_cache *s, char *buf)
 {
 	return sprintf(buf, "%d\n", s->order);
@@ -3573,11 +3730,11 @@ static ssize_t partial_show(struct kmem_
 }
 SLAB_ATTR_RO(partial);
 
-static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf)
+static ssize_t cpu_objects_show(struct kmem_cache *s, char *buf)
 {
-	return slab_objects(s, buf, SO_CPU);
+	return slab_objects(s, buf, SO_CPU|SO_OBJECTS);
 }
-SLAB_ATTR_RO(cpu_slabs);
+SLAB_ATTR_RO(cpu_objects);
 
 static ssize_t objects_show(struct kmem_cache *s, char *buf)
 {
@@ -3785,9 +3942,11 @@ static struct attribute * slab_attrs[] =
 	&objs_per_slab_attr.attr,
 	&order_attr.attr,
 	&objects_attr.attr,
+	&refill_attr.attr,
+	&max_attr.attr,
 	&slabs_attr.attr,
 	&partial_attr.attr,
-	&cpu_slabs_attr.attr,
+	&cpu_objects_attr.attr,
 	&ctor_attr.attr,
 	&aliases_attr.attr,
 	&align_attr.attr,