From clameter@sgi.com Thu Apr 26 21:21:55 2007
Message-Id: <20070427042126.299876478@sgi.com>
User-Agent: quilt/0.45-1
Date: Thu, 26 Apr 2007 21:21:26 -0700
From: clameter@sgi.com
To: akpm@linux-foundation.org
Cc: linux-mm@kvack.org
Subject: [patch 00/10] SLUB updates against 2.6.21-rc7-mm2

--

From clameter@sgi.com Thu Apr 26 21:21:55 2007
Message-Id: <20070427042155.150222454@sgi.com>
References: <20070427042126.299876478@sgi.com>
User-Agent: quilt/0.45-1
Date: Thu, 26 Apr 2007 21:21:27 -0700
From: clameter@sgi.com
To: akpm@linux-foundation.org
Cc: linux-mm@kvack.org
Subject: [patch 01/10] SLUB: Remove duplicate VM_BUG_ON
Content-Disposition: inline; filename=slub_duplicate

Somehow this artifact got in during merge with mm.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.21-rc7-mm1/mm/slub.c
===================================================================
--- linux-2.6.21-rc7-mm1.orig/mm/slub.c	2007-04-25 09:48:40.000000000 -0700
+++ linux-2.6.21-rc7-mm1/mm/slub.c	2007-04-25 09:48:47.000000000 -0700
@@ -633,8 +633,6 @@ static void add_full(struct kmem_cache *
 
 	VM_BUG_ON(!irqs_disabled());
 
-	VM_BUG_ON(!irqs_disabled());
-
 	if (!(s->flags & SLAB_STORE_USER))
 		return;
 

--

From clameter@sgi.com Thu Apr 26 21:21:55 2007
Message-Id: <20070427042155.479289095@sgi.com>
References: <20070427042126.299876478@sgi.com>
User-Agent: quilt/0.45-1
Date: Thu, 26 Apr 2007 21:21:28 -0700
From: clameter@sgi.com
To: akpm@linux-foundation.org
Cc: linux-mm@kvack.org
Subject: [patch 02/10] SLAB: Fix sysfs directory handling
Content-Disposition: inline; filename=slub_sysfs_dir_fix

This fixes the problem that SLUB does not track the names of aliased
slabs by changing the way that SLUB manages the files in /sys/slab.

If the slab that is being operated on is not mergeable (usually the
case if we are debugging) then do not create any aliases. If an alias
exists that we conflict with then remove it before creating the
directory for the unmergeable slab. If there is a true slab cache there
and not an alias then we fail since there is a true duplication of
slab cache names. So debugging allows the detection of slab name
duplication as usual.

If the slab is mergeable then we create a directory with a unique name
created from the slab size, slab options and the pointer to the kmem_cache
structure (disambiguation). All names referring to the slabs will
then be created as symlinks to that unique name. These symlinks are
not going to be removed on kmem_cache_destroy() since we only carry
a counter for the number of aliases. If a new symlink is created
then it may just replace an existing one. This means that one can create
a gazillion slabs with the same name (if they all refer to mergeable
caches). It will only increase the alias count. So we have the potential
of not detecting duplicate slab names (there is actually no harm
done by doing that....). We will detect the duplications as
as soon as debugging is enabled because we will then no longer
generate symlinks and special unique names.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.21-rc7-mm2/mm/slub.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/mm/slub.c	2007-04-26 11:40:52.000000000 -0700
+++ linux-2.6.21-rc7-mm2/mm/slub.c	2007-04-26 11:40:59.000000000 -0700
@@ -3298,16 +3298,68 @@ static struct kset_uevent_ops slab_ueven
 
 decl_subsys(slab, &slab_ktype, &slab_uevent_ops);
 
+#define ID_STR_LENGTH 64
+
+/* Create a unique string id for a slab cache:
+ * format
+ * :[flags-]size:[memory address of kmemcache]
+ */
+static char *create_unique_id(struct kmem_cache *s)
+{
+	char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
+	char *p = name;
+
+	BUG_ON(!name);
+
+	*p++ = ':';
+	/*
+	 * First flags affecting slabcache operations */
+	if (s->flags & SLAB_CACHE_DMA)
+		*p++ = 'd';
+	if (s->flags & SLAB_RECLAIM_ACCOUNT)
+		*p++ = 'a';
+	if (s->flags & SLAB_DESTROY_BY_RCU)
+		*p++ = 'r';\
+	/* Debug flags */
+	if (s->flags & SLAB_RED_ZONE)
+		*p++ = 'Z';
+	if (s->flags & SLAB_POISON)
+		*p++ = 'P';
+	if (s->flags & SLAB_STORE_USER)
+		*p++ = 'U';
+	if (p != name + 1)
+		*p++ = '-';
+	p += sprintf(p,"%07d:0x%p" ,s->size, s);
+	BUG_ON(p > name + ID_STR_LENGTH - 1);
+	return name;
+}
+
 static int sysfs_slab_add(struct kmem_cache *s)
 {
 	int err;
+	const char *name;
 
 	if (slab_state < SYSFS)
 		/* Defer until later */
 		return 0;
 
+	if (s->flags & SLUB_NEVER_MERGE) {
+		/*
+		 * Slabcache can never be merged so we can use the name proper.
+		 * This is typically the case for debug situations. In that
+		 * case we can catch duplicate names easily.
+		 */
+		sysfs_remove_link(&slab_subsys.kset.kobj, s->name);
+		name = s->name;
+	} else
+		/*
+		 * Create a unique name for the slab as a target
+		 * for the symlinks.
+		 */
+		name = create_unique_id(s);
+
 	kobj_set_kset_s(s, slab_subsys);
-	kobject_set_name(&s->kobj, s->name);
+	kobject_set_name(&s->kobj, name);
 	kobject_init(&s->kobj);
 	err = kobject_add(&s->kobj);
 	if (err)
@@ -3317,6 +3369,10 @@ static int sysfs_slab_add(struct kmem_ca
 	if (err)
 		return err;
 	kobject_uevent(&s->kobj, KOBJ_ADD);
+	if (!(s->flags & SLUB_NEVER_MERGE)) {
+		sysfs_slab_alias(s, s->name);
+		kfree(name);
+	}
 	return 0;
 }
 
@@ -3342,9 +3398,14 @@ static int sysfs_slab_alias(struct kmem_
 {
 	struct saved_alias *al;
 
-	if (slab_state == SYSFS)
+	if (slab_state == SYSFS) {
+		/*
+		 * If we have a leftover link then remove it.
+		 */
+		sysfs_remove_link(&slab_subsys.kset.kobj, name);
 		return sysfs_create_link(&slab_subsys.kset.kobj,
 						&s->kobj, name);
+	}
 
 	al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL);
 	if (!al)

--

From clameter@sgi.com Thu Apr 26 21:21:55 2007
Message-Id: <20070427042155.693131327@sgi.com>
References: <20070427042126.299876478@sgi.com>
User-Agent: quilt/0.45-1
Date: Thu, 26 Apr 2007 21:21:29 -0700
From: clameter@sgi.com
To: akpm@linux-foundation.org
Cc: linux-mm@kvack.org
Subject: [patch 03/10] SLUB: debug printk cleanup
Content-Disposition: inline; filename=slub_at_cleanup

Set up a new function slab_err in order to report errors consistently.

Consistently report corrective actions taken by SLUB by a printk starting
with @@@.

Fix locations where there is no 0x in front of %p.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.21-rc7-mm2/mm/slub.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/mm/slub.c	2007-04-26 20:58:08.000000000 -0700
+++ linux-2.6.21-rc7-mm2/mm/slub.c	2007-04-26 20:58:23.000000000 -0700
@@ -324,8 +324,8 @@ static void object_err(struct kmem_cache
 {
 	u8 *addr = page_address(page);
 
-	printk(KERN_ERR "*** SLUB: %s in %s@0x%p slab 0x%p\n",
-			reason, s->name, object, page);
+	printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n",
+			s->name, reason, object, page);
 	printk(KERN_ERR "    offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n",
 		object - addr, page->flags, page->inuse, page->freelist);
 	if (object > addr + 16)
@@ -335,6 +335,19 @@ static void object_err(struct kmem_cache
 	dump_stack();
 }
 
+static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...)
+{
+	va_list args;
+	char buf[100];
+
+	va_start(args, reason);
+	vsnprintf(buf, sizeof(buf), reason, args);
+	va_end(args);
+	printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf,
+		page);
+	dump_stack();
+}
+
 static void init_object(struct kmem_cache *s, void *object, int active)
 {
 	u8 *p = object;
@@ -412,7 +425,7 @@ static int check_valid_pointer(struct km
 static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
 						void *from, void *to)
 {
-	printk(KERN_ERR "@@@ SLUB: %s Restoring %s (0x%x) from 0x%p-0x%p\n",
+	printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n",
 		s->name, message, data, from, to - 1);
 	memset(from, data, to - from);
 }
@@ -459,9 +472,7 @@ static int slab_pad_check(struct kmem_ca
 		return 1;
 
 	if (!check_bytes(p + length, POISON_INUSE, remainder)) {
-		printk(KERN_ERR "SLUB: %s slab 0x%p: Padding fails check\n",
-			s->name, p);
-		dump_stack();
+		slab_err(s, page, "Padding check failed");
 		restore_bytes(s, "slab padding", POISON_INUSE, p + length,
 			p + length + remainder);
 		return 0;
@@ -547,30 +558,25 @@ static int check_slab(struct kmem_cache 
 	VM_BUG_ON(!irqs_disabled());
 
 	if (!PageSlab(page)) {
-		printk(KERN_ERR "SLUB: %s Not a valid slab page @0x%p "
-			"flags=%lx mapping=0x%p count=%d \n",
-			s->name, page, page->flags, page->mapping,
+		slab_err(s, page, "Not a valid slab page flags=%lx "
+			"mapping=0x%p count=%d", page->flags, page->mapping,
 			page_count(page));
 		return 0;
 	}
 	if (page->offset * sizeof(void *) != s->offset) {
-		printk(KERN_ERR "SLUB: %s Corrupted offset %lu in slab @0x%p"
-			" flags=0x%lx mapping=0x%p count=%d\n",
-			s->name,
+		slab_err(s, page, "Corrupted offset %lu flags=0x%lx "
+			"mapping=0x%p count=%d",
 			(unsigned long)(page->offset * sizeof(void *)),
-			page,
 			page->flags,
 			page->mapping,
 			page_count(page));
-		dump_stack();
 		return 0;
 	}
 	if (page->inuse > s->objects) {
-		printk(KERN_ERR "SLUB: %s inuse %u > max %u in slab "
-			"page @0x%p flags=%lx mapping=0x%p count=%d\n",
-			s->name, page->inuse, s->objects, page, page->flags,
+		slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx "
+			"mapping=0x%p count=%d",
+			s->name, page->inuse, s->objects, page->flags,
 			page->mapping, page_count(page));
-		dump_stack();
 		return 0;
 	}
 	/* Slab_pad_check fixes things up after itself */
@@ -599,12 +605,13 @@ static int on_freelist(struct kmem_cache
 				set_freepointer(s, object, NULL);
 				break;
 			} else {
-				printk(KERN_ERR "SLUB: %s slab 0x%p "
-					"freepointer 0x%p corrupted.\n",
-					s->name, page, fp);
-				dump_stack();
+				slab_err(s, page, "Freepointer 0x%p corrupt",
+									fp);
 				page->freelist = NULL;
 				page->inuse = s->objects;
+				printk(KERN_ERR "@@@ SLUB %s: Freelist "
+					"cleared. Slab 0x%p\n",
+					s->name, page);
 				return 0;
 			}
 			break;
@@ -615,11 +622,12 @@ static int on_freelist(struct kmem_cache
 	}
 
 	if (page->inuse != s->objects - nr) {
-		printk(KERN_ERR "slab %s: page 0x%p wrong object count."
-			" counter is %d but counted were %d\n",
-			s->name, page, page->inuse,
-			s->objects - nr);
+		slab_err(s, page, "Wrong object count. Counter is %d but "
+			"counted were %d", s, page, page->inuse,
+							s->objects - nr);
 		page->inuse = s->objects - nr;
+		printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. "
+			"Slab @0x%p\n", s->name, page);
 	}
 	return search == NULL;
 }
@@ -663,10 +671,7 @@ static int alloc_object_checks(struct km
 		goto bad;
 
 	if (object && !on_freelist(s, page, object)) {
-		printk(KERN_ERR "SLUB: %s Object 0x%p@0x%p "
-			"already allocated.\n",
-			s->name, object, page);
-		dump_stack();
+		slab_err(s, page, "Object 0x%p already allocated", object);
 		goto bad;
 	}
 
@@ -706,15 +711,12 @@ static int free_object_checks(struct kme
 		goto fail;
 
 	if (!check_valid_pointer(s, page, object)) {
-		printk(KERN_ERR "SLUB: %s slab 0x%p invalid "
-			"object pointer 0x%p\n",
-			s->name, page, object);
+		slab_err(s, page, "Invalid object pointer 0x%p", object);
 		goto fail;
 	}
 
 	if (on_freelist(s, page, object)) {
-		printk(KERN_ERR "SLUB: %s slab 0x%p object "
-			"0x%p already free.\n", s->name, page, object);
+		slab_err(s, page, "Object 0x%p already free", object);
 		goto fail;
 	}
 
@@ -723,24 +725,22 @@ static int free_object_checks(struct kme
 
 	if (unlikely(s != page->slab)) {
 		if (!PageSlab(page))
-			printk(KERN_ERR "slab_free %s size %d: attempt to"
-				"free object(0x%p) outside of slab.\n",
-				s->name, s->size, object);
+			slab_err(s, page, "Attempt to free object(0x%p) "
+				"outside of slab", object);
 		else
-		if (!page->slab)
+		if (!page->slab) {
 			printk(KERN_ERR
-				"slab_free : no slab(NULL) for object 0x%p.\n",
+				"SLUB <none>: no slab for object 0x%p.\n",
 						object);
+			dump_stack();
+		}
 		else
-			printk(KERN_ERR "slab_free %s(%d): object at 0x%p"
-				" belongs to slab %s(%d)\n",
-				s->name, s->size, object,
-				page->slab->name, page->slab->size);
+			slab_err(s, page, "object at 0x%p belongs "
+				"to slab %s", object, page->slab->name);
 		goto fail;
 	}
 	return 1;
 fail:
-	dump_stack();
 	printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n",
 		s->name, page, object);
 	return 0;
@@ -2479,6 +2479,8 @@ __initcall(cpucache_init);
 #endif
 
 #ifdef SLUB_RESILIENCY_TEST
+static unsigned long validate_slab_cache(struct kmem_cache *s);
+
 static void resiliency_test(void)
 {
 	u8 *p;
@@ -2490,7 +2492,7 @@ static void resiliency_test(void)
 	p = kzalloc(16, GFP_KERNEL);
 	p[16] = 0x12;
 	printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
-			" 0x12->%p\n\n", p + 16);
+			" 0x12->0x%p\n\n", p + 16);
 
 	validate_slab_cache(kmalloc_caches + 4);
 
@@ -2498,14 +2500,14 @@ static void resiliency_test(void)
 	p = kzalloc(32, GFP_KERNEL);
 	p[32 + sizeof(void *)] = 0x34;
 	printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
-		 	" 0x34 -> %p\n", p);
+		 	" 0x34 -> -0x%p\n", p);
 	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
 
 	validate_slab_cache(kmalloc_caches + 5);
 	p = kzalloc(64, GFP_KERNEL);
 	p += 64 + (get_cycles() & 0xff) * sizeof(void *);
 	*p = 0x56;
-	printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->%p\n",
+	printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
 									p);
 	printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n");
 	validate_slab_cache(kmalloc_caches + 6);
@@ -2514,19 +2516,19 @@ static void resiliency_test(void)
 	p = kzalloc(128, GFP_KERNEL);
 	kfree(p);
 	*p = 0x78;
-	printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->%p\n\n", p);
+	printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
 	validate_slab_cache(kmalloc_caches + 7);
 
 	p = kzalloc(256, GFP_KERNEL);
 	kfree(p);
 	p[50] = 0x9a;
-	printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->%p\n\n", p);
+	printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
 	validate_slab_cache(kmalloc_caches + 8);
 
 	p = kzalloc(512, GFP_KERNEL);
 	kfree(p);
 	p[512] = 0xab;
-	printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->%p\n\n", p);
+	printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
 	validate_slab_cache(kmalloc_caches + 9);
 }
 #else
@@ -2593,17 +2595,17 @@ static void validate_slab_slab(struct km
 		validate_slab(s, page);
 		slab_unlock(page);
 	} else
-		printk(KERN_INFO "SLUB: %s Skipped busy slab %p\n",
+		printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n",
 			s->name, page);
 
 	if (s->flags & DEBUG_DEFAULT_FLAGS) {
 		if (!PageError(page))
-			printk(KERN_ERR "SLUB: %s PageError not set "
-				"on slab %p\n", s->name, page);
+			printk(KERN_ERR "SLUB %s: PageError not set "
+				"on slab 0x%p\n", s->name, page);
 	} else {
 		if (PageError(page))
-			printk(KERN_ERR "SLUB: %s PageError set on "
-				"slab %p\n", s->name, page);
+			printk(KERN_ERR "SLUB %s: PageError set on "
+				"slab 0x%p\n", s->name, page);
 	}
 }
 
@@ -2620,8 +2622,8 @@ static int validate_slab_node(struct kme
 		count++;
 	}
 	if (count != n->nr_partial)
-		printk("SLUB: %s %ld partial slabs counted but counter=%ld\n",
-			s->name, count, n->nr_partial);
+		printk(KERN_ERR "SLUB %s: %ld partial slabs counted but "
+			"counter=%ld\n", s->name, count, n->nr_partial);
 
 	if (!(s->flags & SLAB_STORE_USER))
 		goto out;
@@ -2631,8 +2633,9 @@ static int validate_slab_node(struct kme
 		count++;
 	}
 	if (count != atomic_long_read(&n->nr_slabs))
-		printk("SLUB: %s %ld slabs counted but counter=%ld\n",
-		s->name, count, atomic_long_read(&n->nr_slabs));
+		printk(KERN_ERR "SLUB: %s %ld slabs counted but "
+			"counter=%ld\n", s->name, count,
+			atomic_long_read(&n->nr_slabs));
 
 out:
 	spin_unlock_irqrestore(&n->list_lock, flags);

--

From clameter@sgi.com Thu Apr 26 21:21:56 2007
Message-Id: <20070427042155.948218296@sgi.com>
References: <20070427042126.299876478@sgi.com>
User-Agent: quilt/0.45-1
Date: Thu, 26 Apr 2007 21:21:30 -0700
From: clameter@sgi.com
To: akpm@linux-foundation.org
Cc: linux-mm@kvack.org
Subject: [patch 04/10] SLUB: Conform more to SLABs SLAB_HWCACHE_ALIGN behavior
Content-Disposition: inline; filename=slub_hwalign

Currently SLUB is using a strict L1_CACHE_BYTES alignment if
SLAB_HWCACHE_ALIGN is specified. SLAB does not align to a cacheline if the
object is smaller than half of a cacheline. Small objects are then aligned
by SLAB to a fraction of a cacheline.

Make SLUB just forget about the alignment requirement if the object size
is less than L1_CACHE_BYTES. It seems that fractional alignments are no
good because they grow the object and reduce the object density in a cache
line needlessly causing additional cache line fetches.

If we are already throwing the user suggestion of a cache line alignment
away then lets do the best we can. Maybe SLAB_HWCACHE_ALIGN also needs
to be tossed given its wishy-washy handling but doing so would require
an audit of all kmem_cache_allocs throughout the kernel source.

In any case one needs to explictly specify an alignment during
kmem_cache_create to either slab allocator in order to ensure that the
objects are cacheline aligned.

[Patch has a nice memory compaction effect on 32 bit platforms]

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.21-rc7-mm2/mm/slub.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/mm/slub.c	2007-04-26 11:41:15.000000000 -0700
+++ linux-2.6.21-rc7-mm2/mm/slub.c	2007-04-26 11:41:43.000000000 -0700
@@ -1483,9 +1483,19 @@ static int calculate_order(int size)
  * various ways of specifying it.
  */
 static unsigned long calculate_alignment(unsigned long flags,
-		unsigned long align)
+		unsigned long align, unsigned long size)
 {
-	if (flags & SLAB_HWCACHE_ALIGN)
+	/*
+	 * If the user wants hardware cache aligned objects then
+	 * follow that suggestion if the object is sufficiently
+	 * large.
+	 *
+	 * The hardware cache alignment cannot override the
+	 * specified alignment though. If that is greater
+	 * then use it.
+	 */
+	if ((flags & SLAB_HWCACHE_ALIGN) &&
+			size > L1_CACHE_BYTES / 2)
 		return max_t(unsigned long, align, L1_CACHE_BYTES);
 
 	if (align < ARCH_SLAB_MINALIGN)
@@ -1674,7 +1684,7 @@ static int calculate_sizes(struct kmem_c
 	 * user specified (this is unecessarily complex due to the attempt
 	 * to be compatible with SLAB. Should be cleaned up some day).
 	 */
-	align = calculate_alignment(flags, align);
+	align = calculate_alignment(flags, align, s->objsize);
 
 	/*
 	 * SLUB stores one object immediately after another beginning from
@@ -2251,7 +2261,7 @@ static struct kmem_cache *find_mergeable
 		return NULL;
 
 	size = ALIGN(size, sizeof(void *));
-	align = calculate_alignment(flags, align);
+	align = calculate_alignment(flags, align, size);
 	size = ALIGN(size, align);
 
 	list_for_each(h, &slab_caches) {

--

From clameter@sgi.com Thu Apr 26 21:21:56 2007
Message-Id: <20070427042156.196011848@sgi.com>
References: <20070427042126.299876478@sgi.com>
User-Agent: quilt/0.45-1
Date: Thu, 26 Apr 2007 21:21:31 -0700
From: clameter@sgi.com
To: akpm@linux-foundation.org
Cc: linux-mm@kvack.org
Subject: [patch 05/10] SLUB: Add MIN_PARTIAL
Content-Disposition: inline; filename=slab_partial

We leave a mininum of partial slabs on nodes when we search for
partial slabs on other node. Define a constant for that value.

Then modify slub to keep MIN_PARTIAL slabs around.

This avoids bad situations where a function frees the last object
in a slab (which results in the page being returned to the page
allocator) only to then allocate one again (which requires getting
a page back from the page allocator if the partial list was empty).
Keeping a couple of slabs on the partial list reduces overhead.

Empty slabs are added to the end of the partial list to insure that
partially allocated slabs are consumed first (defragmentation).

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.21-rc7-mm2/mm/slub.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/mm/slub.c	2007-04-26 11:41:43.000000000 -0700
+++ linux-2.6.21-rc7-mm2/mm/slub.c	2007-04-26 11:41:54.000000000 -0700
@@ -109,6 +109,9 @@
 /* Enable to test recovery from slab corruption on boot */
 #undef SLUB_RESILIENCY_TEST
 
+/* Mininum number of partial slabs */
+#define MIN_PARTIAL 2
+
 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
 				SLAB_POISON | SLAB_STORE_USER)
 /*
@@ -635,16 +638,8 @@ static int on_freelist(struct kmem_cache
 /*
  * Tracking of fully allocated slabs for debugging
  */
-static void add_full(struct kmem_cache *s, struct page *page)
+static void add_full(struct kmem_cache_node *n, struct page *page)
 {
-	struct kmem_cache_node *n;
-
-	VM_BUG_ON(!irqs_disabled());
-
-	if (!(s->flags & SLAB_STORE_USER))
-		return;
-
-	n = get_node(s, page_to_nid(page));
 	spin_lock(&n->list_lock);
 	list_add(&page->lru, &n->full);
 	spin_unlock(&n->list_lock);
@@ -923,10 +918,16 @@ static __always_inline int slab_trylock(
 /*
  * Management of partially allocated slabs
  */
-static void add_partial(struct kmem_cache *s, struct page *page)
+static void add_partial_tail(struct kmem_cache_node *n, struct page *page)
 {
-	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+	spin_lock(&n->list_lock);
+	n->nr_partial++;
+	list_add_tail(&page->lru, &n->partial);
+	spin_unlock(&n->list_lock);
+}
 
+static void add_partial(struct kmem_cache_node *n, struct page *page)
+{
 	spin_lock(&n->list_lock);
 	n->nr_partial++;
 	list_add(&page->lru, &n->partial);
@@ -1026,7 +1027,7 @@ static struct page *get_any_partial(stru
 		n = get_node(s, zone_to_nid(*z));
 
 		if (n && cpuset_zone_allowed_hardwall(*z, flags) &&
-				n->nr_partial > 2) {
+				n->nr_partial > MIN_PARTIAL) {
 			page = get_partial_node(n);
 			if (page)
 				return page;
@@ -1060,15 +1061,31 @@ static struct page *get_partial(struct k
  */
 static void putback_slab(struct kmem_cache *s, struct page *page)
 {
+	struct kmem_cache_node *n = get_node(s, page_to_nid(page));
+
 	if (page->inuse) {
+
 		if (page->freelist)
-			add_partial(s, page);
-		else if (PageError(page))
-			add_full(s, page);
+			add_partial(n, page);
+		else if (PageError(page) && (s->flags & SLAB_STORE_USER))
+			add_full(n, page);
 		slab_unlock(page);
+
 	} else {
-		slab_unlock(page);
-		discard_slab(s, page);
+		if (n->nr_partial < MIN_PARTIAL) {
+			/*
+			 * Adding an empty page to the partial slabs in order
+			 * to avoid page allocator overhead. This page needs to
+			 * come after all the others that are not fully empty
+			 * in order to make sure that we do maximum
+			 * defragmentation.
+			 */
+			add_partial_tail(n, page);
+			slab_unlock(page);
+		} else {
+			slab_unlock(page);
+			discard_slab(s, page);
+		}
 	}
 }
 
@@ -1326,7 +1343,7 @@ checks_ok:
 	 * then add it.
 	 */
 	if (unlikely(!prior))
-		add_partial(s, page);
+		add_partial(get_node(s, page_to_nid(page)), page);
 
 out_unlock:
 	slab_unlock(page);
@@ -1542,7 +1559,7 @@ static struct kmem_cache_node * __init e
 	kmalloc_caches->node[node] = n;
 	init_kmem_cache_node(n);
 	atomic_long_inc(&n->nr_slabs);
-	add_partial(kmalloc_caches, page);
+	add_partial(n, page);
 	return n;
 }
 

--

From clameter@sgi.com Thu Apr 26 21:21:56 2007
Message-Id: <20070427042156.441242745@sgi.com>
References: <20070427042126.299876478@sgi.com>
User-Agent: quilt/0.45-1
Date: Thu, 26 Apr 2007 21:21:32 -0700
From: clameter@sgi.com
To: akpm@linux-foundation.org
Cc: linux-mm@kvack.org,
 Mel Gorman <mel@skynet.ie>
Subject: [patch 06/10] SLUB: Free slabs and sort partial slab lists in kmem_cache_shrink
Content-Disposition: inline; filename=slab_shrink_cache

At kmem_cache_shrink check if we have any empty slabs on the partial
if so then remove them.

Also--as an anti-fragmentation measure--sort the partial slabs so that
the most fully allocated ones come first and the least allocated last.

The next allocations may fill up the nearly full slabs. Having the
least allocated slabs last gives them the maximum chance that their
remaining objects may be freed. Thus we can hopefully minimize the
partial slabs.

I think this is the best one can do in terms antifragmentation
measures. Real defragmentation (meaning moving objects out of slabs with
the least free objects to those that are almost full) can be implemted
by reverse scanning through the list produced here but that would mean
that we need to provide a callback at slab cache creation that allows
the deletion or moving of an object. This will involve slab API
changes, so defer for now.

Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 mm/slub.c |  118 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 104 insertions(+), 14 deletions(-)

Index: linux-2.6.21-rc7-mm2/mm/slub.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/mm/slub.c	2007-04-26 20:59:01.000000000 -0700
+++ linux-2.6.21-rc7-mm2/mm/slub.c	2007-04-26 20:59:21.000000000 -0700
@@ -109,9 +109,19 @@
 /* Enable to test recovery from slab corruption on boot */
 #undef SLUB_RESILIENCY_TEST
 
-/* Mininum number of partial slabs */
+/*
+ * Mininum number of partial slabs. These will be left on the partial
+ * lists even if they are empty. kmem_cache_shrink may reclaim them.
+ */
 #define MIN_PARTIAL 2
 
+/*
+ * Maximum number of desirable partial slabs.
+ * The existence of more partial slabs makes kmem_cache_shrink
+ * sort the partial list by the number of objects in the.
+ */
+#define MAX_PARTIAL 10
+
 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
 				SLAB_POISON | SLAB_STORE_USER)
 /*
@@ -1915,7 +1925,7 @@ static int kmem_cache_close(struct kmem_
 	for_each_online_node(node) {
 		struct kmem_cache_node *n = get_node(s, node);
 
-		free_list(s, n, &n->partial);
+		n->nr_partial -= free_list(s, n, &n->partial);
 		if (atomic_long_read(&n->nr_slabs))
 			return 1;
 	}
@@ -2164,6 +2174,79 @@ void kfree(const void *x)
 }
 EXPORT_SYMBOL(kfree);
 
+/*
+ *  kmem_cache_shrink removes empty slabs from the partial lists
+ *  and then sorts the partially allocated slabs by the number
+ *  of items in use. The slabs with the most items in use
+ *  come first. New allocations will remove these from the
+ *  partial list because they are full. The slabs with the
+ *  least items are placed last. If it happens that the objects
+ *  are freed then the page can be returned to the page allocator.
+ */
+int kmem_cache_shrink(struct kmem_cache *s)
+{
+	int node;
+	int i;
+	struct kmem_cache_node *n;
+	struct page *page;
+	struct page *t;
+	struct list_head *slabs_by_inuse =
+		kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL);
+	unsigned long flags;
+
+	if (!slabs_by_inuse)
+		return -ENOMEM;
+
+	flush_all(s);
+	for_each_online_node(node) {
+		n = get_node(s, node);
+
+		if (n->nr_partial <= MIN_PARTIAL)
+			continue;
+
+		for (i = 0; i < s->objects; i++)
+			INIT_LIST_HEAD(slabs_by_inuse + i);
+
+		spin_lock_irqsave(&n->list_lock, flags);
+
+		/*
+		 * Build lists indexed by the items in use in
+		 * each slab or free slabs if empty.
+		 *
+		 * Note that concurrent frees may occur while
+		 * we hold the list_lock. page->inuse here is
+		 * the upper limit.
+		 */
+		list_for_each_entry_safe(page, t, &n->partial, lru) {
+			if (!page->inuse) {
+				list_del(&page->lru);
+				n->nr_partial--;
+				discard_slab(s, page);
+			} else
+			if (n->nr_partial > MAX_PARTIAL)
+				list_move(&page->lru,
+					slabs_by_inuse + page->inuse);
+		}
+
+		if (n->nr_partial <= MAX_PARTIAL)
+			goto out;
+
+		/*
+		 * Rebuild the partial list with the slabs filled up
+		 * most first and the least used slabs at the end.
+		 */
+		for (i = s->objects - 1; i > 0; i--)
+			list_splice(slabs_by_inuse + i, n->partial.prev);
+
+	out:
+		spin_unlock_irqrestore(&n->list_lock, flags);
+	}
+
+	kfree(slabs_by_inuse);
+	return 0;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
 /**
  * krealloc - reallocate memory. The contents will remain unchanged.
  *
@@ -2409,17 +2492,6 @@ static struct notifier_block __cpuinitda
 
 #endif
 
-/***************************************************************
- *	Compatiblility definitions
- **************************************************************/
-
-int kmem_cache_shrink(struct kmem_cache *s)
-{
-	flush_all(s);
-	return 0;
-}
-EXPORT_SYMBOL(kmem_cache_shrink);
-
 #ifdef CONFIG_NUMA
 
 /*****************************************************************
@@ -3195,6 +3267,25 @@ static ssize_t validate_store(struct kme
 }
 SLAB_ATTR(validate);
 
+static ssize_t shrink_show(struct kmem_cache *s, char *buf)
+{
+	return 0;
+}
+
+static ssize_t shrink_store(struct kmem_cache *s,
+			const char *buf, size_t length)
+{
+	if (buf[0] == '1') {
+		int rc = kmem_cache_shrink(s);
+
+		if (rc)
+			return rc;
+	} else
+		return -EINVAL;
+	return length;
+}
+SLAB_ATTR(shrink);
+
 static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
 {
 	if (!(s->flags & SLAB_STORE_USER))
@@ -3251,6 +3342,7 @@ static struct attribute * slab_attrs[] =
 	&poison_attr.attr,
 	&store_user_attr.attr,
 	&validate_attr.attr,
+	&shrink_attr.attr,
 	&alloc_calls_attr.attr,
 	&free_calls_attr.attr,
 #ifdef CONFIG_ZONE_DMA

--

From clameter@sgi.com Thu Apr 26 21:21:56 2007
Message-Id: <20070427042156.668869489@sgi.com>
References: <20070427042126.299876478@sgi.com>
User-Agent: quilt/0.45-1
Date: Thu, 26 Apr 2007 21:21:33 -0700
From: clameter@sgi.com
To: akpm@linux-foundation.org
Cc: linux-mm@kvack.org
Subject: [patch 07/10] SLUB: Major slabinfo update
Content-Disposition: inline; filename=slub_slabinfo_update

Enhancement to slabinfo
- Support for slab shrinking (-r option)
- Slab summary showing system totals (-T option)
- Sync with new form of alias handling
- Sort by size, reverse sorting etc (-S -i option)
- Alias lookups (-a)
- NUMA allocation tables table output (-n option)

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.21-rc7-mm2/Documentation/vm/slabinfo.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/Documentation/vm/slabinfo.c	2007-04-26 20:58:01.000000000 -0700
+++ linux-2.6.21-rc7-mm2/Documentation/vm/slabinfo.c	2007-04-26 21:00:24.000000000 -0700
@@ -3,7 +3,7 @@
  *
  * (C) 2007 sgi, Christoph Lameter <clameter@sgi.com>
  *
- * Compile by doing:
+ * Compile by:
  *
  * gcc -o slabinfo slabinfo.c
  */
@@ -17,15 +17,47 @@
 #include <getopt.h>
 #include <regex.h>
 
+#define MAX_SLABS 500
+#define MAX_ALIASES 500
+#define MAX_NODES 1024
+
+struct slabinfo {
+	char *name;
+	int alias;
+	int refs;
+	int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu;
+	int hwcache_align, object_size, objs_per_slab;
+	int sanity_checks, slab_size, store_user, trace;
+	int order, poison, reclaim_account, red_zone;
+	unsigned long partial, objects, slabs;
+	int numa[MAX_NODES];
+	int numa_partial[MAX_NODES];
+} slabinfo[MAX_SLABS];
+
+struct aliasinfo {
+	char *name;
+	char *ref;
+	struct slabinfo *slab;
+} aliasinfo[MAX_ALIASES];
+
+int slabs = 0;
+int aliases = 0;
+int highest_node = 0;
+
 char buffer[4096];
 
 int show_alias = 0;
 int show_slab = 0;
-int show_parameters = 0;
 int skip_zero = 1;
 int show_numa = 0;
 int show_track = 0;
+int show_first_alias = 0;
 int validate = 0;
+int shrink = 0;
+int show_inverted = 0;
+int show_single_ref = 0;
+int show_totals = 0;
+int sort_size = 0;
 
 int page_size;
 
@@ -47,11 +79,16 @@ void usage(void)
 		"-a|--aliases           Show aliases\n"
 		"-h|--help              Show usage information\n"
 		"-n|--numa              Show NUMA information\n"
-		"-p|--parameters        Show global parameters\n"
+		"-r|--reduce	        Shrink slabs\n"
 		"-v|--validate          Validate slabs\n"
 		"-t|--tracking          Show alloc/free information\n"
+		"-T|--Totals		Show summary information\n"
 		"-s|--slabs             Show slabs\n"
+		"-S|--Size		Sort by size\n"
 		"-z|--zero              Include empty slabs\n"
+		"-f|--first-alias       Show first alias\n"
+		"-i|--inverted          Inverted list\n"
+		"-1|--1ref              Single reference\n"
 	);
 }
 
@@ -86,23 +123,32 @@ unsigned long get_obj(char *name)
 unsigned long get_obj_and_str(char *name, char **x)
 {
 	unsigned long result = 0;
+	char *p;
+
+	*x = NULL;
 
 	if (!read_obj(name)) {
 		x = NULL;
 		return 0;
 	}
-	result = strtoul(buffer, x, 10);
-	while (**x == ' ')
-		(*x)++;
+	result = strtoul(buffer, &p, 10);
+	while (*p == ' ')
+		p++;
+	if (*p)
+		*x = strdup(p);
 	return result;
 }
 
-void set_obj(char *name, int n)
+void set_obj(struct slabinfo *s, char *name, int n)
 {
-	FILE *f = fopen(name, "w");
+	char x[100];
+
+	sprintf(x, "%s/%s", s->name, name);
+
+	FILE *f = fopen(x, "w");
 
 	if (!f)
-		fatal("Cannot write to %s\n", name);
+		fatal("Cannot write to %s\n", x);
 
 	fprintf(f, "%d\n", n);
 	fclose(f);
@@ -143,167 +189,616 @@ int store_size(char *buffer, unsigned lo
 	return n;
 }
 
-void alias(const char *name)
+void decode_numa_list(int *numa, char *t)
 {
-	int count;
-	char *p;
-
-	if (!show_alias)
-		return;
+	int node;
+	int nr;
 
-	count = readlink(name, buffer, sizeof(buffer));
+	memset(numa, 0, MAX_NODES * sizeof(int));
 
-	if (count < 0)
-		return;
+	while (*t == 'N') {
+		t++;
+		node = strtoul(t, &t, 10);
+		if (*t == '=') {
+			t++;
+			nr = strtoul(t, &t, 10);
+			numa[node] = nr;
+			if (node > highest_node)
+				highest_node = node;
+		}
+		while (*t == ' ')
+			t++;
+	}
+}
 
-	buffer[count] = 0;
+char *hackname(struct slabinfo *s)
+{
+	char *n = s->name;
 
-	p = buffer + count;
+	if (n[0] == ':') {
+		char *nn = malloc(20);
+		char *p;
+
+		strncpy(nn, n, 20);
+		n = nn;
+		p = n + 4;
+		while (*p && *p !=':')
+			p++;
+		*p = 0;
+	}
+	return n;
+}
 
-	while (p > buffer && p[-1] != '/')
-		p--;
-	printf("%-20s -> %s\n", name, p);
+void slab_validate(struct slabinfo *s)
+{
+	set_obj(s, "validate", 1);
 }
 
-void slab_validate(char *name)
+void slab_shrink(struct slabinfo *s)
 {
-	set_obj("validate", 1);
+	set_obj(s, "shrink", 1);
 }
 
 int line = 0;
 
 void first_line(void)
 {
-	printf("Name                Objects   Objsize    Space "
-		"Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n");
+	printf("Name                 Objects   Objsize    Space "
+		"Slabs/Part/Cpu  O/S O %%Fr %%Ef Flg\n");
+}
+
+/*
+ * Find the shortest alias of a slab
+ */
+struct aliasinfo *find_one_alias(struct slabinfo *find)
+{
+	struct aliasinfo *a;
+	struct aliasinfo *best = NULL;
+
+	for(a = aliasinfo;a < aliasinfo + aliases; a++) {
+		if (a->slab == find &&
+			(!best || strlen(best->name) < strlen(a->name))) {
+				best = a;
+				if (strncmp(a->name,"kmall", 5) == 0)
+					return best;
+			}
+	}
+	if (best)
+		return best;
+	fatal("Cannot find alias for %s\n", find->name);
+	return NULL;
 }
 
-void slab(const char *name)
+unsigned long slab_size(struct slabinfo *s)
+{
+	return 	s->slabs * (page_size << s->order);
+}
+
+
+void slabcache(struct slabinfo *s)
 {
-	unsigned long aliases, align, cache_dma, cpu_slabs, destroy_by_rcu;
-	unsigned long hwcache_align, object_size, objects, objs_per_slab;
-	unsigned long order, partial, poison, reclaim_account, red_zone;
-	unsigned long sanity_checks, slab_size, slabs, store_user, trace;
 	char size_str[20];
 	char dist_str[40];
 	char flags[20];
 	char *p = flags;
+	char *n;
 
-	if (!show_slab)
+	if (skip_zero && !s->slabs)
 		return;
 
-	aliases = get_obj("aliases");
-	align = get_obj("align");
-	cache_dma = get_obj("cache_dma");
-	cpu_slabs = get_obj("cpu_slabs");
-	destroy_by_rcu = get_obj("destroy_by_rcu");
-	hwcache_align = get_obj("hwcache_align");
-	object_size = get_obj("object_size");
-	objects = get_obj("objects");
-	objs_per_slab = get_obj("objs_per_slab");
-	order = get_obj("order");
-	partial = get_obj("partial");
-	poison = get_obj("poison");
-	reclaim_account = get_obj("reclaim_account");
-	red_zone = get_obj("red_zone");
-	sanity_checks = get_obj("sanity_checks");
-	slab_size = get_obj("slab_size");
-	slabs = get_obj("slabs");
-	store_user = get_obj("store_user");
-	trace = get_obj("trace");
-
-	if (skip_zero && !slabs)
-		return;
-
-	store_size(size_str, slabs * page_size);
-	sprintf(dist_str,"%lu/%lu/%lu", slabs, partial, cpu_slabs);
+	store_size(size_str, slab_size(s));
+	sprintf(dist_str,"%lu/%lu/%d", s->slabs, s->partial, s->cpu_slabs);
 
 	if (!line++)
 		first_line();
 
-	if (aliases)
+	if (s->aliases)
 		*p++ = '*';
-	if (cache_dma)
+	if (s->cache_dma)
 		*p++ = 'd';
-	if (hwcache_align)
+	if (s->hwcache_align)
 		*p++ = 'A';
-	if (poison)
+	if (s->poison)
 		*p++ = 'P';
-	if (reclaim_account)
+	if (s->reclaim_account)
 		*p++ = 'a';
-	if (red_zone)
+	if (s->red_zone)
 		*p++ = 'Z';
-	if (sanity_checks)
+	if (s->sanity_checks)
 		*p++ = 'F';
-	if (store_user)
+	if (s->store_user)
 		*p++ = 'U';
-	if (trace)
+	if (s->trace)
 		*p++ = 'T';
 
 	*p = 0;
-	printf("%-20s %8ld %7ld %8s %14s %3ld %1ld %3ld %3ld %s\n",
-			name, objects, object_size, size_str, dist_str,
-			objs_per_slab, order,
-			slabs ? (partial * 100) / slabs : 100,
-			slabs ? (objects * object_size * 100) /
-				(slabs * (page_size << order)) : 100,
+	n = hackname(s);
+	printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n",
+			n, s->objects, s->object_size, size_str, dist_str,
+			s->objs_per_slab, s->order,
+			s->slabs ? (s->partial * 100) / s->slabs : 100,
+			s->slabs ? (s->objects * s->object_size * 100) /
+				(s->slabs * (page_size << s->order)) : 100,
 			flags);
 }
 
-void slab_numa(const char *name)
+void slab_numa(struct slabinfo *s)
 {
-	unsigned long slabs;
-	char *numainfo;
+	char *n;
+	int node;
 
-	slabs = get_obj_and_str("slabs", &numainfo);
+	if (!highest_node)
+		fatal("No NUMA information available.\n");
 
-	if (skip_zero && !slabs)
+	if (skip_zero && !s->slabs)
 		return;
+	n = hackname(s);
 
-	printf("%-20s %s", name, numainfo);
-}
+	if (!line) {
+		printf("\nSlab             Node ");
+		for(node = 0; node <= highest_node; node++)
+			printf(" %4d", node);
+		printf("\n----------------------");
+		for(node = 0; node <= highest_node; node++)
+			printf("-----");
+		printf("\n");
+	}
+	printf("%-21s ", n);
+	for(node = 0; node <= highest_node; node++) {
+		char b[20];
 
-void parameter(const char *name)
-{
-	if (!show_parameters)
-		return;
+		store_size(b, s->numa[node]);
+		printf(" %4s", b);
+	}
+	printf("\n");
+	line++;
 }
 
-void show_tracking(const char *name)
+void show_tracking(struct slabinfo *s)
 {
-	printf("\n%s: Calls to allocate a slab object\n", name);
+	printf("\n%s: Calls to allocate a slab object\n", s->name);
 	printf("---------------------------------------------------\n");
 	if (read_obj("alloc_calls"))
 		printf(buffer);
 
-	printf("%s: Calls to free a slab object\n", name);
+	printf("%s: Calls to free a slab object\n", s->name);
 	printf("-----------------------------------------------\n");
 	if (read_obj("free_calls"))
 		printf(buffer);
 
 }
 
+void totals(void)
+{
+	struct slabinfo *s;
+
+	int used_slabs = 0;
+	char b1[20], b2[20], b3[20], b4[20];
+	unsigned long long min_objsize = 0, max_objsize = 0, avg_objsize;
+	unsigned long long min_partial = 0, max_partial = 0, avg_partial, total_partial = 0;
+	unsigned long long min_slabs = 0, max_slabs = 0, avg_slabs, total_slabs = 0;
+	unsigned long long min_size = 0, max_size = 0, avg_size, total_size = 0;
+	unsigned long long min_waste = 0, max_waste = 0, avg_waste, total_waste = 0;
+	unsigned long long min_objects = 0, max_objects = 0, avg_objects, total_objects = 0;
+	unsigned long long min_objwaste = 0, max_objwaste = 0, avg_objwaste;
+	unsigned long long min_used = 0, max_used = 0, avg_used, total_used = 0;
+	unsigned long min_ppart = 0, max_ppart = 0, avg_ppart, total_ppart = 0;
+	unsigned long min_partobj = 0, max_partobj = 0, avg_partobj;
+	unsigned long total_objects_in_partial = 0;
+
+	for (s = slabinfo; s < slabinfo + slabs; s++) {
+		unsigned long long size;
+		unsigned long partial;
+		unsigned long slabs;
+		unsigned long used;
+		unsigned long long wasted;
+		unsigned long long objwaste;
+		long long objects_in_partial;
+		unsigned long percentage_partial;
+
+		if (!s->slabs || !s->objects)
+			continue;
+
+		used_slabs++;
+
+		size = slab_size(s);
+		partial = s->partial << s->order;
+		slabs = s->slabs << s->order;
+		used = s->objects * s->object_size;
+		wasted = size - used;
+		objwaste = wasted / s->objects;
+
+		objects_in_partial = s->objects - (s->slabs - s->partial - s ->cpu_slabs)
+					* s->objs_per_slab;
+
+		if (objects_in_partial < 0)
+			objects_in_partial = 0;
+
+		percentage_partial = objects_in_partial * 100 / s->objects;
+		if (percentage_partial > 100)
+			percentage_partial = 100;
+
+		if (s->object_size < min_objsize || !min_objsize)
+			min_objsize = s->object_size;
+		if (partial && (partial < min_partial || !min_partial))
+			min_partial = partial;
+		if (slabs < min_slabs || !min_partial)
+			min_slabs = slabs;
+		if (size < min_size)
+			min_size = size;
+		if (wasted < min_waste && !min_waste)
+			min_waste = wasted;
+		if (objwaste < min_objwaste || !min_objwaste)
+			min_objwaste = objwaste;
+		if (s->objects < min_objects || !min_objects)
+			min_objects = s->objects;
+		if (used < min_used || !min_used)
+			min_used = used;
+		if (objects_in_partial < min_partobj || !min_partobj)
+			min_partobj = objects_in_partial;
+		if (percentage_partial < min_ppart || !min_ppart)
+			min_ppart = percentage_partial;
+
+		if (s->object_size > max_objsize)
+			max_objsize = s->object_size;
+		if (partial > max_partial)
+			max_partial = partial;
+		if (slabs > max_slabs)
+			max_slabs = slabs;
+		if (size > max_size)
+			max_size = size;
+		if (wasted > max_waste)
+			max_waste = wasted;
+		if (objwaste > max_objwaste)
+			max_objwaste = objwaste;
+		if (s->objects > max_objects)
+			max_objects = s->objects;
+		if (used > max_used)
+			max_used = used;
+		if (objects_in_partial > max_partobj)
+			max_partobj = objects_in_partial;
+		if (percentage_partial > max_ppart)
+			max_ppart = percentage_partial;
+
+		total_objects += s->objects;
+		total_partial += partial;
+		total_slabs += slabs;
+		total_used += used;
+		total_waste += wasted;
+		total_size += size;
+		total_ppart += percentage_partial;
+		total_objects_in_partial += objects_in_partial;
+	}
+
+	if (!total_objects) {
+		printf("No objects\n");
+		return;
+	}
+	if (!used_slabs) {
+		printf("No slabs\n");
+		return;
+	}
+	avg_partial = total_partial / used_slabs;
+	avg_slabs = total_slabs / used_slabs;
+	avg_waste = total_waste / used_slabs;
+	avg_size = total_waste / used_slabs;
+	avg_objects = total_objects / used_slabs;
+	avg_used = total_used / used_slabs;
+	avg_ppart = total_ppart / used_slabs;
+	avg_partobj = total_objects_in_partial / used_slabs;
+
+	avg_objsize = total_used / total_objects;
+	avg_objwaste = total_waste / total_objects;
+
+	printf("Slabcache Totals\n");
+	printf("----------------\n");
+	printf("Slabcaches : %3d      Aliases  : %3d      Active: %3d\n",
+			slabs, aliases, used_slabs);
+
+	store_size(b1, total_used);store_size(b2, total_waste);
+	store_size(b3, total_waste * 100 / total_used);
+	printf("Memory used: %6s   # Loss   : %6s   MRatio: %6s%%\n", b1, b2, b3);
+
+	store_size(b1, total_objects);store_size(b2, total_objects_in_partial);
+	store_size(b3, total_objects_in_partial * 100 / total_objects);
+	printf("# Objects  : %6s   # PartObj: %6s   ORatio: %6s%%\n", b1, b2, b3);
+
+	printf("\n");
+	printf("Per Cache    Average         Min         Max       Total\n");
+	printf("---------------------------------------------------------\n");
+
+	store_size(b1, avg_objects);store_size(b2, min_objects);
+	store_size(b3, max_objects);store_size(b4, total_objects);
+	printf("# Objects %10s  %10s  %10s  %10s\n",
+			b1,	b2,	b3,	b4);
+
+	store_size(b1, avg_slabs);store_size(b2, min_slabs);
+	store_size(b3, max_slabs);store_size(b4, total_slabs);
+	printf("# Slabs   %10s  %10s  %10s  %10s\n",
+			b1,	b2,	b3,	b4);
+
+	store_size(b1, avg_partial);store_size(b2, min_partial);
+	store_size(b3, max_partial);store_size(b4, total_partial);
+	printf("# Partial %10s  %10s  %10s  %10s\n",
+			b1,	b2,	b3,	b4);
+	store_size(b1, avg_ppart);store_size(b2, min_ppart);
+	store_size(b3, max_ppart);
+	printf("%% Partial %10s%% %10s%% %10s%%\n",
+			b1,	b2,	b3);
+
+	store_size(b1, avg_size);store_size(b2, min_size);
+	store_size(b3, max_size);store_size(b4, total_size);
+	printf("Memory    %10s  %10s  %10s  %10s\n",
+			b1,	b2,	b3,	b4);
+
+	store_size(b1, avg_used);store_size(b2, min_used);
+	store_size(b3, max_used);store_size(b4, total_used);
+	printf("Used      %10s  %10s  %10s  %10s\n",
+			b1,	b2,	b3,	b4);
+
+	store_size(b1, avg_slabs);store_size(b2, min_slabs);
+	store_size(b3, max_slabs);store_size(b4, total_slabs);
+	printf("Waste     %10s  %10s  %10s  %10s\n",
+			b1,	b2,	b3,	b4);
+
+	printf("\n");
+	printf("Per Object   Average         Min         Max\n");
+	printf("---------------------------------------------\n");
+
+	store_size(b1, avg_objsize);store_size(b2, min_objsize);
+	store_size(b3, max_objsize);
+	printf("Size      %10s  %10s  %10s\n",
+			b1,	b2,	b3);
+
+	store_size(b1, avg_objwaste);store_size(b2, min_objwaste);
+	store_size(b3, max_objwaste);
+	printf("Loss      %10s  %10s  %10s\n",
+			b1,	b2,	b3);
+}
+
+void sort_slabs(void)
+{
+	struct slabinfo *s1,*s2;
+
+	for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) {
+		for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) {
+			int result;
+
+			if (sort_size)
+				result = slab_size(s1) < slab_size(s2);
+			else
+				result = strcasecmp(s1->name, s2->name);
+
+			if (show_inverted)
+				result = -result;
+
+			if (result > 0) {
+				struct slabinfo t;
+
+				memcpy(&t, s1, sizeof(struct slabinfo));
+				memcpy(s1, s2, sizeof(struct slabinfo));
+				memcpy(s2, &t, sizeof(struct slabinfo));
+			}
+		}
+	}
+}
+
+void sort_aliases(void)
+{
+	struct aliasinfo *a1,*a2;
+
+	for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) {
+		for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) {
+			char *n1, *n2;
+
+			n1 = a1->name;
+			n2 = a2->name;
+			if (show_alias && !show_inverted) {
+				n1 = a1->ref;
+				n2 = a2->ref;
+			}
+			if (strcasecmp(n1, n2) > 0) {
+				struct aliasinfo t;
+
+				memcpy(&t, a1, sizeof(struct aliasinfo));
+				memcpy(a1, a2, sizeof(struct aliasinfo));
+				memcpy(a2, &t, sizeof(struct aliasinfo));
+			}
+		}
+	}
+}
+
+void link_slabs(void)
+{
+	struct aliasinfo *a;
+	struct slabinfo *s;
+
+	for (a = aliasinfo; a < aliasinfo + aliases; a++) {
+
+		for(s = slabinfo; s < slabinfo + slabs; s++)
+			if (strcmp(a->ref, s->name) == 0) {
+				a->slab = s;
+				s->refs++;
+				break;
+			}
+		if (s == slabinfo + slabs)
+			fatal("Unresolved alias %s\n", a->ref);
+	}
+}
+
+void alias(void)
+{
+	struct aliasinfo *a;
+	char *active = NULL;
+
+	sort_aliases();
+	link_slabs();
+
+	for(a = aliasinfo; a < aliasinfo + aliases; a++) {
+
+		if (!show_single_ref && a->slab->refs == 1)
+			continue;
+
+		if (!show_inverted) {
+			if (active) {
+				if (strcmp(a->slab->name, active) == 0) {
+					printf(" %s", a->name);
+					continue;
+				}
+			}
+			printf("\n%-20s <- %s", a->slab->name, a->name);
+			active = a->slab->name;
+		}
+		else
+			printf("%-20s -> %s\n", a->name, a->slab->name);
+	}
+	if (active)
+		printf("\n");
+}
+
+
+void rename_slabs(void)
+{
+	struct slabinfo *s;
+	struct aliasinfo *a;
+
+	for (s = slabinfo; s < slabinfo + slabs; s++) {
+		if (*s->name != ':')
+			continue;
+
+		if (s->refs > 1 && !show_first_alias)
+			continue;
+
+		a = find_one_alias(s);
+
+		s->name = a->name;
+	}
+}
+
 int slab_mismatch(char *slab)
 {
 	return regexec(&pattern, slab, 0, NULL, 0);
 }
 
+void read_slab_dir(void)
+{
+	DIR *dir;
+	struct dirent *de;
+	struct slabinfo *slab = slabinfo;
+	struct aliasinfo *alias = aliasinfo;
+	char *p;
+	char *t;
+	int count;
+
+	dir = opendir(".");
+	while ((de = readdir(dir))) {
+		if (de->d_name[0] == '.' ||
+				slab_mismatch(de->d_name))
+			continue;
+		switch (de->d_type) {
+		   case DT_LNK:
+		   	alias->name = strdup(de->d_name);
+			count = readlink(de->d_name, buffer, sizeof(buffer));
+
+			if (count < 0)
+				fatal("Cannot read symlink %s\n", de->d_name);
+
+			buffer[count] = 0;
+			p = buffer + count;
+			while (p > buffer && p[-1] != '/')
+				p--;
+			alias->ref = strdup(p);
+			alias++;
+			break;
+		   case DT_DIR:
+			if (chdir(de->d_name))
+				fatal("Unable to access slab %s\n", slab->name);
+		   	slab->name = strdup(de->d_name);
+			slab->alias = 0;
+			slab->refs = 0;
+			slab->aliases = get_obj("aliases");
+			slab->align = get_obj("align");
+			slab->cache_dma = get_obj("cache_dma");
+			slab->cpu_slabs = get_obj("cpu_slabs");
+			slab->destroy_by_rcu = get_obj("destroy_by_rcu");
+			slab->hwcache_align = get_obj("hwcache_align");
+			slab->object_size = get_obj("object_size");
+			slab->objects = get_obj("objects");
+			slab->objs_per_slab = get_obj("objs_per_slab");
+			slab->order = get_obj("order");
+			slab->partial = get_obj("partial");
+			slab->partial = get_obj_and_str("partial", &t);
+			decode_numa_list(slab->numa_partial, t);
+			slab->poison = get_obj("poison");
+			slab->reclaim_account = get_obj("reclaim_account");
+			slab->red_zone = get_obj("red_zone");
+			slab->sanity_checks = get_obj("sanity_checks");
+			slab->slab_size = get_obj("slab_size");
+			slab->slabs = get_obj_and_str("slabs", &t);
+			decode_numa_list(slab->numa, t);
+			slab->store_user = get_obj("store_user");
+			slab->trace = get_obj("trace");
+			chdir("..");
+			slab++;
+			break;
+		   default :
+			fatal("Unknown file type %lx\n", de->d_type);
+		}
+	}
+	closedir(dir);
+	slabs = slab - slabinfo;
+	aliases = alias - aliasinfo;
+	if (slabs > MAX_SLABS)
+		fatal("Too many slabs\n");
+	if (aliases > MAX_ALIASES)
+		fatal("Too many aliases\n");
+}
+
+void output_slabs(void)
+{
+	struct slabinfo *slab;
+
+	for (slab = slabinfo; slab < slabinfo + slabs; slab++) {
+
+		if (slab->alias)
+			continue;
+
+
+		if (show_numa)
+			slab_numa(slab);
+		else
+		if (show_track)
+			show_tracking(slab);
+		else
+		if (validate)
+			slab_validate(slab);
+		else
+		if (shrink)
+			slab_shrink(slab);
+		else {
+			if (show_slab)
+				slabcache(slab);
+		}
+	}
+}
+
 struct option opts[] = {
 	{ "aliases", 0, NULL, 'a' },
 	{ "slabs", 0, NULL, 's' },
 	{ "numa", 0, NULL, 'n' },
-	{ "parameters", 0, NULL, 'p' },
 	{ "zero", 0, NULL, 'z' },
 	{ "help", 0, NULL, 'h' },
 	{ "validate", 0, NULL, 'v' },
+	{ "first-alias", 0, NULL, 'f' },
+	{ "reduce", 0, NULL, 'r' },
 	{ "track", 0, NULL, 't'},
+	{ "inverted", 0, NULL, 'i'},
+	{ "1ref", 0, NULL, '1'},
 	{ NULL, 0, NULL, 0 }
 };
 
 int main(int argc, char *argv[])
 {
-	DIR *dir;
-	struct dirent *de;
 	int c;
 	int err;
 	char *pattern_source;
@@ -312,22 +807,31 @@ int main(int argc, char *argv[])
 	if (chdir("/sys/slab"))
 		fatal("This kernel does not have SLUB support.\n");
 
-	while ((c = getopt_long(argc, argv, "ahtvnpsz", opts, NULL)) != -1)
+	while ((c = getopt_long(argc, argv, "afhi1nprstvzTS", opts, NULL)) != -1)
 	switch(c) {
-		case 's':
-			show_slab = 1;
+		case '1':
+			show_single_ref = 1;
 			break;
 		case 'a':
 			show_alias = 1;
 			break;
+		case 'f':
+			show_first_alias = 1;
+			break;
+		case 'h':
+			usage();
+			return 0;
+		case 'i':
+			show_inverted = 1;
+			break;
 		case 'n':
 			show_numa = 1;
 			break;
-		case 'p':
-			show_parameters = 1;
+		case 'r':
+			shrink = 1;
 			break;
-		case 'z':
-			skip_zero = 0;
+		case 's':
+			show_slab = 1;
 			break;
 		case 't':
 			show_track = 1;
@@ -335,17 +839,23 @@ int main(int argc, char *argv[])
 		case 'v':
 			validate = 1;
 			break;
-		case 'h':
-			usage();
-			return 0;
+		case 'z':
+			skip_zero = 0;
+			break;
+		case 'T':
+			show_totals = 1;
+			break;
+		case 'S':
+			sort_size = 1;
+			break;
 
 		default:
 			fatal("%s: Invalid option '%c'\n", argv[0], optopt);
 
 	}
 
-	if (!show_slab && !show_alias && !show_parameters && !show_track
-		&& !validate)
+	if (!show_slab && !show_alias && !show_track
+		&& !validate && !shrink)
 			show_slab = 1;
 
 	if (argc > optind)
@@ -357,39 +867,17 @@ int main(int argc, char *argv[])
 	if (err)
 		fatal("%s: Invalid pattern '%s' code %d\n",
 			argv[0], pattern_source, err);
-
-	dir = opendir(".");
-	while ((de = readdir(dir))) {
-		if (de->d_name[0] == '.' ||
-				slab_mismatch(de->d_name))
-			continue;
-		switch (de->d_type) {
-		   case DT_LNK:
-			alias(de->d_name);
-			break;
-		   case DT_DIR:
-			if (chdir(de->d_name))
-				fatal("Unable to access slab %s\n", de->d_name);
-
-		   	if (show_numa)
-				slab_numa(de->d_name);
-			else
-			if (show_track)
-				show_tracking(de->d_name);
-			else
-		   	if (validate)
-				slab_validate(de->d_name);
-			else
-				slab(de->d_name);
-			chdir("..");
-			break;
-		   case DT_REG:
-			parameter(de->d_name);
-			break;
-		   default :
-			fatal("Unknown file type %lx\n", de->d_type);
-		}
+	read_slab_dir();
+	if (show_alias)
+		alias();
+	else
+	if (show_totals)
+		totals();
+	else {
+		link_slabs();
+		rename_slabs();
+		sort_slabs();
+		output_slabs();
 	}
-	closedir(dir);
 	return 0;
 }

--

From clameter@sgi.com Thu Apr 26 21:21:57 2007
Message-Id: <20070427042156.910075002@sgi.com>
References: <20070427042126.299876478@sgi.com>
User-Agent: quilt/0.45-1
Date: Thu, 26 Apr 2007 21:21:34 -0700
From: clameter@sgi.com
To: akpm@linux-foundation.org
Cc: linux-mm@kvack.org,
 Mel Gorman <mel@skynet.ie>
Subject: [patch 08/10] SLUB: Reduce the order of allocations to avoid fragmentation
Content-Disposition: inline; filename=slub_i386_no_frag

Seems that fragmentation is an important subject. So be safe.

If an arch supports 4k page size then assume that defragmentation may be
a major problem. Reduce the minimum number of objects in a slab and
limit the order of slabs. Be a little bit more lenient for larger
page sizes.

Change the bootup message of SLUB to show the parameters so that
difficulties due to fragmentation are detectable when the boot
log is reviewed.

Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.21-rc7-mm2/mm/slub.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/mm/slub.c	2007-04-26 21:01:28.000000000 -0700
+++ linux-2.6.21-rc7-mm2/mm/slub.c	2007-04-26 21:02:01.000000000 -0700
@@ -109,6 +109,25 @@
 /* Enable to test recovery from slab corruption on boot */
 #undef SLUB_RESILIENCY_TEST
 
+#if PAGE_SHIFT <= 12
+
+/*
+ * Small page size. Make sure that we do not fragment memory
+ */
+#define DEFAULT_MAX_ORDER 1
+#define DEFAULT_MIN_OBJECTS 4
+
+#else
+
+/*
+ * Large page machines are customarily able to handle larger
+ * page orders.
+ */
+#define DEFAULT_MAX_ORDER 2
+#define DEFAULT_MIN_OBJECTS 8
+
+#endif
+
 /*
  * Mininum number of partial slabs. These will be left on the partial
  * lists even if they are empty. kmem_cache_shrink may reclaim them.
@@ -1437,13 +1456,13 @@ static struct page *get_object_page(cons
  * take the list_lock.
  */
 static int slub_min_order;
-static int slub_max_order = 4;
+static int slub_max_order = DEFAULT_MAX_ORDER;
 
 /*
  * Minimum number of objects per slab. This is necessary in order to
  * reduce locking overhead. Similar to the queue size in SLAB.
  */
-static int slub_min_objects = 8;
+static int slub_min_objects = DEFAULT_MIN_OBJECTS;
 
 /*
  * Merge control. If this is set then no merging of slab caches will occur.
@@ -2338,9 +2357,10 @@ void __init kmem_cache_init(void)
 		kmem_size = offsetof(struct kmem_cache, cpu_slab)
 			 + nr_cpu_ids * sizeof(struct page *);
 
-	printk(KERN_INFO "SLUB: General Slabs=%d, HW alignment=%d, "
-		"Processors=%d, Nodes=%d\n",
+	printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
+		" Processors=%d, Nodes=%d\n",
 		KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES,
+		slub_min_order, slub_max_order, slub_min_objects,
 		nr_cpu_ids, nr_node_ids);
 }
 

--

From clameter@sgi.com Thu Apr 26 21:21:57 2007
Message-Id: <20070427042157.173100785@sgi.com>
References: <20070427042126.299876478@sgi.com>
User-Agent: quilt/0.45-1
Date: Thu, 26 Apr 2007 21:21:35 -0700
From: clameter@sgi.com
To: akpm@linux-foundation.org
Cc: linux-mm@kvack.org,
 Mel Gorman <mel@skynet.ie>
Subject: [patch 09/10] SLUB: Exploit page mobility to increase allocation order
Content-Disposition: inline; filename=slub_i386_mobility

If there is page mobility then we can defragment memory. So its possible to
use higher order of pages for slab allocations.

If the defaults were not overridden set the max order to 4 and guarantee 16
objects per slab. This will put some stress on Mel's antifrag approaches.
If these defaults are too large then they should be later reduced.

Cc: Mel Gorman <mel@skynet.ie>
Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.21-rc7-mm2/include/linux/mmzone.h
===================================================================
--- linux-2.6.21-rc7-mm2.orig/include/linux/mmzone.h	2007-04-26 20:57:58.000000000 -0700
+++ linux-2.6.21-rc7-mm2/include/linux/mmzone.h	2007-04-26 21:05:48.000000000 -0700
@@ -25,6 +25,8 @@
 #endif
 #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
 
+extern int page_group_by_mobility_disabled;
+
 /*
  * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
  * costly to service.  That is between allocation orders which should
Index: linux-2.6.21-rc7-mm2/mm/slub.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/mm/slub.c	2007-04-26 21:02:01.000000000 -0700
+++ linux-2.6.21-rc7-mm2/mm/slub.c	2007-04-26 21:05:48.000000000 -0700
@@ -129,6 +129,13 @@
 #endif
 
 /*
+ * If antifragmentation methods are in effect then increase the
+ * slab sizes to increase performance
+ */
+#define DEFAULT_ANTIFRAG_MAX_ORDER 4
+#define DEFAULT_ANTIFRAG_MIN_OBJECTS 16
+
+/*
  * Mininum number of partial slabs. These will be left on the partial
  * lists even if they are empty. kmem_cache_shrink may reclaim them.
  */
@@ -1450,6 +1457,11 @@ static struct page *get_object_page(cons
  */
 
 /*
+ * Set if the user has overridden any of the order related defaults.
+ */
+static int user_overide;
+
+/*
  * Mininum / Maximum order of slab pages. This influences locking overhead
  * and slab fragmentation. A higher order reduces the number of partial slabs
  * and increases the number of allocations possible without having to
@@ -1985,7 +1997,7 @@ static struct kmem_cache *kmalloc_caches
 static int __init setup_slub_min_order(char *str)
 {
 	get_option (&str, &slub_min_order);
-
+	user_override = 1;
 	return 1;
 }
 
@@ -1994,7 +2006,7 @@ __setup("slub_min_order=", setup_slub_mi
 static int __init setup_slub_max_order(char *str)
 {
 	get_option (&str, &slub_max_order);
-
+	user_override = 1;
 	return 1;
 }
 
@@ -2003,7 +2015,7 @@ __setup("slub_max_order=", setup_slub_ma
 static int __init setup_slub_min_objects(char *str)
 {
 	get_option (&str, &slub_min_objects);
-
+	user_override = 1;
 	return 1;
 }
 
@@ -2319,6 +2331,15 @@ void __init kmem_cache_init(void)
 {
 	int i;
 
+	if (!page_group_by_mobility_disabled && !user_override) {
+		/*
+		 * Antifrag support available. Increase usable
+		 * page order and generate slabs with more objects.
+	 	 */
+		slub_max_order = ANTIFRAG_ORDER;
+		slub_min_objects = ANTIFRAG_MIN_OBJECTS;
+	}
+
 #ifdef CONFIG_NUMA
 	/*
 	 * Must first have the slab cache available for the allocations of the

--

From clameter@sgi.com Thu Apr 26 21:21:57 2007
Message-Id: <20070427042157.419317176@sgi.com>
References: <20070427042126.299876478@sgi.com>
User-Agent: quilt/0.45-1
Date: Thu, 26 Apr 2007 21:21:36 -0700
From: clameter@sgi.com
To: akpm@linux-foundation.org
Cc: linux-mm@kvack.org,
 William Lee Irwin III <wli@holomorphy.com>
Subject: [patch 10/10] SLUB: i386 support
Content-Disposition: inline; filename=slub_i386_pgd_slab

SLUB cannot run on i386 at this point because i386 uses the page->private and
page->index field of slab pages for the pgd cache.

Make SLUB run on i386 by replacing the pgd slab cache with a quicklist.
Limit the changes as much as possible Leave the improvised linked list in place
etc etc. This has been working here for a couple of weeks now.

Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: linux-2.6.21-rc7-mm2/arch/i386/Kconfig
===================================================================
--- linux-2.6.21-rc7-mm2.orig/arch/i386/Kconfig	2007-04-26 15:18:04.000000000 -0700
+++ linux-2.6.21-rc7-mm2/arch/i386/Kconfig	2007-04-26 15:23:08.000000000 -0700
@@ -55,6 +55,10 @@ config ZONE_DMA
 	bool
 	default y
 
+config QUICKLIST
+	bool
+	default y
+
 config SBUS
 	bool
 
@@ -79,10 +83,6 @@ config ARCH_MAY_HAVE_PC_FDC
 	bool
 	default y
 
-config ARCH_USES_SLAB_PAGE_STRUCT
-	bool
-	default y
-
 config DMI
 	bool
 	default y
Index: linux-2.6.21-rc7-mm2/arch/i386/kernel/process.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/arch/i386/kernel/process.c	2007-04-26 15:18:04.000000000 -0700
+++ linux-2.6.21-rc7-mm2/arch/i386/kernel/process.c	2007-04-26 15:23:08.000000000 -0700
@@ -186,6 +186,7 @@ void cpu_idle(void)
 			if (__get_cpu_var(cpu_idle_state))
 				__get_cpu_var(cpu_idle_state) = 0;
 
+			check_pgt_cache();
 			rmb();
 			idle = pm_idle;
 
Index: linux-2.6.21-rc7-mm2/arch/i386/kernel/smp.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/arch/i386/kernel/smp.c	2007-04-26 15:18:04.000000000 -0700
+++ linux-2.6.21-rc7-mm2/arch/i386/kernel/smp.c	2007-04-26 15:23:08.000000000 -0700
@@ -429,7 +429,7 @@ void flush_tlb_mm (struct mm_struct * mm
 	}
 	if (!cpus_empty(cpu_mask))
 		flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL);
-
+	check_pgt_cache();
 	preempt_enable();
 }
 
Index: linux-2.6.21-rc7-mm2/arch/i386/mm/init.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/arch/i386/mm/init.c	2007-04-26 15:18:04.000000000 -0700
+++ linux-2.6.21-rc7-mm2/arch/i386/mm/init.c	2007-04-26 15:23:08.000000000 -0700
@@ -752,7 +752,6 @@ int remove_memory(u64 start, u64 size)
 EXPORT_SYMBOL_GPL(remove_memory);
 #endif
 
-struct kmem_cache *pgd_cache;
 struct kmem_cache *pmd_cache;
 
 void __init pgtable_cache_init(void)
@@ -776,12 +775,6 @@ void __init pgtable_cache_init(void)
 			pgd_size = PAGE_SIZE;
 		}
 	}
-	pgd_cache = kmem_cache_create("pgd",
-				pgd_size,
-				pgd_size,
-				SLAB_PANIC,
-				pgd_ctor,
-				(!SHARED_KERNEL_PMD) ? pgd_dtor : NULL);
 }
 
 /*
Index: linux-2.6.21-rc7-mm2/arch/i386/mm/pgtable.c
===================================================================
--- linux-2.6.21-rc7-mm2.orig/arch/i386/mm/pgtable.c	2007-04-26 15:18:04.000000000 -0700
+++ linux-2.6.21-rc7-mm2/arch/i386/mm/pgtable.c	2007-04-26 15:37:29.000000000 -0700
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/spinlock.h>
 #include <linux/module.h>
+#include <linux/quicklist.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
@@ -205,8 +206,6 @@ void pmd_ctor(void *pmd, struct kmem_cac
  * against pageattr.c; it is the unique case in which a valid change
  * of kernel pagetables can't be lazily synchronized by vmalloc faults.
  * vmalloc faults work because attached pagetables are never freed.
- * The locking scheme was chosen on the basis of manfred's
- * recommendations and having no core impact whatsoever.
  * -- wli
  */
 DEFINE_SPINLOCK(pgd_lock);
@@ -232,9 +231,11 @@ static inline void pgd_list_del(pgd_t *p
 		set_page_private(next, (unsigned long)pprev);
 }
 
+
+
 #if (PTRS_PER_PMD == 1)
 /* Non-PAE pgd constructor */
-void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
+void pgd_ctor(void *pgd)
 {
 	unsigned long flags;
 
@@ -256,7 +257,7 @@ void pgd_ctor(void *pgd, struct kmem_cac
 }
 #else  /* PTRS_PER_PMD > 1 */
 /* PAE pgd constructor */
-void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused)
+void pgd_ctor(void *pgd)
 {
 	/* PAE, kernel PMD may be shared */
 
@@ -275,11 +276,12 @@ void pgd_ctor(void *pgd, struct kmem_cac
 }
 #endif	/* PTRS_PER_PMD */
 
-void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused)
+void pgd_dtor(void *pgd)
 {
 	unsigned long flags; /* can be called from interrupt context */
 
-	BUG_ON(SHARED_KERNEL_PMD);
+	if (SHARED_KERNEL_PMD)
+		return;
 
 	paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT);
 	spin_lock_irqsave(&pgd_lock, flags);
@@ -321,7 +323,7 @@ static void pmd_cache_free(pmd_t *pmd, i
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	int i;
-	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
+	pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
 
 	if (PTRS_PER_PMD == 1 || !pgd)
 		return pgd;
@@ -344,7 +346,7 @@ out_oom:
 		paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
 		pmd_cache_free(pmd, i);
 	}
-	kmem_cache_free(pgd_cache, pgd);
+	quicklist_free(0, pgd_dtor, pgd);
 	return NULL;
 }
 
@@ -361,5 +363,11 @@ void pgd_free(pgd_t *pgd)
 			pmd_cache_free(pmd, i);
 		}
 	/* in the non-PAE case, free_pgtables() clears user pgd entries */
-	kmem_cache_free(pgd_cache, pgd);
+	quicklist_free(0, pgd_dtor, pgd);
 }
+
+void check_pgt_cache(void)
+{
+	quicklist_trim(0, pgd_dtor, 25, 16);
+}
+
Index: linux-2.6.21-rc7-mm2/include/asm-i386/pgalloc.h
===================================================================
--- linux-2.6.21-rc7-mm2.orig/include/asm-i386/pgalloc.h	2007-04-26 15:18:04.000000000 -0700
+++ linux-2.6.21-rc7-mm2/include/asm-i386/pgalloc.h	2007-04-26 15:23:08.000000000 -0700
@@ -65,6 +65,4 @@ do {									\
 #define pud_populate(mm, pmd, pte)	BUG()
 #endif
 
-#define check_pgt_cache()	do { } while (0)
-
 #endif /* _I386_PGALLOC_H */
Index: linux-2.6.21-rc7-mm2/include/asm-i386/pgtable.h
===================================================================
--- linux-2.6.21-rc7-mm2.orig/include/asm-i386/pgtable.h	2007-04-26 15:18:04.000000000 -0700
+++ linux-2.6.21-rc7-mm2/include/asm-i386/pgtable.h	2007-04-26 15:23:08.000000000 -0700
@@ -35,17 +35,16 @@ struct vm_area_struct;
 #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 extern unsigned long empty_zero_page[1024];
 extern pgd_t swapper_pg_dir[1024];
-extern struct kmem_cache *pgd_cache;
 extern struct kmem_cache *pmd_cache;
 extern spinlock_t pgd_lock;
 extern struct page *pgd_list;
+void check_pgt_cache(void);
 
 void pmd_ctor(void *, struct kmem_cache *, unsigned long);
-void pgd_ctor(void *, struct kmem_cache *, unsigned long);
-void pgd_dtor(void *, struct kmem_cache *, unsigned long);
 void pgtable_cache_init(void);
 void paging_init(void);
 
+
 /*
  * The Linux x86 paging architecture is 'compile-time dual-mode', it
  * implements both the traditional 2-level x86 page tables and the

--