From clameter@sgi.com Thu Apr 26 21:21:55 2007 Message-Id: <20070427042126.299876478@sgi.com> User-Agent: quilt/0.45-1 Date: Thu, 26 Apr 2007 21:21:26 -0700 From: clameter@sgi.com To: akpm@linux-foundation.org Cc: linux-mm@kvack.org Subject: [patch 00/10] SLUB updates against 2.6.21-rc7-mm2 -- From clameter@sgi.com Thu Apr 26 21:21:55 2007 Message-Id: <20070427042155.150222454@sgi.com> References: <20070427042126.299876478@sgi.com> User-Agent: quilt/0.45-1 Date: Thu, 26 Apr 2007 21:21:27 -0700 From: clameter@sgi.com To: akpm@linux-foundation.org Cc: linux-mm@kvack.org Subject: [patch 01/10] SLUB: Remove duplicate VM_BUG_ON Content-Disposition: inline; filename=slub_duplicate Somehow this artifact got in during merge with mm. Signed-off-by: Christoph Lameter Index: linux-2.6.21-rc7-mm1/mm/slub.c =================================================================== --- linux-2.6.21-rc7-mm1.orig/mm/slub.c 2007-04-25 09:48:40.000000000 -0700 +++ linux-2.6.21-rc7-mm1/mm/slub.c 2007-04-25 09:48:47.000000000 -0700 @@ -633,8 +633,6 @@ static void add_full(struct kmem_cache * VM_BUG_ON(!irqs_disabled()); - VM_BUG_ON(!irqs_disabled()); - if (!(s->flags & SLAB_STORE_USER)) return; -- From clameter@sgi.com Thu Apr 26 21:21:55 2007 Message-Id: <20070427042155.479289095@sgi.com> References: <20070427042126.299876478@sgi.com> User-Agent: quilt/0.45-1 Date: Thu, 26 Apr 2007 21:21:28 -0700 From: clameter@sgi.com To: akpm@linux-foundation.org Cc: linux-mm@kvack.org Subject: [patch 02/10] SLAB: Fix sysfs directory handling Content-Disposition: inline; filename=slub_sysfs_dir_fix This fixes the problem that SLUB does not track the names of aliased slabs by changing the way that SLUB manages the files in /sys/slab. If the slab that is being operated on is not mergeable (usually the case if we are debugging) then do not create any aliases. If an alias exists that we conflict with then remove it before creating the directory for the unmergeable slab. If there is a true slab cache there and not an alias then we fail since there is a true duplication of slab cache names. So debugging allows the detection of slab name duplication as usual. If the slab is mergeable then we create a directory with a unique name created from the slab size, slab options and the pointer to the kmem_cache structure (disambiguation). All names referring to the slabs will then be created as symlinks to that unique name. These symlinks are not going to be removed on kmem_cache_destroy() since we only carry a counter for the number of aliases. If a new symlink is created then it may just replace an existing one. This means that one can create a gazillion slabs with the same name (if they all refer to mergeable caches). It will only increase the alias count. So we have the potential of not detecting duplicate slab names (there is actually no harm done by doing that....). We will detect the duplications as as soon as debugging is enabled because we will then no longer generate symlinks and special unique names. Signed-off-by: Christoph Lameter Index: linux-2.6.21-rc7-mm2/mm/slub.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/slub.c 2007-04-26 11:40:52.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/slub.c 2007-04-26 11:40:59.000000000 -0700 @@ -3298,16 +3298,68 @@ static struct kset_uevent_ops slab_ueven decl_subsys(slab, &slab_ktype, &slab_uevent_ops); +#define ID_STR_LENGTH 64 + +/* Create a unique string id for a slab cache: + * format + * :[flags-]size:[memory address of kmemcache] + */ +static char *create_unique_id(struct kmem_cache *s) +{ + char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL); + char *p = name; + + BUG_ON(!name); + + *p++ = ':'; + /* + * First flags affecting slabcache operations */ + if (s->flags & SLAB_CACHE_DMA) + *p++ = 'd'; + if (s->flags & SLAB_RECLAIM_ACCOUNT) + *p++ = 'a'; + if (s->flags & SLAB_DESTROY_BY_RCU) + *p++ = 'r';\ + /* Debug flags */ + if (s->flags & SLAB_RED_ZONE) + *p++ = 'Z'; + if (s->flags & SLAB_POISON) + *p++ = 'P'; + if (s->flags & SLAB_STORE_USER) + *p++ = 'U'; + if (p != name + 1) + *p++ = '-'; + p += sprintf(p,"%07d:0x%p" ,s->size, s); + BUG_ON(p > name + ID_STR_LENGTH - 1); + return name; +} + static int sysfs_slab_add(struct kmem_cache *s) { int err; + const char *name; if (slab_state < SYSFS) /* Defer until later */ return 0; + if (s->flags & SLUB_NEVER_MERGE) { + /* + * Slabcache can never be merged so we can use the name proper. + * This is typically the case for debug situations. In that + * case we can catch duplicate names easily. + */ + sysfs_remove_link(&slab_subsys.kset.kobj, s->name); + name = s->name; + } else + /* + * Create a unique name for the slab as a target + * for the symlinks. + */ + name = create_unique_id(s); + kobj_set_kset_s(s, slab_subsys); - kobject_set_name(&s->kobj, s->name); + kobject_set_name(&s->kobj, name); kobject_init(&s->kobj); err = kobject_add(&s->kobj); if (err) @@ -3317,6 +3369,10 @@ static int sysfs_slab_add(struct kmem_ca if (err) return err; kobject_uevent(&s->kobj, KOBJ_ADD); + if (!(s->flags & SLUB_NEVER_MERGE)) { + sysfs_slab_alias(s, s->name); + kfree(name); + } return 0; } @@ -3342,9 +3398,14 @@ static int sysfs_slab_alias(struct kmem_ { struct saved_alias *al; - if (slab_state == SYSFS) + if (slab_state == SYSFS) { + /* + * If we have a leftover link then remove it. + */ + sysfs_remove_link(&slab_subsys.kset.kobj, name); return sysfs_create_link(&slab_subsys.kset.kobj, &s->kobj, name); + } al = kmalloc(sizeof(struct saved_alias), GFP_KERNEL); if (!al) -- From clameter@sgi.com Thu Apr 26 21:21:55 2007 Message-Id: <20070427042155.693131327@sgi.com> References: <20070427042126.299876478@sgi.com> User-Agent: quilt/0.45-1 Date: Thu, 26 Apr 2007 21:21:29 -0700 From: clameter@sgi.com To: akpm@linux-foundation.org Cc: linux-mm@kvack.org Subject: [patch 03/10] SLUB: debug printk cleanup Content-Disposition: inline; filename=slub_at_cleanup Set up a new function slab_err in order to report errors consistently. Consistently report corrective actions taken by SLUB by a printk starting with @@@. Fix locations where there is no 0x in front of %p. Signed-off-by: Christoph Lameter Index: linux-2.6.21-rc7-mm2/mm/slub.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/slub.c 2007-04-26 20:58:08.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/slub.c 2007-04-26 20:58:23.000000000 -0700 @@ -324,8 +324,8 @@ static void object_err(struct kmem_cache { u8 *addr = page_address(page); - printk(KERN_ERR "*** SLUB: %s in %s@0x%p slab 0x%p\n", - reason, s->name, object, page); + printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n", + s->name, reason, object, page); printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n", object - addr, page->flags, page->inuse, page->freelist); if (object > addr + 16) @@ -335,6 +335,19 @@ static void object_err(struct kmem_cache dump_stack(); } +static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...) +{ + va_list args; + char buf[100]; + + va_start(args, reason); + vsnprintf(buf, sizeof(buf), reason, args); + va_end(args); + printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf, + page); + dump_stack(); +} + static void init_object(struct kmem_cache *s, void *object, int active) { u8 *p = object; @@ -412,7 +425,7 @@ static int check_valid_pointer(struct km static void restore_bytes(struct kmem_cache *s, char *message, u8 data, void *from, void *to) { - printk(KERN_ERR "@@@ SLUB: %s Restoring %s (0x%x) from 0x%p-0x%p\n", + printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n", s->name, message, data, from, to - 1); memset(from, data, to - from); } @@ -459,9 +472,7 @@ static int slab_pad_check(struct kmem_ca return 1; if (!check_bytes(p + length, POISON_INUSE, remainder)) { - printk(KERN_ERR "SLUB: %s slab 0x%p: Padding fails check\n", - s->name, p); - dump_stack(); + slab_err(s, page, "Padding check failed"); restore_bytes(s, "slab padding", POISON_INUSE, p + length, p + length + remainder); return 0; @@ -547,30 +558,25 @@ static int check_slab(struct kmem_cache VM_BUG_ON(!irqs_disabled()); if (!PageSlab(page)) { - printk(KERN_ERR "SLUB: %s Not a valid slab page @0x%p " - "flags=%lx mapping=0x%p count=%d \n", - s->name, page, page->flags, page->mapping, + slab_err(s, page, "Not a valid slab page flags=%lx " + "mapping=0x%p count=%d", page->flags, page->mapping, page_count(page)); return 0; } if (page->offset * sizeof(void *) != s->offset) { - printk(KERN_ERR "SLUB: %s Corrupted offset %lu in slab @0x%p" - " flags=0x%lx mapping=0x%p count=%d\n", - s->name, + slab_err(s, page, "Corrupted offset %lu flags=0x%lx " + "mapping=0x%p count=%d", (unsigned long)(page->offset * sizeof(void *)), - page, page->flags, page->mapping, page_count(page)); - dump_stack(); return 0; } if (page->inuse > s->objects) { - printk(KERN_ERR "SLUB: %s inuse %u > max %u in slab " - "page @0x%p flags=%lx mapping=0x%p count=%d\n", - s->name, page->inuse, s->objects, page, page->flags, + slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx " + "mapping=0x%p count=%d", + s->name, page->inuse, s->objects, page->flags, page->mapping, page_count(page)); - dump_stack(); return 0; } /* Slab_pad_check fixes things up after itself */ @@ -599,12 +605,13 @@ static int on_freelist(struct kmem_cache set_freepointer(s, object, NULL); break; } else { - printk(KERN_ERR "SLUB: %s slab 0x%p " - "freepointer 0x%p corrupted.\n", - s->name, page, fp); - dump_stack(); + slab_err(s, page, "Freepointer 0x%p corrupt", + fp); page->freelist = NULL; page->inuse = s->objects; + printk(KERN_ERR "@@@ SLUB %s: Freelist " + "cleared. Slab 0x%p\n", + s->name, page); return 0; } break; @@ -615,11 +622,12 @@ static int on_freelist(struct kmem_cache } if (page->inuse != s->objects - nr) { - printk(KERN_ERR "slab %s: page 0x%p wrong object count." - " counter is %d but counted were %d\n", - s->name, page, page->inuse, - s->objects - nr); + slab_err(s, page, "Wrong object count. Counter is %d but " + "counted were %d", s, page, page->inuse, + s->objects - nr); page->inuse = s->objects - nr; + printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. " + "Slab @0x%p\n", s->name, page); } return search == NULL; } @@ -663,10 +671,7 @@ static int alloc_object_checks(struct km goto bad; if (object && !on_freelist(s, page, object)) { - printk(KERN_ERR "SLUB: %s Object 0x%p@0x%p " - "already allocated.\n", - s->name, object, page); - dump_stack(); + slab_err(s, page, "Object 0x%p already allocated", object); goto bad; } @@ -706,15 +711,12 @@ static int free_object_checks(struct kme goto fail; if (!check_valid_pointer(s, page, object)) { - printk(KERN_ERR "SLUB: %s slab 0x%p invalid " - "object pointer 0x%p\n", - s->name, page, object); + slab_err(s, page, "Invalid object pointer 0x%p", object); goto fail; } if (on_freelist(s, page, object)) { - printk(KERN_ERR "SLUB: %s slab 0x%p object " - "0x%p already free.\n", s->name, page, object); + slab_err(s, page, "Object 0x%p already free", object); goto fail; } @@ -723,24 +725,22 @@ static int free_object_checks(struct kme if (unlikely(s != page->slab)) { if (!PageSlab(page)) - printk(KERN_ERR "slab_free %s size %d: attempt to" - "free object(0x%p) outside of slab.\n", - s->name, s->size, object); + slab_err(s, page, "Attempt to free object(0x%p) " + "outside of slab", object); else - if (!page->slab) + if (!page->slab) { printk(KERN_ERR - "slab_free : no slab(NULL) for object 0x%p.\n", + "SLUB : no slab for object 0x%p.\n", object); + dump_stack(); + } else - printk(KERN_ERR "slab_free %s(%d): object at 0x%p" - " belongs to slab %s(%d)\n", - s->name, s->size, object, - page->slab->name, page->slab->size); + slab_err(s, page, "object at 0x%p belongs " + "to slab %s", object, page->slab->name); goto fail; } return 1; fail: - dump_stack(); printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", s->name, page, object); return 0; @@ -2479,6 +2479,8 @@ __initcall(cpucache_init); #endif #ifdef SLUB_RESILIENCY_TEST +static unsigned long validate_slab_cache(struct kmem_cache *s); + static void resiliency_test(void) { u8 *p; @@ -2490,7 +2492,7 @@ static void resiliency_test(void) p = kzalloc(16, GFP_KERNEL); p[16] = 0x12; printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" - " 0x12->%p\n\n", p + 16); + " 0x12->0x%p\n\n", p + 16); validate_slab_cache(kmalloc_caches + 4); @@ -2498,14 +2500,14 @@ static void resiliency_test(void) p = kzalloc(32, GFP_KERNEL); p[32 + sizeof(void *)] = 0x34; printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> %p\n", p); + " 0x34 -> -0x%p\n", p); printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches + 5); p = kzalloc(64, GFP_KERNEL); p += 64 + (get_cycles() & 0xff) * sizeof(void *); *p = 0x56; - printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->%p\n", + printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", p); printk(KERN_ERR "If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches + 6); @@ -2514,19 +2516,19 @@ static void resiliency_test(void) p = kzalloc(128, GFP_KERNEL); kfree(p); *p = 0x78; - printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->%p\n\n", p); + printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); validate_slab_cache(kmalloc_caches + 7); p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->%p\n\n", p); + printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); validate_slab_cache(kmalloc_caches + 8); p = kzalloc(512, GFP_KERNEL); kfree(p); p[512] = 0xab; - printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->%p\n\n", p); + printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); validate_slab_cache(kmalloc_caches + 9); } #else @@ -2593,17 +2595,17 @@ static void validate_slab_slab(struct km validate_slab(s, page); slab_unlock(page); } else - printk(KERN_INFO "SLUB: %s Skipped busy slab %p\n", + printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", s->name, page); if (s->flags & DEBUG_DEFAULT_FLAGS) { if (!PageError(page)) - printk(KERN_ERR "SLUB: %s PageError not set " - "on slab %p\n", s->name, page); + printk(KERN_ERR "SLUB %s: PageError not set " + "on slab 0x%p\n", s->name, page); } else { if (PageError(page)) - printk(KERN_ERR "SLUB: %s PageError set on " - "slab %p\n", s->name, page); + printk(KERN_ERR "SLUB %s: PageError set on " + "slab 0x%p\n", s->name, page); } } @@ -2620,8 +2622,8 @@ static int validate_slab_node(struct kme count++; } if (count != n->nr_partial) - printk("SLUB: %s %ld partial slabs counted but counter=%ld\n", - s->name, count, n->nr_partial); + printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " + "counter=%ld\n", s->name, count, n->nr_partial); if (!(s->flags & SLAB_STORE_USER)) goto out; @@ -2631,8 +2633,9 @@ static int validate_slab_node(struct kme count++; } if (count != atomic_long_read(&n->nr_slabs)) - printk("SLUB: %s %ld slabs counted but counter=%ld\n", - s->name, count, atomic_long_read(&n->nr_slabs)); + printk(KERN_ERR "SLUB: %s %ld slabs counted but " + "counter=%ld\n", s->name, count, + atomic_long_read(&n->nr_slabs)); out: spin_unlock_irqrestore(&n->list_lock, flags); -- From clameter@sgi.com Thu Apr 26 21:21:56 2007 Message-Id: <20070427042155.948218296@sgi.com> References: <20070427042126.299876478@sgi.com> User-Agent: quilt/0.45-1 Date: Thu, 26 Apr 2007 21:21:30 -0700 From: clameter@sgi.com To: akpm@linux-foundation.org Cc: linux-mm@kvack.org Subject: [patch 04/10] SLUB: Conform more to SLABs SLAB_HWCACHE_ALIGN behavior Content-Disposition: inline; filename=slub_hwalign Currently SLUB is using a strict L1_CACHE_BYTES alignment if SLAB_HWCACHE_ALIGN is specified. SLAB does not align to a cacheline if the object is smaller than half of a cacheline. Small objects are then aligned by SLAB to a fraction of a cacheline. Make SLUB just forget about the alignment requirement if the object size is less than L1_CACHE_BYTES. It seems that fractional alignments are no good because they grow the object and reduce the object density in a cache line needlessly causing additional cache line fetches. If we are already throwing the user suggestion of a cache line alignment away then lets do the best we can. Maybe SLAB_HWCACHE_ALIGN also needs to be tossed given its wishy-washy handling but doing so would require an audit of all kmem_cache_allocs throughout the kernel source. In any case one needs to explictly specify an alignment during kmem_cache_create to either slab allocator in order to ensure that the objects are cacheline aligned. [Patch has a nice memory compaction effect on 32 bit platforms] Signed-off-by: Christoph Lameter Index: linux-2.6.21-rc7-mm2/mm/slub.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/slub.c 2007-04-26 11:41:15.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/slub.c 2007-04-26 11:41:43.000000000 -0700 @@ -1483,9 +1483,19 @@ static int calculate_order(int size) * various ways of specifying it. */ static unsigned long calculate_alignment(unsigned long flags, - unsigned long align) + unsigned long align, unsigned long size) { - if (flags & SLAB_HWCACHE_ALIGN) + /* + * If the user wants hardware cache aligned objects then + * follow that suggestion if the object is sufficiently + * large. + * + * The hardware cache alignment cannot override the + * specified alignment though. If that is greater + * then use it. + */ + if ((flags & SLAB_HWCACHE_ALIGN) && + size > L1_CACHE_BYTES / 2) return max_t(unsigned long, align, L1_CACHE_BYTES); if (align < ARCH_SLAB_MINALIGN) @@ -1674,7 +1684,7 @@ static int calculate_sizes(struct kmem_c * user specified (this is unecessarily complex due to the attempt * to be compatible with SLAB. Should be cleaned up some day). */ - align = calculate_alignment(flags, align); + align = calculate_alignment(flags, align, s->objsize); /* * SLUB stores one object immediately after another beginning from @@ -2251,7 +2261,7 @@ static struct kmem_cache *find_mergeable return NULL; size = ALIGN(size, sizeof(void *)); - align = calculate_alignment(flags, align); + align = calculate_alignment(flags, align, size); size = ALIGN(size, align); list_for_each(h, &slab_caches) { -- From clameter@sgi.com Thu Apr 26 21:21:56 2007 Message-Id: <20070427042156.196011848@sgi.com> References: <20070427042126.299876478@sgi.com> User-Agent: quilt/0.45-1 Date: Thu, 26 Apr 2007 21:21:31 -0700 From: clameter@sgi.com To: akpm@linux-foundation.org Cc: linux-mm@kvack.org Subject: [patch 05/10] SLUB: Add MIN_PARTIAL Content-Disposition: inline; filename=slab_partial We leave a mininum of partial slabs on nodes when we search for partial slabs on other node. Define a constant for that value. Then modify slub to keep MIN_PARTIAL slabs around. This avoids bad situations where a function frees the last object in a slab (which results in the page being returned to the page allocator) only to then allocate one again (which requires getting a page back from the page allocator if the partial list was empty). Keeping a couple of slabs on the partial list reduces overhead. Empty slabs are added to the end of the partial list to insure that partially allocated slabs are consumed first (defragmentation). Signed-off-by: Christoph Lameter Index: linux-2.6.21-rc7-mm2/mm/slub.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/slub.c 2007-04-26 11:41:43.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/slub.c 2007-04-26 11:41:54.000000000 -0700 @@ -109,6 +109,9 @@ /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST +/* Mininum number of partial slabs */ +#define MIN_PARTIAL 2 + #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) /* @@ -635,16 +638,8 @@ static int on_freelist(struct kmem_cache /* * Tracking of fully allocated slabs for debugging */ -static void add_full(struct kmem_cache *s, struct page *page) +static void add_full(struct kmem_cache_node *n, struct page *page) { - struct kmem_cache_node *n; - - VM_BUG_ON(!irqs_disabled()); - - if (!(s->flags & SLAB_STORE_USER)) - return; - - n = get_node(s, page_to_nid(page)); spin_lock(&n->list_lock); list_add(&page->lru, &n->full); spin_unlock(&n->list_lock); @@ -923,10 +918,16 @@ static __always_inline int slab_trylock( /* * Management of partially allocated slabs */ -static void add_partial(struct kmem_cache *s, struct page *page) +static void add_partial_tail(struct kmem_cache_node *n, struct page *page) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + spin_lock(&n->list_lock); + n->nr_partial++; + list_add_tail(&page->lru, &n->partial); + spin_unlock(&n->list_lock); +} +static void add_partial(struct kmem_cache_node *n, struct page *page) +{ spin_lock(&n->list_lock); n->nr_partial++; list_add(&page->lru, &n->partial); @@ -1026,7 +1027,7 @@ static struct page *get_any_partial(stru n = get_node(s, zone_to_nid(*z)); if (n && cpuset_zone_allowed_hardwall(*z, flags) && - n->nr_partial > 2) { + n->nr_partial > MIN_PARTIAL) { page = get_partial_node(n); if (page) return page; @@ -1060,15 +1061,31 @@ static struct page *get_partial(struct k */ static void putback_slab(struct kmem_cache *s, struct page *page) { + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + if (page->inuse) { + if (page->freelist) - add_partial(s, page); - else if (PageError(page)) - add_full(s, page); + add_partial(n, page); + else if (PageError(page) && (s->flags & SLAB_STORE_USER)) + add_full(n, page); slab_unlock(page); + } else { - slab_unlock(page); - discard_slab(s, page); + if (n->nr_partial < MIN_PARTIAL) { + /* + * Adding an empty page to the partial slabs in order + * to avoid page allocator overhead. This page needs to + * come after all the others that are not fully empty + * in order to make sure that we do maximum + * defragmentation. + */ + add_partial_tail(n, page); + slab_unlock(page); + } else { + slab_unlock(page); + discard_slab(s, page); + } } } @@ -1326,7 +1343,7 @@ checks_ok: * then add it. */ if (unlikely(!prior)) - add_partial(s, page); + add_partial(get_node(s, page_to_nid(page)), page); out_unlock: slab_unlock(page); @@ -1542,7 +1559,7 @@ static struct kmem_cache_node * __init e kmalloc_caches->node[node] = n; init_kmem_cache_node(n); atomic_long_inc(&n->nr_slabs); - add_partial(kmalloc_caches, page); + add_partial(n, page); return n; } -- From clameter@sgi.com Thu Apr 26 21:21:56 2007 Message-Id: <20070427042156.441242745@sgi.com> References: <20070427042126.299876478@sgi.com> User-Agent: quilt/0.45-1 Date: Thu, 26 Apr 2007 21:21:32 -0700 From: clameter@sgi.com To: akpm@linux-foundation.org Cc: linux-mm@kvack.org, Mel Gorman Subject: [patch 06/10] SLUB: Free slabs and sort partial slab lists in kmem_cache_shrink Content-Disposition: inline; filename=slab_shrink_cache At kmem_cache_shrink check if we have any empty slabs on the partial if so then remove them. Also--as an anti-fragmentation measure--sort the partial slabs so that the most fully allocated ones come first and the least allocated last. The next allocations may fill up the nearly full slabs. Having the least allocated slabs last gives them the maximum chance that their remaining objects may be freed. Thus we can hopefully minimize the partial slabs. I think this is the best one can do in terms antifragmentation measures. Real defragmentation (meaning moving objects out of slabs with the least free objects to those that are almost full) can be implemted by reverse scanning through the list produced here but that would mean that we need to provide a callback at slab cache creation that allows the deletion or moving of an object. This will involve slab API changes, so defer for now. Cc: Mel Gorman Signed-off-by: Christoph Lameter --- mm/slub.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 104 insertions(+), 14 deletions(-) Index: linux-2.6.21-rc7-mm2/mm/slub.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/slub.c 2007-04-26 20:59:01.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/slub.c 2007-04-26 20:59:21.000000000 -0700 @@ -109,9 +109,19 @@ /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST -/* Mininum number of partial slabs */ +/* + * Mininum number of partial slabs. These will be left on the partial + * lists even if they are empty. kmem_cache_shrink may reclaim them. + */ #define MIN_PARTIAL 2 +/* + * Maximum number of desirable partial slabs. + * The existence of more partial slabs makes kmem_cache_shrink + * sort the partial list by the number of objects in the. + */ +#define MAX_PARTIAL 10 + #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) /* @@ -1915,7 +1925,7 @@ static int kmem_cache_close(struct kmem_ for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); - free_list(s, n, &n->partial); + n->nr_partial -= free_list(s, n, &n->partial); if (atomic_long_read(&n->nr_slabs)) return 1; } @@ -2164,6 +2174,79 @@ void kfree(const void *x) } EXPORT_SYMBOL(kfree); +/* + * kmem_cache_shrink removes empty slabs from the partial lists + * and then sorts the partially allocated slabs by the number + * of items in use. The slabs with the most items in use + * come first. New allocations will remove these from the + * partial list because they are full. The slabs with the + * least items are placed last. If it happens that the objects + * are freed then the page can be returned to the page allocator. + */ +int kmem_cache_shrink(struct kmem_cache *s) +{ + int node; + int i; + struct kmem_cache_node *n; + struct page *page; + struct page *t; + struct list_head *slabs_by_inuse = + kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); + unsigned long flags; + + if (!slabs_by_inuse) + return -ENOMEM; + + flush_all(s); + for_each_online_node(node) { + n = get_node(s, node); + + if (n->nr_partial <= MIN_PARTIAL) + continue; + + for (i = 0; i < s->objects; i++) + INIT_LIST_HEAD(slabs_by_inuse + i); + + spin_lock_irqsave(&n->list_lock, flags); + + /* + * Build lists indexed by the items in use in + * each slab or free slabs if empty. + * + * Note that concurrent frees may occur while + * we hold the list_lock. page->inuse here is + * the upper limit. + */ + list_for_each_entry_safe(page, t, &n->partial, lru) { + if (!page->inuse) { + list_del(&page->lru); + n->nr_partial--; + discard_slab(s, page); + } else + if (n->nr_partial > MAX_PARTIAL) + list_move(&page->lru, + slabs_by_inuse + page->inuse); + } + + if (n->nr_partial <= MAX_PARTIAL) + goto out; + + /* + * Rebuild the partial list with the slabs filled up + * most first and the least used slabs at the end. + */ + for (i = s->objects - 1; i > 0; i--) + list_splice(slabs_by_inuse + i, n->partial.prev); + + out: + spin_unlock_irqrestore(&n->list_lock, flags); + } + + kfree(slabs_by_inuse); + return 0; +} +EXPORT_SYMBOL(kmem_cache_shrink); + /** * krealloc - reallocate memory. The contents will remain unchanged. * @@ -2409,17 +2492,6 @@ static struct notifier_block __cpuinitda #endif -/*************************************************************** - * Compatiblility definitions - **************************************************************/ - -int kmem_cache_shrink(struct kmem_cache *s) -{ - flush_all(s); - return 0; -} -EXPORT_SYMBOL(kmem_cache_shrink); - #ifdef CONFIG_NUMA /***************************************************************** @@ -3195,6 +3267,25 @@ static ssize_t validate_store(struct kme } SLAB_ATTR(validate); +static ssize_t shrink_show(struct kmem_cache *s, char *buf) +{ + return 0; +} + +static ssize_t shrink_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (buf[0] == '1') { + int rc = kmem_cache_shrink(s); + + if (rc) + return rc; + } else + return -EINVAL; + return length; +} +SLAB_ATTR(shrink); + static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) { if (!(s->flags & SLAB_STORE_USER)) @@ -3251,6 +3342,7 @@ static struct attribute * slab_attrs[] = &poison_attr.attr, &store_user_attr.attr, &validate_attr.attr, + &shrink_attr.attr, &alloc_calls_attr.attr, &free_calls_attr.attr, #ifdef CONFIG_ZONE_DMA -- From clameter@sgi.com Thu Apr 26 21:21:56 2007 Message-Id: <20070427042156.668869489@sgi.com> References: <20070427042126.299876478@sgi.com> User-Agent: quilt/0.45-1 Date: Thu, 26 Apr 2007 21:21:33 -0700 From: clameter@sgi.com To: akpm@linux-foundation.org Cc: linux-mm@kvack.org Subject: [patch 07/10] SLUB: Major slabinfo update Content-Disposition: inline; filename=slub_slabinfo_update Enhancement to slabinfo - Support for slab shrinking (-r option) - Slab summary showing system totals (-T option) - Sync with new form of alias handling - Sort by size, reverse sorting etc (-S -i option) - Alias lookups (-a) - NUMA allocation tables table output (-n option) Signed-off-by: Christoph Lameter Index: linux-2.6.21-rc7-mm2/Documentation/vm/slabinfo.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/Documentation/vm/slabinfo.c 2007-04-26 20:58:01.000000000 -0700 +++ linux-2.6.21-rc7-mm2/Documentation/vm/slabinfo.c 2007-04-26 21:00:24.000000000 -0700 @@ -3,7 +3,7 @@ * * (C) 2007 sgi, Christoph Lameter * - * Compile by doing: + * Compile by: * * gcc -o slabinfo slabinfo.c */ @@ -17,15 +17,47 @@ #include #include +#define MAX_SLABS 500 +#define MAX_ALIASES 500 +#define MAX_NODES 1024 + +struct slabinfo { + char *name; + int alias; + int refs; + int aliases, align, cache_dma, cpu_slabs, destroy_by_rcu; + int hwcache_align, object_size, objs_per_slab; + int sanity_checks, slab_size, store_user, trace; + int order, poison, reclaim_account, red_zone; + unsigned long partial, objects, slabs; + int numa[MAX_NODES]; + int numa_partial[MAX_NODES]; +} slabinfo[MAX_SLABS]; + +struct aliasinfo { + char *name; + char *ref; + struct slabinfo *slab; +} aliasinfo[MAX_ALIASES]; + +int slabs = 0; +int aliases = 0; +int highest_node = 0; + char buffer[4096]; int show_alias = 0; int show_slab = 0; -int show_parameters = 0; int skip_zero = 1; int show_numa = 0; int show_track = 0; +int show_first_alias = 0; int validate = 0; +int shrink = 0; +int show_inverted = 0; +int show_single_ref = 0; +int show_totals = 0; +int sort_size = 0; int page_size; @@ -47,11 +79,16 @@ void usage(void) "-a|--aliases Show aliases\n" "-h|--help Show usage information\n" "-n|--numa Show NUMA information\n" - "-p|--parameters Show global parameters\n" + "-r|--reduce Shrink slabs\n" "-v|--validate Validate slabs\n" "-t|--tracking Show alloc/free information\n" + "-T|--Totals Show summary information\n" "-s|--slabs Show slabs\n" + "-S|--Size Sort by size\n" "-z|--zero Include empty slabs\n" + "-f|--first-alias Show first alias\n" + "-i|--inverted Inverted list\n" + "-1|--1ref Single reference\n" ); } @@ -86,23 +123,32 @@ unsigned long get_obj(char *name) unsigned long get_obj_and_str(char *name, char **x) { unsigned long result = 0; + char *p; + + *x = NULL; if (!read_obj(name)) { x = NULL; return 0; } - result = strtoul(buffer, x, 10); - while (**x == ' ') - (*x)++; + result = strtoul(buffer, &p, 10); + while (*p == ' ') + p++; + if (*p) + *x = strdup(p); return result; } -void set_obj(char *name, int n) +void set_obj(struct slabinfo *s, char *name, int n) { - FILE *f = fopen(name, "w"); + char x[100]; + + sprintf(x, "%s/%s", s->name, name); + + FILE *f = fopen(x, "w"); if (!f) - fatal("Cannot write to %s\n", name); + fatal("Cannot write to %s\n", x); fprintf(f, "%d\n", n); fclose(f); @@ -143,167 +189,616 @@ int store_size(char *buffer, unsigned lo return n; } -void alias(const char *name) +void decode_numa_list(int *numa, char *t) { - int count; - char *p; - - if (!show_alias) - return; + int node; + int nr; - count = readlink(name, buffer, sizeof(buffer)); + memset(numa, 0, MAX_NODES * sizeof(int)); - if (count < 0) - return; + while (*t == 'N') { + t++; + node = strtoul(t, &t, 10); + if (*t == '=') { + t++; + nr = strtoul(t, &t, 10); + numa[node] = nr; + if (node > highest_node) + highest_node = node; + } + while (*t == ' ') + t++; + } +} - buffer[count] = 0; +char *hackname(struct slabinfo *s) +{ + char *n = s->name; - p = buffer + count; + if (n[0] == ':') { + char *nn = malloc(20); + char *p; + + strncpy(nn, n, 20); + n = nn; + p = n + 4; + while (*p && *p !=':') + p++; + *p = 0; + } + return n; +} - while (p > buffer && p[-1] != '/') - p--; - printf("%-20s -> %s\n", name, p); +void slab_validate(struct slabinfo *s) +{ + set_obj(s, "validate", 1); } -void slab_validate(char *name) +void slab_shrink(struct slabinfo *s) { - set_obj("validate", 1); + set_obj(s, "shrink", 1); } int line = 0; void first_line(void) { - printf("Name Objects Objsize Space " - "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n"); + printf("Name Objects Objsize Space " + "Slabs/Part/Cpu O/S O %%Fr %%Ef Flg\n"); +} + +/* + * Find the shortest alias of a slab + */ +struct aliasinfo *find_one_alias(struct slabinfo *find) +{ + struct aliasinfo *a; + struct aliasinfo *best = NULL; + + for(a = aliasinfo;a < aliasinfo + aliases; a++) { + if (a->slab == find && + (!best || strlen(best->name) < strlen(a->name))) { + best = a; + if (strncmp(a->name,"kmall", 5) == 0) + return best; + } + } + if (best) + return best; + fatal("Cannot find alias for %s\n", find->name); + return NULL; } -void slab(const char *name) +unsigned long slab_size(struct slabinfo *s) +{ + return s->slabs * (page_size << s->order); +} + + +void slabcache(struct slabinfo *s) { - unsigned long aliases, align, cache_dma, cpu_slabs, destroy_by_rcu; - unsigned long hwcache_align, object_size, objects, objs_per_slab; - unsigned long order, partial, poison, reclaim_account, red_zone; - unsigned long sanity_checks, slab_size, slabs, store_user, trace; char size_str[20]; char dist_str[40]; char flags[20]; char *p = flags; + char *n; - if (!show_slab) + if (skip_zero && !s->slabs) return; - aliases = get_obj("aliases"); - align = get_obj("align"); - cache_dma = get_obj("cache_dma"); - cpu_slabs = get_obj("cpu_slabs"); - destroy_by_rcu = get_obj("destroy_by_rcu"); - hwcache_align = get_obj("hwcache_align"); - object_size = get_obj("object_size"); - objects = get_obj("objects"); - objs_per_slab = get_obj("objs_per_slab"); - order = get_obj("order"); - partial = get_obj("partial"); - poison = get_obj("poison"); - reclaim_account = get_obj("reclaim_account"); - red_zone = get_obj("red_zone"); - sanity_checks = get_obj("sanity_checks"); - slab_size = get_obj("slab_size"); - slabs = get_obj("slabs"); - store_user = get_obj("store_user"); - trace = get_obj("trace"); - - if (skip_zero && !slabs) - return; - - store_size(size_str, slabs * page_size); - sprintf(dist_str,"%lu/%lu/%lu", slabs, partial, cpu_slabs); + store_size(size_str, slab_size(s)); + sprintf(dist_str,"%lu/%lu/%d", s->slabs, s->partial, s->cpu_slabs); if (!line++) first_line(); - if (aliases) + if (s->aliases) *p++ = '*'; - if (cache_dma) + if (s->cache_dma) *p++ = 'd'; - if (hwcache_align) + if (s->hwcache_align) *p++ = 'A'; - if (poison) + if (s->poison) *p++ = 'P'; - if (reclaim_account) + if (s->reclaim_account) *p++ = 'a'; - if (red_zone) + if (s->red_zone) *p++ = 'Z'; - if (sanity_checks) + if (s->sanity_checks) *p++ = 'F'; - if (store_user) + if (s->store_user) *p++ = 'U'; - if (trace) + if (s->trace) *p++ = 'T'; *p = 0; - printf("%-20s %8ld %7ld %8s %14s %3ld %1ld %3ld %3ld %s\n", - name, objects, object_size, size_str, dist_str, - objs_per_slab, order, - slabs ? (partial * 100) / slabs : 100, - slabs ? (objects * object_size * 100) / - (slabs * (page_size << order)) : 100, + n = hackname(s); + printf("%-21s %8ld %7d %8s %14s %4d %1d %3ld %3ld %s\n", + n, s->objects, s->object_size, size_str, dist_str, + s->objs_per_slab, s->order, + s->slabs ? (s->partial * 100) / s->slabs : 100, + s->slabs ? (s->objects * s->object_size * 100) / + (s->slabs * (page_size << s->order)) : 100, flags); } -void slab_numa(const char *name) +void slab_numa(struct slabinfo *s) { - unsigned long slabs; - char *numainfo; + char *n; + int node; - slabs = get_obj_and_str("slabs", &numainfo); + if (!highest_node) + fatal("No NUMA information available.\n"); - if (skip_zero && !slabs) + if (skip_zero && !s->slabs) return; + n = hackname(s); - printf("%-20s %s", name, numainfo); -} + if (!line) { + printf("\nSlab Node "); + for(node = 0; node <= highest_node; node++) + printf(" %4d", node); + printf("\n----------------------"); + for(node = 0; node <= highest_node; node++) + printf("-----"); + printf("\n"); + } + printf("%-21s ", n); + for(node = 0; node <= highest_node; node++) { + char b[20]; -void parameter(const char *name) -{ - if (!show_parameters) - return; + store_size(b, s->numa[node]); + printf(" %4s", b); + } + printf("\n"); + line++; } -void show_tracking(const char *name) +void show_tracking(struct slabinfo *s) { - printf("\n%s: Calls to allocate a slab object\n", name); + printf("\n%s: Calls to allocate a slab object\n", s->name); printf("---------------------------------------------------\n"); if (read_obj("alloc_calls")) printf(buffer); - printf("%s: Calls to free a slab object\n", name); + printf("%s: Calls to free a slab object\n", s->name); printf("-----------------------------------------------\n"); if (read_obj("free_calls")) printf(buffer); } +void totals(void) +{ + struct slabinfo *s; + + int used_slabs = 0; + char b1[20], b2[20], b3[20], b4[20]; + unsigned long long min_objsize = 0, max_objsize = 0, avg_objsize; + unsigned long long min_partial = 0, max_partial = 0, avg_partial, total_partial = 0; + unsigned long long min_slabs = 0, max_slabs = 0, avg_slabs, total_slabs = 0; + unsigned long long min_size = 0, max_size = 0, avg_size, total_size = 0; + unsigned long long min_waste = 0, max_waste = 0, avg_waste, total_waste = 0; + unsigned long long min_objects = 0, max_objects = 0, avg_objects, total_objects = 0; + unsigned long long min_objwaste = 0, max_objwaste = 0, avg_objwaste; + unsigned long long min_used = 0, max_used = 0, avg_used, total_used = 0; + unsigned long min_ppart = 0, max_ppart = 0, avg_ppart, total_ppart = 0; + unsigned long min_partobj = 0, max_partobj = 0, avg_partobj; + unsigned long total_objects_in_partial = 0; + + for (s = slabinfo; s < slabinfo + slabs; s++) { + unsigned long long size; + unsigned long partial; + unsigned long slabs; + unsigned long used; + unsigned long long wasted; + unsigned long long objwaste; + long long objects_in_partial; + unsigned long percentage_partial; + + if (!s->slabs || !s->objects) + continue; + + used_slabs++; + + size = slab_size(s); + partial = s->partial << s->order; + slabs = s->slabs << s->order; + used = s->objects * s->object_size; + wasted = size - used; + objwaste = wasted / s->objects; + + objects_in_partial = s->objects - (s->slabs - s->partial - s ->cpu_slabs) + * s->objs_per_slab; + + if (objects_in_partial < 0) + objects_in_partial = 0; + + percentage_partial = objects_in_partial * 100 / s->objects; + if (percentage_partial > 100) + percentage_partial = 100; + + if (s->object_size < min_objsize || !min_objsize) + min_objsize = s->object_size; + if (partial && (partial < min_partial || !min_partial)) + min_partial = partial; + if (slabs < min_slabs || !min_partial) + min_slabs = slabs; + if (size < min_size) + min_size = size; + if (wasted < min_waste && !min_waste) + min_waste = wasted; + if (objwaste < min_objwaste || !min_objwaste) + min_objwaste = objwaste; + if (s->objects < min_objects || !min_objects) + min_objects = s->objects; + if (used < min_used || !min_used) + min_used = used; + if (objects_in_partial < min_partobj || !min_partobj) + min_partobj = objects_in_partial; + if (percentage_partial < min_ppart || !min_ppart) + min_ppart = percentage_partial; + + if (s->object_size > max_objsize) + max_objsize = s->object_size; + if (partial > max_partial) + max_partial = partial; + if (slabs > max_slabs) + max_slabs = slabs; + if (size > max_size) + max_size = size; + if (wasted > max_waste) + max_waste = wasted; + if (objwaste > max_objwaste) + max_objwaste = objwaste; + if (s->objects > max_objects) + max_objects = s->objects; + if (used > max_used) + max_used = used; + if (objects_in_partial > max_partobj) + max_partobj = objects_in_partial; + if (percentage_partial > max_ppart) + max_ppart = percentage_partial; + + total_objects += s->objects; + total_partial += partial; + total_slabs += slabs; + total_used += used; + total_waste += wasted; + total_size += size; + total_ppart += percentage_partial; + total_objects_in_partial += objects_in_partial; + } + + if (!total_objects) { + printf("No objects\n"); + return; + } + if (!used_slabs) { + printf("No slabs\n"); + return; + } + avg_partial = total_partial / used_slabs; + avg_slabs = total_slabs / used_slabs; + avg_waste = total_waste / used_slabs; + avg_size = total_waste / used_slabs; + avg_objects = total_objects / used_slabs; + avg_used = total_used / used_slabs; + avg_ppart = total_ppart / used_slabs; + avg_partobj = total_objects_in_partial / used_slabs; + + avg_objsize = total_used / total_objects; + avg_objwaste = total_waste / total_objects; + + printf("Slabcache Totals\n"); + printf("----------------\n"); + printf("Slabcaches : %3d Aliases : %3d Active: %3d\n", + slabs, aliases, used_slabs); + + store_size(b1, total_used);store_size(b2, total_waste); + store_size(b3, total_waste * 100 / total_used); + printf("Memory used: %6s # Loss : %6s MRatio: %6s%%\n", b1, b2, b3); + + store_size(b1, total_objects);store_size(b2, total_objects_in_partial); + store_size(b3, total_objects_in_partial * 100 / total_objects); + printf("# Objects : %6s # PartObj: %6s ORatio: %6s%%\n", b1, b2, b3); + + printf("\n"); + printf("Per Cache Average Min Max Total\n"); + printf("---------------------------------------------------------\n"); + + store_size(b1, avg_objects);store_size(b2, min_objects); + store_size(b3, max_objects);store_size(b4, total_objects); + printf("# Objects %10s %10s %10s %10s\n", + b1, b2, b3, b4); + + store_size(b1, avg_slabs);store_size(b2, min_slabs); + store_size(b3, max_slabs);store_size(b4, total_slabs); + printf("# Slabs %10s %10s %10s %10s\n", + b1, b2, b3, b4); + + store_size(b1, avg_partial);store_size(b2, min_partial); + store_size(b3, max_partial);store_size(b4, total_partial); + printf("# Partial %10s %10s %10s %10s\n", + b1, b2, b3, b4); + store_size(b1, avg_ppart);store_size(b2, min_ppart); + store_size(b3, max_ppart); + printf("%% Partial %10s%% %10s%% %10s%%\n", + b1, b2, b3); + + store_size(b1, avg_size);store_size(b2, min_size); + store_size(b3, max_size);store_size(b4, total_size); + printf("Memory %10s %10s %10s %10s\n", + b1, b2, b3, b4); + + store_size(b1, avg_used);store_size(b2, min_used); + store_size(b3, max_used);store_size(b4, total_used); + printf("Used %10s %10s %10s %10s\n", + b1, b2, b3, b4); + + store_size(b1, avg_slabs);store_size(b2, min_slabs); + store_size(b3, max_slabs);store_size(b4, total_slabs); + printf("Waste %10s %10s %10s %10s\n", + b1, b2, b3, b4); + + printf("\n"); + printf("Per Object Average Min Max\n"); + printf("---------------------------------------------\n"); + + store_size(b1, avg_objsize);store_size(b2, min_objsize); + store_size(b3, max_objsize); + printf("Size %10s %10s %10s\n", + b1, b2, b3); + + store_size(b1, avg_objwaste);store_size(b2, min_objwaste); + store_size(b3, max_objwaste); + printf("Loss %10s %10s %10s\n", + b1, b2, b3); +} + +void sort_slabs(void) +{ + struct slabinfo *s1,*s2; + + for (s1 = slabinfo; s1 < slabinfo + slabs; s1++) { + for (s2 = s1 + 1; s2 < slabinfo + slabs; s2++) { + int result; + + if (sort_size) + result = slab_size(s1) < slab_size(s2); + else + result = strcasecmp(s1->name, s2->name); + + if (show_inverted) + result = -result; + + if (result > 0) { + struct slabinfo t; + + memcpy(&t, s1, sizeof(struct slabinfo)); + memcpy(s1, s2, sizeof(struct slabinfo)); + memcpy(s2, &t, sizeof(struct slabinfo)); + } + } + } +} + +void sort_aliases(void) +{ + struct aliasinfo *a1,*a2; + + for (a1 = aliasinfo; a1 < aliasinfo + aliases; a1++) { + for (a2 = a1 + 1; a2 < aliasinfo + aliases; a2++) { + char *n1, *n2; + + n1 = a1->name; + n2 = a2->name; + if (show_alias && !show_inverted) { + n1 = a1->ref; + n2 = a2->ref; + } + if (strcasecmp(n1, n2) > 0) { + struct aliasinfo t; + + memcpy(&t, a1, sizeof(struct aliasinfo)); + memcpy(a1, a2, sizeof(struct aliasinfo)); + memcpy(a2, &t, sizeof(struct aliasinfo)); + } + } + } +} + +void link_slabs(void) +{ + struct aliasinfo *a; + struct slabinfo *s; + + for (a = aliasinfo; a < aliasinfo + aliases; a++) { + + for(s = slabinfo; s < slabinfo + slabs; s++) + if (strcmp(a->ref, s->name) == 0) { + a->slab = s; + s->refs++; + break; + } + if (s == slabinfo + slabs) + fatal("Unresolved alias %s\n", a->ref); + } +} + +void alias(void) +{ + struct aliasinfo *a; + char *active = NULL; + + sort_aliases(); + link_slabs(); + + for(a = aliasinfo; a < aliasinfo + aliases; a++) { + + if (!show_single_ref && a->slab->refs == 1) + continue; + + if (!show_inverted) { + if (active) { + if (strcmp(a->slab->name, active) == 0) { + printf(" %s", a->name); + continue; + } + } + printf("\n%-20s <- %s", a->slab->name, a->name); + active = a->slab->name; + } + else + printf("%-20s -> %s\n", a->name, a->slab->name); + } + if (active) + printf("\n"); +} + + +void rename_slabs(void) +{ + struct slabinfo *s; + struct aliasinfo *a; + + for (s = slabinfo; s < slabinfo + slabs; s++) { + if (*s->name != ':') + continue; + + if (s->refs > 1 && !show_first_alias) + continue; + + a = find_one_alias(s); + + s->name = a->name; + } +} + int slab_mismatch(char *slab) { return regexec(&pattern, slab, 0, NULL, 0); } +void read_slab_dir(void) +{ + DIR *dir; + struct dirent *de; + struct slabinfo *slab = slabinfo; + struct aliasinfo *alias = aliasinfo; + char *p; + char *t; + int count; + + dir = opendir("."); + while ((de = readdir(dir))) { + if (de->d_name[0] == '.' || + slab_mismatch(de->d_name)) + continue; + switch (de->d_type) { + case DT_LNK: + alias->name = strdup(de->d_name); + count = readlink(de->d_name, buffer, sizeof(buffer)); + + if (count < 0) + fatal("Cannot read symlink %s\n", de->d_name); + + buffer[count] = 0; + p = buffer + count; + while (p > buffer && p[-1] != '/') + p--; + alias->ref = strdup(p); + alias++; + break; + case DT_DIR: + if (chdir(de->d_name)) + fatal("Unable to access slab %s\n", slab->name); + slab->name = strdup(de->d_name); + slab->alias = 0; + slab->refs = 0; + slab->aliases = get_obj("aliases"); + slab->align = get_obj("align"); + slab->cache_dma = get_obj("cache_dma"); + slab->cpu_slabs = get_obj("cpu_slabs"); + slab->destroy_by_rcu = get_obj("destroy_by_rcu"); + slab->hwcache_align = get_obj("hwcache_align"); + slab->object_size = get_obj("object_size"); + slab->objects = get_obj("objects"); + slab->objs_per_slab = get_obj("objs_per_slab"); + slab->order = get_obj("order"); + slab->partial = get_obj("partial"); + slab->partial = get_obj_and_str("partial", &t); + decode_numa_list(slab->numa_partial, t); + slab->poison = get_obj("poison"); + slab->reclaim_account = get_obj("reclaim_account"); + slab->red_zone = get_obj("red_zone"); + slab->sanity_checks = get_obj("sanity_checks"); + slab->slab_size = get_obj("slab_size"); + slab->slabs = get_obj_and_str("slabs", &t); + decode_numa_list(slab->numa, t); + slab->store_user = get_obj("store_user"); + slab->trace = get_obj("trace"); + chdir(".."); + slab++; + break; + default : + fatal("Unknown file type %lx\n", de->d_type); + } + } + closedir(dir); + slabs = slab - slabinfo; + aliases = alias - aliasinfo; + if (slabs > MAX_SLABS) + fatal("Too many slabs\n"); + if (aliases > MAX_ALIASES) + fatal("Too many aliases\n"); +} + +void output_slabs(void) +{ + struct slabinfo *slab; + + for (slab = slabinfo; slab < slabinfo + slabs; slab++) { + + if (slab->alias) + continue; + + + if (show_numa) + slab_numa(slab); + else + if (show_track) + show_tracking(slab); + else + if (validate) + slab_validate(slab); + else + if (shrink) + slab_shrink(slab); + else { + if (show_slab) + slabcache(slab); + } + } +} + struct option opts[] = { { "aliases", 0, NULL, 'a' }, { "slabs", 0, NULL, 's' }, { "numa", 0, NULL, 'n' }, - { "parameters", 0, NULL, 'p' }, { "zero", 0, NULL, 'z' }, { "help", 0, NULL, 'h' }, { "validate", 0, NULL, 'v' }, + { "first-alias", 0, NULL, 'f' }, + { "reduce", 0, NULL, 'r' }, { "track", 0, NULL, 't'}, + { "inverted", 0, NULL, 'i'}, + { "1ref", 0, NULL, '1'}, { NULL, 0, NULL, 0 } }; int main(int argc, char *argv[]) { - DIR *dir; - struct dirent *de; int c; int err; char *pattern_source; @@ -312,22 +807,31 @@ int main(int argc, char *argv[]) if (chdir("/sys/slab")) fatal("This kernel does not have SLUB support.\n"); - while ((c = getopt_long(argc, argv, "ahtvnpsz", opts, NULL)) != -1) + while ((c = getopt_long(argc, argv, "afhi1nprstvzTS", opts, NULL)) != -1) switch(c) { - case 's': - show_slab = 1; + case '1': + show_single_ref = 1; break; case 'a': show_alias = 1; break; + case 'f': + show_first_alias = 1; + break; + case 'h': + usage(); + return 0; + case 'i': + show_inverted = 1; + break; case 'n': show_numa = 1; break; - case 'p': - show_parameters = 1; + case 'r': + shrink = 1; break; - case 'z': - skip_zero = 0; + case 's': + show_slab = 1; break; case 't': show_track = 1; @@ -335,17 +839,23 @@ int main(int argc, char *argv[]) case 'v': validate = 1; break; - case 'h': - usage(); - return 0; + case 'z': + skip_zero = 0; + break; + case 'T': + show_totals = 1; + break; + case 'S': + sort_size = 1; + break; default: fatal("%s: Invalid option '%c'\n", argv[0], optopt); } - if (!show_slab && !show_alias && !show_parameters && !show_track - && !validate) + if (!show_slab && !show_alias && !show_track + && !validate && !shrink) show_slab = 1; if (argc > optind) @@ -357,39 +867,17 @@ int main(int argc, char *argv[]) if (err) fatal("%s: Invalid pattern '%s' code %d\n", argv[0], pattern_source, err); - - dir = opendir("."); - while ((de = readdir(dir))) { - if (de->d_name[0] == '.' || - slab_mismatch(de->d_name)) - continue; - switch (de->d_type) { - case DT_LNK: - alias(de->d_name); - break; - case DT_DIR: - if (chdir(de->d_name)) - fatal("Unable to access slab %s\n", de->d_name); - - if (show_numa) - slab_numa(de->d_name); - else - if (show_track) - show_tracking(de->d_name); - else - if (validate) - slab_validate(de->d_name); - else - slab(de->d_name); - chdir(".."); - break; - case DT_REG: - parameter(de->d_name); - break; - default : - fatal("Unknown file type %lx\n", de->d_type); - } + read_slab_dir(); + if (show_alias) + alias(); + else + if (show_totals) + totals(); + else { + link_slabs(); + rename_slabs(); + sort_slabs(); + output_slabs(); } - closedir(dir); return 0; } -- From clameter@sgi.com Thu Apr 26 21:21:57 2007 Message-Id: <20070427042156.910075002@sgi.com> References: <20070427042126.299876478@sgi.com> User-Agent: quilt/0.45-1 Date: Thu, 26 Apr 2007 21:21:34 -0700 From: clameter@sgi.com To: akpm@linux-foundation.org Cc: linux-mm@kvack.org, Mel Gorman Subject: [patch 08/10] SLUB: Reduce the order of allocations to avoid fragmentation Content-Disposition: inline; filename=slub_i386_no_frag Seems that fragmentation is an important subject. So be safe. If an arch supports 4k page size then assume that defragmentation may be a major problem. Reduce the minimum number of objects in a slab and limit the order of slabs. Be a little bit more lenient for larger page sizes. Change the bootup message of SLUB to show the parameters so that difficulties due to fragmentation are detectable when the boot log is reviewed. Cc: Mel Gorman Signed-off-by: Christoph Lameter Index: linux-2.6.21-rc7-mm2/mm/slub.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/slub.c 2007-04-26 21:01:28.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/slub.c 2007-04-26 21:02:01.000000000 -0700 @@ -109,6 +109,25 @@ /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST +#if PAGE_SHIFT <= 12 + +/* + * Small page size. Make sure that we do not fragment memory + */ +#define DEFAULT_MAX_ORDER 1 +#define DEFAULT_MIN_OBJECTS 4 + +#else + +/* + * Large page machines are customarily able to handle larger + * page orders. + */ +#define DEFAULT_MAX_ORDER 2 +#define DEFAULT_MIN_OBJECTS 8 + +#endif + /* * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -1437,13 +1456,13 @@ static struct page *get_object_page(cons * take the list_lock. */ static int slub_min_order; -static int slub_max_order = 4; +static int slub_max_order = DEFAULT_MAX_ORDER; /* * Minimum number of objects per slab. This is necessary in order to * reduce locking overhead. Similar to the queue size in SLAB. */ -static int slub_min_objects = 8; +static int slub_min_objects = DEFAULT_MIN_OBJECTS; /* * Merge control. If this is set then no merging of slab caches will occur. @@ -2338,9 +2357,10 @@ void __init kmem_cache_init(void) kmem_size = offsetof(struct kmem_cache, cpu_slab) + nr_cpu_ids * sizeof(struct page *); - printk(KERN_INFO "SLUB: General Slabs=%d, HW alignment=%d, " - "Processors=%d, Nodes=%d\n", + printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," + " Processors=%d, Nodes=%d\n", KMALLOC_SHIFT_HIGH, L1_CACHE_BYTES, + slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); } -- From clameter@sgi.com Thu Apr 26 21:21:57 2007 Message-Id: <20070427042157.173100785@sgi.com> References: <20070427042126.299876478@sgi.com> User-Agent: quilt/0.45-1 Date: Thu, 26 Apr 2007 21:21:35 -0700 From: clameter@sgi.com To: akpm@linux-foundation.org Cc: linux-mm@kvack.org, Mel Gorman Subject: [patch 09/10] SLUB: Exploit page mobility to increase allocation order Content-Disposition: inline; filename=slub_i386_mobility If there is page mobility then we can defragment memory. So its possible to use higher order of pages for slab allocations. If the defaults were not overridden set the max order to 4 and guarantee 16 objects per slab. This will put some stress on Mel's antifrag approaches. If these defaults are too large then they should be later reduced. Cc: Mel Gorman Signed-off-by: Christoph Lameter Index: linux-2.6.21-rc7-mm2/include/linux/mmzone.h =================================================================== --- linux-2.6.21-rc7-mm2.orig/include/linux/mmzone.h 2007-04-26 20:57:58.000000000 -0700 +++ linux-2.6.21-rc7-mm2/include/linux/mmzone.h 2007-04-26 21:05:48.000000000 -0700 @@ -25,6 +25,8 @@ #endif #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) +extern int page_group_by_mobility_disabled; + /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should Index: linux-2.6.21-rc7-mm2/mm/slub.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/slub.c 2007-04-26 21:02:01.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/slub.c 2007-04-26 21:05:48.000000000 -0700 @@ -129,6 +129,13 @@ #endif /* + * If antifragmentation methods are in effect then increase the + * slab sizes to increase performance + */ +#define DEFAULT_ANTIFRAG_MAX_ORDER 4 +#define DEFAULT_ANTIFRAG_MIN_OBJECTS 16 + +/* * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. */ @@ -1450,6 +1457,11 @@ static struct page *get_object_page(cons */ /* + * Set if the user has overridden any of the order related defaults. + */ +static int user_overide; + +/* * Mininum / Maximum order of slab pages. This influences locking overhead * and slab fragmentation. A higher order reduces the number of partial slabs * and increases the number of allocations possible without having to @@ -1985,7 +1997,7 @@ static struct kmem_cache *kmalloc_caches static int __init setup_slub_min_order(char *str) { get_option (&str, &slub_min_order); - + user_override = 1; return 1; } @@ -1994,7 +2006,7 @@ __setup("slub_min_order=", setup_slub_mi static int __init setup_slub_max_order(char *str) { get_option (&str, &slub_max_order); - + user_override = 1; return 1; } @@ -2003,7 +2015,7 @@ __setup("slub_max_order=", setup_slub_ma static int __init setup_slub_min_objects(char *str) { get_option (&str, &slub_min_objects); - + user_override = 1; return 1; } @@ -2319,6 +2331,15 @@ void __init kmem_cache_init(void) { int i; + if (!page_group_by_mobility_disabled && !user_override) { + /* + * Antifrag support available. Increase usable + * page order and generate slabs with more objects. + */ + slub_max_order = ANTIFRAG_ORDER; + slub_min_objects = ANTIFRAG_MIN_OBJECTS; + } + #ifdef CONFIG_NUMA /* * Must first have the slab cache available for the allocations of the -- From clameter@sgi.com Thu Apr 26 21:21:57 2007 Message-Id: <20070427042157.419317176@sgi.com> References: <20070427042126.299876478@sgi.com> User-Agent: quilt/0.45-1 Date: Thu, 26 Apr 2007 21:21:36 -0700 From: clameter@sgi.com To: akpm@linux-foundation.org Cc: linux-mm@kvack.org, William Lee Irwin III Subject: [patch 10/10] SLUB: i386 support Content-Disposition: inline; filename=slub_i386_pgd_slab SLUB cannot run on i386 at this point because i386 uses the page->private and page->index field of slab pages for the pgd cache. Make SLUB run on i386 by replacing the pgd slab cache with a quicklist. Limit the changes as much as possible Leave the improvised linked list in place etc etc. This has been working here for a couple of weeks now. Cc: William Lee Irwin III Signed-off-by: Christoph Lameter Index: linux-2.6.21-rc7-mm2/arch/i386/Kconfig =================================================================== --- linux-2.6.21-rc7-mm2.orig/arch/i386/Kconfig 2007-04-26 15:18:04.000000000 -0700 +++ linux-2.6.21-rc7-mm2/arch/i386/Kconfig 2007-04-26 15:23:08.000000000 -0700 @@ -55,6 +55,10 @@ config ZONE_DMA bool default y +config QUICKLIST + bool + default y + config SBUS bool @@ -79,10 +83,6 @@ config ARCH_MAY_HAVE_PC_FDC bool default y -config ARCH_USES_SLAB_PAGE_STRUCT - bool - default y - config DMI bool default y Index: linux-2.6.21-rc7-mm2/arch/i386/kernel/process.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/arch/i386/kernel/process.c 2007-04-26 15:18:04.000000000 -0700 +++ linux-2.6.21-rc7-mm2/arch/i386/kernel/process.c 2007-04-26 15:23:08.000000000 -0700 @@ -186,6 +186,7 @@ void cpu_idle(void) if (__get_cpu_var(cpu_idle_state)) __get_cpu_var(cpu_idle_state) = 0; + check_pgt_cache(); rmb(); idle = pm_idle; Index: linux-2.6.21-rc7-mm2/arch/i386/kernel/smp.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/arch/i386/kernel/smp.c 2007-04-26 15:18:04.000000000 -0700 +++ linux-2.6.21-rc7-mm2/arch/i386/kernel/smp.c 2007-04-26 15:23:08.000000000 -0700 @@ -429,7 +429,7 @@ void flush_tlb_mm (struct mm_struct * mm } if (!cpus_empty(cpu_mask)) flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); - + check_pgt_cache(); preempt_enable(); } Index: linux-2.6.21-rc7-mm2/arch/i386/mm/init.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/arch/i386/mm/init.c 2007-04-26 15:18:04.000000000 -0700 +++ linux-2.6.21-rc7-mm2/arch/i386/mm/init.c 2007-04-26 15:23:08.000000000 -0700 @@ -752,7 +752,6 @@ int remove_memory(u64 start, u64 size) EXPORT_SYMBOL_GPL(remove_memory); #endif -struct kmem_cache *pgd_cache; struct kmem_cache *pmd_cache; void __init pgtable_cache_init(void) @@ -776,12 +775,6 @@ void __init pgtable_cache_init(void) pgd_size = PAGE_SIZE; } } - pgd_cache = kmem_cache_create("pgd", - pgd_size, - pgd_size, - SLAB_PANIC, - pgd_ctor, - (!SHARED_KERNEL_PMD) ? pgd_dtor : NULL); } /* Index: linux-2.6.21-rc7-mm2/arch/i386/mm/pgtable.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/arch/i386/mm/pgtable.c 2007-04-26 15:18:04.000000000 -0700 +++ linux-2.6.21-rc7-mm2/arch/i386/mm/pgtable.c 2007-04-26 15:37:29.000000000 -0700 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -205,8 +206,6 @@ void pmd_ctor(void *pmd, struct kmem_cac * against pageattr.c; it is the unique case in which a valid change * of kernel pagetables can't be lazily synchronized by vmalloc faults. * vmalloc faults work because attached pagetables are never freed. - * The locking scheme was chosen on the basis of manfred's - * recommendations and having no core impact whatsoever. * -- wli */ DEFINE_SPINLOCK(pgd_lock); @@ -232,9 +231,11 @@ static inline void pgd_list_del(pgd_t *p set_page_private(next, (unsigned long)pprev); } + + #if (PTRS_PER_PMD == 1) /* Non-PAE pgd constructor */ -void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) +void pgd_ctor(void *pgd) { unsigned long flags; @@ -256,7 +257,7 @@ void pgd_ctor(void *pgd, struct kmem_cac } #else /* PTRS_PER_PMD > 1 */ /* PAE pgd constructor */ -void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) +void pgd_ctor(void *pgd) { /* PAE, kernel PMD may be shared */ @@ -275,11 +276,12 @@ void pgd_ctor(void *pgd, struct kmem_cac } #endif /* PTRS_PER_PMD */ -void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) +void pgd_dtor(void *pgd) { unsigned long flags; /* can be called from interrupt context */ - BUG_ON(SHARED_KERNEL_PMD); + if (SHARED_KERNEL_PMD) + return; paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); spin_lock_irqsave(&pgd_lock, flags); @@ -321,7 +323,7 @@ static void pmd_cache_free(pmd_t *pmd, i pgd_t *pgd_alloc(struct mm_struct *mm) { int i; - pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); + pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); if (PTRS_PER_PMD == 1 || !pgd) return pgd; @@ -344,7 +346,7 @@ out_oom: paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); pmd_cache_free(pmd, i); } - kmem_cache_free(pgd_cache, pgd); + quicklist_free(0, pgd_dtor, pgd); return NULL; } @@ -361,5 +363,11 @@ void pgd_free(pgd_t *pgd) pmd_cache_free(pmd, i); } /* in the non-PAE case, free_pgtables() clears user pgd entries */ - kmem_cache_free(pgd_cache, pgd); + quicklist_free(0, pgd_dtor, pgd); } + +void check_pgt_cache(void) +{ + quicklist_trim(0, pgd_dtor, 25, 16); +} + Index: linux-2.6.21-rc7-mm2/include/asm-i386/pgalloc.h =================================================================== --- linux-2.6.21-rc7-mm2.orig/include/asm-i386/pgalloc.h 2007-04-26 15:18:04.000000000 -0700 +++ linux-2.6.21-rc7-mm2/include/asm-i386/pgalloc.h 2007-04-26 15:23:08.000000000 -0700 @@ -65,6 +65,4 @@ do { \ #define pud_populate(mm, pmd, pte) BUG() #endif -#define check_pgt_cache() do { } while (0) - #endif /* _I386_PGALLOC_H */ Index: linux-2.6.21-rc7-mm2/include/asm-i386/pgtable.h =================================================================== --- linux-2.6.21-rc7-mm2.orig/include/asm-i386/pgtable.h 2007-04-26 15:18:04.000000000 -0700 +++ linux-2.6.21-rc7-mm2/include/asm-i386/pgtable.h 2007-04-26 15:23:08.000000000 -0700 @@ -35,17 +35,16 @@ struct vm_area_struct; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) extern unsigned long empty_zero_page[1024]; extern pgd_t swapper_pg_dir[1024]; -extern struct kmem_cache *pgd_cache; extern struct kmem_cache *pmd_cache; extern spinlock_t pgd_lock; extern struct page *pgd_list; +void check_pgt_cache(void); void pmd_ctor(void *, struct kmem_cache *, unsigned long); -void pgd_ctor(void *, struct kmem_cache *, unsigned long); -void pgd_dtor(void *, struct kmem_cache *, unsigned long); void pgtable_cache_init(void); void paging_init(void); + /* * The Linux x86 paging architecture is 'compile-time dual-mode', it * implements both the traditional 2-level x86 page tables and the --