SLUB: Optimize cacheline use for zeroing We touch a cacheline in the kmem_cache structure for zeroing to get the size. However, the hot paths in slab_alloc and slab_free do not reference any other fields in kmem_cache. Add a new field to kmem_cache_cpu that contains the object size. That cacheline must already be used. So we save one cacheline on every slab_alloc. We need to update the kmem_cache_cpu object size if an aliasing operation changes the objsize of an non debug slab. Signed-off-by: Christoph Lameter --- include/linux/slub_def.h | 1 mm/slub.c | 14 ++- slub.c | 192 +++++++++++++++++++++++++++-------------------- 3 files changed, 124 insertions(+), 83 deletions(-) Index: linux-2.6.22-rc6-mm1/include/linux/slub_def.h =================================================================== --- linux-2.6.22-rc6-mm1.orig/include/linux/slub_def.h 2007-07-11 22:04:59.000000000 -0700 +++ linux-2.6.22-rc6-mm1/include/linux/slub_def.h 2007-07-11 22:05:03.000000000 -0700 @@ -16,6 +16,7 @@ struct kmem_cache_cpu { struct page *page; int node; unsigned int offset; + unsigned int objsize; }; struct kmem_cache_node { Index: linux-2.6.22-rc6-mm1/mm/slub.c =================================================================== --- linux-2.6.22-rc6-mm1.orig/mm/slub.c 2007-07-11 22:04:59.000000000 -0700 +++ linux-2.6.22-rc6-mm1/mm/slub.c 2007-07-11 22:05:03.000000000 -0700 @@ -1571,7 +1571,7 @@ static void __always_inline *slab_alloc( local_irq_restore(flags); if (unlikely((gfpflags & __GFP_ZERO) && object)) - memset(object, 0, s->objsize); + memset(object, 0, c->objsize); return object; } @@ -1864,8 +1864,9 @@ static void init_kmem_cache_cpu(struct k { c->page = NULL; c->freelist = NULL; - c->offset = s->offset / sizeof(void *); c->node = 0; + c->offset = s->offset / sizeof(void *); + c->objsize = s->objsize; } static void init_kmem_cache_node(struct kmem_cache_node *n) @@ -3173,12 +3174,21 @@ struct kmem_cache *kmem_cache_create(con down_write(&slub_lock); s = find_mergeable(size, align, flags, ctor, ops); if (s) { + int cpu; + s->refcount++; /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ s->objsize = max(s->objsize, (int)size); + + /* + * And then we need to update the object size in the + * per cpu structures + */ + for_each_online_cpu(cpu) + get_cpu_slab(s, cpu)->objsize = s->objsize; s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); up_write(&slub_lock); Index: linux-2.6.22-rc6-mm1/slub.c =================================================================== --- linux-2.6.22-rc6-mm1.orig/slub.c 2007-07-11 21:55:22.000000000 -0700 +++ linux-2.6.22-rc6-mm1/slub.c 2007-07-11 22:05:03.000000000 -0700 @@ -138,26 +138,6 @@ static inline void ClearSlabDebug(struct } /* - * Special value to put onto the freelist to signal - * that it is not to be used. - */ -#define CPU_FREELIST_OFF (void **)(16) - -static inline int freelist_off(void **x) -{ - return x == CPU_FREELIST_OFF; -} - -static inline int freelist_off_or_empty(void *x) -{ - return (unsigned long)x <= (unsigned long)CPU_FREELIST_OFF; -} - -static inline void **freelist_get_and_clear(void ***x) -{ - return xchg(x, CPU_FREELIST_OFF); -} -/* * Issues still to be resolved: * * - Support PAGE_ALLOC_DEBUG. Should be easy to do. @@ -252,6 +232,9 @@ static enum { static DECLARE_RWSEM(slub_lock); LIST_HEAD(slab_caches); +/* Maximum objects in defragmentable slabs */ +static unsigned int max_defrag_slab_objects = 0; + /* * Tracking user of a slab. */ @@ -1398,7 +1381,7 @@ static void deactivate_slab(struct kmem_ * because both freelists are empty. So this is unlikely * to occur. */ - while (unlikely(!freelist_off_or_empty(freelist))) { + while (unlikely(freelist)) { void **object; /* Retrieve object from cpu_freelist */ @@ -1415,7 +1398,7 @@ static void deactivate_slab(struct kmem_ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - void **freelist = freelist_get_and_clear(&c->freelist); + void **freelist = xchg(&c->freelist, NULL); slab_lock(c->page); deactivate_slab(s, c, freelist); @@ -1489,24 +1472,26 @@ static void *__slab_alloc(struct kmem_ca void **object; struct page *new; struct kmem_cache_cpu *c; - void **cpu_freelist = NULL; + void **freelist = NULL; unsigned long flags; local_irq_save(flags); - c = get_cpu_slab(s, smp_processor_id()); - if (!c->page) /* Slab was flushed */ goto new_slab; - cpu_freelist = freelist_get_and_clear(&c->freelist); + freelist = xchg(&c->freelist, NULL); slab_lock(c->page); - if (unlikely(!freelist_off_or_empty(cpu_freelist))) - goto mess; if (unlikely(!node_match(c, node))) goto another_slab; + + if (unlikely(freelist)) { + object = freelist; + goto out_object; + } + load_freelist: object = c->page->freelist; if (unlikely(!object)) @@ -1515,19 +1500,20 @@ load_freelist: goto debug; object = c->page->freelist; - c->freelist = object[c->offset]; c->page->inuse = s->objects; c->page->freelist = NULL; c->node = page_to_nid(c->page); +out_object: + c->freelist = object[c->offset]; slab_unlock(c->page); out: local_irq_restore(flags); if (unlikely((gfpflags & __GFP_ZERO))) - memset(object, 0, s->objsize); + memset(object, 0, c->objsize); return object; another_slab: - deactivate_slab(s, c, cpu_freelist); + deactivate_slab(s, c, freelist); new_slab: new = get_partial(s, gfpflags, node); @@ -1576,21 +1562,6 @@ debug: c->node = page_to_nid(c->page); slab_unlock(c->page); goto out; - -mess: - /* - * We have switched off the cpu freelist but found that - * a racing process has freed additional objects in the - * meantime. - * - * So leave the cpu freelist off. Return the newly freed - * object and deactivate the slab. This may have just been - * a single object. In that case we return the freed object - * and can release a full slab. - */ - object = cpu_freelist; - deactivate_slab(s, c, object[c->offset]); - goto out; } /* @@ -1613,15 +1584,18 @@ redo: c = get_cpu_slab(s, raw_smp_processor_id()); object = c->freelist; - if (unlikely(freelist_off_or_empty(object) || !node_match(c, node))) + if (unlikely(!object)) goto slow; - if (cmpxchg(&c->freelist, object, - object[c->offset]) != object) + if (unlikely(!node_match(c, node))) + goto slow; + + if (unlikely(cmpxchg(&c->freelist, object, + object[c->offset]) != object)) goto redo; if (unlikely((gfpflags & __GFP_ZERO))) - memset(object, 0, s->objsize); + memset(object, 0, c->objsize); return object; @@ -1652,7 +1626,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, void *addr, int offset) + void *x, void *addr, unsigned int offset) { void *prior; void **object = (void *)x; @@ -1725,15 +1699,24 @@ static void __always_inline slab_free(st redo: c = get_cpu_slab(s, raw_smp_processor_id()); - freelist = c->freelist; - if (unlikely(page != c->page)) - goto slow; - if (freelist_off(freelist)) + freelist = c->freelist; + /* + * Must read c->freelist before c->page. If the page is + * later changed then the freelist also changes which + * will make the cmpxchg() fail. + * + * deactivate_slab() sets c->page to NULL while taking + * the slab lock which provides the corresponding + * smp_wmb() barriers. + */ + smp_rmb(); + if (unlikely(c->page != page)) goto slow; - object[c->offset] = freelist; + if (unlikely(!freelist)) + goto slow; if (unlikely(cmpxchg(&c->freelist, freelist, object) != freelist)) goto redo; @@ -1930,9 +1913,10 @@ static void init_kmem_cache_cpu(struct k struct kmem_cache_cpu *c) { c->page = NULL; - c->freelist = CPU_FREELIST_OFF; - c->offset = s->offset / sizeof(void *); + c->freelist = NULL; c->node = 0; + c->offset = s->offset / sizeof(void *); + c->objsize = s->objsize; } static void init_kmem_cache_node(struct kmem_cache_node *n) @@ -1941,7 +1925,9 @@ static void init_kmem_cache_node(struct atomic_long_set(&n->nr_slabs, 0); spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); +#ifdef CONFIG_SLUB_DEBUG INIT_LIST_HEAD(&n->full); +#endif } #ifdef CONFIG_SMP @@ -1996,7 +1982,7 @@ static void free_kmem_cache_cpu(struct k return; } c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); - per_cpu(kmem_cache_cpu_free, cpu) = c; + per_cpu(kmem_cache_cpu_free, cpu) = c; } static void free_kmem_cache_cpus(struct kmem_cache *s) @@ -2050,7 +2036,7 @@ static void __init init_alloc_cpu(void) for_each_online_cpu(cpu) init_alloc_cpu_cpu(cpu); -} + } #else static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} @@ -2058,13 +2044,12 @@ static inline void init_alloc_cpu(void) static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) { - init_kmem_cache_cpu(s, &s->cpu_slab); + init_kmem_cache_cpu(s, &s->cpu_slab); return 1; } #endif #ifdef CONFIG_NUMA - /* * No kmalloc_node yet so do it by hand. We know that this is the first * slab on the node for this slabcache. There are no concurrent accesses @@ -2092,8 +2077,10 @@ static struct kmem_cache_node * __init e page->freelist = get_freepointer(kmalloc_caches, n); page->inuse++; kmalloc_caches->node[node] = n; +#ifdef CONFIG_SLUB_DEBUG init_object(kmalloc_caches, n, 1); init_tracking(kmalloc_caches, n); +#endif init_kmem_cache_node(n); atomic_long_inc(&n->nr_slabs); add_partial(n, page); @@ -2756,7 +2743,9 @@ static unsigned long sort_partial_list(s * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse && slab_trylock(page)) { + int inuse = page->inuse; + + if (!inuse && slab_trylock(page)) { /* * Must hold slab lock here because slab_free * may have freed the last object and be @@ -2769,7 +2758,7 @@ static unsigned long sort_partial_list(s freed++; } else { list_move(&page->lru, - slabs_by_inuse + page->inuse); + slabs_by_inuse + inuse); } } @@ -2783,6 +2772,8 @@ static unsigned long sort_partial_list(s return freed; } +#define NR_INUSE 40 + /* * Shrink the slab cache on a particular node of the cache */ @@ -2793,6 +2784,9 @@ static unsigned long __kmem_cache_shrink struct page *page, *page2; LIST_HEAD(zaplist); int freed; + int inuse; + int nr[NR_INUSE] = { 0, }; + int i; spin_lock_irqsave(&n->list_lock, flags); freed = sort_partial_list(s, n, scratch); @@ -2813,12 +2807,13 @@ static unsigned long __kmem_cache_shrink */ while (n->nr_partial > MAX_PARTIAL) { page = container_of(n->partial.prev, struct page, lru); + inuse = page->inuse; /* * We are holding the list_lock so we can only * trylock the slab */ - if (page->inuse > s->objects / 4) + if (inuse > s->objects / 4) break; if (!slab_trylock(page)) @@ -2828,6 +2823,8 @@ static unsigned long __kmem_cache_shrink n->nr_partial--; SetSlabFrozen(page); slab_unlock(page); + if (inuse < NR_INUSE) + nr[inuse]++; } spin_unlock_irqrestore(&n->list_lock, flags); @@ -2841,6 +2838,13 @@ static unsigned long __kmem_cache_shrink if (__kmem_cache_vacate(s, page, flags, scratch) == 0) freed++; } + printk(KERN_INFO "Slab %s: Defrag freed %d pages. PartSlab config=", + s->name, freed << s->order); + + for (i = 0; i < NR_INUSE; i++) + if (nr[i]) + printk(" %d=%d", i, nr[i]); + printk("\n"); return freed; } @@ -2960,7 +2964,7 @@ static unsigned long __kmem_cache_defrag void *scratch) { unsigned long capacity; - unsigned long objects; + unsigned long objects_in_full_slabs; unsigned long ratio; struct kmem_cache_node *n = get_node(s, node); @@ -2971,17 +2975,26 @@ static unsigned long __kmem_cache_defrag if (n->nr_partial <= MAX_PARTIAL) return 0; - /* - * Calculate usage ratio - */ capacity = atomic_long_read(&n->nr_slabs) * s->objects; - objects = capacity - n->nr_partial * s->objects + count_partial(n); - ratio = objects * 100 / capacity; + objects_in_full_slabs = + (atomic_long_read(&n->nr_slabs) - n->nr_partial) + * s->objects; + /* + * Worst case calculation: If we would be over the ratio + * even if all partial slabs would only have one object + * then we can skip the further test that would require a scan + * through all the partial page structs to sum up the actual + * number of objects in the partial slabs. + */ + ratio = (objects_in_full_slabs + 1 * n->nr_partial) * 100 / capacity; + if (ratio > s->defrag_ratio) + return 0; /* - * If usage ratio is more than required then no - * defragmentation + * Now for the real calculation. If usage ratio is more than required + * then no defragmentation */ + ratio = (objects_in_full_slabs + count_partial(n)) * 100 / capacity; if (ratio > s->defrag_ratio) return 0; @@ -2998,6 +3011,11 @@ int kmem_cache_defrag(int node) unsigned long pages = 0; void *scratch; + scratch = kmalloc(sizeof(struct list_head) * max_defrag_slab_objects, + GFP_KERNEL); + if (!scratch) + return 0; + /* * kmem_cache_defrag may be called from the reclaim path which may be * called for any page allocator alloc. So there is the danger that we @@ -3016,16 +3034,17 @@ int kmem_cache_defrag(int node) if (!s->ops->kick) break; - scratch = kmalloc(sizeof(struct list_head) * s->objects, - GFP_KERNEL); + if (node == -1) { - for_each_online_node(node) - pages += __kmem_cache_defrag(s, node, scratch); + int nid; + + for_each_online_node(nid) + pages += __kmem_cache_defrag(s, nid, scratch); } else pages += __kmem_cache_defrag(s, node, scratch); - kfree(scratch); } up_read(&slub_lock); + kfree(scratch); return pages; } EXPORT_SYMBOL(kmem_cache_defrag); @@ -3205,12 +3224,21 @@ struct kmem_cache *kmem_cache_create(con down_write(&slub_lock); s = find_mergeable(size, align, flags, ctor, ops); if (s) { + int cpu; + s->refcount++; /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ s->objsize = max(s->objsize, (int)size); + + /* + * And then we need to update the object size in the + * per cpu structures + */ + for_each_online_cpu(cpu) + get_cpu_slab(s, cpu)->objsize = s->objsize; s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); up_write(&slub_lock); @@ -3229,9 +3257,11 @@ struct kmem_cache *kmem_cache_create(con * Reclaimable slabs first because we may have * to scan them repeatedly. */ - if (ops->kick) + if (ops->kick) { list_add(&s->list, &slab_caches); - else + if (s->objects > max_defrag_slab_objects) + max_defrag_slab_objects = s->objects; + } else list_add_tail(&s->list, &slab_caches); up_write(&slub_lock); @@ -3743,8 +3773,8 @@ static unsigned long slab_objects(struct per_cpu = nodes + nr_node_ids; for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); struct page *page; + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); if (!c) continue;