From: Christoph Lameter Subject: SLUB: Experimental new fastpath w/o interrupt disable This is a bit of a different tack on things than the last version provided by Mathieu. Instead of using a cmpxchg we keep a state variable in the per cpu structure that is incremented when we enter the hot path. We can then detect that a thread is in the fastpath and fall back to alternate allocation / free technique that bypasses fastpath caching. V1->V2: - Use safe preempt_enable / disable. - Enable preempt before calling into the page allocator - Checkup on hotpath activity changes when returning from page allocator. - Add barriers. Todo: - Verify that this is really safe - Is this a benefit? Cc: Mathieu Desnoyers Cc: Pekka Enberg Signed-off-by: Christoph Lameter --- include/linux/slub_def.h | 1 mm/slub.c | 112 +++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 96 insertions(+), 17 deletions(-) Index: linux-2.6/include/linux/slub_def.h =================================================================== --- linux-2.6.orig/include/linux/slub_def.h 2009-10-08 11:35:59.000000000 -0500 +++ linux-2.6/include/linux/slub_def.h 2009-10-08 11:35:59.000000000 -0500 @@ -38,6 +38,7 @@ struct kmem_cache_cpu { void **freelist; /* Pointer to first free per cpu object */ struct page *page; /* The slab from which we are allocating */ int node; /* The node of the page (or -1 for debug) */ + int active; /* Active fastpaths */ #ifdef CONFIG_SLUB_STATS unsigned stat[NR_SLUB_STAT_ITEMS]; #endif Index: linux-2.6/mm/slub.c =================================================================== --- linux-2.6.orig/mm/slub.c 2009-10-08 11:35:59.000000000 -0500 +++ linux-2.6/mm/slub.c 2009-10-08 14:03:22.000000000 -0500 @@ -1606,7 +1606,14 @@ static void *__slab_alloc(struct kmem_ca unsigned long addr) { void **object; - struct page *page = __this_cpu_read(s->cpu_slab->page); + struct page *page; + unsigned long flags; + int hotpath; + + local_irq_save(flags); + hotpath = __this_cpu_read(s->cpu_slab->active) != 0; + __this_cpu_dec(s->cpu_slab->active); /* Drop count from hotpath */ + page = __this_cpu_read(s->cpu_slab->page); /* We handle __GFP_ZERO in the caller */ gfpflags &= ~__GFP_ZERO; @@ -1627,12 +1634,24 @@ load_freelist: if (unlikely(SLABDEBUG && PageSlubDebug(page))) goto debug; - __this_cpu_write(s->cpu_slab->node, page_to_nid(page)); - __this_cpu_write(s->cpu_slab->freelist, get_freepointer(s, object)); - page->inuse = page->objects; - page->freelist = NULL; + if (unlikely(hotpath)) { + /* Object on page free list available and hotpath busy */ + page->inuse++; + page->freelist = get_freepointer(s, object); + + } else { + + /* Prepare new list of objects for hotpath */ + __this_cpu_write(s->cpu_slab->freelist, get_freepointer(s, object)); + page->inuse = page->objects; + page->freelist = NULL; + __this_cpu_write(s->cpu_slab->node, page_to_nid(page)); + + } + unlock_out: slab_unlock(page); + local_irq_restore(flags); stat(s, ALLOC_SLOWPATH); return object; @@ -1642,21 +1661,38 @@ another_slab: new_slab: page = get_partial(s, gfpflags, node); if (page) { - __this_cpu_write(s->cpu_slab->page, page); stat(s, ALLOC_FROM_PARTIAL); + + if (hotpath) + goto hot_lock; + + __this_cpu_write(s->cpu_slab->page, page); goto load_freelist; } if (gfpflags & __GFP_WAIT) local_irq_enable(); + preempt_enable(); page = new_slab(s, gfpflags, node); + preempt_disable(); + + /* + * We have already decremented our count. Someone else + * could be running right now or we were moved to a + * processor that is in the hotpath. So check against -1. + */ + hotpath = __this_cpu_read(s->cpu_slab->active) != -1; if (gfpflags & __GFP_WAIT) local_irq_disable(); if (page) { stat(s, ALLOC_SLAB); + + if (hotpath) + goto hot_no_lock; + if (__this_cpu_read(s->cpu_slab->page)) flush_slab(s, __this_cpu_ptr(s->cpu_slab)); slab_lock(page); @@ -1664,9 +1700,13 @@ new_slab: __this_cpu_write(s->cpu_slab->page, page); goto load_freelist; } + + local_irq_restore(flags); + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) slab_out_of_memory(s, gfpflags, node); return NULL; + debug: if (!alloc_debug_processing(s, page, object, addr)) goto another_slab; @@ -1675,6 +1715,20 @@ debug: page->freelist = get_freepointer(s, object); __this_cpu_write(s->cpu_slab->node, -1); goto unlock_out; + + /* + * Hotpath is busy and we need to avoid touching + * hotpath variables + */ +hot_no_lock: + slab_lock(page); + +hot_lock: + __ClearPageSlubFrozen(page); + if (get_freepointer(s, page->freelist)) + /* Cannot put page into the hotpath. Instead back to partial */ + add_partial(get_node(s, page_to_nid(page)), page, 0); + goto load_freelist; } /* @@ -1691,7 +1745,6 @@ static __always_inline void *slab_alloc( gfp_t gfpflags, int node, unsigned long addr) { void **object; - unsigned long flags; gfpflags &= gfp_allowed_mask; @@ -1701,18 +1754,27 @@ static __always_inline void *slab_alloc( if (should_failslab(s->objsize, gfpflags)) return NULL; - local_irq_save(flags); + preempt_disable(); + + irqsafe_cpu_inc(s->cpu_slab->active); + barrier(); object = __this_cpu_read(s->cpu_slab->freelist); - if (unlikely(!object || !node_match(s, node))) + if (unlikely(!object || !node_match(s, node) || + __this_cpu_read(s->cpu_slab->active))) object = __slab_alloc(s, gfpflags, node, addr); else { + __this_cpu_write(s->cpu_slab->freelist, get_freepointer(s, object)); + barrier(); + irqsafe_cpu_dec(s->cpu_slab->active); stat(s, ALLOC_FASTPATH); + } - local_irq_restore(flags); + + preempt_enable(); if (unlikely((gfpflags & __GFP_ZERO) && object)) memset(object, 0, s->objsize); @@ -1777,7 +1839,9 @@ static void __slab_free(struct kmem_cach { void *prior; void **object = (void *)x; + unsigned long flags; + local_irq_save(flags); stat(s, FREE_SLOWPATH); slab_lock(page); @@ -1809,6 +1873,7 @@ checks_ok: out_unlock: slab_unlock(page); + local_irq_restore(flags); return; slab_empty: @@ -1820,6 +1885,7 @@ slab_empty: stat(s, FREE_REMOVE_PARTIAL); } slab_unlock(page); + local_irq_restore(flags); stat(s, FREE_SLAB); discard_slab(s, page); return; @@ -1845,24 +1911,31 @@ static __always_inline void slab_free(st struct page *page, void *x, unsigned long addr) { void **object = (void *)x; - unsigned long flags; kmemleak_free_recursive(x, s->flags); - local_irq_save(flags); + preempt_disable(); kmemcheck_slab_free(s, object, s->objsize); debug_check_no_locks_freed(object, s->objsize); if (!(s->flags & SLAB_DEBUG_OBJECTS)) debug_check_no_obj_freed(object, s->objsize); + irqsafe_cpu_inc(s->cpu_slab->active); + barrier(); if (likely(page == __this_cpu_read(s->cpu_slab->page) && - __this_cpu_read(s->cpu_slab->node) >= 0)) { - set_freepointer(s, object, __this_cpu_read(s->cpu_slab->freelist)); + __this_cpu_read(s->cpu_slab->node) >= 0) && + !__this_cpu_read(s->cpu_slab->active)) { + set_freepointer(s, object, + __this_cpu_read(s->cpu_slab->freelist)); __this_cpu_write(s->cpu_slab->freelist, object); + barrier(); + irqsafe_cpu_dec(s->cpu_slab->active); + preempt_enable(); stat(s, FREE_FASTPATH); - } else + } else { + irqsafe_cpu_dec(s->cpu_slab->active); + preempt_enable(); __slab_free(s, page, x, addr); - - local_irq_restore(flags); + } } void kmem_cache_free(struct kmem_cache *s, void *x) @@ -2064,6 +2137,8 @@ static DEFINE_PER_CPU(struct kmem_cache_ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) { + int cpu; + if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches) /* * Boot time creation of the kmalloc array. Use static per cpu data @@ -2073,6 +2148,9 @@ static inline int alloc_kmem_cache_cpus( else s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); + for_each_possible_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->active = -1; + if (!s->cpu_slab) return 0;