From f645ac4ca5000838d73d165fd7c9cedc1985861f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 6 Mar 2008 21:44:48 -0500 Subject: [PATCH] Implement slub fastpath with sequence number Changelog : - Fix slab_alloc end of slab check - Remove unneeded bit shift ops on fast path by changing freebase (LSBs -> MSBs) This version does not use the slab END value. We'll have to see which approach is the best. Here, I changed slab_alloc to test for the whole NULL pointer before branching over __slab_alloc. --- include/linux/slub_def.h | 10 ++ mm/slub.c | 209 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 207 insertions(+), 12 deletions(-) Index: linux-2.6/include/linux/slub_def.h =================================================================== --- linux-2.6.orig/include/linux/slub_def.h 2008-03-07 00:18:15.217079441 -0800 +++ linux-2.6/include/linux/slub_def.h 2008-03-07 00:22:55.035941344 -0800 @@ -32,9 +32,19 @@ enum stat_item { ORDER_FALLBACK, /* Number of times fallback was necessary */ NR_SLUB_STAT_ITEMS }; +/* + * Currently the fastpath is not supported if preemption is enabled. + */ +#if defined(CONFIG_FAST_CMPXCHG_LOCAL) && !defined(CONFIG_PREEMPT) +#define SLUB_FASTPATH +#endif + struct kmem_cache_cpu { void **freelist; /* Pointer to first free per cpu object */ struct page *page; /* The slab from which we are allocating */ +#ifdef SLUB_FASTPATH + void *base; /* Base for fastpath. */ +#endif int node; /* The node of the page (or -1 for debug) */ unsigned int offset; /* Freepointer offset (in word units) */ unsigned int objsize; /* Size of an object (from kmem_cache) */ Index: linux-2.6/mm/slub.c =================================================================== --- linux-2.6.orig/mm/slub.c 2008-03-07 00:18:15.229079641 -0800 +++ linux-2.6/mm/slub.c 2008-03-07 00:54:48.259152279 -0800 @@ -138,6 +138,72 @@ static inline void ClearSlabDebug(struct page->flags &= ~SLABDEBUG; } +#ifdef SLUB_FASTPATH +/* + * The fastpath divides the free pointer into two halves. + * + * The lower bits provide an offset relative to the base pointer (that is + * kept in different field). The upper bits provide a sequence number so + * that we can decide if a given pointer is still valid through a cmpxchg. + */ +#define HALF_BITS_PER_LONG (BITS_PER_LONG / 2) +#define HALF_MASK ((1UL << HALF_BITS_PER_LONG) - 1) +#define END ((void *)1) + +static inline int is_end(const void *x) +{ + return (unsigned long)x & 1; +} + +static inline unsigned long sequence(const void *x) +{ + return (unsigned long)x >> HALF_BITS_PER_LONG; +} + +static inline unsigned offset(const void *x) +{ + return (unsigned long) x & HALF_MASK; +} + +static inline void *make_version(unsigned long sequence, void *object) +{ + return (void *)((sequence << HALF_BITS_PER_LONG) | + ((unsigned long)object & HALF_MASK)); +} + +static inline void **get_freelist_ptr(struct kmem_cache_cpu *c) +{ + VM_BUG_ON(!irqs_disabled()); + return c->base + offset(c->freelist); +} + +static inline void set_freelist_ptr(struct kmem_cache_cpu *c, void **freelist) +{ + VM_BUG_ON(!irqs_disabled()); + c->base = (void *)((unsigned long)freelist & ~HALF_MASK); + c->freelist = make_version(sequence(c->freelist) + 1, freelist); +} + +#else +#define END NULL + +static inline is_end(void *x) +{ + return x == NULL; +} + +static inline void **get_freelist_ptr(struct kmem_cache_cpu *c) +{ + return c->freelist; +} + +static inline void set_freelist_ptr(struct kmem_cache_cpu *c, void **freelist) +{ + c->freelist = freelist; +} + +#endif + /* * Issues still to be resolved: * @@ -273,13 +339,6 @@ static inline struct kmem_cache_cpu *get #endif } -#define END NULL - -static inline int is_end(const void *x) -{ - return x == NULL; -} - /* Determine the maximum number of objects that a slab page can hold */ static inline unsigned long slab_objects(struct kmem_cache *s, struct page *page) { @@ -1406,13 +1465,14 @@ static void deactivate_slab(struct kmem_ if (page->freelist) stat(c, DEACTIVATE_REMOTE_FREES); + /* * Merge cpu freelist into slab freelist. Typically we get here * because both freelists are empty. So this is unlikely * to occur. */ - freelist = c->freelist; - while (unlikely(freelist)) { + freelist = get_freelist_ptr(c); + while (unlikely(!is_end(freelist))) { void **object; tail = 0; /* Hot objects. Put the slab first */ @@ -1427,7 +1487,7 @@ static void deactivate_slab(struct kmem_ page->inuse--; } c->page = NULL; - c->freelist = NULL; + set_freelist_ptr(c, END); unfreeze_slab(s, page, tail); } @@ -1507,7 +1567,11 @@ static void *__slab_alloc(struct kmem_ca { void **object; struct page *new; +#ifdef SLUB_FASTPATH + unsigned long flags; + local_irq_save(flags); +#endif if (!c->page) goto new_slab; @@ -1524,13 +1588,16 @@ load_freelist: if (unlikely(SlabDebug(c->page))) goto debug; - c->freelist = object[c->offset]; + set_freelist_ptr(c, object[c->offset]); c->page->inuse = slab_objects(s, c->page); c->page->freelist = END; c->node = page_to_nid(c->page); unlock_out: slab_unlock(c->page); stat(c, ALLOC_SLOWPATH); +#ifdef SLUB_FASTPATH + local_irq_restore(flags); +#endif return object; another_slab: @@ -1562,7 +1629,9 @@ new_slab: c->page = new; goto load_freelist; } - +#ifdef SLUB_FASTPATH + local_irq_restore(flags); +#endif return NULL; debug: if (!alloc_debug_processing(s, c->page, object, addr)) @@ -1589,6 +1658,62 @@ static __always_inline void *slab_alloc( { void **object; struct kmem_cache_cpu *c; + +/* + * The SLUB_FASTPATH path is provisional and is currently disabled if the + * kernel is compiled with preemption or if the arch does not support + * fast cmpxchg operations. There are a couple of coming changes that will + * simplify matters and allow preemption. Ultimately we may end up making + * SLUB_FASTPATH the default. + * + * 1. The introduction of the per cpu allocator will avoid array lookups + * through get_cpu_slab(). A special register can be used instead. + * + * 2. The introduction of per cpu atomic operations (cpu_ops) means that + * we can realize the logic here entirely with per cpu atomics. The + * per cpu atomic ops will take care of the preemption issues. + */ + +#ifdef SLUB_FASTPATH + void *old, *new, *result; + + c = get_cpu_slab(s, raw_smp_processor_id()); + do { + old = c->freelist; + /* + * Whenever c->base is changed, the sequence number + * _must_ be incremented. This barrier insures we read + * version before c->base wrt interrupts. + */ + barrier(); + object = c->base + offset(old); + /* + * Must check base here otherwise we may form an invalid + * address..... + */ + if (unlikely(!c->base || is_end(object) || !node_match(c, node))) { + object = __slab_alloc(s, gfpflags, node, addr, c); + break; + } + stat(c, ALLOC_FASTPATH); + /* No need to increment the MSB counter here, because only + * object free will lead to counter re-use. */ + new = make_version(sequence(old), object[c->offset]); + result = cmpxchg_local(&c->freelist, old, new); +#ifdef CONFIG_DEBUG_VM + /* + * Just to be paranoid : warn if we detect that enough free or + * slow paths nested on top of us to get the counter to go + * half-way to overflow. That would be insane to do that much + * allocations/free in interrupt handers, but check it anyway. + * Mask out the LSBs because alloc fast path does not increment + * the sequence number, which may cause the overall values to go + * backward. + */ + WARN_ON(sequence(result) - sequence(old) > HALF_MASK); +#endif + } while (result != old); +#else unsigned long flags; local_irq_save(flags); @@ -1603,6 +1728,7 @@ static __always_inline void *slab_alloc( stat(c, ALLOC_FASTPATH); } local_irq_restore(flags); +#endif if (unlikely((gfpflags & __GFP_ZERO) && object)) memset(object, 0, c->objsize); @@ -1639,6 +1765,11 @@ static void __slab_free(struct kmem_cach void **object = (void *)x; struct kmem_cache_cpu *c; +#ifdef SLUB_FASTPATH + unsigned long flags; + + local_irq_save(flags); +#endif c = get_cpu_slab(s, raw_smp_processor_id()); stat(c, FREE_SLOWPATH); slab_lock(page); @@ -1670,6 +1801,9 @@ checks_ok: out_unlock: slab_unlock(page); +#ifdef SLUB_FASTPATH + local_irq_restore(flags); +#endif return; slab_empty: @@ -1683,6 +1817,9 @@ slab_empty: slab_unlock(page); stat(c, FREE_SLAB); discard_slab(s, page); +#ifdef SLUB_FASTPATH + local_irq_restore(flags); +#endif return; debug: @@ -1707,6 +1844,48 @@ static __always_inline void slab_free(st { void **object = (void *)x; struct kmem_cache_cpu *c; + +#ifdef SLUB_FASTPATH + void *old, *new, *result; + + c = get_cpu_slab(s, raw_smp_processor_id()); + debug_check_no_locks_freed(object, s->objsize); + do { + old = c->freelist; + barrier(); + /* + * If the compiler would reorder the retrieval of c->page and + * c->base to come before c->version then an interrupt + * could change the cpu slab before we retrieve c->version. + * We could be matching on a page no longer active and put the + * object onto the freelist of the wrong slab. + * + * On the other hand: If we already have the version + * then any change of cpu_slab will cause the cmpxchg to fail + * since the freelist pointers are unique per slab. + */ + if (unlikely(page != c->page || c->node < 0)) { + __slab_free(s, page, x, addr, c->offset); + break; + } + object[c->offset] = c->base + offset(old); + stat(c, FREE_FASTPATH); + new = make_version(sequence(old) + 1, object); + result = cmpxchg_local(&c->freelist, old, new); +#ifdef CONFIG_DEBUG_VM + /* + * Just to be paranoid : warn if we detect that enough free or + * slow paths nested on top of us to get the counter to go + * half-way to overflow. That would be insane to do that much + * allocations/free in interrupt handers, but check it anyway. + * Mask out the LSBs because alloc fast path does not increment + * the sequence number, which may cause the overall values to go + * backward. + */ + WARN_ON(sequence(result) - sequence(old) > HALF_MASK); +#endif + } while (result != old); +#else unsigned long flags; local_irq_save(flags); @@ -1720,6 +1899,7 @@ static __always_inline void slab_free(st __slab_free(s, page, x, addr, c->offset); local_irq_restore(flags); +#endif } void kmem_cache_free(struct kmem_cache *s, void *x) @@ -1897,7 +2077,12 @@ static void init_kmem_cache_cpu(struct k struct kmem_cache_cpu *c) { c->page = NULL; +#ifdef SLUB_FASTPATH + c->freelist = make_version(1, END); + c->base = NULL; +#else c->freelist = END; +#endif c->node = 0; c->offset = s->offset / sizeof(void *); c->objsize = s->objsize;