SLUB: Support per cpu allocation list of objects independent of slabs. This patch changes the way we manage per cpu objects by managing linked lists of objects. These objects are no longer restricted to come only from a single slab. This has the following benefits: - The fastpath for slab_free() can be taken regularly (vs. about 20%-50% with the old approach). - It is possible to put more objects than fit in a slab onto the per cpu allocation list. This allows more frequent use of the slab_alloc fast path without the need for larger pages. Fetches of objects via the partial lists can be batched which will reduce locking overhead. - We can manage list sizes per slab cache on SLUB like SLAB. Issues that arise because of this: - The alloc/free fastpaths get slower since we need to update a counter. Likely a minimal impact since we increment in an already cache hot cacheline. - Further optimizations of the alloc/free paths are not possible since we now have to update two variables in the per cpu structure (pointer + counter). Mathieu's cmpxchg_local optimization that brings us around 30-50% in speed gain in the fast paths for slab_alloc and slab_free will no longer be possible. - The per slab control of slowpath/fastpath to support containers/cgroups accounting and access control is also no longer possible. We will likely have to add code to the fastpath to check for a container/cgroup restriction. With the current approach we can switch individual slabs to slow path mode and then put the logic only into the slow path. I favor that approach since it is likely that future enhancements may require more specialized object handling and the existing approach allows us to keep additional logic out of the fast path for good. - NUMA localization control is getting problematic. If slab objects only come from a specific page then we can check the locality by checking the the node that we calculated when activating a cpu slab. This is no longer working if random objects can show up on the per cpu freelists. For NUMA we have to add a number of calls to virt_to_page and page_to_nid to determine NUMA locality. Signed-off-by: Christoph Lameter --- include/linux/slub_def.h | 5 mm/slub.c | 539 ++++++++++++++++++++++++++++++----------------- 2 files changed, 352 insertions(+), 192 deletions(-) Index: linux-2.6.23-rc8-mm2/include/linux/slub_def.h =================================================================== --- linux-2.6.23-rc8-mm2.orig/include/linux/slub_def.h 2007-10-09 13:44:48.000000000 -0700 +++ linux-2.6.23-rc8-mm2/include/linux/slub_def.h 2007-10-09 13:44:56.000000000 -0700 @@ -13,8 +13,9 @@ struct kmem_cache_cpu { void **freelist; - struct page *page; - int node; + int objects; + int refill; + int max; unsigned int offset; unsigned int objsize; }; Index: linux-2.6.23-rc8-mm2/mm/slub.c =================================================================== --- linux-2.6.23-rc8-mm2.orig/mm/slub.c 2007-10-09 13:44:48.000000000 -0700 +++ linux-2.6.23-rc8-mm2/mm/slub.c 2007-10-10 10:47:02.000000000 -0700 @@ -877,9 +877,11 @@ bad: return 0; } -static int free_debug_processing(struct kmem_cache *s, struct page *page, +static int free_debug_processing(struct kmem_cache *s, void *object, void *addr) { + struct page *page = virt_to_head_page(object); + if (!check_slab(s, page)) goto fail; @@ -1022,10 +1024,10 @@ static inline void setup_object_debug(st struct page *page, void *object) {} static inline int alloc_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + void *object, void *addr) { return 0; } static inline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } + void *object, void *addr) { return 0; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } @@ -1173,6 +1175,8 @@ static void discard_slab(struct kmem_cac { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + WARN_ON(!atomic_long_read(&n->nr_slabs)); + atomic_long_dec(&n->nr_slabs); reset_page_mapcount(page); __ClearPageSlab(page); @@ -1376,36 +1380,106 @@ static void unfreeze_slab(struct kmem_ca } /* - * Remove the cpu slab + * Object count was reduced. Unlock page and make sure that the page is on the + * right list */ -static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) +static void putback_slab(struct kmem_cache *s, struct page *page, void *prior) { - struct page *page = c->page; + if (unlikely(SlabFrozen(page))) + goto out_unlock; + + if (unlikely(!page->inuse)) { + /* Page is now empty */ + if (prior) + /* + * Slab still on the partial list. + */ + remove_partial(s, page); + + slab_unlock(page); + discard_slab(s, page); + return; + } + /* - * Merge cpu freelist into freelist. Typically we get here - * because both freelists are empty. So this is unlikely - * to occur. + * Objects left in the slab. If it + * was not on the partial list before + * then add it. */ - while (unlikely(c->freelist)) { - void **object; + if (unlikely(!prior)) + add_partial(get_node(s, page_to_nid(page)), page); - /* Retrieve object from cpu_freelist */ - object = c->freelist; - c->freelist = c->freelist[c->offset]; +out_unlock: + slab_unlock(page); +} + +/* + * Free a list of objects and attempt to minimize the lock overhead + * while doing so. + * + * Interrupts are disabled + */ +static void free_objects(struct kmem_cache *s, struct kmem_cache_cpu *c, + void **list) +{ + struct page *page = NULL; + struct page *npage; + void **object; + void **prior = NULL; + + while (list) { + object = list; + list = object[c->offset]; + + npage = virt_to_head_page(object); + if (npage != page) { + if (page) + putback_slab(s, page, prior); + /* finish off the old one !*/ + page = npage; + slab_lock(page); + prior = page->freelist; + } - /* And put onto the regular freelist */ object[c->offset] = page->freelist; page->freelist = object; page->inuse--; } - c->page = NULL; - unfreeze_slab(s, page); + + if (page) + putback_slab(s, page, prior); +} + +/* + * Reduce number of objects in a cpu structure + */ +static void drain(struct kmem_cache *s, struct kmem_cache_cpu *c, int objects) +{ + void **free; + + if (c->objects <= objects) + return; + + c->objects = objects; + free = c->freelist; + if (!objects) + c->freelist = NULL; + else { + /* Keep the first hot objects */ + void **x = free; + + while (--objects > 0) + x = x[c->offset]; + + free = x[c->offset]; + x[c->offset] = NULL; + } + free_objects(s, c, free); } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - slab_lock(c->page); - deactivate_slab(s, c); + drain(s, c, 0); } /* @@ -1416,7 +1490,7 @@ static inline void __flush_cpu_slab(stru { struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - if (likely(c && c->page)) + if (likely(c && c->freelist)) flush_slab(s, c); } @@ -1444,16 +1518,131 @@ static void flush_all(struct kmem_cache * Check if the objects in a per cpu structure fit numa * locality expectations. */ -static inline int node_match(struct kmem_cache_cpu *c, int node) +static inline int node_match(void *object, int node) { #ifdef CONFIG_NUMA - if (node != -1 && c->node != node) + if (node != -1 && page_to_nid(virt_to_page(object)) != node) return 0; #endif return 1; } /* + * Find a new slab. + * + * Find new slab may reenable interrupts. Thus there is no guarantee that + * we are on the same cpu as before when this function returns. + */ +static struct page *find_new_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page *page; + + page = get_partial(s, flags, node); + if (page) + return page; + + if (flags & __GFP_WAIT) + local_irq_enable(); + + page = new_slab(s, flags, node); + + if (flags & __GFP_WAIT) + local_irq_disable(); + + if (page) + slab_lock(page); + return page; +} + +/* + * Drain objects that are not on the specified node until we find a fitting + * one. + */ +void numa_drain(struct kmem_cache *s, struct kmem_cache_cpu *c, int node) +{ +#ifdef CONFIG_NUMA + if (node >= 0) { + /* NUMA case. Throw the wrong node objects away */ + void **free = NULL; + + while (c->freelist) { + void **object = c->freelist; + + if (page_to_nid(virt_to_page(object)) == node) + break; + + c->freelist = object[c->offset]; + object[c->offset] = free; + free = object; + c->objects--; + } + if (free) + free_objects(s, c, free); + } +#endif +} + +static struct kmem_cache_cpu * fillup(struct kmem_cache *s, + struct kmem_cache_cpu *c, gfp_t gfpflags, int node, int refill) +{ + struct page *page; + + while (c->objects < refill) { + int new_objects; + void **new_freelist; + + page = find_new_slab(s, gfpflags, node); + + /* + * We may have switched cpus in find_new_slab. We merge the + * new objects into whatever processor per cpu list we are on + * right now. The bad side effects are limited to getting too + * many objects onto a per cpu list. + */ + c = get_cpu_slab(s, smp_processor_id()); + + if (!page) + return c; + + /* Extract objects and dispose of the slab */ + new_objects = s->objects - page->inuse; + new_freelist = page->freelist; + page->freelist = NULL; + page->inuse = s->objects; + unfreeze_slab(s, page); + + /* + * We may have to merge two linked lists. + * Find the smaller one. + */ + if (unlikely(c->objects > new_objects)) { + void **temp = c->freelist; + + c->freelist = new_freelist; + new_freelist = temp; + } + + /* + * If there are objects already in the per cpu list + * then we need to find the end of that list and + * attach the new objects to the end of that list. + */ + if (unlikely(c->objects)) { + void **p = c->freelist; + + while (p[c->offset]) + p = p[c->offset]; + + p[c->offset] = new_freelist; + } else + c->freelist = new_freelist; + + c->objects += new_objects; + } + return c; +} + +/* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. * @@ -1470,88 +1659,59 @@ static inline int node_match(struct kmem * And if we were unable to get a new slab from the partial slab lists then * we need to allocate a new slab. This is slowest path since we may sleep. */ -static void *__slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) +static void *__slab_alloc(struct kmem_cache *s, struct kmem_cache_cpu *c, + gfp_t gfpflags, int node, void *addr) { + int refill = c->refill; void **object; - struct page *new; - - if (!c->page) - goto new_slab; - - slab_lock(c->page); - if (unlikely(!node_match(c, node))) - goto another_slab; -load_freelist: - object = c->page->freelist; - if (unlikely(!object)) - goto another_slab; - if (unlikely(SlabDebug(c->page))) - goto debug; - - object = c->page->freelist; - c->freelist = object[c->offset]; - c->page->inuse = s->objects; - c->page->freelist = NULL; - c->node = page_to_nid(c->page); - slab_unlock(c->page); - return object; - -another_slab: - deactivate_slab(s, c); -new_slab: - new = get_partial(s, gfpflags, node); - if (new) { - c->page = new; - goto load_freelist; + /* + * If we got here in a NUMA system then the first object on the + * freelist does not have the right node. + * + * Simple approach: Dump all objects from the freelist that + * do not fit the requirements. If any are left then use that. + */ + if (unlikely(node >= 0 && c->objects)) { + numa_drain(s, c, node); + if (c->objects) + goto out; + refill = 1; } - if (gfpflags & __GFP_WAIT) - local_irq_enable(); + /* + * Check for unusual situations. These are not bugs but may not + * have been tested well yet. + */ + if (c->objects || c->freelist) + printk("refill(%s, %p, %x, %d, %p) have=%d freelist=%p\n", + s->name, c, gfpflags, node, addr, c->objects, c->freelist); - new = new_slab(s, gfpflags, node); + /* + * Check for debug mode. If we are in a debug mode then the slab + * must be processed one object at a time. + */ + if (unlikely(c->max < 0)) { + struct page * page; - if (gfpflags & __GFP_WAIT) - local_irq_disable(); + do { + page = find_new_slab(s, gfpflags, node); + object = page->freelist; + } while (!alloc_debug_processing(s, page, object, addr)); - if (new) { - c = get_cpu_slab(s, smp_processor_id()); - if (c->page) { - /* - * Someone else populated the cpu_slab while we - * enabled interrupts, or we have gotten scheduled - * on another cpu. The page may not be on the - * requested node even if __GFP_THISNODE was - * specified. So we need to recheck. - */ - if (node_match(c, node)) { - /* - * Current cpuslab is acceptable and we - * want the current one since its cache hot - */ - discard_slab(s, new); - slab_lock(c->page); - goto load_freelist; - } - /* New slab does not fit our expectations */ - flush_slab(s, c); - } - slab_lock(new); - SetSlabFrozen(new); - c->page = new; - goto load_freelist; + page->inuse++; + page->freelist = object[c->offset]; + unfreeze_slab(s, page); + return object; + } + + c = fillup(s, c, gfpflags, node, refill); +out: + object = c->freelist; + if (object) { + c->freelist = c->freelist[c->offset]; + c->objects--; } - return NULL; -debug: - object = c->page->freelist; - if (!alloc_debug_processing(s, c->page, object, addr)) - goto another_slab; - - c->page->inuse++; - c->page->freelist = object[c->offset]; - c->node = -1; - slab_unlock(c->page); return object; } @@ -1574,17 +1734,17 @@ static void __always_inline *slab_alloc( local_irq_save(flags); c = get_cpu_slab(s, smp_processor_id()); - if (unlikely(!c->freelist || !node_match(c, node))) - - object = __slab_alloc(s, gfpflags, node, addr, c); - + if (unlikely(!c->freelist || !node_match(c->freelist, node))) + object = __slab_alloc(s, c, gfpflags, node, addr); else { + object = c->freelist; c->freelist = object[c->offset]; + c->objects--; } local_irq_restore(flags); - if (unlikely((gfpflags & __GFP_ZERO) && object)) + if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, c->objsize); return object; @@ -1605,61 +1765,17 @@ EXPORT_SYMBOL(kmem_cache_alloc_node); #endif /* - * Slow patch handling. This may still be called frequently since objects - * have a longer lifetime than the cpu slabs in most processing loads. - * - * So we still attempt to reduce cache line usage. Just take the slab - * lock and free the item. If there is no additional partial page - * handling required then we can return immediately. + * Slow path of __slab_free. Implements debugging and draining. */ -static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, void *addr, unsigned int offset) +static void __slab_free(struct kmem_cache *s, struct kmem_cache_cpu *c, + void *addr) { - void *prior; - void **object = (void *)x; - - slab_lock(page); - - if (unlikely(SlabDebug(page))) - goto debug; -checks_ok: - prior = object[offset] = page->freelist; - page->freelist = object; - page->inuse--; - - if (unlikely(SlabFrozen(page))) - goto out_unlock; - - if (unlikely(!page->inuse)) - goto slab_empty; - - /* - * Objects left in the slab. If it - * was not on the partial list before - * then add it. - */ - if (unlikely(!prior)) - add_partial(get_node(s, page_to_nid(page)), page); - -out_unlock: - slab_unlock(page); - return; - -slab_empty: - if (prior) - /* - * Slab still on the partial list. - */ - remove_partial(s, page); - - slab_unlock(page); - discard_slab(s, page); - return; - -debug: - if (!free_debug_processing(s, page, x, addr)) - goto out_unlock; - goto checks_ok; + if (unlikely(c->max < 0)) { + if (!free_debug_processing(s, c->freelist, addr)) + return; + drain(s, c, 0); + } else + drain(s, c, c->refill); } /* @@ -1673,8 +1789,8 @@ debug: * If fastpath is not possible then fall back to __slab_free where we deal * with all sorts of special processing. */ -static void __always_inline slab_free(struct kmem_cache *s, - struct page *page, void *x, void *addr) +static void __always_inline slab_free(struct kmem_cache *s, void *x, + void *addr) { void **object = (void *)x; unsigned long flags; @@ -1683,22 +1799,18 @@ static void __always_inline slab_free(st local_irq_save(flags); debug_check_no_locks_freed(object, s->objsize); c = get_cpu_slab(s, smp_processor_id()); - if (likely(page == c->page && c->node >= 0)) { - object[c->offset] = c->freelist; - c->freelist = object; - } else - __slab_free(s, page, x, addr, c->offset); + object[c->offset] = c->freelist; + c->freelist = object; + c->objects++; + if (c->objects >= c->max) + __slab_free(s, c, addr); local_irq_restore(flags); } void kmem_cache_free(struct kmem_cache *s, void *x) { - struct page *page; - - page = virt_to_head_page(x); - - slab_free(s, page, x, __builtin_return_address(0)); + slab_free(s, x, __builtin_return_address(0)); } EXPORT_SYMBOL(kmem_cache_free); @@ -1870,11 +1982,18 @@ static unsigned long calculate_alignment static void init_kmem_cache_cpu(struct kmem_cache *s, struct kmem_cache_cpu *c) { - c->page = NULL; c->freelist = NULL; - c->node = 0; + c->objects = 0; c->offset = s->offset / sizeof(void *); c->objsize = s->objsize; + if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | + SLAB_POISON | SLAB_STORE_USER | SLAB_TRACE)) { + c->refill = 1; + c->max = -1; + } else { + c->refill = max(s->objects / 2, 10); + c->max = max(4 * s->objects, 20); + } } static void init_kmem_cache_node(struct kmem_cache_node *n) @@ -2622,7 +2741,7 @@ void kfree(const void *x) put_page(page); return; } - slab_free(page->slab, page, (void *)x, __builtin_return_address(0)); + slab_free(page->slab, (void *)x, __builtin_return_address(0)); } EXPORT_SYMBOL(kfree); @@ -3410,29 +3529,23 @@ static unsigned long slab_objects(struct nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); per_cpu = nodes + nr_node_ids; - for_each_possible_cpu(cpu) { - struct page *page; + for_each_online_cpu(cpu) { int node; struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); if (!c) continue; - page = c->page; - node = c->node; - if (node < 0) - continue; - if (page) { - if (flags & SO_CPU) { - int x = 0; - - if (flags & SO_OBJECTS) - x = page->inuse; - else - x = 1; - total += x; - nodes[node] += x; - } + node = cpu_to_node(cpu); + if (flags & SO_CPU) { + int x = 0; + + if (flags & SO_OBJECTS) + x = c->objects; + else + x = 0; + total += x; + nodes[node] += x; per_cpu[node]++; } } @@ -3482,7 +3595,7 @@ static int any_slab_objects(struct kmem_ for_each_possible_cpu(cpu) { struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - if (c && c->page) + if (c && c->freelist) return 1; } @@ -3538,6 +3651,50 @@ static ssize_t objs_per_slab_show(struct } SLAB_ATTR_RO(objs_per_slab); +static ssize_t refill_show(struct kmem_cache *s, char *buf) +{ + int refill = get_cpu_slab(s, 0)->refill; + + return sprintf(buf, "%d\n", refill); +} +static ssize_t refill_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int n = simple_strtoul(buf, NULL, 10); + int cpu; + + if (n < 1 || n > 200) + return -EINVAL; + + for_each_online_cpu(cpu) + get_cpu_slab(s, cpu)->refill = n; + + return length; +} +SLAB_ATTR(refill); + +static ssize_t max_show(struct kmem_cache *s, char *buf) +{ + int max = get_cpu_slab(s, 0)->max; + + return sprintf(buf, "%d\n", max); +} +static ssize_t max_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int n = simple_strtoul(buf, NULL, 10); + int cpu; + + if (n < 1 || n > 10000) + return -EINVAL; + + for_each_online_cpu(cpu) + get_cpu_slab(s, cpu)->max = n; + + return length; +} +SLAB_ATTR(max); + static ssize_t order_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", s->order); @@ -3573,11 +3730,11 @@ static ssize_t partial_show(struct kmem_ } SLAB_ATTR_RO(partial); -static ssize_t cpu_slabs_show(struct kmem_cache *s, char *buf) +static ssize_t cpu_objects_show(struct kmem_cache *s, char *buf) { - return slab_objects(s, buf, SO_CPU); + return slab_objects(s, buf, SO_CPU|SO_OBJECTS); } -SLAB_ATTR_RO(cpu_slabs); +SLAB_ATTR_RO(cpu_objects); static ssize_t objects_show(struct kmem_cache *s, char *buf) { @@ -3785,9 +3942,11 @@ static struct attribute * slab_attrs[] = &objs_per_slab_attr.attr, &order_attr.attr, &objects_attr.attr, + &refill_attr.attr, + &max_attr.attr, &slabs_attr.attr, &partial_attr.attr, - &cpu_slabs_attr.attr, + &cpu_objects_attr.attr, &ctor_attr.attr, &aliases_attr.attr, &align_attr.attr,