Index: linux-2.6/mm/slub.c =================================================================== --- linux-2.6.orig/mm/slub.c 2008-04-09 22:01:41.000000000 -0700 +++ linux-2.6/mm/slub.c 2008-04-10 10:18:19.000000000 -0700 @@ -5,7 +5,7 @@ * The allocator synchronizes using per slab locks and only * uses a centralized lock to manage a pool of partial slabs. * - * (C) 2007 SGI, Christoph Lameter + * (C) 2007, 2008 SGI, Christoph Lameter */ #include @@ -101,6 +101,7 @@ */ #define FROZEN (1 << PG_active) +#define KICKABLE (1 << PG_dirty) #ifdef CONFIG_SLUB_DEBUG #define SLABDEBUG (1 << PG_error) @@ -138,6 +139,21 @@ page->flags &= ~SLABDEBUG; } +static inline int SlabKickable(struct page *page) +{ + return page->flags & KICKABLE; +} + +static inline void SetSlabKickable(struct page *page) +{ + page->flags |= KICKABLE; +} + +static inline void ClearSlabKickable(struct page *page) +{ + page->flags &= ~KICKABLE; +} + /* * Issues still to be resolved: * @@ -149,25 +165,6 @@ /* Enable to test recovery from slab corruption on boot */ #undef SLUB_RESILIENCY_TEST -#if PAGE_SHIFT <= 12 - -/* - * Small page size. Make sure that we do not fragment memory - */ -#define DEFAULT_MAX_ORDER 1 -#define DEFAULT_MIN_OBJECTS 4 - -#else - -/* - * Large page machines are customarily able to handle larger - * page orders. - */ -#define DEFAULT_MAX_ORDER 2 -#define DEFAULT_MIN_OBJECTS 8 - -#endif - /* * Mininum number of partial slabs. These will be left on the partial * lists even if they are empty. kmem_cache_shrink may reclaim them. @@ -176,10 +173,10 @@ /* * Maximum number of desirable partial slabs. - * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in the. + * More slabs cause kmem_cache_shrink to sort the slabs by objects + * and triggers slab defragmentation. */ -#define MAX_PARTIAL 10 +#define MAX_PARTIAL 20 #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ SLAB_POISON | SLAB_STORE_USER) @@ -204,8 +201,6 @@ /* Internal SLUB flags */ #define __OBJECT_POISON 0x80000000 /* Poison object */ #define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ -#define __KMALLOC_CACHE 0x20000000 /* objects freed using kfree */ -#define __PAGE_ALLOC_FALLBACK 0x10000000 /* Allow fallback to page alloc */ /* Not all arches define cache_line_size */ #ifndef cache_line_size @@ -229,6 +224,9 @@ static DECLARE_RWSEM(slub_lock); static LIST_HEAD(slab_caches); +/* Maximum objects in defragmentable slabs */ +static unsigned int max_defrag_slab_objects = 0; + /* * Tracking user of a slab. */ @@ -301,7 +299,7 @@ return 1; base = page_address(page); - if (object < base || object >= base + s->objects * s->size || + if (object < base || object >= base + page->objects * s->size || (object - base) % s->size) { return 0; } @@ -327,8 +325,8 @@ } /* Loop over all objects in a slab */ -#define for_each_object(__p, __s, __addr) \ - for (__p = (__addr); __p < (__addr) + (__s)->objects * (__s)->size;\ +#define for_each_object(__p, __s, __addr, __objects) \ + for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) /* Scan freelist */ @@ -341,6 +339,25 @@ return (p - addr) / s->size; } +static inline struct kmem_cache_order_objects oo_make(int order, + unsigned long size) +{ + struct kmem_cache_order_objects x = + { (order << 16) + (PAGE_SIZE << order) / size }; + + return x; +} + +static inline int oo_order(struct kmem_cache_order_objects x) +{ + return x.x >> 16; +} + +static inline int oo_objects(struct kmem_cache_order_objects x) +{ + return x.x & ((1 << 16) - 1); +} + #ifdef CONFIG_SLUB_DEBUG /* * Debug settings: @@ -451,8 +468,8 @@ static void print_page_info(struct page *page) { - printk(KERN_ERR "INFO: Slab 0x%p used=%u fp=0x%p flags=0x%04lx\n", - page, page->inuse, page->freelist, page->flags); + printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + page, page->objects, page->inuse, page->freelist, page->flags); } @@ -652,6 +669,7 @@ p + off, POISON_INUSE, s->size - off); } +/* Check the pad bytes at the end of a slab page */ static int slab_pad_check(struct kmem_cache *s, struct page *page) { u8 *start; @@ -664,20 +682,20 @@ return 1; start = page_address(page); - end = start + (PAGE_SIZE << s->order); - length = s->objects * s->size; - remainder = end - (start + length); + length = (PAGE_SIZE << compound_order(page)); + end = start + length; + remainder = length % s->size; if (!remainder) return 1; - fault = check_bytes(start + length, POISON_INUSE, remainder); + fault = check_bytes(end - remainder, POISON_INUSE, remainder); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding", start, length); + print_section("Padding", end - remainder, remainder); restore_bytes(s, "slab padding", POISON_INUSE, start, end); return 0; @@ -739,15 +757,24 @@ static int check_slab(struct kmem_cache *s, struct page *page) { + int maxobj; + VM_BUG_ON(!irqs_disabled()); if (!PageSlab(page)) { slab_err(s, page, "Not a valid slab page"); return 0; } - if (page->inuse > s->objects) { + + maxobj = (PAGE_SIZE << compound_order(page)) / s->size; + if (page->objects > maxobj) { + slab_err(s, page, "objects %u > max %u", + s->name, page->objects, maxobj); + return 0; + } + if (page->inuse > page->objects) { slab_err(s, page, "inuse %u > max %u", - s->name, page->inuse, s->objects); + s->name, page->inuse, page->objects); return 0; } /* Slab_pad_check fixes things up after itself */ @@ -764,8 +791,9 @@ int nr = 0; void *fp = page->freelist; void *object = NULL; + unsigned long max_objects; - while (fp && nr <= s->objects) { + while (fp && nr <= page->objects) { if (fp == search) return 1; if (!check_valid_pointer(s, page, fp)) { @@ -777,7 +805,7 @@ } else { slab_err(s, page, "Freepointer corrupt"); page->freelist = NULL; - page->inuse = s->objects; + page->inuse = page->objects; slab_fix(s, "Freelist cleared"); return 0; } @@ -788,10 +816,20 @@ nr++; } - if (page->inuse != s->objects - nr) { + max_objects = (PAGE_SIZE << compound_order(page)) / s->size; + if (max_objects > 65535) + max_objects = 65535; + + if (page->objects != max_objects) { + slab_err(s, page, "Wrong number of objects. Found %d but " + "should be %d", page->objects, max_objects); + page->objects = max_objects; + slab_fix(s, "Number of objects adjusted."); + } + if (page->inuse != page->objects - nr) { slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", page->inuse, s->objects - nr); - page->inuse = s->objects - nr; + "counted were %d", page->inuse, page->objects - nr); + page->inuse = page->objects - nr; slab_fix(s, "Object count adjusted."); } return search == NULL; @@ -837,6 +875,38 @@ spin_unlock(&n->list_lock); } +/* Tracking of the number of slabs for debugging purposes */ +static inline unsigned long slabs_node(struct kmem_cache *s, int node) +{ + struct kmem_cache_node *n = get_node(s, node); + + return atomic_long_read(&n->nr_slabs); +} + +static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + /* + * May be called early in order to allocate a slab for the + * kmem_cache_node structure. Solve the chicken-egg + * dilemma by deferring the increment of the count during + * bootstrap (see early_kmem_cache_node_alloc). + */ + if (!NUMA_BUILD || n) { + atomic_long_inc(&n->nr_slabs); + atomic_long_add(objects, &n->total_objects); + } +} +static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) +{ + struct kmem_cache_node *n = get_node(s, node); + + atomic_long_dec(&n->nr_slabs); + atomic_long_sub(objects, &n->total_objects); +} + +/* Object debug checks for alloc/free paths */ static void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) { @@ -881,7 +951,7 @@ * as used avoids touching the remaining objects. */ slab_fix(s, "Marking all objects used"); - page->inuse = s->objects; + page->inuse = page->objects; page->freelist = NULL; } return 0; @@ -1028,29 +1098,55 @@ return flags; } #define slub_debug 0 + +static inline unsigned long slabs_node(struct kmem_cache *s, int node) + { return 0; } +static inline void inc_slabs_node(struct kmem_cache *s, int node, + int objects) {} +static inline void dec_slabs_node(struct kmem_cache *s, int node, + int objects) {} #endif + /* * Slab allocation and freeing */ +static inline struct page *alloc_slab_page(gfp_t flags, int node, + struct kmem_cache_order_objects oo) +{ + int order = oo_order(oo); + + if (node == -1) + return alloc_pages(flags, order); + else + return alloc_pages_node(node, flags, order); +} + static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - int pages = 1 << s->order; + struct kmem_cache_order_objects oo = s->oo; flags |= s->allocflags; - if (node == -1) - page = alloc_pages(flags, s->order); - else - page = alloc_pages_node(node, flags, s->order); - - if (!page) - return NULL; + page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, + oo); + if (unlikely(!page)) { + oo = s->min; + /* + * Allocation may have failed due to fragmentation. + * Try a lower order alloc if possible + */ + page = alloc_slab_page(flags, node, oo); + if (!page) + return NULL; + stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); + } + page->objects = oo_objects(oo); mod_zone_page_state(page_zone(page), (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - pages); + 1 << oo_order(oo)); return page; } @@ -1066,10 +1162,10 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - struct kmem_cache_node *n; void *start; void *last; void *p; + struct kmem_cache_order_objects oo = s->oo; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1078,22 +1174,23 @@ if (!page) goto out; - n = get_node(s, page_to_nid(page)); - if (n) - atomic_long_inc(&n->nr_slabs); + inc_slabs_node(s, node, page->objects); page->slab = s; page->flags |= 1 << PG_slab; if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | SLAB_TRACE)) SetSlabDebug(page); + if (s->kick) + SetSlabKickable(page); + start = page_address(page); if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << s->order); + memset(start, POISON_INUSE, PAGE_SIZE << oo_order(oo)); last = start; - for_each_object(p, s, start) { + for_each_object(p, s, start, page->objects) { setup_object(s, page, last); set_freepointer(s, last, p); last = p; @@ -1109,13 +1206,15 @@ static void __free_slab(struct kmem_cache *s, struct page *page) { - int pages = 1 << s->order; + int order = compound_order(page); + int pages = 1 << order; if (unlikely(SlabDebug(page))) { void *p; slab_pad_check(s, page); - for_each_object(p, s, page_address(page)) + for_each_object(p, s, page_address(page), + page->objects) check_object(s, page, p, 0); ClearSlabDebug(page); } @@ -1125,7 +1224,10 @@ NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); - __free_pages(page, s->order); + ClearSlabKickable(page); + __ClearPageSlab(page); + reset_page_mapcount(page); + __free_pages(page, order); } static void rcu_free_slab(struct rcu_head *h) @@ -1151,11 +1253,7 @@ static void discard_slab(struct kmem_cache *s, struct page *page) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - - atomic_long_dec(&n->nr_slabs); - reset_page_mapcount(page); - __ClearPageSlab(page); + dec_slabs_node(s, page_to_nid(page), page->objects); free_slab(s, page); } @@ -1335,6 +1433,8 @@ stat(c, DEACTIVATE_FULL); if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) add_full(n, page); + if (s->kick) + SetSlabKickable(page); } slab_unlock(page); } else { @@ -1470,9 +1570,6 @@ void **object; struct page *new; - /* We handle __GFP_ZERO in the caller */ - gfpflags &= ~__GFP_ZERO; - if (!c->page) goto new_slab; @@ -1490,7 +1587,7 @@ goto debug; c->freelist = object[c->offset]; - c->page->inuse = s->objects; + c->page->inuse = c->page->objects; c->page->freelist = NULL; c->node = page_to_nid(c->page); unlock_out: @@ -1527,27 +1624,6 @@ c->page = new; goto load_freelist; } - - /* - * No memory available. - * - * If the slab uses higher order allocs but the object is - * smaller than a page size then we can fallback in emergencies - * to the page allocator via kmalloc_large. The page allocator may - * have failed to obtain a higher order page and we can try to - * allocate a single page if the object fits into a single page. - * That is only possible if certain conditions are met that are being - * checked when a slab is created. - */ - if (!(gfpflags & __GFP_NORETRY) && - (s->flags & __PAGE_ALLOC_FALLBACK)) { - if (gfpflags & __GFP_WAIT) - local_irq_enable(); - object = kmalloc_large(s->objsize, gfpflags); - if (gfpflags & __GFP_WAIT) - local_irq_disable(); - return object; - } return NULL; debug: if (!alloc_debug_processing(s, c->page, object, addr)) @@ -1748,8 +1824,8 @@ * take the list_lock. */ static int slub_min_order; -static int slub_max_order = DEFAULT_MAX_ORDER; -static int slub_min_objects = DEFAULT_MIN_OBJECTS; +static int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +static int slub_min_objects = 0; /* * Merge control. If this is set then no merging of slab caches will occur. @@ -1764,7 +1840,7 @@ * system components. Generally order 0 allocations should be preferred since * order 0 does not cause fragmentation in the page allocator. Larger objects * be problematic to put into order 0 slabs because there may be too much - * unused space left. We go to a higher order if more than 1/8th of the slab + * unused space left. We go to a higher order if more than 1/16th of the slab * would be wasted. * * In order to reach satisfactory performance we must ensure that a minimum @@ -1789,6 +1865,9 @@ int rem; int min_order = slub_min_order; + if ((PAGE_SIZE << min_order) / size > 65535) + return get_order(size * 65535) - 1; + for (order = max(min_order, fls(min_objects * size - 1) - PAGE_SHIFT); order <= max_order; order++) { @@ -1823,8 +1902,10 @@ * we reduce the minimum objects required in a slab. */ min_objects = slub_min_objects; + if (!min_objects) + min_objects = 4 * fls(nr_cpu_ids); while (min_objects > 1) { - fraction = 8; + fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, slub_max_order, fraction); @@ -1886,15 +1967,18 @@ c->node = 0; c->offset = s->offset / sizeof(void *); c->objsize = s->objsize; +#ifdef CONFIG_SLUB_STATS + memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); +#endif } static void init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; - atomic_long_set(&n->nr_slabs, 0); spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG + atomic_long_set(&n->nr_slabs, 0); INIT_LIST_HEAD(&n->full); #endif } @@ -2063,7 +2147,7 @@ init_tracking(kmalloc_caches, n); #endif init_kmem_cache_node(n); - atomic_long_inc(&n->nr_slabs); + inc_slabs_node(kmalloc_caches, node, page->objects); /* * lockdep requires consistent irq usage for each lock @@ -2139,11 +2223,12 @@ * calculate_sizes() determines the order and the distribution of data within * a slab object. */ -static int calculate_sizes(struct kmem_cache *s) +static int calculate_sizes(struct kmem_cache *s, int forced_order) { unsigned long flags = s->flags; unsigned long size = s->objsize; unsigned long align = s->align; + int order; /* * Round up object size to the next word boundary. We can only @@ -2227,26 +2312,16 @@ */ size = ALIGN(size, align); s->size = size; + if (forced_order >= 0) + order = forced_order; + else + order = calculate_order(size); - if ((flags & __KMALLOC_CACHE) && - PAGE_SIZE / size < slub_min_objects) { - /* - * Kmalloc cache that would not have enough objects in - * an order 0 page. Kmalloc slabs can fallback to - * page allocator order 0 allocs so take a reasonably large - * order that will allows us a good number of objects. - */ - s->order = max(slub_max_order, PAGE_ALLOC_COSTLY_ORDER); - s->flags |= __PAGE_ALLOC_FALLBACK; - s->allocflags |= __GFP_NOWARN; - } else - s->order = calculate_order(size); - - if (s->order < 0) + if (order < 0) return 0; s->allocflags = 0; - if (s->order) + if (order) s->allocflags |= __GFP_COMP; if (s->flags & SLAB_CACHE_DMA) @@ -2258,9 +2333,12 @@ /* * Determine the number of objects per slab */ - s->objects = (PAGE_SIZE << s->order) / size; + s->oo = oo_make(order, size); + s->min = oo_make(get_order(size), size); + if (oo_objects(s->oo) > oo_objects(s->max)) + s->max = s->oo; - return !!s->objects; + return !!oo_objects(s->oo); } @@ -2276,10 +2354,11 @@ s->align = align; s->flags = kmem_cache_flags(size, flags, name, ctor); - if (!calculate_sizes(s)) + if (!calculate_sizes(s, -1)) goto error; s->refcount = 1; + s->defrag_ratio = 30; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 100; #endif @@ -2293,7 +2372,7 @@ if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)size, s->size, s->order, + s->name, (unsigned long)size, s->size, oo_order(s->oo), s->offset, flags); return 0; } @@ -2376,7 +2455,7 @@ struct kmem_cache_node *n = get_node(s, node); n->nr_partial -= free_list(s, n, &n->partial); - if (atomic_long_read(&n->nr_slabs)) + if (slabs_node(s, node)) return 1; } free_kmem_cache_nodes(s); @@ -2409,10 +2488,6 @@ struct kmem_cache kmalloc_caches[PAGE_SHIFT + 1] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); -#ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; -#endif - static int __init setup_slub_min_order(char *str) { get_option(&str, &slub_min_order); @@ -2458,10 +2533,10 @@ down_write(&slub_lock); if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, - flags | __KMALLOC_CACHE, NULL)) + flags, NULL)) goto panic; - list_add(&s->list, &slab_caches); + list_add_tail(&s->list, &slab_caches); up_write(&slub_lock); if (sysfs_slab_add(s)) goto panic; @@ -2472,6 +2547,7 @@ } #ifdef CONFIG_ZONE_DMA +static struct kmem_cache *kmalloc_caches_dma[PAGE_SHIFT + 1]; static void sysfs_add_func(struct work_struct *w) { @@ -2688,91 +2764,269 @@ } EXPORT_SYMBOL(kfree); -#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SLABINFO) -static unsigned long count_partial(struct kmem_cache_node *n) +static inline void *alloc_scratch(void) +{ + return kmalloc(max_defrag_slab_objects * sizeof(void *) + + BITS_TO_LONGS(max_defrag_slab_objects) * sizeof(unsigned long), + GFP_KERNEL); +} + +void kmem_cache_setup_defrag(struct kmem_cache *s, + void *(*get)(struct kmem_cache *, int nr, void **), + void (*kick)(struct kmem_cache *, int nr, void **, void *private)) +{ + int max_objects = oo_objects(s->max); + + /* + * Defragmentable slabs must have a ctor otherwise objects may be + * in an undetermined state after they are allocated. + */ + BUG_ON(!s->ctor); + s->get = get; + s->kick = kick; + down_write(&slub_lock); + list_move(&s->list, &slab_caches); + if (max_objects > max_defrag_slab_objects) + max_defrag_slab_objects = max_objects; + up_write(&slub_lock); +} +EXPORT_SYMBOL(kmem_cache_setup_defrag); + +/* + * Vacate all objects in the given slab. + * + * The scratch aread passed to list function is sufficient to hold + * struct listhead times objects per slab. We use it to hold void ** times + * objects per slab plus a bitmap for each object. + */ +static int kmem_cache_vacate(struct page *page, void *scratch) { + void **vector = scratch; + void *p; + void *addr = page_address(page); + struct kmem_cache *s; + unsigned long *map; + int leftover; + int count; + void *private; unsigned long flags; - unsigned long x = 0; - struct page *page; + unsigned long objects; + struct kmem_cache_cpu *c; - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->partial, lru) - x += page->inuse; - spin_unlock_irqrestore(&n->list_lock, flags); - return x; + BUG_ON(!PageSlab(page)); + local_irq_save(flags); + slab_lock(page); + BUG_ON(!SlabFrozen(page)); + + s = page->slab; + objects = page->objects; + c= get_cpu_slab(s, smp_processor_id()); + map = scratch + max_defrag_slab_objects * sizeof(void **); + if (!page->inuse || !s->kick || !SlabKickable(page)) { + stat(c, SHRINK_SLAB_SKIPPED); + goto out; + } + + /* Determine used objects */ + bitmap_fill(map, objects); + for_each_free_object(p, s, page->freelist) + __clear_bit(slab_index(p, s, addr), map); + + count = 0; + memset(vector, 0, objects * sizeof(void **)); + for_each_object(p, s, addr, objects) + if (test_bit(slab_index(p, s, addr), map)) + vector[count++] = p; + + private = s->get(s, count, vector); + + /* + * Got references. Now we can drop the slab lock. The slab + * is frozen so it cannot vanish from under us nor will + * allocations be performed on the slab. However, unlocking the + * slab will allow concurrent slab_frees to proceed. + */ + slab_unlock(page); + local_irq_restore(flags); + + /* + * Perform the KICK callbacks to remove the objects. + */ + s->kick(s, count, vector, private); + + local_irq_save(flags); + slab_lock(page); +out: + /* + * Check the result and unfreeze the slab + */ + leftover = page->inuse; + if (leftover) { + stat(c, SHRINK_OBJECT_RECLAIM_FAILED); + ClearSlabKickable(page); + } else + stat(c, SHRINK_SLAB_RECLAIMED); + + unfreeze_slab(s, page, leftover > 0); + local_irq_restore(flags); + return leftover; } -#endif /* - * kmem_cache_shrink removes empty slabs from the partial lists and sorts - * the remaining slabs by the number of items in use. The slabs with the - * most items in use come first. New allocations will then fill those up - * and thus they can be removed from the partial lists. - * - * The slabs with the least items are placed last. This results in them - * being allocated from last increasing the chance that the last objects - * are freed in them. + * Remove objects from a list of slab pages that have been gathered. + * Must be called with slabs that have been isolated before. */ -int kmem_cache_shrink(struct kmem_cache *s) +int kmem_cache_reclaim(struct list_head *zaplist) { - int node; - int i; - struct kmem_cache_node *n; + int freed = 0; + void **scratch; struct page *page; - struct page *t; - struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); + struct page *page2; + + if (list_empty(zaplist)) + return 0; + + scratch = alloc_scratch(); + if (!scratch) + return 0; + + list_for_each_entry_safe(page, page2, zaplist, lru) { + list_del(&page->lru); + if (kmem_cache_vacate(page, scratch) == 0) + freed++; + } + kfree(scratch); + return freed; +} + +/* + * Shrink the slab cache on a particular node of the cache + * by releasing slabs with zero objects and trying to reclaim + * slabs with less than a quarter of objects allocated. + */ +static unsigned long __kmem_cache_shrink(struct kmem_cache *s, int node, + unsigned long limit) +{ unsigned long flags; + struct page *page, *page2; + LIST_HEAD(zaplist); + int freed = 0; + struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_cpu *c; - if (!slabs_by_inuse) - return -ENOMEM; + if (n->nr_partial <= limit) + return 0; - flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); + spin_lock_irqsave(&n->list_lock, flags); + c = get_cpu_slab(s, smp_processor_id()); + stat(c, SHRINK_CALLS); + list_for_each_entry_safe(page, page2, &n->partial, lru) { + if (page->inuse) { - if (!n->nr_partial) - continue; + if (!SlabKickable(page)) + continue; - for (i = 0; i < s->objects; i++) - INIT_LIST_HEAD(slabs_by_inuse + i); + if (page->inuse * 100 >= + s->defrag_ratio * page->objects) + continue; - spin_lock_irqsave(&n->list_lock, flags); + if (!slab_trylock(page)) + continue; - /* - * Build lists indexed by the items in use in each slab. - * - * Note that concurrent frees may occur while we hold the - * list_lock. page->inuse here is the upper limit. - */ - list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse && slab_trylock(page)) { - /* - * Must hold slab lock here because slab_free - * may have freed the last object and be - * waiting to release the slab. - */ - list_del(&page->lru); + list_move(&page->lru, &zaplist); + if (s->kick) { + stat(c, SHRINK_ATTEMPT_DEFRAG); n->nr_partial--; - slab_unlock(page); - discard_slab(s, page); - } else { - list_move(&page->lru, - slabs_by_inuse + page->inuse); + SetSlabFrozen(page); } + slab_unlock(page); + + } else { + stat(c, SHRINK_EMPTY_SLAB); + list_del(&page->lru); + n->nr_partial--; + slab_unlock(page); + discard_slab(s, page); + freed++; } + } + + if (!s->kick) + /* Simply put the zaplist at the end */ + list_splice(&zaplist, n->partial.prev); + + spin_unlock_irqrestore(&n->list_lock, flags); + + if (s->kick) + freed += kmem_cache_reclaim(&zaplist); + + return freed; +} + +/* + * Defrag slabs conditional on the amount of fragmentation in a page. + */ +int kmem_cache_defrag(int node) +{ + struct kmem_cache *s; + unsigned long slabs = 0; + unsigned long reclaimed; + + /* + * kmem_cache_defrag may be called from the reclaim path which may be + * called for any page allocator alloc. So there is the danger that we + * get called in a situation where slub already acquired the slub_lock + * for other purposes. + */ + if (!down_read_trylock(&slub_lock)) + return 0; + + list_for_each_entry(s, &slab_caches, list) { + + if (time_before(jiffies, s->next_defrag)) + continue; /* - * Rebuild the partial list with the slabs filled up most - * first and the least used slabs at the end. + * Defragmentable caches come first. If the slab cache is not + * defragmentable then we can stop traversing the list. */ - for (i = s->objects - 1; i >= 0; i--) - list_splice(slabs_by_inuse + i, n->partial.prev); + if (!s->kick) + break; - spin_unlock_irqrestore(&n->list_lock, flags); + if (node == -1) { + int nid; + + for_each_node_state(nid, N_NORMAL_MEMORY) + reclaimed = __kmem_cache_shrink(s, nid, + MAX_PARTIAL); + } else + reclaimed = __kmem_cache_shrink(s, node, MAX_PARTIAL); + + if (reclaimed) + s->next_defrag = jiffies + HZ / 10; + else + s->next_defrag = jiffies + HZ; + + slabs += reclaimed; } + up_read(&slub_lock); + return slabs; +} +EXPORT_SYMBOL(kmem_cache_defrag); + +/* + * kmem_cache_shrink removes empty slabs from the partial lists. + * If the slab cache supports defragmentation then objects are + * reclaimed. + */ +int kmem_cache_shrink(struct kmem_cache *s) +{ + int node; + + flush_all(s); + for_each_node_state(node, N_NORMAL_MEMORY) + __kmem_cache_shrink(s, node, 0); - kfree(slabs_by_inuse); return 0; } EXPORT_SYMBOL(kmem_cache_shrink); @@ -2816,7 +3070,7 @@ * and offline_pages() function shoudn't call this * callback. So, we must fail. */ - BUG_ON(atomic_long_read(&n->nr_slabs)); + BUG_ON(slabs_node(s, offline_node)); s->node[offline_node] = NULL; kmem_cache_free(kmalloc_caches, n); @@ -2841,7 +3095,7 @@ return 0; /* - * We are bringing a node online. No memory is availabe yet. We must + * We are bringing a node online. No memory is available yet. We must * allocate a kmem_cache_node structure in order to bring the node * online. */ @@ -2987,10 +3241,7 @@ if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) return 1; - if ((s->flags & __PAGE_ALLOC_FALLBACK)) - return 1; - - if (s->ctor) + if (s->ctor || s->kick || s->get) return 1; /* @@ -3080,7 +3331,7 @@ if (s) { if (kmem_cache_open(s, GFP_KERNEL, name, size, align, flags, ctor)) { - list_add(&s->list, &slab_caches); + list_add_tail(&s->list, &slab_caches); up_write(&slub_lock); if (sysfs_slab_add(s)) goto err; @@ -3181,6 +3432,37 @@ return slab_alloc(s, gfpflags, node, caller); } +#if (defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG)) || defined(CONFIG_SLABINFO) +static unsigned long count_partial(struct kmem_cache_node *n, + int (*get_count)(struct page *)) +{ + unsigned long flags; + unsigned long x = 0; + struct page *page; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) + x += get_count(page); + spin_unlock_irqrestore(&n->list_lock, flags); + return x; +} + +static int count_inuse(struct page *page) +{ + return page->inuse; +} + +static int count_total(struct page *page) +{ + return page->objects; +} + +static int count_free(struct page *page) +{ + return page->objects - page->inuse; +} +#endif + #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) @@ -3193,7 +3475,7 @@ return 0; /* Now we know that a valid freelist exists */ - bitmap_zero(map, s->objects); + bitmap_zero(map, page->objects); for_each_free_object(p, s, page->freelist) { set_bit(slab_index(p, s, addr), map); @@ -3201,7 +3483,7 @@ return 0; } - for_each_object(p, s, addr) + for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) if (!check_object(s, page, p, 1)) return 0; @@ -3267,7 +3549,7 @@ { int node; unsigned long count = 0; - unsigned long *map = kmalloc(BITS_TO_LONGS(s->objects) * + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); if (!map) @@ -3470,14 +3752,14 @@ struct page *page, enum track_item alloc) { void *addr = page_address(page); - DECLARE_BITMAP(map, s->objects); + DECLARE_BITMAP(map, page->objects); void *p; - bitmap_zero(map, s->objects); + bitmap_zero(map, page->objects); for_each_free_object(p, s, page->freelist) set_bit(slab_index(p, s, addr), map); - for_each_object(p, s, addr) + for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) add_location(t, s, get_track(s, p, alloc)); } @@ -3567,22 +3849,23 @@ } enum slab_stat_type { - SL_FULL, - SL_PARTIAL, - SL_CPU, - SL_OBJECTS + SL_ALL, /* All slabs */ + SL_PARTIAL, /* Only partially allocated slabs */ + SL_CPU, /* Only slabs used for cpu caches */ + SL_OBJECTS, /* Determine allocated objects not slabs */ + SL_TOTAL /* Determine object capacity not slabs */ }; -#define SO_FULL (1 << SL_FULL) +#define SO_ALL (1 << SL_ALL) #define SO_PARTIAL (1 << SL_PARTIAL) #define SO_CPU (1 << SL_CPU) #define SO_OBJECTS (1 << SL_OBJECTS) +#define SO_TOTAL (1 << SL_TOTAL) static ssize_t show_slab_objects(struct kmem_cache *s, char *buf, unsigned long flags) { unsigned long total = 0; - int cpu; int node; int x; unsigned long *nodes; @@ -3593,56 +3876,60 @@ return -ENOMEM; per_cpu = nodes + nr_node_ids; - for_each_possible_cpu(cpu) { - struct page *page; - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + if (flags & SO_CPU) { + int cpu; - if (!c) - continue; + for_each_possible_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - page = c->page; - node = c->node; - if (node < 0) - continue; - if (page) { - if (flags & SO_CPU) { - if (flags & SO_OBJECTS) - x = page->inuse; + if (!c || c->node < 0) + continue; + + if (c->page) { + if (flags & SO_TOTAL) + x = c->page->objects; + else if (flags & SO_OBJECTS) + x = c->page->inuse; else x = 1; + total += x; - nodes[node] += x; + nodes[c->node] += x; } - per_cpu[node]++; + per_cpu[c->node]++; } } - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + if (flags & SO_ALL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + + if (flags & SO_TOTAL) + x = atomic_long_read(&n->total_objects); + else if (flags & SO_OBJECTS) + x = atomic_long_read(&n->total_objects) - + count_partial(n, count_free); - if (flags & SO_PARTIAL) { - if (flags & SO_OBJECTS) - x = count_partial(n); else - x = n->nr_partial; + x = atomic_long_read(&n->nr_slabs); total += x; nodes[node] += x; } - if (flags & SO_FULL) { - int full_slabs = atomic_long_read(&n->nr_slabs) - - per_cpu[node] - - n->nr_partial; - - if (flags & SO_OBJECTS) - x = full_slabs * s->objects; + } else if (flags & SO_PARTIAL) { + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + + if (flags & SO_TOTAL) + x = count_partial(n, count_total); + else if (flags & SO_OBJECTS) + x = count_partial(n, count_inuse); else - x = full_slabs; + x = n->nr_partial; total += x; nodes[node] += x; } } - x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA for_each_node_state(node, N_NORMAL_MEMORY) @@ -3657,14 +3944,6 @@ static int any_slab_objects(struct kmem_cache *s) { int node; - int cpu; - - for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c && c->page) - return 1; - } for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); @@ -3672,7 +3951,7 @@ if (!n) continue; - if (n->nr_partial || atomic_long_read(&n->nr_slabs)) + if (atomic_read(&n->total_objects)) return 1; } return 0; @@ -3714,26 +3993,54 @@ static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->objects); + return sprintf(buf, "%d\n", oo_objects(s->oo)); } SLAB_ATTR_RO(objs_per_slab); +static ssize_t order_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int order = simple_strtoul(buf, NULL, 10); + + if (order > slub_max_order || order < slub_min_order) + return -EINVAL; + + calculate_sizes(s, order); + return length; +} + static ssize_t order_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->order); + return sprintf(buf, "%d\n", oo_order(s->oo)); } -SLAB_ATTR_RO(order); +SLAB_ATTR(order); -static ssize_t ctor_show(struct kmem_cache *s, char *buf) +static ssize_t ops_show(struct kmem_cache *s, char *buf) { - if (s->ctor) { - int n = sprint_symbol(buf, (unsigned long)s->ctor); + int x = 0; - return n + sprintf(buf + n, "\n"); + if (s->ctor) { + x += sprintf(buf + x, "ctor : "); + x += sprint_symbol(buf + x, (unsigned long)s->ctor); + x += sprintf(buf + x, "\n"); + } + + if (s->get) { + x += sprintf(buf + x, "get : "); + x += sprint_symbol(buf + x, + (unsigned long)s->get); + x += sprintf(buf + x, "\n"); + } + + if (s->kick) { + x += sprintf(buf + x, "kick : "); + x += sprint_symbol(buf + x, + (unsigned long)s->kick); + x += sprintf(buf + x, "\n"); } - return 0; + return x; } -SLAB_ATTR_RO(ctor); +SLAB_ATTR_RO(ops); static ssize_t aliases_show(struct kmem_cache *s, char *buf) { @@ -3743,7 +4050,7 @@ static ssize_t slabs_show(struct kmem_cache *s, char *buf) { - return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU); + return show_slab_objects(s, buf, SO_ALL); } SLAB_ATTR_RO(slabs); @@ -3761,10 +4068,22 @@ static ssize_t objects_show(struct kmem_cache *s, char *buf) { - return show_slab_objects(s, buf, SO_FULL|SO_PARTIAL|SO_CPU|SO_OBJECTS); + return show_slab_objects(s, buf, SO_ALL|SO_OBJECTS); } SLAB_ATTR_RO(objects); +static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_PARTIAL|SO_OBJECTS); +} +SLAB_ATTR_RO(objects_partial); + +static ssize_t total_objects_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); +} +SLAB_ATTR_RO(total_objects); + static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE)); @@ -3844,7 +4163,7 @@ s->flags &= ~SLAB_RED_ZONE; if (buf[0] == '1') s->flags |= SLAB_RED_ZONE; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(red_zone); @@ -3863,7 +4182,7 @@ s->flags &= ~SLAB_POISON; if (buf[0] == '1') s->flags |= SLAB_POISON; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(poison); @@ -3882,7 +4201,7 @@ s->flags &= ~SLAB_STORE_USER; if (buf[0] == '1') s->flags |= SLAB_STORE_USER; - calculate_sizes(s); + calculate_sizes(s, -1); return length; } SLAB_ATTR(store_user); @@ -3941,6 +4260,22 @@ } SLAB_ATTR_RO(free_calls); +static ssize_t defrag_ratio_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", s->defrag_ratio); +} + +static ssize_t defrag_ratio_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + int n = simple_strtoul(buf, NULL, 10); + + if (n < 100) + s->defrag_ratio = n; + return length; +} +SLAB_ATTR(defrag_ratio); + #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { @@ -3979,10 +4314,12 @@ len = sprintf(buf, "%lu", sum); +#ifdef CONFIG_SMP for_each_online_cpu(cpu) { if (data[cpu] && len < PAGE_SIZE - 20) - len += sprintf(buf + len, " c%d=%u", cpu, data[cpu]); + len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]); } +#endif kfree(data); return len + sprintf(buf + len, "\n"); } @@ -4011,7 +4348,13 @@ STAT_ATTR(DEACTIVATE_TO_HEAD, deactivate_to_head); STAT_ATTR(DEACTIVATE_TO_TAIL, deactivate_to_tail); STAT_ATTR(DEACTIVATE_REMOTE_FREES, deactivate_remote_frees); - +STAT_ATTR(ORDER_FALLBACK, order_fallback); +STAT_ATTR(SHRINK_CALLS, shrink_calls); +STAT_ATTR(SHRINK_ATTEMPT_DEFRAG, shrink_attempt_defrag); +STAT_ATTR(SHRINK_EMPTY_SLAB, shrink_empty_slab); +STAT_ATTR(SHRINK_SLAB_SKIPPED, shrink_slab_skipped); +STAT_ATTR(SHRINK_SLAB_RECLAIMED, shrink_slab_reclaimed); +STAT_ATTR(SHRINK_OBJECT_RECLAIM_FAILED, shrink_object_reclaim_failed); #endif static struct attribute *slab_attrs[] = { @@ -4020,10 +4363,12 @@ &objs_per_slab_attr.attr, &order_attr.attr, &objects_attr.attr, + &objects_partial_attr.attr, + &total_objects_attr.attr, &slabs_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, - &ctor_attr.attr, + &ops_attr.attr, &aliases_attr.attr, &align_attr.attr, &sanity_checks_attr.attr, @@ -4038,6 +4383,7 @@ &shrink_attr.attr, &alloc_calls_attr.attr, &free_calls_attr.attr, + &defrag_ratio_attr.attr, #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, #endif @@ -4062,6 +4408,13 @@ &deactivate_to_head_attr.attr, &deactivate_to_tail_attr.attr, &deactivate_remote_frees_attr.attr, + &order_fallback_attr.attr, + &shrink_calls_attr.attr, + &shrink_attempt_defrag_attr.attr, + &shrink_empty_slab_attr.attr, + &shrink_slab_skipped_attr.attr, + &shrink_slab_reclaimed_attr.attr, + &shrink_object_reclaim_failed_attr.attr, #endif NULL }; @@ -4348,7 +4701,8 @@ unsigned long nr_partials = 0; unsigned long nr_slabs = 0; unsigned long nr_inuse = 0; - unsigned long nr_objs; + unsigned long nr_objs = 0; + unsigned long nr_free = 0; struct kmem_cache *s; int node; @@ -4362,14 +4716,15 @@ nr_partials += n->nr_partial; nr_slabs += atomic_long_read(&n->nr_slabs); - nr_inuse += count_partial(n); + nr_objs += atomic_long_read(&n->total_objects); + nr_free += count_partial(n, count_free); } - nr_objs = nr_slabs * s->objects; - nr_inuse += (nr_slabs - nr_partials) * s->objects; + nr_inuse = nr_objs - nr_free; seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d", s->name, nr_inuse, - nr_objs, s->size, s->objects, (1 << s->order)); + nr_objs, s->size, oo_objects(s->oo), + (1 << oo_order(s->oo))); seq_printf(m, " : tunables %4u %4u %4u", 0, 0, 0); seq_printf(m, " : slabdata %6lu %6lu %6lu", nr_slabs, nr_slabs, 0UL);