Index: linux-2.6.21-rc7-mm1/mm/slub.c =================================================================== --- linux-2.6.21-rc7-mm1.orig/mm/slub.c 2007-04-25 09:23:16.000000000 -0700 +++ linux-2.6.21-rc7-mm1/mm/slub.c 2007-04-25 09:27:23.000000000 -0700 @@ -93,6 +93,13 @@ * slab handling out of the fast path. */ +/* Mininum number of partial slabs */ +#define MIN_PARTIAL 2 + +/* + * Maximum number of partial slabs */ +#define MAX_PARTIAL 10 + /* * Issues still to be resolved: * @@ -627,7 +634,7 @@ static int on_freelist(struct kmem_cache /* * Tracking of fully allocated slabs for debugging */ -static void add_full(struct kmem_cache *s, struct page *page) +static void add_full(struct kmem_cache_node *n, struct page *page) { struct kmem_cache_node *n; @@ -638,7 +645,6 @@ static void add_full(struct kmem_cache * if (!(s->flags & SLAB_STORE_USER)) return; - n = get_node(s, page_to_nid(page)); spin_lock(&n->list_lock); list_add(&page->lru, &n->full); spin_unlock(&n->list_lock); @@ -925,10 +931,16 @@ static __always_inline int slab_trylock( /* * Management of partially allocated slabs */ -static void add_partial(struct kmem_cache *s, struct page *page) +static void add_partial_tail(struct kmem_cache_node *n, struct page *page) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + spin_lock(&n->list_lock); + n->nr_partial++; + list_add_tail(&page->lru, &n->partial); + spin_unlock(&n->list_lock); +} +static void add_partial(struct kmem_cache_node *n, struct page *page) +{ spin_lock(&n->list_lock); n->nr_partial++; list_add(&page->lru, &n->partial); @@ -1028,7 +1040,7 @@ static struct page *get_any_partial(stru n = get_node(s, zone_to_nid(*z)); if (n && cpuset_zone_allowed_hardwall(*z, flags) && - n->nr_partial > 2) { + n->nr_partial > MIN_PARTIAL) { page = get_partial_node(n); if (page) return page; @@ -1062,15 +1074,30 @@ static struct page *get_partial(struct k */ static void putback_slab(struct kmem_cache *s, struct page *page) { - if (page->inuse) { + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + if (page->inuse || n->nr_partial < MIN_PARTIAL) { if (page->freelist) - add_partial(s, page); + add_partial(n, page); else if (PageError(page)) - add_full(s, page); + add_full(n, page); slab_unlock(page); } else { - slab_unlock(page); - discard_slab(s, page); + if (n->nr_partial < MIN_PARTIAL) { + /* + * Adding an empty page to the partial slabs in order + * to avoid page allocator overhead. This page needs to + * come after all the others that are not fully empty + * in order to make sure that we do maximum + * defragmentation. We could free that slab if we + * wanted. + */ + add_partial_tail(n, page); + slab_unlock(page); + } else { + slab_unlock(page); + discard_slab(s, page); + } } } @@ -1327,7 +1354,7 @@ checks_ok: * then add it. */ if (unlikely(!prior)) - add_partial(s, page); + add_partial(get_node(s, page_to_nid(page)), page); out_unlock: slab_unlock(page); @@ -1543,7 +1570,7 @@ static struct kmem_cache_node * __init e kmalloc_caches->node[node] = n; init_kmem_cache_node(n); atomic_long_inc(&n->nr_slabs); - add_partial(kmalloc_caches, page); + add_partial(n, page); return n; } @@ -2393,13 +2420,154 @@ static struct notifier_block __cpuinitda #endif -/*************************************************************** - * Compatiblility definitions - **************************************************************/ +/* + * Attempt to empty a slab by using the callback. + * Must hold the list_lock on entry. + * May temporarily drop the list_lock. + */ +static int empty_slab(struct kmem_cache *s, + struct kmem_cache_node *n, + struct page *page) +{ + void *p; + void *addr = page_address(page); + unsigned long map[BITS_TO_LONGS(s->objects)]; + + if (!(s->flags & SLAB_FREE_CALLBACK)) + return 0; + + /* Extract from list */ + if (!slab_trylock(page)) + /* busy */ + return 0; + + /* Disable list management for this slab */ + SetPageActive(page); + list_del(&page->lru); + spin_lock_irqrestore(n->list_lock, flags); + + /* + * Now we have the slab locked so that no operations + * can occur on it. list_lock is not held so other + * slab operations can continue. + */ + bitmap_zero(map, s->objects); + for(p = page->freelist; p; p = get_freepointer(s, p)) + set_bit((p - addr) / s->size, map); + + /* Use the map to scan through the used objects */ + for(p = addr; p < addr + s->objects * s->size; p += s->size) + if (!test_bit((p - addr) / s->size, map)) + /* + * The callback needs to return 1 if the object + * is to be freed. Callback cannot free objects + * of this slab which may deadlock if attempts + * are made to free other objects in the slab + * we have locked! + * + * Callback may allocate new objects. This feature + * allows the callback to simply reallocate + * the object elsewhere. + */ + if (s->callback(s, p, SLAB_CTOR_FREE)) { + set_freepointer(s, page->freepointer) + page->freepointer = s; + page->inuse--; + } + + spin_lock_irqsave(n->list_lock, flags); + ClearPageActive(page); + ClearPageReferenced(page); + putback_slab(s, page); + return page->inuse == 0; +} +/* + * kmem_cache_shrink removes empty slabs from the partial lists + * and then sorts the partially allocated slabs by the number + * of items in use. The slabs with the most items in use + * come first. New allocations will remove these from the + * partial list because they are full. The slabs with the + * least items are placed last. If it happens that the objects + * are freed then the page can be returned to the page allocator. + * + * If the slab provides a callback for freeing objects then + * kmem_cache_shrink will perform callbacks for objects in slabs + * with few elements in order to free them up. + */ int kmem_cache_shrink(struct kmem_cache *s) { + int node; + int i; + struct kmem_cache_node *n; + struct page *page; + struct page *t; + struct list_head *slabs_by_inuse = + kmalloc(sizeof(struct list_head) * s->objects); + + if (!slabs_by_inuse) + return -ENOMEM; + flush_all(s); + for_each_node(node) { + struct kmem_cache_node *n = get_node(s, node); + + /* + * If there are just a minimum number of partial slabs + * then do not bother. + */ + if (n->nr_partial <= MIN_PARTIAL) + continue; + + for (i = 0; i < s->objects; i++) + INIT_LIST_HEAD(slab_by_inuse + i); + + spin_lock_irqsave(n->list_lock, flags); + + /* + * Build lists indexed by the items in use in + * each slab or free slabs if empty. + * + * Note that concurrent frees may occur while + * we hold the list_lock. page->inuse here is + * the upper limit. + */ + for_each_entry_safe(page, t, n->partial) { + if (!page->inuse) { + list_del(&page->lru); + discard_slab(s, page); + } else + list_move(&page->lru, + slabs_by_inuse + page->inuse); + } + + /* + * Rebuild the partial list with the slabs filled up + * most first and the least used slabs at the end. + */ + for (i = s->objects - 1; i > 0; i--) + list_splice(slab_by_inuse + i, n->partial.prev); + + /* + * Walk back through the partial attempting to free objects + * until we encounter a page that has more than 1/3rd + * objects. + */ + page = n->partial.prev; + while (n->nr_partial > MAX_PARTIAL && + (s->flags & SLAB_FREE_CALLBACK) && + page->prev != &n->partial && + page->inuse < s->objects / 3) { + + empty_slab(s, n, page); + + page = page->lru.prev; + } + + spin_unlock_irqrestore(n->list_lock, flags); + } + + kfree(slabs_by_inuse); return 0; } EXPORT_SYMBOL(kmem_cache_shrink);