SLUB: Implement targeted reclaim and partial list defragmentation Targeted reclaim allows to target a single slab for reclaim. This is done by calling kmem_cache_vacate(page); It will return 1 on success, 0 if the operation failed. The vacate functionality is also used for slab shrinking. During the shrink operation SLUB will generate a list sorted by the number of objects in use. We extract pages off that list that are only filled less than a quarter. These objects are then processed using kmem_cache_vacate. In order for a slabcache to support this functionality two functions must be defined via slab_operations. get_reference(struct kmem_cache *s, void *) Must obtain a reference to the object if it has not been freed yet. It is up to the user to resolve the race. SLUB guarantees that the objects is still allocated. However, another thread may be blocked in slab_free attempting to free the same object. It may succeed as soon as get_reference() returns to the slab allocator. get_reference() processing must recognize this situation (i.e. check refcount for zero) and fail in such a sitation (no problem since the object will be freed as soon we drop the slab lock before doing kick calls). No slab operations may be performed in get_reference(). Interrupts are disabled. What can be done is very limited. The slab lock for the page with the object is taken. Any attempt to perform a slab operation may lead to a deadlock. 2. kick_object(struct kmem_cache *, void *) After SLUB has established references to the remaining objects in a slab it will drop all locks and then use kick_object on each of the objects. The existence of the objects is guaranteed by virtue of the earlier obtained reference. The callback may perform any slab operation since no locks are held at the time of call. The callback should remove the object from the slab in some way. This may be accomplished by reclaiming the object and then running kmem_cache_free() or reallocating it and then running kmem_cache_free(). Reallocation is advantageous because the partial slabs were just sorted to have the partial slabs with the most objects first. Allocation is likely to result in filling up a slab so that it can be removed from the partial list. NOTE: This patch is for conceptual review. I'd appreciate any feedback especially on the locking approach taken here. It will be critical to insure that locking works reliably in order for this approach to become feasable. Signed-off-by: Christoph Lameter --- include/linux/slab.h | 15 +++ mm/slub.c | 219 ++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 223 insertions(+), 11 deletions(-) Index: slub/include/linux/slab.h =================================================================== --- slub.orig/include/linux/slab.h 2007-05-09 16:25:00.000000000 -0700 +++ slub/include/linux/slab.h 2007-05-09 16:25:36.000000000 -0700 @@ -42,6 +42,20 @@ struct kmem_cache_ops { /* FIXME: Remove flags parameter */ void (*ctor)(void *, struct kmem_cache *, unsigned long); void (*dtor)(void *, struct kmem_cache *, unsigned long); + /* + * Called with slab lock held and interrupts disabled. + * No slab operations may be performed in get_reference + * + * Must return 1 if a reference was obtained. + * 0 if we failed to obtain the reference (f.e. + * the object is concurrently freed). + */ + int (*get_reference)(struct kmem_cache *, void *); + /* + * Called with no locks held and interrupts enabled. + * Any operation may be performed in kick_object. + */ + void (*kick_object)(struct kmem_cache *, void *); }; struct kmem_cache *__kmem_cache_create(const char *, size_t, size_t, @@ -54,6 +68,7 @@ void kmem_cache_free(struct kmem_cache * unsigned int kmem_cache_size(struct kmem_cache *); const char *kmem_cache_name(struct kmem_cache *); int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr); +int kmem_cache_vacate(struct page *); /* * Please use this macro to create slab caches. Simply specify the Index: slub/mm/slub.c =================================================================== --- slub.orig/mm/slub.c 2007-05-09 16:25:00.000000000 -0700 +++ slub/mm/slub.c 2007-05-09 16:45:24.000000000 -0700 @@ -293,6 +293,8 @@ static inline int check_valid_pointer(st struct kmem_cache_ops slub_default_ops = { NULL, + NULL, + NULL, NULL }; @@ -997,7 +999,7 @@ static struct page *allocate_slab(struct static void setup_object(struct kmem_cache *s, struct page *page, void *object) { - if (SlabDebug(page)) { + if (s->flags & DEBUG_DEFAULT_FLAGS) { init_object(s, object, 0); init_tracking(s, object); } @@ -1027,12 +1029,11 @@ static struct page *new_slab(struct kmem n = get_node(s, page_to_nid(page)); if (n) atomic_long_inc(&n->nr_slabs); + + page->inuse = 0; + page->lockless_freelist = NULL; page->offset = s->offset / sizeof(void *); page->slab = s; - page->flags |= 1 << PG_slab; - if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | - SLAB_STORE_USER | SLAB_TRACE)) - SetSlabDebug(page); start = page_address(page); end = start + s->objects * s->size; @@ -1050,11 +1051,20 @@ static struct page *new_slab(struct kmem set_freepointer(s, last, NULL); page->freelist = start; - page->lockless_freelist = NULL; - page->inuse = 0; -out: - if (flags & __GFP_WAIT) - local_irq_disable(); + + /* + * page->inuse must be visible when PageSlab(page) becomes + * true for targeted reclaim + */ + smp_wmb(); + __SetPageSlab(page); + if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | + SLAB_STORE_USER | SLAB_TRACE)) + SetSlabDebug(page); + + out: + if (flags & __GFP_WAIT) + local_irq_disable(); return page; } @@ -2323,6 +2333,143 @@ void kfree(const void *x) EXPORT_SYMBOL(kfree); /* + * Vacate all objects in the given slab. Slab must be locked. + * + * Will drop and regain and drop the slab lock. + * Slab must be frozen to avoid concurrent slab_free from + * removing the slab from the lists. At the end the slab will either + * be freed or have been returned to the partial lists. + * + * Return number of remaining objects + */ +static int __kmem_cache_vacate(struct kmem_cache *s, + struct page *page, unsigned long flags) +{ + void *p; + void *addr = page_address(page); + DECLARE_BITMAP(map, s->objects); + int leftover; + + if (!page->inuse) + return 0; + + /* Determine free objects */ + bitmap_fill(map, s->objects); + for_each_free_object(p, s, page->freelist) + __clear_bit(slab_index(p, s, addr), map); + + /* + * Get a refcount for all used objects. If that fails then + * no KICK callback can be performed. + */ + for_each_object(p, s, addr) { + int i = slab_index(p, s, addr); + + if (test_bit(i, map)) + if (!s->ops->get_reference(s, p)) + __clear_bit(i, map); + } + + /* + * Got references. Now we can drop the slab lock. The slab + * is frozen so it cannot vanish from under us nor will + * allocations be performed on the slab. Unlocking the + * slab will allow blocked slab_frees that caused failure of + * get_reference() to proceed. + */ + slab_unlock(page); + local_irq_restore(flags); + + /* + * Perform the KICK callbacks to remove the objects. This is + * expected to remove objects in the slab. There is no + * return code. We can check the slab to see if all + * were freed or not. + */ + for_each_object(p, s, addr) + if (test_bit(slab_index(p, s, addr), map)) + s->ops->kick_object(s, p); + + /* + * Check the result and unfreeze the slab + */ + local_irq_save(flags); + slab_lock(page); + leftover = page->inuse; + unfreeze_slab(s, page); + local_irq_restore(flags); + return leftover; +} + +/* + * Get a page off a list and freeze it. Must be holding slab lock. + */ +static void freeze_from_list(struct kmem_cache *s, struct page *page) +{ + if (page->inuse < s->objects) + remove_partial(s, page); + else if (s->flags & SLAB_STORE_USER) + remove_full(s, page); + SetSlabFrozen(page); +} + +/* + * Attempt to free objects in a page. Return 1 if succesful. + */ +int kmem_cache_vacate(struct page *page) +{ + unsigned long flags; + struct kmem_cache *s; + int vacated = 0; + + /* + * Get a reference to the page. Return if its freed or being freed. + * This is necessary to make sure that the page does not vanish + * from under us even if it should be freed because it was emptied + */ + if (!get_page_unless_zero(page)) + return 0; + + /* Check that this is truly a slab page */ + if (!PageSlab(page)) + goto out; + + local_irq_save(flags); + slab_lock(page); + + /* + * We may now have locked a page that may be in various stages of + * being freed. If the PageSlab bit is off then we have already + * reached the page allocator. If page->inuse is zero then we are + * in SLUB but freeing or allocating the page. + * page->inuse is never modified without the slab lock held. + * + * Also abort if the page happens to be already frozen. If its + * frozen then list operations cannot be performed on the slab. + */ + if (!PageSlab(page) || SlabFrozen(page) || !page->inuse) { + slab_unlock(page); + local_irq_restore(flags); + goto out; + } + + /* + * We are holding a lock on a slab page and block on allocations + * and frees. + */ + s = page->slab; + if (!s->ops->get_reference || !s->ops->kick_object) { + slab_unlock(page); + goto out; + } + freeze_from_list(s, page); + vacated = __kmem_cache_vacate(s, page, flags) == 0; +out: + put_page(page); + return vacated; +} + +/* * kmem_cache_shrink removes empty slabs from the partial lists and sorts * the remaining slabs by the number of items in use. The slabs with the * most items in use come first. New allocations will then fill those up @@ -2337,11 +2484,12 @@ int kmem_cache_shrink(struct kmem_cache int node; int i; struct kmem_cache_node *n; - struct page *page; + struct page *page, *page2; struct page *t; struct list_head *slabs_by_inuse = kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); unsigned long flags; + LIST_HEAD(zaplist); if (!slabs_by_inuse) return -ENOMEM; @@ -2392,8 +2540,43 @@ int kmem_cache_shrink(struct kmem_cache for (i = s->objects - 1; i >= 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); + if (!s->ops->get_reference || !s->ops->kick_object) + goto out; + + /* Take objects with just a few objects off the tail */ + while (n->nr_partial > MAX_PARTIAL) { + page = container_of(n->partial.prev, struct page, lru); + + /* + * We are holding the list_lock so we can only + * trylock the slab + */ + if (page->inuse > s->objects / 4) + break; + + if (!slab_trylock(page)) + break; + + /* + * Manual freeze here in order to be able to + * use list_move + */ + list_move(&page->lru, &zaplist); + n->nr_partial--; + SetSlabFrozen(page); + slab_unlock(page); + } out: spin_unlock_irqrestore(&n->list_lock, flags); + + /* Now we can free objects in the slabs on the zaplist */ + list_for_each_entry_safe(page, page2, &zaplist, lru) { + unsigned long flags; + + local_irq_save(flags); + slab_lock(page); + __kmem_cache_vacate(s, page, flags); + } } kfree(slabs_by_inuse); @@ -3226,6 +3409,20 @@ static ssize_t ops_show(struct kmem_cach x += sprint_symbol(buf + x, (unsigned long)s->ops->dtor); x += sprintf(buf + x, "\n"); } + + if (s->ops->get_reference) { + x += sprintf(buf + x, "get_reference : "); + x += sprint_symbol(buf + x, + (unsigned long)s->ops->get_reference); + x += sprintf(buf + x, "\n"); + } + + if (s->ops->kick_object) { + x += sprintf(buf + x, "kick_object : "); + x += sprint_symbol(buf + x, + (unsigned long)s->ops->kick_object); + x += sprintf(buf + x, "\n"); + } return x; } SLAB_ATTR_RO(ops);