Index: linux-2.6.19-rc1-mm1/mm/slabifier.c =================================================================== --- linux-2.6.19-rc1-mm1.orig/mm/slabifier.c 2006-10-16 18:38:12.387873148 -0700 +++ linux-2.6.19-rc1-mm1/mm/slabifier.c 2006-10-16 20:05:26.549541001 -0700 @@ -27,14 +27,14 @@ struct slab { struct work_struct flush; #endif atomic_t refcount; /* Refcount for destroy */ - atomic_long_t nr_slabs; /* Total slabs used */ /* Performance critical items follow */ int size; /* Total size of an object */ int offset; /* Free pointer offset. */ int objects; /* Number of objects in slab */ - spinlock_t list_lock; - struct list_head partial; - unsigned long nr_partial; + int fallback; /* Last fallback node */ + atomic_long_t nr_slabs; /* Total slabs used */ + atomic_long_t nr_partial; + struct page *partial[MAX_NUMNODES]; struct page *active[NR_CPUS]; }; @@ -42,10 +42,6 @@ struct slab { * The page struct is used to keep necessary information about a slab. * For a compound page the first page keeps the slab state. * - * Lock order: - * 1. slab_lock(page) - * 2. slab->list_lock - * * The slabifier assigns one slab for allocation to each processor. * Allocations only occur from these active slabs. * @@ -81,99 +77,80 @@ static __always_inline void slab_unlock( */ static void __always_inline add_partial(struct slab *s, struct page *page) { - spin_lock(&s->list_lock); - s->nr_partial++; - list_add_tail(&page->lru, &s->partial); - spin_unlock(&s->list_lock); -} + int node = page_to_nid(page); + struct page *oldpage; -static void __always_inline remove_partial(struct slab *s, - struct page *page) -{ - spin_lock(&s->list_lock); - list_del(&page->lru); - s->nr_partial--; - spin_unlock(&s->list_lock); + do { + oldpage = s->partial[node]; + page->next = oldpage; + } while (cmpxchg(&s->partial[node], oldpage, page) != oldpage); + atomic_long_inc(&s->nr_partial); } -/* - * Lock page and remove it from the partial list - * - * Must hold list_lock - */ -static __always_inline int lock_and_del_slab(struct slab *s, - struct page *page) +static void discard_slab(struct slab *s, struct page *page) { - if (bit_spin_trylock(PG_locked, &page->flags)) { - list_del(&page->lru); - s->nr_partial--; - return 1; - } - return 0; + atomic_long_dec(&s->nr_slabs); + + page->mapping = NULL; + reset_page_mapcount(page); + __ClearPageSlab(page); + __ClearPageSlabsingle(page); + + s->sc.page_alloc->free(s->sc.page_alloc, page, s->sc.order); } /* - * Get a partial page, lock it and return it. + * Get a partially allocated or fully free page. */ -#ifdef CONFIG_NUMA static struct page *get_partial(struct slab *s, int node) { struct page *page; int searchnode = (node == -1) ? numa_node_id() : node; + int fallback = -1; - if (!s->nr_partial) +redo: + if (!atomic_read(&s->nr_partial)) return NULL; - spin_lock(&s->list_lock); - /* - * Search for slab on the right node - */ - list_for_each_entry(page, &s->partial, lru) - if (likely(page_to_nid(page) == searchnode) && - lock_and_del_slab(s, page)) - goto out; + page = s->partial[searchnode]; - if (likely(node == -1)) { + if (page) { + if (cmpxchg(&s->partial[searchnode], page, page->next) != page) + goto redo; + atomic_long_dec(&s->nr_partial); /* - * We can fall back to any other node in order to - * reduce the size of the partial list. + * If we have gotten a completely empty slab and still have pages left + * on this node then free the slab and take the next. This helps to + * defragment slab pages and avoids otherwise expensive removal of + * fully freed slabs */ - list_for_each_entry(page, &s->partial, lru) - if (likely(lock_and_del_slab(s, page))) - goto out; + if (!page->inuse && s->partial[searchnode]) { + discard_slab(s, page); + goto redo; + } + return page; } - /* Nothing found */ - page = NULL; -out: - spin_unlock(&s->list_lock); - return page; -} -#else -static struct page *get_partial(struct slab *s, int node) -{ - struct page *page; - - /* Racy check. If we mistakenly see no partial slabs then we - * just allocate an empty slab. If we mistakenly try to get a - * partial slab then get_partials() will return NULL. - */ - if (!s->nr_partial) +#ifdef CONFIG_NUMA + /* Request for a specific node that we were unable to fullfill */ + if (node != -1) return NULL; - spin_lock(&s->list_lock); - list_for_each_entry(page, &s->partial, lru) - if (likely(lock_and_del_slab(s, page))) - goto out; + /* Ok. We may fall back to other nodes .... */ + fallback = s->fallback; + do { + if (s->partial[fallback]) { + searchnode = fallback; + goto redo; + } + fallback++; + if (fallback == MAX_NUMNODES) + fallback = 0; - /* No slab or all slabs busy */ - page = NULL; -out: - spin_unlock(&s->list_lock); - return page; -} + } while (fallback != s->fallback); #endif - + return NULL; +} /* * Debugging checks @@ -224,7 +201,7 @@ static int on_freelist(struct slab *s, s { int nr = 0; void **object = page->freelist; - void *origin = &page->lru; + void *origin = &page->next; if (PageSlabsingle(page)) return 0; @@ -265,18 +242,6 @@ void check_free_chain(struct slab *s, st /* * Operations on slabs */ -static void discard_slab(struct slab *s, struct page *page) -{ - atomic_long_dec(&s->nr_slabs); - - page->mapping = NULL; - reset_page_mapcount(page); - __ClearPageSlab(page); - __ClearPageSlabsingle(page); - - s->sc.page_alloc->free(s->sc.page_alloc, page, s->sc.order); -} - /* * Allocate a new slab and prepare an empty freelist and the basic struct * page settings. @@ -488,17 +453,14 @@ new_slab: return page_address(page); } - slab_lock(page); - gotpage: if (s->active[cpu]) { - slab_unlock(page); - discard_slab(s, page); + putback_slab(s, page); page = s->active[cpu]; - slab_lock(page); } else s->active[cpu] = page; + slab_lock(page); __SetPageActive(page); check_free_chain(s, page); @@ -566,27 +528,17 @@ static void slab_free(struct slab_cache page->freelist = object; page->inuse--; - if (likely(PageActive(page) || (page->inuse && prior))) { -out_unlock: - slab_unlock(page); - local_irq_restore(flags); - return; - } - - if (!prior) { + if (!PageActive(page) && !prior) /* * Page was fully used before. It will have one free * object now. So move to the partial list. */ add_partial(s, page); - goto out_unlock; - } - /* - * All object have been freed. - */ - remove_partial(s, page); slab_unlock(page); + local_irq_restore(flags); + return; + single_object_slab: discard_slab(s, page); local_irq_restore(flags); @@ -684,7 +636,7 @@ static struct slab_cache *slab_create(st s->objects = (PAGE_SIZE << sc->order) / s->size; BUG_ON(s->objects > 65535); atomic_long_set(&s->nr_slabs, 0); - s->nr_partial = 0; + atomic_set(&s->nr_partial, 0); #ifdef CONFIG_SMP atomic_set(&s->active_cpus, 0); INIT_WORK(&s->flush, &flusher, s); @@ -692,10 +644,8 @@ static struct slab_cache *slab_create(st if (!s->objects) return NULL; - INIT_LIST_HEAD(&s->partial); - + memset(s->partial, 0, sizeof(s->partial)); atomic_set(&s->refcount, 1); - spin_lock_init(&s->list_lock); mutex_init(&s->flushing); for_each_possible_cpu(cpu) s->active[cpu] = NULL; @@ -784,6 +734,64 @@ static int move_slab_objects(struct slab return unfreeable; } +struct shrink_info { + struct slab *s; + int (*move_object)(struct slab_cache *, void *); +}; + +/* + * Shrink a slab on a node. + * + * We get all pages off the per node lists and free all that are + * empty. The remaining pages are compacted if move_slab_object + * is set otherwise they are simply put back onto the per node + * list. + */ +static void slab_shrink_node(void *d) +{ + struct page *list = NULL; + struct page *page; + struct shrink_info *si = d; + + while ((page = get_partial(si->s, numa_node_id()))) { + + if (!page->inuse) + discard_slab(si->s, page); + else + if (page->inuse < si->s->objects) { + page->next = list; + list = page; + } + }; + + if (!list) + return; + + if (!si->s->partial[numa_node_id()]) { + page = list; + list = page->next; + add_partial(si->s, page); + } + + /* + * Cycle through the remainder of the list adding slab objects + */ + while (list) { + page = list; + list = page->next; + + if (!page->inuse) + discard_slab(si->s,page); + else { + if (si->move_object) + move_slab_objects(si->s, page, si->move_object); + + if (page->inuse) + putback_slab(si->s, page); + } + } +} + /* * Shrinking drops the active per cpu slabs and also reaps all empty * slabs off the partial list. Returns the number of slabs freed. @@ -798,47 +806,17 @@ static int move_slab_objects(struct slab * * Returns the number of slabs freed. */ -static int slab_shrink(struct slab_cache *sc, +static void slab_shrink(struct slab_cache *sc, int (*move_object)(struct slab_cache *, void *)) { struct slab *s = (void *)sc; - unsigned long flags; - int slabs_freed = 0; - int i; + struct shrink_info si = { s, move_object }; drain_all(s); - - local_irq_save(flags); - for(i = 0; s->nr_partial > 1 && i < s->nr_partial - 1; i++ ) { - struct page * page; - - page = get_partial(s, -1); - if (!page) - break; - - /* - * Pin page so that slab_free will not free even if we - * drop the slab lock. - */ - __SetPageActive(page); - - if (page->inuse < s->objects && move_object) - if (move_slab_objects(s, - page, move_object) == 0) - slabs_freed++; - - /* - * This will put the slab on the front of the partial - * list, the used list or free it. - */ - __ClearPageActive(page); - putback_slab(s, page); - } - local_irq_restore(flags); - return slabs_freed; - + schedule_on_each_node(slab_shrink_node, &si); } + static struct slab_cache *slab_dup(struct slab_cache *sc) { struct slab *s = (void *)sc; @@ -847,23 +825,6 @@ static struct slab_cache *slab_dup(struc return &s->sc; } -static int free_list(struct slab *s, struct list_head *list) -{ - int slabs_inuse = 0; - unsigned long flags; - struct page *page, *h; - - spin_lock_irqsave(&s->list_lock, flags); - list_for_each_entry_safe(page, h, list, lru) - if (!page->inuse) { - list_del(&s->partial); - discard_slab(s, page); - } else - slabs_inuse++; - spin_unlock_irqrestore(&s->list_lock, flags); - return slabs_inuse; -} - static int slab_destroy(struct slab_cache *sc) { struct slab *s = (void *)sc; @@ -871,8 +832,7 @@ static int slab_destroy(struct slab_cach if (!atomic_dec_and_test(&s->refcount)) return 0; - drain_all(s); - free_list(s, &s->partial); + slab_shrink(sc, NULL); if (atomic_long_read(&s->nr_slabs)) return 1; @@ -882,16 +842,22 @@ static int slab_destroy(struct slab_cach return 0; } -static unsigned long count_objects(struct slab *s, struct list_head *list) +/* + * This is racy and may produce weird results. We check the page pointers + * carefully to see if they are still valid. + */ +static unsigned long count_objects(struct slab *s) { int count = 0; struct page *page; - unsigned long flags; + int node; - spin_lock_irqsave(&s->list_lock, flags); - list_for_each_entry(page, list, lru) - count += page->inuse; - spin_unlock_irqrestore(&s->list_lock, flags); + for_each_node(node) { + page = s->partial[node]; + while (!page && pfn_valid(page_to_pfn(page)) && page->inuse < s->objects) + count += page->inuse; + page = page->next; + } return count; } @@ -900,7 +866,7 @@ static unsigned long slab_objects(struct unsigned long *p_partial) { struct slab *s = (void *)sc; - int partial = count_objects(s, &s->partial); + int partial = count_objects(s); int nr_slabs = atomic_read(&s->nr_slabs); int active = 0; /* Active slabs */ int nr_active = 0; /* Objects in active slabs */ @@ -916,7 +882,7 @@ static unsigned long slab_objects(struct } if (p_partial) - *p_partial = s->nr_partial; + *p_partial = atomic_long_read(&s->nr_partial); if (p_active) *p_active = nr_active; @@ -925,7 +891,7 @@ static unsigned long slab_objects(struct *p_total = nr_slabs; return partial + active + - (nr_slabs - s->nr_partial - nr_active) * s->objects; + (nr_slabs - atomic_read(&s->nr_partial) - nr_active) * s->objects; } const struct slab_allocator slabifier_allocator = { Index: linux-2.6.19-rc1-mm1/include/linux/allocator.h =================================================================== --- linux-2.6.19-rc1-mm1.orig/include/linux/allocator.h 2006-10-16 18:38:10.341124236 -0700 +++ linux-2.6.19-rc1-mm1/include/linux/allocator.h 2006-10-16 20:03:36.215526138 -0700 @@ -133,6 +133,7 @@ struct slab_control { struct slab_cache sc; /* Common information */ void *data[50]; /* Some data */ void *percpu[NR_CPUS]; /* Some per cpu information. */ + void *pernode[MAX_NUMNODES]; /* some per node data */ }; struct slab_allocator { @@ -177,7 +178,7 @@ struct slab_allocator { * return 1 for success. If it return 0 then the object is pinned. * the slab that the object resides on will not be freed. */ - int (*shrink)(struct slab_cache *, + void (*shrink)(struct slab_cache *, int (*move_object)(struct slab_cache *, void *)); /* Index: linux-2.6.19-rc1-mm1/include/linux/mm_types.h =================================================================== --- linux-2.6.19-rc1-mm1.orig/include/linux/mm_types.h 2006-10-16 18:38:12.389826153 -0700 +++ linux-2.6.19-rc1-mm1/include/linux/mm_types.h 2006-10-16 19:10:47.044777115 -0700 @@ -58,9 +58,12 @@ struct page { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* Slabifier: free object */ }; - struct list_head lru; /* Pageout list, eg. active_list + union { + struct list_head lru; /* Pageout list, eg. active_list * protected by zone->lru_lock ! */ + struct page *next; /* slabifier: Next free page */ + }; /* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with Index: linux-2.6.19-rc1-mm1/include/linux/workqueue.h =================================================================== --- linux-2.6.19-rc1-mm1.orig/include/linux/workqueue.h 2006-10-04 19:57:05.000000000 -0700 +++ linux-2.6.19-rc1-mm1/include/linux/workqueue.h 2006-10-16 20:04:03.215813870 -0700 @@ -72,6 +72,7 @@ extern int FASTCALL(schedule_delayed_wor extern int schedule_delayed_work_on(int cpu, struct work_struct *work, unsigned long delay); extern int schedule_on_each_cpu(void (*func)(void *info), void *info); +extern int schedule_on_each_node(void (*func)(void *info), void *info); extern void flush_scheduled_work(void); extern int current_is_keventd(void); extern int keventd_up(void); Index: linux-2.6.19-rc1-mm1/kernel/workqueue.c =================================================================== --- linux-2.6.19-rc1-mm1.orig/kernel/workqueue.c 2006-10-16 18:38:04.540700317 -0700 +++ linux-2.6.19-rc1-mm1/kernel/workqueue.c 2006-10-16 20:06:40.397525117 -0700 @@ -531,6 +531,29 @@ int schedule_on_each_cpu(void (*func)(vo return 0; } +int schedule_on_each_node(void (*func)(void *info), void *info) +{ + int node; + struct work_struct *works; + + works = alloc_percpu(struct work_struct); + if (!works) + return -ENOMEM; + + mutex_lock(&workqueue_mutex); + for_each_online_node(node) { + int cpu = node_to_first_cpu(node); + + INIT_WORK(per_cpu_ptr(works, cpu), func, info); + __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), + per_cpu_ptr(works, cpu)); + } + mutex_unlock(&workqueue_mutex); + flush_workqueue(keventd_wq); + free_percpu(works); + return 0; +} + void flush_scheduled_work(void) { flush_workqueue(keventd_wq);