Index: linux-2.6.18-rc4/mm/slabifier.c =================================================================== --- linux-2.6.18-rc4.orig/mm/slabifier.c 2006-08-22 21:57:46.982809406 -0700 +++ linux-2.6.18-rc4/mm/slabifier.c 2006-08-23 11:56:44.603652940 -0700 @@ -23,18 +23,17 @@ struct slab { struct slab_cache sc; - spinlock_t list_lock; - struct list_head partial; /* List of partially allocated slabs */ - struct list_head full; /* Fully allocated slabs */ - unsigned long nr_partial; /* Partial slabs */ - unsigned long nr_slabs; /* Total slabs used */ + struct work_struct flush; + ZONE_PADDING(slab_pad); /* Align to cacheline boundary */ int size; /* Slab size */ int offset; /* Free pointer offset. */ int objects; /* Number of objects in slab */ atomic_t refcount; /* Refcount for destroy */ - /* Flusher related data */ + atomic_long_t nr_slabs; /* Total slabs used */ + spinlock_t list_lock; + struct list_head partial; /* List of partially allocated slabs */ + unsigned long nr_partial; /* Partial slabs */ int flusher_active; - struct work_struct flush; struct page *active[NR_CPUS]; /* Per CPU slabs list protected by * page lock */ @@ -120,28 +119,95 @@ static int get_object_counter(struct pag } /* - * For a given active page get the corresponding cpu */ -static int get_active_cpu(struct page *page) + * Locking for each individual slab using the pagelock + */ +static __always_inline void slab_lock(struct page *page) { - return (unsigned long)(page->lru.prev); + bit_spin_lock(PG_locked, &page->flags); } -static void set_active_cpu(struct page *page, unsigned long cpu) +static __always_inline void slab_unlock(struct page *page) { - page->lru.prev = (void *)cpu; + bit_spin_unlock(PG_locked, &page->flags); +} + +static void add_partial(struct slab *s, struct page *page) +{ + spin_lock(&s->list_lock); + s->nr_partial++; + list_add_tail(&page->lru, &s->partial); + spin_unlock(&s->list_lock); +} + +static void remove_partial(struct slab *s, struct page *page) +{ + spin_lock(&s->list_lock); + list_del(&page->lru); + s->nr_partial--; + spin_unlock(&s->list_lock); } /* - * Locking for each individual slab using the pagelock + * Get a page and remove it from the partial list + * Must hold list_lock */ -static void slab_lock(struct page *page) +static int lock_and_del_slab(struct slab *s, struct page *page) { - bit_spin_lock(PG_locked, &page->flags); + if (bit_spin_trylock(PG_locked, &page->flags)) { + list_del(&page->lru); + s->nr_partial--; + return 1; + } + return 0; } -static void slab_unlock(struct page *page) +/* + * Get a partial page, lock it and return it. + */ +static struct page *get_partial(struct slab *s, int node) { - bit_spin_unlock(PG_locked, &page->flags); + struct page *page; + struct list_head *h; + int wanted_node; + + spin_lock(&s->list_lock); + +#ifdef CONFIG_NUMA + /* + * Search for slab on the right node + * + * This search is a scalability concern. Searching big + * lists under lock can cause latencies. + * + * On the other hand picking the right slab that + * is from the node were we are and maybe even + * from the same cpu as before is very good + * for latency. + */ + wanted_node = node < 0 ? numa_node_id() : node; + list_for_each(h, &s->partial) { + page = container_of(h, struct page, lru); + + if (likely(page_to_nid(page) == wanted_node) && + lock_and_del_slab(s, page)) + goto out; + } + + if (node >= 0) + goto fail; + +#endif + list_for_each(h, &s->partial) { + page = container_of(h, struct page, lru); + + if (lock_and_del_slab(s, page)) + goto out; + } +fail: + page = NULL; +out: + spin_unlock(&s->list_lock); + return page; } static void check_slab(struct page *page) @@ -170,18 +236,12 @@ static void check_active_slab(struct pag /* * Discard an unused slab page - * Must hold list_lock. - * Cannot hold the slab lock since the page is going away. */ static void discard_slab(struct slab *s, struct page *page) { - TPRINTK(KERN_CRIT "slab %s free %p page_alloc=%p free=%p\n", s->sc.name, page, - s->sc.page_alloc, s->sc.page_alloc->free); - DBUG_ON(PageActive(page)); DBUG_ON(PageLocked(page)); - list_del(&page->lru); - s->nr_slabs--; + atomic_long_dec(&s->nr_slabs); /* Restore page state */ page->mapping = NULL; /* was used for slab pointer */ @@ -194,50 +254,39 @@ static void discard_slab(struct slab *s, } /* - * Move a page back to the lists. This can be an active page or a page - * that was taken off the list for another purpose. + * Move a page back to the lists. * * Must be called with the slab lock held. * On exit the slab lock will have been dropped. */ -static void deactivate_slab(struct slab *s, struct page *page) +static void putback_slab(struct slab *s, struct page *page) { int inuse; -#ifdef SLABIFIER_DEBUG - void *objp; - int cpu = get_active_cpu(page); -#endif - spin_lock(&s->list_lock); - s->active[get_active_cpu(page)] = NULL; - ClearPageActive(page); - ClearPageReferenced(page); inuse = get_object_counter(page); -#ifdef SLABIFIER_DEBUG - /* - * Must get this before dropping slab lock otherwise others - * may already be freeing objects in the page again. - */ - objp = get_object_pointer(page); -#endif - slab_unlock(page); + + TPRINTK(KERN_CRIT "putback_slab %s: %p %d/%d\n",s->sc.name, page, inuse, s->objects); if (inuse) { - if (inuse < s->objects) { - DBUG_ON(!objp); - TPRINTK(KERN_CRIT "slab %s: %p partial %d/%d %d cpu=%d\n",s->sc.name, page, inuse, s->objects, contended, cpu); - s->nr_partial++; - list_add(&page->lru, &s->partial); - } else { - DBUG_ON(objp); - TPRINTK(KERN_CRIT "slab %s: %p full %d cpu=%d\n",s->sc.name, page, contended, cpu); - list_add_tail(&page->lru, &s->full); - } + if (inuse < s->objects) + add_partial(s, page); + slab_unlock(page); } else { - /* For discard_slab we must have the slab on some list */ - list_add_tail(&page->lru, &s->full); + slab_unlock(page); discard_slab(s, page); } - spin_unlock(&s->list_lock); +} + +/* + * Make the current active page inactive + */ +static void deactivate_slab(struct slab *s, struct page *page, int cpu) +{ + s->active[cpu] = NULL; + smp_wmb(); + ClearPageActive(page); + ClearPageReferenced(page); + + putback_slab(s, page); } static int check_valid_pointer(struct slab *s, struct page *page, void *object, void *origin) @@ -245,10 +294,8 @@ static int check_valid_pointer(struct sl #ifdef SLABIFIER_DEBUG void *base = page_address(page); - check_slab(page); - if (object < base || object >= base + s->objects * s->size) { - printk(KERN_CRIT "slab %s size %d: pointer %p->%p\nnot in " + printk(KERN_CRIT "slab %s size %d: pointer %p->%p\nnot in" " range (%p-%p) in page %p\n", s->sc.name, s->size, origin, object, base, base + s->objects * s->size, page); @@ -279,7 +326,7 @@ static int on_freelist(struct slab *s, s check_slab(page); - while (object) { + while (object && nr <= s->objects) { if (object == search) return 1; if (!check_valid_pointer(s, page, object, origin)) @@ -292,7 +339,7 @@ static int on_freelist(struct slab *s, s if (get_object_counter(page) != s->objects - nr) { printk(KERN_CRIT "slab %s: page %p wrong object count." " counter is %d but counted were %d\n", - s->sc.name, page, get_object_counter(page), nr); + s->sc.name, page, get_object_counter(page), s->objects - nr); try_recover: printk(KERN_CRIT "****** Trying to continue by marking " "all objects used (memory leak!)\n"); @@ -312,17 +359,16 @@ void check_free_chain(struct slab *s, st /* * Allocate a new slab and prepare an empty freelist * and the basic struct page settings. + * Return with the slab locked. */ -static struct page *new_slab(struct slab *s, gfp_t flags) +static struct page *new_slab(struct slab *s, gfp_t flags, int node) { void *p, *start, *end; void **last; struct page *page; - TPRINTK(KERN_CRIT "add slab %s flags=%x\n", s->sc.name, flags); - page = s->sc.page_alloc->allocate(s->sc.page_alloc, s->sc.order, - flags, s->sc.node); + flags, node < 0 ? s->sc.node : node); if (!page) return NULL; @@ -341,6 +387,8 @@ static struct page *new_slab(struct slab __SetPageSlab(page); check_free_chain(s, page); add_zone_page_state(page_zone(page), NR_SLAB, 1 << s->sc.order); + atomic_long_inc(&s->nr_slabs); + slab_lock(page); return page; } @@ -348,8 +396,7 @@ static struct page *new_slab(struct slab * Acquire the slab lock from the active array. If there is no active * slab for this processor then return NULL; */ -static struct page *get_and_lock_active(struct slab *s, int cpu) -{ +static __always_inline struct page *get_and_lock_active(struct slab *s, int cpu) { struct page *page; redo: @@ -362,6 +409,7 @@ redo: goto redo; } check_active_slab(page); + check_free_chain(s, page); return page; } @@ -377,7 +425,7 @@ static void flush_active(struct slab *s, local_irq_save(flags); page = get_and_lock_active(s, cpu); if (likely(page)) - deactivate_slab(s, page); + deactivate_slab(s, page, cpu); local_irq_restore(flags); } @@ -450,6 +498,8 @@ static struct slab_cache *slab_create(st struct slab *s = (void *)x; int cpu; + BUG_ON(sizeof(struct slab_control) < sizeof(struct slab)); + memcpy(&x->sc, sc, sizeof(struct slab_cache)); s->size = ALIGN(sc->size, sizeof(void *)); @@ -459,7 +509,7 @@ static struct slab_cache *slab_create(st s->offset = sc->offset / sizeof(void *); s->objects = (PAGE_SIZE << sc->order) / s->size; - s->nr_slabs = 0; + atomic_long_set(&s->nr_slabs, 0); s->nr_partial = 0; s->flusher_active = 0; @@ -467,7 +517,6 @@ static struct slab_cache *slab_create(st return NULL; INIT_LIST_HEAD(&s->partial); - INIT_LIST_HEAD(&s->full); atomic_set(&s->refcount, 1); spin_lock_init(&s->list_lock); @@ -486,117 +535,117 @@ static struct slab_cache *slab_create(st * * Return NULL if we cannot reload. */ -static struct page *reload(struct slab *s, unsigned long cpu, gfp_t flags) +static struct page *reload(struct slab *s, unsigned long cpu, gfp_t flags, + int node) { struct page *page; redo: - if (unlikely(list_empty(&s->partial))) { - /* Add more slabs to the partial list */ - if ((flags & __GFP_WAIT)) { - local_irq_enable(); - page = new_slab(s, flags); - local_irq_disable(); - } else - page = new_slab(s, flags); - - if (!page) - return NULL; + if (s->nr_partial) { /* Racy check. If we do a useless allocation then + we just build up the partial list */ + page = get_partial(s, node); + if (page) + goto gotpage; + } - spin_lock(&s->list_lock); - s->nr_slabs++; + if ((flags & __GFP_WAIT)) { + local_irq_enable(); + page = new_slab(s, flags, node); + local_irq_disable(); } else - spin_lock(&s->list_lock); - page = NULL; /* Help compiler to not get confused */ - /* Recheck */ - if (unlikely(list_empty(&s->partial))) { - /* Another processor drained the list */ - spin_unlock(&s->list_lock); - goto redo; - page = lru_to_first_page(&s->partial); - list_del(&page->lru); - /* Search list for page from the correct node */ - s->nr_partial--; - } + page = new_slab(s, flags, node); + + if (!page) + return NULL; +gotpage: /* - * Now we have a page that is isolated from the lists - * and we hold the list lock. So no one can modify - * active slab pointers. + * Now we have a page that is isolated from the lists and + * locked, */ + SetPageActive(page); + ClearPageReferenced(page); + + /* + * Barrier is needed so that a racing process never + * sees a page that thas active not set. + */ + smp_wmb(); + + if (cmpxchg(&s->active[cpu], NULL, page) != NULL) { + + TPRINTK(KERN_CRIT "active already provided %s\n", s->sc.name); + + ClearPageActive(page); + add_partial(s, page); + slab_unlock(page); - if (unlikely(s->active[cpu])) { - /* Someone else created a new slab here */ - list_add(&page->lru,&s->partial); - s->nr_partial++; - spin_unlock(&s->list_lock); page = get_and_lock_active(s, cpu); if (page) return page; goto redo; } - /* - * Lock inversion. This works because s->active[cpu] is null. No one else - * can acquire the lock. However, we must insure that the lock bit becomes - * visible before the update to s->active[cpu]. Thus the write barrier here. - * get_and_lock active uses a lock there which gives us the implicit - * corresponding smb_rmb() barrier. - */ - slab_lock(page); - SetPageActive(page); - ClearPageReferenced(page); - set_active_cpu(page, cpu); - smp_wmb(); - s->active[cpu] = page; check_free_chain(s, page); - spin_unlock(&s->list_lock); if (keventd_up() && !s->flusher_active && s->size != (PAGE_SIZE << s->sc.order)) schedule_delayed_work(&s->flush, 10 * HZ); return page; } + /* * If the gfp mask has __GFP_WAIT set then slab_alloc() may enable interrupts * if it needs to acquire more pages for new slabs. */ -static void *slab_alloc(struct slab_cache *sc, gfp_t gfpflags) +static __always_inline void *__slab_alloc(struct slab_cache *sc, gfp_t gfpflags, + int node) { struct slab *s = (void *)sc; - int cpu = smp_processor_id(); struct page *page; - void **object = NULL; + void **object; void *next_object; unsigned long flags; + int cpu = smp_processor_id(); local_irq_save(flags); + page = get_and_lock_active(s, cpu); + if (unlikely(!page)) + goto load; - do { - page = get_and_lock_active(s, cpu); - - if (unlikely(!page)) { - page = reload(s, cpu, gfpflags); + while (unlikely(!get_object_pointer(page) || + (node > 0 && page_to_nid(page) != node))) { - if (!page) - goto out; + /* Current slab is unfit for allocation */ + deactivate_slab(s, page, cpu); +load: + /* Get a new slab */ + page = reload(s, cpu, gfpflags, node); + if (!page) { + local_irq_restore(flags); + return NULL; } - - object = get_object_pointer(page); - } while (!object); + } inc_object_counter(page); + object = get_object_pointer(page); next_object = object[s->offset]; set_object_pointer(page, next_object); - if (likely(!next_object)) - /* Sorry, fully allocated slab! */ - deactivate_slab(s, page); - else - SetPageReferenced(page); check_free_chain(s, page); + SetPageReferenced(page); slab_unlock(page); -out: local_irq_restore(flags); return object; + +} + +static void *slab_alloc(struct slab_cache *sc, gfp_t gfpflags) +{ + return __slab_alloc(sc, gfpflags, -1); +} + +static void *slab_alloc_node(struct slab_cache *sc, gfp_t gfpflags, int node) +{ + return __slab_alloc(sc, gfpflags, node); } /* Figure out on which slab object the object resides */ @@ -708,11 +757,11 @@ dumpret: * We deallocated all objects in a slab and the slab * is not under allocation. So we can free it. */ - spin_lock(&s->list_lock); + if (s->objects > 1) + remove_partial(s, page); check_free_chain(s, page); slab_unlock(page); discard_slab(s, page); - spin_unlock(&s->list_lock); goto out; } if (unlikely(!prior)) { @@ -722,10 +771,7 @@ dumpret: * This will increase the chances of the first object * to be reused soon. Its likely cache hot. */ - spin_lock(&s->list_lock); - list_move(&page->lru, &s->partial); - s->nr_partial++; - spin_unlock(&s->list_lock); + add_partial(s, page); } out_unlock: slab_unlock(page); @@ -803,13 +849,16 @@ static int move_slab_objects(struct slab * Drop the lock here to allow the * move_object function to do things * with the slab_cache and maybe this - * page - */ + * page. + * + */ slab_unlock(page); + local_irq_enable(); if (move_objects((struct slab_cache *)s, p)) slab_free(&s->sc, p); else unfreeable++; + local_irq_disable(); slab_lock(page); } } @@ -846,22 +895,12 @@ static int slab_shrink(struct slab_cache drain_all(s); + local_irq_save(flags); for(i = 0; s->nr_partial > 1 && i < s->nr_partial - 1; i++ ) { struct page * page; - /* Take one page off the list */ - spin_lock_irqsave(&s->list_lock, flags); - - if (s->nr_partial == 0) { - spin_unlock_irqrestore(&s->list_lock, flags); - break; - } - - page = lru_to_last_page(&s->partial); - s->nr_partial--; - list_del(&page->lru); - SetPageActive(page); /* Pin page so that slab_free will not free */ - spin_unlock_irqrestore(&s->list_lock, flags); + page = get_partial(s, -1); + SetPageActive(page); /* Pin page so that slab_free will not free */ /* * Ok. The page cannot become active anymore. @@ -878,8 +917,9 @@ static int slab_shrink(struct slab_cache * This will put the slab on the front of the partial * list, the used list or free it. */ - deactivate_slab(s, page); + putback_slab(s, page); } + local_irq_restore(flags); return slabs_freed; @@ -917,12 +957,13 @@ static int slab_destroy(struct slab_cach return 0; TPRINTK("Slab destroy %s\n",sc->name); + drain_all(s); - free_list(s, &s->full); + /* There may be empty slabs on the partial list */ free_list(s, &s->partial); - if (s->nr_slabs) + if (atomic_long_read(&s->nr_slabs)) return 1; /* Just to make sure that no one uses this again */ @@ -967,51 +1008,8 @@ static unsigned long slab_objects(struct if (p_active) *p_active = active; - return active + partial + count_objects(s, &s->full); -} - -static void *slab_alloc_node(struct slab_cache *sc, gfp_t flags, int node) -{ - struct slab *s = (void *)sc; - int cpu = node_to_first_cpu(node); - struct page *page; - void **object = NULL; - void *next_object; - unsigned long flags; - - local_irq_save(flags); - - do { - page = get_and_lock_active(s, cpu); - - if (unlikely(!page)) { - page = reload(s, cpu, gfpflags); - - if (!page) - goto out; - } - if (page_to_nid(page) != node) { - deactivate_slab(page); - continue; - } - - object = get_object_pointer(page); - } while (!object); - - inc_object_counter(page); - next_object = object[s->offset]; - set_object_pointer(page, next_object); - if (likely(!next_object)) - /* Sorry, fully allocated slab! */ - deactivate_slab(s, page); - else - SetPageReferenced(page); - check_free_chain(s, page); - slab_unlock(page); -out: - local_irq_restore(flags); - return object; - return slab_alloc(sc, flags); + return active + partial + + (atomic_read(&s->nr_slabs) - s->nr_partial) * s->objects; } const struct slab_allocator slabifier_allocator = {