Index: linux-2.6.18-rc4-mm3/mm/slabifier.c =================================================================== --- linux-2.6.18-rc4-mm3.orig/mm/slabifier.c 2006-08-29 11:12:04.432335257 -0700 +++ linux-2.6.18-rc4-mm3/mm/slabifier.c 2006-08-30 00:16:51.794382871 -0700 @@ -12,16 +12,20 @@ #include #include +#define SLABIFIER_DEBUG + #ifdef SLABIFIER_DEBUG #define DBUG_ON(_x) BUG_ON(_x) #else #define DBUG_ON(_x) #endif + struct slab { struct slab_cache sc; #ifdef CONFIG_SMP - int flusher_active; + struct mutex flushing; /* Lock for flusher */ + atomic_t active_cpus; /* if >0 then flusher is active */ struct work_struct flush; #endif atomic_t refcount; /* Refcount for destroy */ @@ -42,7 +46,9 @@ struct slab { * * Overloaded fields in struct page: * - * lru -> used to a slab on the lists + * lru -> used for a slab on the lists + * In case of an active slab then use lru to store free pointer + * and counter of used objects * mapping -> pointer to struct slab * index -> pointer to next free object * _mapcount -> count number of elements in use @@ -84,6 +90,17 @@ static __always_inline void set_object_p page->index = (unsigned long)object; } +static __always_inline void *get_active_pointer(struct page *page) +{ + return (void *)page->lru.prev; +} + +static __always_inline void set_active_pointer(struct page *page, + void *object) +{ + page->lru.prev = object; +} + static __always_inline struct slab *get_slab(struct page *page) { return (struct slab *)page->mapping; @@ -120,6 +137,32 @@ static __always_inline int get_object_co return *object_counter(page); } +static __always_inline int *active_counter(struct page *page) +{ + return (int *)&page->lru.next; +} + +static __always_inline void inc_active_counter(struct page *page) +{ + (*active_counter(page))++; +} + +static __always_inline void dec_active_counter(struct page *page) +{ + (*active_counter(page))--; +} + +static __always_inline void set_active_counter(struct page *page, + int counter) +{ + *active_counter(page) = counter; +} + +static __always_inline int get_active_counter(struct page *page) +{ + return *active_counter(page); +} + /* * Locking for each individual slab using the pagelock */ @@ -136,14 +179,28 @@ static __always_inline void slab_unlock( /* * Management of partially allocated slabs */ + +/* + * Add partial slab to the lists clearing the active flags. + * + * Slab must be locked on entry. It is no longer locked on exit. + */ static void __always_inline add_partial(struct slab *s, struct page *page) { spin_lock(&s->list_lock); + ClearPageActive(page); s->nr_partial++; list_add_tail(&page->lru, &s->partial); spin_unlock(&s->list_lock); + slab_unlock(page); } +/* + * Remove a partial slab from the lists. + * + * Slab must be locked on entry. It is no longer locked + * on exit. + */ static void __always_inline remove_partial(struct slab *s, struct page *page) { @@ -151,10 +208,12 @@ static void __always_inline remove_parti list_del(&page->lru); s->nr_partial--; spin_unlock(&s->list_lock); + slab_unlock(page); } /* - * Get a page and remove it from the partial list + * Get a page, lock it and remove it from the partial list + * * Must hold list_lock */ static __always_inline int lock_and_del_slab(struct slab *s, @@ -163,6 +222,7 @@ static __always_inline int lock_and_del_ if (bit_spin_trylock(PG_locked, &page->flags)) { list_del(&page->lru); s->nr_partial--; + SetPageActive(page); return 1; } return 0; @@ -188,7 +248,7 @@ static struct page *get_partial(struct s if (likely(node == -1)) { /* - * We can fall back to any other node in order to + / * We can fall back to any other node in order to * reduce the size of the partial list. */ list_for_each_entry(page, &s->partial, lru) @@ -299,7 +359,7 @@ static int on_freelist(struct slab *s, s nr++; } - if (get_object_counter(page) != s->objects - nr) { + if (!PageActive(page) && get_object_counter(page) != s->objects - nr) { printk(KERN_CRIT "slab %s: page %p wrong object count." " counter is %d but counted were %d\n", s->sc.name, page, get_object_counter(page), @@ -368,108 +428,127 @@ static void __always_inline putback_slab inuse = get_object_counter(page); - if (inuse) { - if (inuse < s->objects) - add_partial(s, page); - slab_unlock(page); - } else { + if (unlikely(!inuse)) { + ClearPageActive(page); slab_unlock(page); discard_slab(s, page); + } else { + if (unlikely(inuse < s->objects)) + add_partial(s, page); + else { + ClearPageActive(page); + slab_unlock(page); + } } } -static void deactivate_slab(struct slab *s, struct page *page, int cpu) +static void deactivate_slab(struct slab *s, struct page *page) { - s->active[cpu] = NULL; - smp_wmb(); - ClearPageActive(page); - ClearPageReferenced(page); + void *freelist; + check_active_slab(page); + slab_lock(page); + freelist = get_active_pointer(page); + if (unlikely(freelist)) { + /* Merge freelists */ + if (get_object_pointer(page)) { + while (freelist) { + void **x = freelist; + + /* Remove object from active freelist */ + freelist = x[s->offset]; + /* Push onto object freelist */ + x[s->offset] = get_object_pointer(page); + set_object_pointer(page, x); + dec_object_counter(page); + } + } else { + set_object_pointer(page, freelist); + set_object_counter(page, get_active_counter(page)); + } + + } + ClearPageReferenced(page); putback_slab(s, page); } /* - * Acquire the slab lock from the active array. If there is no active - * slab for this processor then return NULL; + * Flush active slab. + * Called from IPI handler with interrupts disabled */ -static __always_inline struct page *get_and_lock_active(struct slab *s, - int cpu) +static void flush_active(void *d) { - struct page *page; + struct slab *s = d; + int cpu = smp_processor_id(); + struct page *page = s->active[cpu]; -redo: - page = s->active[cpu]; - if (unlikely(!page)) - return NULL; - slab_lock(page); - if (unlikely(s->active[cpu] != page)) { - slab_unlock(page); - goto redo; + if (page) { + s->active[cpu] = NULL; + ClearPageReferenced(page); + deactivate_slab(s, page); } - check_active_slab(page); - check_free_chain(s, page); - return page; } +#ifdef CONFIG_SMP /* - * Flush an active slab back to the lists. + * Count and deal with flushing active slabs back to the list if + * they have not been used. Must be called from + * active cpu. + * + * Called from IPI handler with interrupts disabled. */ -static void flush_active(struct slab *s, int cpu) +static void check_flush_active(void *d) { - struct page *page; - unsigned long flags; + struct slab *s = d; + int cpu = smp_processor_id(); + struct page *page = s->active[cpu]; - local_irq_save(flags); - page = get_and_lock_active(s, cpu); - if (likely(page)) - deactivate_slab(s, page, cpu); - local_irq_restore(flags); + if (!page) + return; + + if (PageReferenced(page)) { + ClearPageReferenced(page); + atomic_inc(&s->active_cpus); + } else { + deactivate_slab(s, page); + s->active[cpu] = NULL; + } } -#ifdef CONFIG_SMP /* * Flush per cpu slabs if they are not in use. */ void flusher(void *d) { struct slab *s = d; - int cpu = smp_processor_id(); - struct page *page; - int nr_active = 0; - - for_each_online_cpu(cpu) { - page = s->active[cpu]; - if (!page) - continue; - - if (PageReferenced(page)) { - ClearPageReferenced(page); - nr_active++; - } else - flush_active(s, cpu); - } - if (nr_active) + if (!mutex_trylock(&s->flushing)) + return; + atomic_set(&s->active_cpus, num_online_cpus()); + on_each_cpu(check_flush_active, s, 1, 1); + if (atomic_read(&s->active_cpus)) schedule_delayed_work(&s->flush, 10 * HZ); - else - s->flusher_active = 0; + mutex_unlock(&s->flushing); } static void drain_all(struct slab *s) { - int cpu; - - if (s->flusher_active) { + if (atomic_read(&s->active_cpus)) { + mutex_lock(&s->flushing); cancel_delayed_work(&s->flush); - for_each_possible_cpu(cpu) - flush_active(s, cpu); - s->flusher_active = 0; + atomic_set(&s->active_cpus, 0); + on_each_cpu(flush_active, s, 1, 1); + mutex_unlock(&s->flushing); } } #else static void drain_all(struct slab *s) { - flush_active(s, 0); + unsigned long flags; + + local_irq_save(flags); + flush_active(s); + local_irq_restore(flags); } #endif @@ -515,7 +594,7 @@ static struct slab_cache *slab_create(st atomic_long_set(&s->nr_slabs, 0); s->nr_partial = 0; #ifdef CONFIG_SMP - s->flusher_active = 0; + atomic_set(&s->active_cpus, 0); INIT_WORK(&s->flush, &flusher, s); #endif if (!s->objects) @@ -525,7 +604,7 @@ static struct slab_cache *slab_create(st atomic_set(&s->refcount, 1); spin_lock_init(&s->list_lock); - + mutex_init(&s->flushing); for_each_possible_cpu(cpu) s->active[cpu] = NULL; return &s->sc; @@ -546,17 +625,22 @@ static struct page *reload(struct slab * void **last; struct page *page; -redo: /* Racy check. If we mistakenly see no partial slabs then we just * expand the partial list. If we mistakenly try to get a partial * slab then get_partials will return NULL. */ - if (s->nr_partial) { + if (likely(s->nr_partial)) { page = get_partial(s, node); - if (page) + if (likely(page)) goto gotpage; } + /* + * We may enable interrupts which may result in another + * slab operation on the cache. Setting active slab to NULL + * will allow the other slab operation to allocate a page. + */ + s->active[cpu] = NULL; if ((flags & __GFP_WAIT)) { local_irq_enable(); page = new_slab(s, flags, node); @@ -579,32 +663,38 @@ redo: last[s->offset] = NULL; set_object_counter(page, 0); slab_lock(page); + SetPageActive(page); check_free_chain(s, page); + /* We droppped the lock .... */ + if (s->active[cpu]) { + + add_partial(s, page); + return s->active[cpu]; + } gotpage: /* - * Now we have a page that is isolated from the lists and locked, + * Now we have a page that is isolated from the lists, */ - SetPageActive(page); ClearPageReferenced(page); - if (cmpxchg(&s->active[cpu], NULL, page) != NULL) { - - ClearPageActive(page); - add_partial(s, page); - slab_unlock(page); - - page = get_and_lock_active(s, cpu); - if (page) - return page; - goto redo; - } + /* + * An active page will appear to slab_free like + * a full page but will have a shadow freelist + * and a shadow counter. + */ + set_active_pointer(page, get_object_pointer(page)); + set_object_pointer(page, NULL); + set_active_counter(page, get_object_counter(page)); + set_object_counter(page, s->objects); + s->active[cpu] = page; + slab_unlock(page); check_free_chain(s, page); #ifdef CONFIG_SMP - if (keventd_up() && !s->flusher_active) { - s->flusher_active = 1; + if (keventd_up() && !atomic_read(&s->active_cpus)) { + atomic_inc(&s->active_cpus); schedule_delayed_work(&s->flush, 10 * HZ); } #endif @@ -618,7 +708,7 @@ static __always_inline void *__slab_allo struct slab *s = (void *)sc; struct page *page; void **object; - void *next_object; + void **freelist; unsigned long flags; int cpu; @@ -633,29 +723,51 @@ static __always_inline void *__slab_allo local_irq_save(flags); cpu = smp_processor_id(); - page = get_and_lock_active(s, cpu); - if (unlikely(!page)) - goto load; - - while (unlikely(!get_object_pointer(page) || - (node > 0 && page_to_nid(page) != node))) { - - deactivate_slab(s, page, cpu); + page = s->active[cpu]; + if (unlikely(!page)) { load: page = reload(s, cpu, gfpflags, node); + s->active[cpu] = page; if (unlikely(!page)) { local_irq_restore(flags); return NULL; } } + check_active_slab(page); + freelist = get_active_pointer(page); + if (unlikely(!freelist)) { + /* + * Free list exhausted. Now we need to see if any additional + * frees have occurred in the meantime on this slab. Then + * we use the build up free list. + */ - inc_object_counter(page); - object = get_object_pointer(page); - next_object = object[s->offset]; - set_object_pointer(page, next_object); - check_free_chain(s, page); + if (get_object_pointer(page)) { + slab_lock(page); + freelist = get_object_pointer(page); + set_object_pointer(page, NULL); + set_object_counter(page, s->objects); + set_active_counter(page, 0); + slab_unlock(page); + } + + /* + * If the above did not help us then we need a new slab + */ + if (unlikely(!freelist) || + (node >= 0 && page_to_nid(page) != node)) { + + /* Return a slab unfit for further allocation */ + deactivate_slab(s, page); + goto load; + } + } + + + object = freelist; + set_active_pointer(page, freelist[s->offset]); + inc_active_counter(page); SetPageReferenced(page); - slab_unlock(page); local_irq_restore(flags); return object; } @@ -735,6 +847,15 @@ dumpret: } local_irq_save(flags); + if (page == s->active[smp_processor_id()]) { + /* fast bypass to local active slab */ + object[s->offset] = get_active_pointer(page); + set_active_pointer(page, object); + dec_active_counter(page); + local_irq_restore(flags); + return; + } + slab_lock(page); #ifdef SLABIFIER_DEBUG @@ -758,7 +879,6 @@ dumpret: if (unlikely(get_object_counter(page) == 0)) { remove_partial(s, page); check_free_chain(s, page); - slab_unlock(page); discard_slab(s, page); goto out; } @@ -769,9 +889,10 @@ dumpret: * object now. So move to the partial list. */ add_partial(s, page); - + else out_unlock: - slab_unlock(page); + slab_unlock(page); + out: local_irq_restore(flags); } @@ -890,12 +1011,6 @@ static int slab_shrink(struct slab_cache if (!page) break; - /* - * Pin page so that slab_free will not free even if we - * drop the slab lock. - */ - SetPageActive(page); - if (get_object_counter(page) < s->objects && move_object) if (move_slab_objects(s, page, move_object) == 0) @@ -905,7 +1020,6 @@ static int slab_shrink(struct slab_cache * This will put the slab on the front of the partial * list, the used list or free it. */ - ClearPageActive(page); putback_slab(s, page); } local_irq_restore(flags);