Index: linux-2.6.21-rc1/include/linux/slub_def.h =================================================================== --- linux-2.6.21-rc1.orig/include/linux/slub_def.h 2007-02-24 11:04:42.000000000 -0800 +++ linux-2.6.21-rc1/include/linux/slub_def.h 2007-02-24 11:05:21.000000000 -0800 @@ -10,22 +10,23 @@ #include #include +struct kmem_cache_node { + spinlock_t list_lock; /* Protect partial list and nr_partial */ + unsigned long nr_partial; + struct list_head partial; + atomic_long_t nr_slabs; +}; + /* * Slab cache management. */ struct kmem_cache { - spinlock_t list_lock; /* Protecty partial list and nr_partial */ - struct list_head partial; - unsigned long nr_partial; int offset; /* Free pointer offset. */ - struct page *cpu_slab[NR_CPUS]; - atomic_long_t nr_slabs[MAX_NUMNODES]; unsigned int order; unsigned long flags; int size; /* Total size of an object */ int objects; /* Number of objects in slab */ atomic_t refcount; /* Refcount for destroy */ - int align; void (*ctor)(void *, struct kmem_cache *, unsigned long); void (*dtor)(void *, struct kmem_cache *, unsigned long); @@ -38,6 +39,8 @@ struct kmem_cache { atomic_t cpu_slabs; /* if >0 then flusher is scheduled */ struct delayed_work flush; #endif + struct kmem_cache_node *node[MAX_NUMNODES]; + struct page *cpu_slab[NR_CPUS]; }; /* @@ -119,6 +122,7 @@ static inline struct kmem_cache *kmalloc #ifdef CONFIG_ZONE_DMA #define SLUB_DMA __GFP_DMA #else +/* Disable SLAB functionality */ #define SLUB_DMA 0 #endif Index: linux-2.6.21-rc1/mm/slub.c =================================================================== --- linux-2.6.21-rc1.orig/mm/slub.c 2007-02-24 11:04:42.000000000 -0800 +++ linux-2.6.21-rc1/mm/slub.c 2007-02-24 11:06:34.000000000 -0800 @@ -11,11 +11,10 @@ * Pending pieces: * * A. Slab defragmentation support - * B. NUMA cache line optimizations and per node partial lists. - * C. Lockless allocs via separate freelists for cpu slabs - * D. Lockless partial list handling + * B. Lockless allocs via separate freelists for cpu slabs + * C. Lockless partial list handling * - * Futher issues to solve: + * Further issues to solve: * * 1. Support the Slab debugging options * 2. Move logic for draining page allocator queues @@ -30,6 +29,8 @@ #include #include #include +#include +#include /* * Overloading of page flags that are otherwise used for LRU management. @@ -91,6 +92,12 @@ static void unregister_slab(struct kmem_ static struct notifier_block slab_notifier; #endif +static enum { + DOWN, /* No slab functionality available */ + PARTIAL, /* kmem_cache_open() works but kmalloc does not */ + UP /* Everything works */ +} slab_state = DOWN; + /******************************************************************** * Core slab cache functions *******************************************************************/ @@ -232,19 +239,25 @@ static __always_inline int slab_trylock( */ static void __always_inline add_partial(struct kmem_cache *s, struct page *page) { - spin_lock(&s->list_lock); - s->nr_partial++; - list_add_tail(&page->lru, &s->partial); - spin_unlock(&s->list_lock); + int node = page_to_nid(page); + struct kmem_cache_node *n = s->node[node]; + + spin_lock(&n->list_lock); + n->nr_partial++; + list_add_tail(&page->lru, &n->partial); + spin_unlock(&n->list_lock); } static void __always_inline remove_partial(struct kmem_cache *s, struct page *page) { - spin_lock(&s->list_lock); + int node = page_to_nid(page); + struct kmem_cache_node *n = s->node[node]; + + spin_lock(&n->list_lock); list_del(&page->lru); - s->nr_partial--; - spin_unlock(&s->list_lock); + n->nr_partial--; + spin_unlock(&n->list_lock); } /* @@ -252,78 +265,83 @@ static void __always_inline remove_parti * * Must hold list_lock */ -static __always_inline int lock_and_del_slab(struct kmem_cache *s, +static __always_inline int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) { if (slab_trylock(page)) { list_del(&page->lru); - s->nr_partial--; + n->nr_partial--; return 1; } return 0; } /* - * Get a partial page, lock it and return it. + * Try to get a partial slab from the indicated node */ -#ifdef CONFIG_NUMA -static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +static struct page *get_partial_node(struct kmem_cache_node *n) { struct page *page; - int searchnode = (node == -1) ? numa_node_id() : node; - if (!s->nr_partial) - return NULL; - - spin_lock(&s->list_lock); /* - * Search for slab on the right node + * Racy check. If we mistakenly see no partial slabs then we + * just allocate an empty slab. If we mistakenly try to get a + * partial slab then get_partials() will return NULL. */ - list_for_each_entry(page, &s->partial, lru) - if (likely(page_to_nid(page) == searchnode) && - lock_and_del_slab(s, page)) - goto out; - - if (likely(!(flags & __GFP_THISNODE))) { - /* - * We can fall back to any other node in order to - * reduce the size of the partial list. - */ - list_for_each_entry(page, &s->partial, lru) - if (likely(lock_and_del_slab(s, page))) - goto out; - } + if (!n->nr_partial) + return NULL; - /* Nothing found */ + spin_lock(&n->list_lock); + list_for_each_entry(page, &n->partial, lru) + if (lock_and_del_slab(n, page)) + goto out; page = NULL; out: - spin_unlock(&s->list_lock); + spin_unlock(&n->list_lock); return page; } -#else + +struct page *get_any_partial(struct kmem_cache *s, int node, gfp_t flags) +{ +#ifdef CONFIG_NUMA + struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy)) + ->node_zonelists[gfp_zone(flags)]; + struct zone **z; + struct page *page; + int nid; + + /* + * Look through allowed nodes for objects available + * from existing per node queues. + */ + for (z = zonelist->zones; *z; z++) { + nid = zone_to_nid(*z); + + if (cpuset_zone_allowed_hardwall(*z, flags) && + s->node[nid]) { + page = get_partial_node(s->node[node]); + if (page) + return page; + } + } +#endif + return NULL; +} + +/* + * Get a partial page, lock it and return it. + */ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; + int searchnode = (node == -1) ? numa_node_id() : node; - /* - * Racy check. If we mistakenly see no partial slabs then we - * just allocate an empty slab. - */ - if (!s->nr_partial) - return NULL; - - spin_lock(&s->list_lock); - list_for_each_entry(page, &s->partial, lru) - if (likely(lock_and_del_slab(s, page))) - goto out; + page = get_partial_node(s->node[searchnode]); + if (page || (flags & __GFP_THISNODE)) + return page; - /* No slab or all slabs busy */ - page = NULL; -out: - spin_unlock(&s->list_lock); - return page; + return get_any_partial(s, node, flags); } -#endif /* * Debugging checks @@ -414,7 +432,10 @@ void check_free_chain(struct kmem_cache static void discard_slab(struct kmem_cache *s, struct page *page) { - atomic_long_dec(&s->nr_slabs[page_to_nid(page)]); + int node = page_to_nid(page); + struct kmem_cache_node *n = s->node[node]; + + atomic_long_dec(&n->nr_slabs); page->mapping = NULL; reset_page_mapcount(page); @@ -427,6 +448,7 @@ static void discard_slab(struct kmem_cac static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; + struct kmem_cache_node *n; BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW)); if (flags & __GFP_NO_GROW) @@ -439,12 +461,12 @@ static struct page *new_slab(struct kmem if (!page) goto out; + node = page_to_nid(page); + n = s->node[node]; page->offset = s->offset; - - atomic_long_inc(&s->nr_slabs[page_to_nid(page)]); - - page->slab = (struct kmem_cache *)s; + page->slab = s; __SetPageSlab(page); + atomic_long_inc(&n->nr_slabs); if (s->objects > 1) { void *start = page_address(page); @@ -873,13 +895,20 @@ static unsigned long calculate_alignment return ALIGN(align, sizeof(void *)); } -int kmem_cache_open(struct kmem_cache *s, +static void init_kmem_cache_node(struct kmem_cache_node *n) +{ + memset(n, 0, sizeof(struct kmem_cache_node)); + atomic_long_set(&n->nr_slabs, 0); + spin_lock_init(&n->list_lock); + INIT_LIST_HEAD(&n->partial); +} + +int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *, struct kmem_cache *, unsigned long), void (*dtor)(void *, struct kmem_cache *, unsigned long)) { - int cpu; int node; BUG_ON(flags & SLUB_UNIMPLEMENTED); @@ -926,23 +955,50 @@ int kmem_cache_open(struct kmem_cache *s atomic_set(&s->refcount, 1); - for_each_possible_cpu(cpu) - s->cpu_slab[cpu] = NULL; - - for_each_node(node) - atomic_long_set(&s->nr_slabs[node], 0); - INIT_LIST_HEAD(&s->partial); - spin_lock_init(&s->list_lock); - #ifdef CONFIG_SMP mutex_init(&s->flushing); atomic_set(&s->cpu_slabs, 0); INIT_DELAYED_WORK(&s->flush, flusher); #endif + for_each_online_node(node) { + struct kmem_cache_node *n; + struct kmem_cache_node boot_kmem_cache_node; + + + if (slab_state == DOWN) { + /* Solve chicken-egg by providing temporary egg */ + n = s->node[node] = &boot_kmem_cache_node; + init_kmem_cache_node(n); + } + + n = kmalloc_node(sizeof(struct kmem_cache_node), + gfpflags, node); + + if (!n) + goto undo_alloc_err; + + s->node[node] = n; + if (slab_state == DOWN) { + /* Transfer temporary egg contents to real egg */ + memcpy(n, &boot_kmem_cache_node, sizeof(*n)); + /* + * Note that we are moving a list head here. The list + * contains exactly one slab. Just move that one over + * to the new list. + */ + INIT_LIST_HEAD(&n->partial); + list_move(boot_kmem_cache_node.partial.next , &n->partial); + } else + init_kmem_cache_node(n); + } register_slab(s); return 1; +undo_alloc_err: + for_each_online_node(node) + kfree(s->node[node]); + error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%ld realsize=%d " @@ -1001,20 +1057,21 @@ const char *kmem_cache_name(struct kmem_ } EXPORT_SYMBOL(kmem_cache_name); -static int free_list(struct kmem_cache *s, struct list_head *list) +static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, + struct list_head *list) { int slabs_inuse = 0; unsigned long flags; struct page *page, *h; - spin_lock_irqsave(&s->list_lock, flags); + spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, list, lru) if (!page->inuse) { list_del(&page->lru); discard_slab(s, page); } else slabs_inuse++; - spin_unlock_irqrestore(&s->list_lock, flags); + spin_unlock_irqrestore(&n->list_lock, flags); return slabs_inuse; } @@ -1027,12 +1084,21 @@ int kmem_cache_close(struct kmem_cache * int node; flush_all(s); - free_list(s, &s->partial); - for_each_online_node(node) - if (atomic_long_read(&s->nr_slabs[node])) + /* Attempt to free all objects */ + for_each_online_node(node) { + struct kmem_cache_node *n = s->node[node]; + + free_list(s, n, &n->partial); + if (atomic_long_read(&n->nr_slabs)) return 1; + } + /* Free allocated metadata */ + for_each_online_node(node) { + kfree(s->node[node]); + s->node[node] = NULL; + } unregister_slab(s); return 0; } @@ -1054,18 +1120,23 @@ EXPORT_SYMBOL(kmem_cache_destroy); static unsigned long count_objects(struct kmem_cache *s, - struct list_head *list, unsigned long *nodes) + unsigned long *nodes) { int count = 0; struct page *page; unsigned long flags; + int node; + + for_each_online_node(node) { + struct kmem_cache_node *n = s->node[node]; - spin_lock_irqsave(&s->list_lock, flags); - list_for_each_entry(page, list, lru) { - count += page->inuse; - nodes[page_to_nid(page)]++; + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) { + count += page->inuse; + nodes[node]++; + } + spin_unlock_irqrestore(&n->list_lock, flags); } - spin_unlock_irqrestore(&s->list_lock, flags); return count; } @@ -1073,16 +1144,20 @@ static unsigned long slab_objects(struct unsigned long *p_total, unsigned long *p_cpu_slabs, unsigned long *p_partial, unsigned long *nodes) { - int in_partial_slabs = count_objects(s, &s->partial, nodes); + int partial = 0; + int in_partial_slabs = count_objects(s, nodes); int nr_slabs = 0; int cpu_slabs = 0; int nr_in_cpu_slabs = 0; int cpu; int node; - for_each_online_node(node) - nr_slabs += nodes[node] = atomic_read(&s->nr_slabs[node]); + for_each_online_node(node) { + struct kmem_cache_node *n = s->node[node]; + nr_slabs += nodes[node] = atomic_read(&n->nr_slabs); + partial += n->nr_partial; + } for_each_possible_cpu(cpu) { struct page *page = s->cpu_slab[cpu]; @@ -1094,7 +1169,7 @@ static unsigned long slab_objects(struct } if (p_partial) - *p_partial = s->nr_partial; + *p_partial = partial; if (p_cpu_slabs) *p_cpu_slabs = cpu_slabs; @@ -1103,7 +1178,7 @@ static unsigned long slab_objects(struct *p_total = nr_slabs; return in_partial_slabs + nr_in_cpu_slabs + - (nr_slabs - s->nr_partial - cpu_slabs) * s->objects; + (nr_slabs - partial - cpu_slabs) * s->objects; } /******************************************************************** @@ -1135,10 +1210,10 @@ static int __init setup_slub_nomerge(cha __setup("slub_nomerge", setup_slub_nomerge); static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, - const char *name, int size) + const char *name, int size, gfp_t flags) { - if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, + if (!kmem_cache_open(s, flags, name, size, ARCH_KMALLOC_MINALIGN, 0, NULL, NULL)) panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); @@ -1176,7 +1251,7 @@ static struct kmem_cache *get_slab(size_ #ifdef KMALLOC_EXTRA if (index <= KMALLOC_SHIFT_HIGH - KMALLOC_SHIFT_LOW) #endif - realsize = 1 << index; + realsize = 1 << (index + KMALLOC_SHIFT_LOW); #ifdef KMALLOC_EXTRA else { index -= KMALLOC_SHIFT_HIGH - KMALLOC_SHIFT_LOW +1; @@ -1188,7 +1263,7 @@ static struct kmem_cache *get_slab(size_ #endif text = kasprintf(flags, "kmalloc_dma-%ld", realsize); - s = create_kmalloc_cache(x, text, realsize); + s = create_kmalloc_cache(x, text, realsize, flags); kfree(text); kmalloc_caches_dma[index] = s; return s; @@ -1236,12 +1311,6 @@ EXPORT_SYMBOL(kfree); #define SLAB_MAX_ORDER 4 -/* - * We can actually operate slabs any time after the page allocator is up. - * slab_is_available() merely means that the kmalloc array is available. - */ -static enum { DOWN, PARTIAL, UP } slab_state = DOWN; - int slab_is_available(void) { return slab_state == UP; } @@ -1249,22 +1318,43 @@ int slab_is_available(void) { void __init kmem_cache_init(void) { int i; + int kmem_cache_node_cache = + kmalloc_index(sizeof(struct kmem_cache_node)); + + BUG_ON(kmem_cache_node_cache < 0 || + kmem_cache_node_cache > KMALLOC_SHIFT_HIGH); + + /* + * Must first have the slab cache available for the allocations of the + * struct kmalloc_cache_node's. There is special bootstrap code in + * kmem_cache_open for the situation when slab_state == DOWN. + */ + create_kmalloc_cache(&kmalloc_caches[kmem_cache_node_cache + - KMALLOC_SHIFT_LOW], + "kmalloc", + 1 << kmem_cache_node_cache, + GFP_KERNEL); + + /* Now we are able to allocate the per node structures */ + slab_state = PARTIAL; for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { + if (i == kmem_cache_node_cache) + continue; + create_kmalloc_cache( &kmalloc_caches[i - KMALLOC_SHIFT_LOW], - "kmalloc", 1 << i); + "kmalloc", 1 << i, GFP_KERNEL); } -#ifdef KMALLOC_EXTRA - slab_state = PARTIAL; +#ifdef KMALLOC_EXTRA /* Caches that are not of the two-to-the-power-of size */ create_kmalloc_cache(&kmalloc_caches [KMALLOC_SHIFT_HIGH - KMALLOC_SHIFT_LOW + 1], - "kmalloc-96", 96); + "kmalloc-96", 96, GFP_KERNEL); create_kmalloc_cache(&kmalloc_caches [KMALLOC_SHIFT_HIGH - KMALLOC_SHIFT_LOW + 2], - "kmalloc-192", 192); + "kmalloc-192", 192, GFP_KERNEL); #endif slab_state = UP; @@ -1331,7 +1421,8 @@ struct kmem_cache *kmem_cache_create(con if (!s) return NULL; - if (!kmem_cache_open(s, name, size, align, flags, ctor, dtor)) { + if (!kmem_cache_open(s, GFP_KERNEL, name, size, align, + flags, ctor, dtor)) { kfree(s); return NULL; }