Index: linux-2.6.20-mm2/include/linux/slub_def.h =================================================================== --- linux-2.6.20-mm2.orig/include/linux/slub_def.h 2007-02-23 06:02:04.000000000 -0800 +++ linux-2.6.20-mm2/include/linux/slub_def.h 2007-02-23 06:13:35.000000000 -0800 @@ -10,16 +10,19 @@ #include #include +struct kmem_cache_node { + spinlock_t list_lock; /* Protect partial list and nr_partial */ + unsigned long nr_partial; + struct list_head partial; + atomic_long_t nr_slabs; +}; + /* * Slab cache management. */ struct kmem_cache { - spinlock_t list_lock; /* Protecty partial list and nr_partial */ - struct list_head partial; - unsigned long nr_partial; int offset; /* Free pointer offset. */ struct page *cpu_slab[NR_CPUS]; - atomic_long_t nr_slabs[MAX_NUMNODES]; unsigned int order; unsigned long flags; int size; /* Total size of an object */ @@ -38,6 +41,8 @@ struct kmem_cache { atomic_t cpu_slabs; /* if >0 then flusher is scheduled */ struct delayed_work flush; #endif + struct kmem_cache_node *node[MAX_NUMNODES]; + struct kmem_cache_node local_node_info; }; /* @@ -119,6 +124,7 @@ static inline struct kmem_cache *kmalloc #ifdef CONFIG_ZONE_DMA #define SLUB_DMA __GFP_DMA #else +/* Disable SLAB functionality */ #define SLUB_DMA 0 #endif Index: linux-2.6.20-mm2/mm/slub.c =================================================================== --- linux-2.6.20-mm2.orig/mm/slub.c 2007-02-23 06:12:55.000000000 -0800 +++ linux-2.6.20-mm2/mm/slub.c 2007-02-23 06:13:35.000000000 -0800 @@ -30,6 +30,8 @@ #include #include #include +#include +#include /* * Overloading of page flags that are otherwise used for LRU management. @@ -102,6 +104,20 @@ static void unregister_slab(struct kmem_ static struct notifier_block slab_notifier; #endif +#ifdef CONFIG_NUMA +struct kmem_cache kmem_cache_node = { + .flags = SLAB_PANIC; + .size = sizeof(struct kmem_cache_node); + .objects = PAGE_SIZE / sizeof(struct kmem_cache_node); + .objsize = sizeof(struct kmem_cache_node); + .inuse = sizeof(struct kmem_cache_node); + .name = "kmem_cache_node", + .node = { &local_node_info, &local_node_info .... }; + .local_node_info = { SPIN_LOCK_INIT(), 0, INIT_LIST_HEAD(), ATOMIC_INIT(0) } +}; +#endif + + /******************************************************************** * Core slab cache functions *******************************************************************/ @@ -243,19 +259,25 @@ static __always_inline int slab_trylock( */ static void __always_inline add_partial(struct kmem_cache *s, struct page *page) { - spin_lock(&s->list_lock); - s->nr_partial++; - list_add_tail(&page->lru, &s->partial); - spin_unlock(&s->list_lock); + int node = page_to_nid(page); + struct kmem_cache_node *n = s->node[node]; + + spin_lock(&n->list_lock); + n->nr_partial++; + list_add_tail(&page->lru, &n->partial); + spin_unlock(&n->list_lock); } static void __always_inline remove_partial(struct kmem_cache *s, struct page *page) { - spin_lock(&s->list_lock); + int node = page_to_nid(page); + struct kmem_cache_node *n = s->node[node]; + + spin_lock(&n->list_lock); list_del(&page->lru); - s->nr_partial--; - spin_unlock(&s->list_lock); + n->nr_partial--; + spin_unlock(&n->list_lock); } /* @@ -263,78 +285,83 @@ static void __always_inline remove_parti * * Must hold list_lock */ -static __always_inline int lock_and_del_slab(struct kmem_cache *s, +static __always_inline int lock_and_del_slab(struct kmem_cache_node *n, struct page *page) { if (slab_trylock(page)) { list_del(&page->lru); - s->nr_partial--; + n->nr_partial--; return 1; } return 0; } /* - * Get a partial page, lock it and return it. + * Try to get a partial slab from the indicated node */ -#ifdef CONFIG_NUMA -static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +static struct page *get_partial_node(struct kmem_cache_node *n) { struct page *page; - int searchnode = (node == -1) ? numa_node_id() : node; - - if (!s->nr_partial) - return NULL; - spin_lock(&s->list_lock); /* - * Search for slab on the right node + * Racy check. If we mistakenly see no partial slabs then we + * just allocate an empty slab. If we mistakenly try to get a + * partial slab then get_partials() will return NULL. */ - list_for_each_entry(page, &s->partial, lru) - if (likely(page_to_nid(page) == searchnode) && - lock_and_del_slab(s, page)) - goto out; - - if (likely(!(flags & __GFP_THISNODE))) { - /* - * We can fall back to any other node in order to - * reduce the size of the partial list. - */ - list_for_each_entry(page, &s->partial, lru) - if (likely(lock_and_del_slab(s, page))) - goto out; - } + if (!n->nr_partial) + return NULL; - /* Nothing found */ + spin_lock(&n->list_lock); + list_for_each_entry(page, &n->partial, lru) + if (lock_and_del_slab(n, page)) + goto out; page = NULL; out: - spin_unlock(&s->list_lock); + spin_unlock(&n->list_lock); return page; } -#else + +struct page *get_any_partial(struct kmem_cache *s, int node, gfp_t flags) +{ +#ifdef CONFIG_NUMA + struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy)) + ->node_zonelists[gfp_zone(flags)]; + struct zone **z; + struct page *page; + int nid; + + /* + * Look through allowed nodes for objects available + * from existing per node queues. + */ + for (z = zonelist->zones; *z; z++) { + nid = zone_to_nid(*z); + + if (cpuset_zone_allowed_hardwall(*z, flags) && + s->node[nid]) { + page = get_partial_node(s->node[node]); + if (page) + return page; + } + } +#endif + return NULL; +} + +/* + * Get a partial page, lock it and return it. + */ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; + int searchnode = (node == -1) ? numa_node_id() : node; - /* - * Racy check. If we mistakenly see no partial slabs then we - * just allocate an empty slab. - */ - if (!s->nr_partial) - return NULL; - - spin_lock(&s->list_lock); - list_for_each_entry(page, &s->partial, lru) - if (likely(lock_and_del_slab(s, page))) - goto out; + page = get_partial_node(s->node[searchnode]); + if (page || (flags & __GFP_THISNODE)) + return page; - /* No slab or all slabs busy */ - page = NULL; -out: - spin_unlock(&s->list_lock); - return page; + return get_any_partial(s, node, flags); } -#endif /* * Debugging checks @@ -425,7 +452,10 @@ void check_free_chain(struct kmem_cache static void discard_slab(struct kmem_cache *s, struct page *page) { - atomic_long_dec(&s->nr_slabs[page_to_nid(page)]); + int node = page_to_nid(page); + struct kmem_cache_node *n = s->node[node]; + + atomic_long_dec(&n->nr_slabs); page->mapping = NULL; reset_page_mapcount(page); @@ -438,6 +468,7 @@ static void discard_slab(struct kmem_cac static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; + struct kmem_cache_node *n; BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK | __GFP_NO_GROW)); if (flags & __GFP_NO_GROW) @@ -450,11 +481,14 @@ static struct page *new_slab(struct kmem if (!page) goto out; + node = page_to_nid(page); + n = s->node[node]; + page->offset = s->offset; - atomic_long_inc(&s->nr_slabs[page_to_nid(page)]); + atomic_long_inc(&n->nr_slabs); - page->slab = (struct kmem_cache *)s; + page->slab = s; __SetPageSlab(page); if (s->objects > 1) { @@ -886,16 +920,28 @@ int kmem_cache_open(struct kmem_cache *s { int cpu; int node; + int local_node = page_to_nid(virt_to_page(s)); BUG_ON(flags & SLUB_UNIMPLEMENTED); memset(s, 0, sizeof(struct kmem_cache)); - for_each_node(node) - atomic_long_set(&s->nr_slabs[node], 0); + s->node[numa_node_id()] = &s->local_node_info; + + for_each_online_node(node) { + struct kmem_cache_node *n = s->node[node]; + + if (node != local_node) + n = s->node[node] = kmem_cache_alloc_node( + &kmem_cache_node, + GFP_KERNEL, + node); + atomic_long_set(&n->nr_slabs, 0); + spin_lock_init(&n->list_lock); + n->nr_partial = 0; + INIT_LIST_HEAD(&n->partial); + } atomic_set(&s->refcount, 1); - spin_lock_init(&s->list_lock); for_each_possible_cpu(cpu) s->cpu_slab[cpu] = NULL; - INIT_LIST_HEAD(&s->partial); #ifdef CONFIG_SMP mutex_init(&s->flushing); atomic_set(&s->cpu_slabs, 0); @@ -1003,20 +1049,21 @@ const char *kmem_cache_name(struct kmem_ } EXPORT_SYMBOL(kmem_cache_name); -static int free_list(struct kmem_cache *s, struct list_head *list) +static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, + struct list_head *list) { int slabs_inuse = 0; unsigned long flags; struct page *page, *h; - spin_lock_irqsave(&s->list_lock, flags); + spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, list, lru) if (!page->inuse) { list_del(&page->lru); discard_slab(s, page); } else slabs_inuse++; - spin_unlock_irqrestore(&s->list_lock, flags); + spin_unlock_irqrestore(&n->list_lock, flags); return slabs_inuse; } @@ -1027,17 +1074,29 @@ static int free_list(struct kmem_cache * int kmem_cache_close(struct kmem_cache *s) { int node; + int local_node = page_to_nid(virt_to_page(s)); if (!atomic_dec_and_test(&s->refcount)) return 0; flush_all(s); - free_list(s, &s->partial); - for_each_online_node(node) - if (atomic_long_read(&s->nr_slabs[node])) + /* Attempt to free all objects */ + for_each_online_node(node) { + struct kmem_cache_node *n = s->node[node]; + + free_list(s, n, &n->partial); + if (atomic_long_read(&n->nr_slabs)) return 1; + } + + /* Free allocated metadata */ + for_each_online_node(node) { + if (node != local_node) + kfree(s->node[node]); + s->node[node] = NULL; + } unregister_slab(s); return 0; } @@ -1056,18 +1115,23 @@ EXPORT_SYMBOL(kmem_cache_destroy); static unsigned long count_objects(struct kmem_cache *s, - struct list_head *list, unsigned long *nodes) + unsigned long *nodes) { int count = 0; struct page *page; unsigned long flags; + int node; - spin_lock_irqsave(&s->list_lock, flags); - list_for_each_entry(page, list, lru) { - count += page->inuse; - nodes[page_to_nid(page)]++; + for_each_online_node(node) { + struct kmem_cache_node *n = s->node[node]; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) { + count += page->inuse; + nodes[node]++; + } + spin_unlock_irqrestore(&n->list_lock, flags); } - spin_unlock_irqrestore(&s->list_lock, flags); return count; } @@ -1075,16 +1139,20 @@ static unsigned long slab_objects(struct unsigned long *p_total, unsigned long *p_cpu_slabs, unsigned long *p_partial, unsigned long *nodes) { - int in_partial_slabs = count_objects(s, &s->partial, nodes); + int partial = 0; + int in_partial_slabs = count_objects(s, nodes); int nr_slabs = 0; int cpu_slabs = 0; int nr_in_cpu_slabs = 0; int cpu; int node; - for_each_online_node(node) - nr_slabs += nodes[node] = atomic_read(&s->nr_slabs[node]); + for_each_online_node(node) { + struct kmem_cache_node *n = s->node[node]; + nr_slabs += nodes[node] = atomic_read(&n->nr_slabs); + partial += n->nr_partial; + } for_each_possible_cpu(cpu) { struct page *page = s->cpu_slab[cpu]; @@ -1096,7 +1164,7 @@ static unsigned long slab_objects(struct } if (p_partial) - *p_partial = s->nr_partial; + *p_partial = partial; if (p_cpu_slabs) *p_cpu_slabs = cpu_slabs; @@ -1105,7 +1173,7 @@ static unsigned long slab_objects(struct *p_total = nr_slabs; return in_partial_slabs + nr_in_cpu_slabs + - (nr_slabs - s->nr_partial - cpu_slabs) * s->objects; + (nr_slabs - partial - cpu_slabs) * s->objects; } /******************************************************************** @@ -1233,14 +1301,20 @@ void __init kmem_cache_init(void) { int i; +#ifdef CONFIG_NUMA + kmem_cache_open(&kmem_cache_node, "kmem_cache_node", + sizeof(struct kmem_cache_node), + ARCH_KMALLOC_MINALIGN, SLAB_PANIC, NULL, NULL); +#endif + + slab_state = PARTIAL; for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { create_kmalloc_cache( &kmalloc_caches[i - KMALLOC_SHIFT_LOW], "kmalloc", 1 << i); } -#ifdef KMALLOC_EXTRA - slab_state = PARTIAL; +#ifdef KMALLOC_EXTRA /* Caches that are not of the two-to-the-power-of size */ create_kmalloc_cache(&kmalloc_caches [KMALLOC_SHIFT_HIGH - KMALLOC_SHIFT_LOW + 1],