SLUB NUMA: Dynamically size kmem_cache depending of the number of nodes Currently we allocate MAXNUMNODES array elements for the per node array in kmem_cache. This is because the cpu array is following. Thus we can only dynamically size the per cpu array. This patch moves the dynamic cpu array in front of the current kmem_cache structure. For small systems this will mean that the cpu array is in the same cacheline as the main control information. We keep the pointer to the kmem_cache structure. If we put the per cpu array in front of the kmem_cache structure then we can find the nth cpu by subtracing 1+cpu from the kmem_cache pointer. This means that we have a dynamically per cpu array in front of the kmem_cache structure and a dyamic per node array at the end of it. Thus means that the cpu array must be accessed in a special way. Introduce two functions get_cpu_slab and set_cpu_slab to access the per cpu information in front of the kmem_cache structure. Signed-off-by: Christoph Lameter --- include/linux/slub_def.h | 8 ++-- mm/slub.c | 93 +++++++++++++++++++++++++++++------------------ 2 files changed, 63 insertions(+), 38 deletions(-) Index: slub/include/linux/slub_def.h =================================================================== --- slub.orig/include/linux/slub_def.h 2007-05-23 17:05:06.000000000 -0700 +++ slub/include/linux/slub_def.h 2007-05-23 17:06:47.000000000 -0700 @@ -54,7 +54,6 @@ struct kmem_cache { int defrag_ratio; struct kmem_cache_node *node[MAX_NUMNODES]; #endif - struct page *cpu_slab[NR_CPUS]; }; /* @@ -68,7 +67,10 @@ struct kmem_cache { * We keep the general caches in an array of slab caches that are used for * 2^x bytes of allocations. */ -extern struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; +extern struct kmalloc_cache { + struct page *cpu_slab[NR_CPUS]; + struct kmem_cache cache; +} kmalloc_caches[KMALLOC_SHIFT_HIGH + 1]; /* * Determine the kmalloc array index given the object size. @@ -120,7 +122,7 @@ static inline struct kmem_cache *kmalloc * If this triggers then the amount of memory requested was too large. */ BUG_ON(index < 0); - return &kmalloc_caches[index]; + return &kmalloc_caches[index].cache; } #ifdef CONFIG_ZONE_DMA Index: slub/mm/slub.c =================================================================== --- slub.orig/mm/slub.c 2007-05-23 17:05:06.000000000 -0700 +++ slub/mm/slub.c 2007-05-23 17:06:21.000000000 -0700 @@ -224,7 +224,7 @@ static inline void ClearSlabDebug(struct #define cache_line_size() L1_CACHE_BYTES #endif -static int kmem_size = sizeof(struct kmem_cache); +static int kmem_size = sizeof(struct kmalloc_cache); #ifdef CONFIG_SMP static struct notifier_block slab_notifier; @@ -281,6 +281,16 @@ static inline struct kmem_cache_node *ge #endif } +static inline struct page *get_cpu_slab(struct kmem_cache *s, int cpu) +{ + return ((struct page **)s)[-1-cpu]; +} + +static inline void set_cpu_slab(struct kmem_cache *s, int cpu, struct page *page) +{ + ((struct page **)s)[-1-cpu] = page; +} + static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) { @@ -1349,7 +1359,7 @@ static void deactivate_slab(struct kmem_ page->freelist = object; page->inuse--; } - s->cpu_slab[cpu] = NULL; + set_cpu_slab(s, cpu, NULL); unfreeze_slab(s, page); } @@ -1365,7 +1375,7 @@ static void flush_slab(struct kmem_cache */ static void __flush_cpu_slab(struct kmem_cache *s, int cpu) { - struct page *page = s->cpu_slab[cpu]; + struct page *page = get_cpu_slab(s, cpu); if (likely(page)) flush_slab(s, page, cpu); @@ -1441,14 +1451,15 @@ another_slab: new_slab: page = get_partial(s, gfpflags, node); if (page) { - s->cpu_slab[cpu] = page; + set_cpu_slab(s, cpu, page); goto load_freelist; } page = new_slab(s, gfpflags, node); if (page) { - cpu = smp_processor_id(); - if (s->cpu_slab[cpu]) { + struct page *cpu_slab = get_cpu_slab(s, smp_processor_id()); + + if (cpu_slab) { /* * Someone else populated the cpu_slab while we * enabled interrupts, or we have gotten scheduled @@ -1457,22 +1468,22 @@ new_slab: * specified. So we need to recheck. */ if (node == -1 || - page_to_nid(s->cpu_slab[cpu]) == node) { + page_to_nid(cpu_slab) == node) { /* * Current cpuslab is acceptable and we * want the current one since its cache hot */ discard_slab(s, page); - page = s->cpu_slab[cpu]; + page = cpu_slab; slab_lock(page); goto load_freelist; } /* New slab does not fit our expectations */ - flush_slab(s, s->cpu_slab[cpu], cpu); + flush_slab(s, cpu_slab, cpu); } slab_lock(page); SetSlabFrozen(page); - s->cpu_slab[cpu] = page; + set_cpu_slab(s, cpu, page); goto load_freelist; } return NULL; @@ -1505,7 +1516,7 @@ static void __always_inline *slab_alloc( unsigned long flags; local_irq_save(flags); - page = s->cpu_slab[smp_processor_id()]; + page = get_cpu_slab(s, smp_processor_id()); if (unlikely(!page || !page->lockless_freelist || (node != -1 && page_to_nid(page) != node))) @@ -1609,7 +1620,7 @@ static void __always_inline slab_free(st unsigned long flags; local_irq_save(flags); - if (likely(page == s->cpu_slab[smp_processor_id()] && + if (likely(page == get_cpu_slab(s, smp_processor_id()) && !SlabDebug(page))) { object[page->offset] = page->lockless_freelist; page->lockless_freelist = object; @@ -1821,20 +1832,21 @@ static struct kmem_cache_node * __init e { struct page *page; struct kmem_cache_node *n; + struct kmem_cache *s = &kmalloc_caches[0].cache; - BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); + BUG_ON(s->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); + page = new_slab(s, gfpflags | GFP_THISNODE, node); /* new_slab() disables interupts */ local_irq_enable(); BUG_ON(!page); n = page->freelist; BUG_ON(!n); - page->freelist = get_freepointer(kmalloc_caches, n); + page->freelist = get_freepointer(s, n); page->inuse++; - kmalloc_caches->node[node] = n; - setup_object_debug(kmalloc_caches, page, n); + s->node[node] = n; + setup_object_debug(s, page, n); init_kmem_cache_node(n); atomic_long_inc(&n->nr_slabs); add_partial(n, page); @@ -1848,7 +1860,7 @@ static void free_kmem_cache_nodes(struct for_each_online_node(node) { struct kmem_cache_node *n = s->node[node]; if (n && n != &s->local_node) - kmem_cache_free(kmalloc_caches, n); + kmem_cache_free(&kmalloc_caches[0].cache, n); s->node[node] = NULL; } } @@ -1874,7 +1886,7 @@ static int init_kmem_cache_nodes(struct node); continue; } - n = kmem_cache_alloc_node(kmalloc_caches, + n = kmem_cache_alloc_node(&kmalloc_caches[0].cache, gfpflags, node); if (!n) { @@ -2139,11 +2151,15 @@ void kmem_cache_destroy(struct kmem_cach down_write(&slub_lock); s->refcount--; if (!s->refcount) { + void *v = s; + + v -= nr_cpu_ids * sizeof(struct kmem_cache_node *); + list_del(&s->list); if (kmem_cache_close(s)) WARN_ON(1); sysfs_slab_remove(s); - kfree(s); + kfree(v); } up_write(&slub_lock); } @@ -2153,7 +2169,7 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; +struct kmalloc_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; EXPORT_SYMBOL(kmalloc_caches); #ifdef CONFIG_ZONE_DMA @@ -2240,7 +2256,8 @@ static struct kmem_cache *get_slab(size_ return s; /* Dynamically create dma cache */ - x = kmalloc(kmem_size, flags & ~SLUB_DMA); + x = kmalloc(kmem_size, flags & ~SLUB_DMA) + + nr_cpu_ids * sizeof(struct kmem_cache *); if (!x) panic("Unable to allocate memory for dma cache\n"); @@ -2260,7 +2277,7 @@ static struct kmem_cache *get_slab(size_ return s; } #endif - return &kmalloc_caches[index]; + return &kmalloc_caches[index].cache; } void *__kmalloc(size_t size, gfp_t flags) @@ -2470,7 +2487,7 @@ void __init kmem_cache_init(void) * struct kmem_cache_node's. There is special bootstrap code in * kmem_cache_open for slab_state == DOWN. */ - create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", + create_kmalloc_cache(&kmalloc_caches[0].cache, "kmem_cache_node", sizeof(struct kmem_cache_node), GFP_KERNEL); #endif @@ -2478,28 +2495,31 @@ void __init kmem_cache_init(void) slab_state = PARTIAL; /* Caches that are not of the two-to-the-power-of size */ - create_kmalloc_cache(&kmalloc_caches[1], + create_kmalloc_cache(&kmalloc_caches[1].cache, "kmalloc-96", 96, GFP_KERNEL); - create_kmalloc_cache(&kmalloc_caches[2], + create_kmalloc_cache(&kmalloc_caches[2].cache, "kmalloc-192", 192, GFP_KERNEL); for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) - create_kmalloc_cache(&kmalloc_caches[i], + create_kmalloc_cache(&kmalloc_caches[i].cache, "kmalloc", 1 << i, GFP_KERNEL); slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) - kmalloc_caches[i]. name = + kmalloc_caches[i].cache.name = kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); #endif - kmem_size = offsetof(struct kmem_cache, cpu_slab) + - nr_cpu_ids * sizeof(struct page *); + kmem_size = offsetof(struct kmem_cache, node) + +#ifdef CONFIG_NUMA + nr_node_ids * sizeof(struct kmem_cache_node *) + +#endif + nr_cpu_ids * sizeof(struct page *); printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," " Processors=%d, Nodes=%d\n", @@ -2596,17 +2616,20 @@ struct kmem_cache *kmem_cache_create(con if (sysfs_slab_alias(s, name)) goto err; } else { - s = kmalloc(kmem_size, GFP_KERNEL); + void *v = kmalloc(kmem_size, GFP_KERNEL); + + s = v + nr_cpu_ids * sizeof(struct kmem_cache_node *); + if (s && kmem_cache_open(s, GFP_KERNEL, name, size, align, flags, ctor, ops)) { if (sysfs_slab_add(s)) { - kfree(s); + kfree(v); goto err; } list_add(&s->list, &slab_caches); raise_kswapd_order(s->order); } else - kfree(s); + kfree(v); } up_write(&slub_lock); return s; @@ -3109,7 +3132,7 @@ static unsigned long slab_objects(struct per_cpu = nodes + nr_node_ids; for_each_possible_cpu(cpu) { - struct page *page = s->cpu_slab[cpu]; + struct page *page = get_cpu_slab(s, cpu); int node; if (page) { @@ -3171,7 +3194,7 @@ static int any_slab_objects(struct kmem_ int cpu; for_each_possible_cpu(cpu) - if (s->cpu_slab[cpu]) + if (get_cpu_slab(s, cpu)) return 1; for_each_node(node) {