SLUB: Place kmem_cache_cpu structures in a NUMA aware way. The kmem_cache_cpu structures introduces are currently an array in the kmem_cache_struct. Meaning the kmem_cache_cpu structures are overwhelmingly on the wrong node. In order to place the kmem_cache_cpu structure optimally we put an array of pointers to kmem_cache_cpu structs in kmem_cache (similar to SLAB). The kmem_cache_cpu structures have to be allocated in an intelligent way. kmalloc_node is easy to use for them once we have bootstrapped the kmalloc array. However, during the buildup of the kmalloc cache array we have no such capability. In order to bootstrap we define per cpu kmem_cache_cpu structures for each kmalloc size that is supported. We establish pointers to these kmem_cache_cpu structures (unlike SLAB we never free the boot structure since per cpu structures are optimally placed). That solves the bootstrap problem. For the per cpu array we pack the kmem_cache_cpu structures as densely as possible. There is no danger of cacheline bouncing since they are all from the same cpu. We can therefore fit lots of them into a single cacheline. On 32 bit a kmem_cache_cpu structure is 12 bytes meaning that 5 fit into one 64 byte cacheline. On 64 bit we get to 24 bytes which still fits almost 3. If we allocate the kmem_cache structure via kmalloc then there is danger of cacheline bouncing so we pad to full cacheline size. Pro: NUMA aware placement All global structures become readonly Potential avoidance of an exclusive cacheline fetch since multiple kmem_cache_cpu structures are in one cacheline. Cons: Additional reference to one read only cacheline in both slab_alloc and slab_free. Signed-off-by: Christoph Lameter --- include/linux/slub_def.h | 9 ++++++--- mm/slub.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 5 deletions(-) Index: linux-2.6.22-rc4-mm2/include/linux/slub_def.h =================================================================== --- linux-2.6.22-rc4-mm2.orig/include/linux/slub_def.h 2007-06-15 18:47:30.000000000 -0700 +++ linux-2.6.22-rc4-mm2/include/linux/slub_def.h 2007-06-15 19:13:03.000000000 -0700 @@ -16,8 +16,7 @@ struct kmem_cache_cpu { struct page *page; int objects; /* Saved page->inuse */ int node; - /* Lots of wasted space */ -} ____cacheline_aligned_in_smp; +}; struct kmem_cache_node { spinlock_t list_lock; /* Protect partial list and nr_partial */ @@ -55,10 +54,14 @@ struct kmem_cache { struct kobject kobj; /* For sysfs */ #ifdef CONFIG_NUMA +#define KMEM_CACHE_CPU_SIZE sizeof(struct kmem_cache_cpu *) int defrag_ratio; struct kmem_cache_node *node[MAX_NUMNODES]; -#endif + struct kmem_cache_cpu *cpu_slab[NR_CPUS]; +#else +#define KMEM_CACHE_CPU_SIZE sizeof(struct kmem_cache_cpu) struct kmem_cache_cpu cpu_slab[NR_CPUS]; +#endif }; /* Index: linux-2.6.22-rc4-mm2/mm/slub.c =================================================================== --- linux-2.6.22-rc4-mm2.orig/mm/slub.c 2007-06-15 18:47:30.000000000 -0700 +++ linux-2.6.22-rc4-mm2/mm/slub.c 2007-06-15 19:11:37.000000000 -0700 @@ -284,8 +284,11 @@ static inline struct kmem_cache_node *ge static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) { - /* For SMP its simple */ +#ifdef CONFIG_NUMA + return s->cpu_slab[cpu]; +#else return &s->cpu_slab[cpu]; +#endif } static inline int check_valid_pointer(struct kmem_cache *s, @@ -1909,6 +1912,8 @@ static void init_kmem_cache_node(struct } #ifdef CONFIG_NUMA +static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_cpu)[KMALLOC_SHIFT_HIGH + 1]; + /* * No kmalloc_node yet so do it by hand. We know that this is the first * slab on the node for this slabcache. There are no concurrent accesses @@ -1945,9 +1950,19 @@ static struct kmem_cache_node * __init e static void free_kmem_cache_nodes(struct kmem_cache *s) { int node; + int cpu; + + for_each_online_cpu(cpu) { + struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + + s->cpu_slab[cpu] = NULL; + if (c) + kfree(c); + } for_each_online_node(node) { struct kmem_cache_node *n = s->node[node]; + if (n && n != &s->local_node) kmem_cache_free(kmalloc_caches, n); s->node[node] = NULL; @@ -1957,6 +1972,7 @@ static void free_kmem_cache_nodes(struct static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) { int node; + int cpu; int local_node; if (slab_state >= UP) @@ -1987,6 +2003,30 @@ static int init_kmem_cache_nodes(struct s->node[node] = n; init_kmem_cache_node(n); } + + for_each_online_cpu(cpu) { + struct kmem_cache_cpu *c; + + if (slab_state < UP) + /* + * Special arrangements for the kmalloc cache so + * that per cpu structures of one processor are + * likely to fall into the same cacheline. + */ + c = percpu_ptr(kmalloc_cpu, cpu)[s - kmalloc_caches]; + else { + /* Each kmem_cache_cpu gets its own cacheline. */ + c = kmalloc_node(ALIGN(sizeof(struct kmem_cache_cpu), + cache_line_size()), gfpflags, node); + + if (!c) { + free_kmem_cache_nodes(s); + return 0; + } + } + memset(c, 0, sizeof(struct kmem_cache_cpu)); + s->cpu_slab[node] = c; + } return 1; } #else @@ -2619,7 +2659,7 @@ void __init kmem_cache_init(void) #endif kmem_size = offsetof(struct kmem_cache, cpu_slab) + - nr_cpu_ids * sizeof(struct kmem_cache_cpu); + nr_cpu_ids * KMEM_CACHE_CPU_SIZE; printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d," " MinObjects=%d, CPUs=%d, Nodes=%d\n",