--- include/linux/cpu_alloc.h | 25 ---------------------- mm/cpu_alloc.c | 51 +++++++++++++++++++++------------------------- 2 files changed, 24 insertions(+), 52 deletions(-) Index: linux-2.6/include/linux/cpu_alloc.h =================================================================== --- linux-2.6.orig/include/linux/cpu_alloc.h 2007-11-02 21:58:09.000000000 -0700 +++ linux-2.6/include/linux/cpu_alloc.h 2007-11-02 21:59:58.000000000 -0700 @@ -43,7 +43,6 @@ __alignof__(type)) #define CPU_FREE(pointer) cpu_free(pointer, sizeof(*(pointer))) -#ifdef CONFIG_SMP /* * Raw calls */ @@ -82,10 +81,8 @@ extern u8 cpu_area[]; * THIS_CPU_OFFSET constant that may work more efficiently than this one. * (Although this definition is typically very effective). */ -#ifndef THIS_CPU_OFFSET #define THIS_CPU_OFFSET ((unsigned long)smp_processor_id() \ << CPU_AREA_SHIFT) -#endif #define CPU_PTR(__p, __cpu) ((__typeof__(__p))((void *)(__p) + \ ((unsigned long)(__cpu) << CPU_AREA_SHIFT))) @@ -94,26 +91,4 @@ extern u8 cpu_area[]; #define THIS_CPU_PTR(__p) \ ((__typeof__(__p))((void *)(__p) + THIS_CPU_OFFSET)) -#else /* !SMP */ - -/* - * Fallback for the single processor case. - * - * This is unsafe to use before slab allocator bootstrap! - */ -static inline void *cpu_alloc(unsigned long size, gfp_t flags, - unsigned long align) -{ - return kmalloc(size, flags); -} - -static inline void cpu_free(void *cpu_pointer, unsigned long size) -{ - kfree(cpu_pointer); -} - -#define CPU_PTR(__p, __cpu) (__p) -#define THIS_CPU_PTR(__p) (__p) -#endif - #endif /* _LINUX_CPU_ALLOC_H_ */ Index: linux-2.6/mm/cpu_alloc.c =================================================================== --- linux-2.6.orig/mm/cpu_alloc.c 2007-11-02 21:58:09.000000000 -0700 +++ linux-2.6/mm/cpu_alloc.c 2007-11-02 22:00:48.000000000 -0700 @@ -7,6 +7,10 @@ * * The per cpu allocator allows dynamic allocation of memory on all * processor simultaneously. A bitmap is used to track used areas. + * The allocator implements tight packing to reduce cache footprint + * and increase speed since cacheline contention is typically not a cocern + * for memory mainly used by a single cpu. Small objects will fill up gaps + * left by larger allocations that required alignments. */ #include #include @@ -22,9 +26,10 @@ * CPU_AREA_BLOCK shift is the units in which the cpu areas are extended. * Setting it to PAGE_SHIFT allows increasing the per cpu areas in * PAGE_SIZE steps (which may be too small for large systems). + * The default is 64k if not specified. */ #ifndef CPU_AREA_BLOCK_SHIFT -#define CPU_AREA_BLOCK_SHIFT PAGE_SHIFT +#define CPU_AREA_BLOCK_SHIFT 16 #endif #ifdef CPU_AREA_BASE @@ -39,11 +44,11 @@ /* * No base specified. Fall back to a static configuration of the cpu * allocator. The cpu areas are of a fixed size. Such configurations - * are mainly usedful for SMP on machines that do not have MMU support. - * But it is also satisfactory for basic page and slab allocator - * needs if the arch code has not yet provided an address for the cpu area. + * are mainly useful for SMP on machines that do not have MMU support. */ -u8 cpu_area[NR_CPUS][PAGE_SIZE]; + +u8 cpu_area[NR_CPUS][1 << CPU_AREA_BLOCK_SHIFT]; + #define CPU_AREA_STATIC #define CPU_AREA_BASE cpu_area #define MAX_BLOCKS 1 @@ -116,6 +121,7 @@ static inline int expand_cpu_area(gfp_t return -ENOSYS; } #else + /* * Allocate a block of memory to be used to provide cpu area memory * or to extend the bitmap for the cpu map. @@ -123,18 +129,11 @@ static inline int expand_cpu_area(gfp_t */ void *cpu_area_alloc_block(unsigned long size, gfp_t flags, int node) { - extern int after_bootmem; - - /* If the main allocator is up use that, fallback to bootmem. */ - if (after_bootmem) { - struct page *page = alloc_pages_node(node, - flags | __GFP_ZERO, get_order(size)); - if (page) - return page_address(page); - return NULL; - } - return __alloc_bootmem_node(NODE_DATA(node), size, size, - __pa(MAX_DMA_ADDRESS)); + struct page *page = alloc_pages_node(node, + flags | __GFP_ZERO, get_order(size)); + if (page) + return page_address(page); + return NULL; } pte_t *cpu_area_pte_populate(pmd_t *pmd, unsigned long addr, @@ -231,27 +230,25 @@ int cpu_area_populate(void *start, unsig static int expand_cpu_area(gfp_t flags) { unsigned long blocks = active_blocks; + unsigned long bits; int cpu; int err = -ENOMEM; int map_order; unsigned long *new_map = NULL; + void *start; if (active_blocks == MAX_BLOCKS) goto out; + spin_unlock(&cpu_alloc_map_lock); /* * Determine the size of the bit map needed */ - map_order = get_order((blocks + 1) * UNITS_PER_BLOCK) / - (BITS_PER_LONG * sizeof(unsigned long)); -// map_order = get_order(BITS_TO_LONGS((active_blocks + 1) -// * UNITS_PER_BLOCK) * sizeof(unsigned long)); - spin_unlock(&cpu_alloc_map_lock); + bits = (blocks + 1) * UNITS_PER_BLOCK; + map_order = get_order(DIV_ROUND_UP(bits, 8)); + start = (void *)CPU_AREA_BASE + (blocks << CPU_AREA_BLOCK_SHIFT); for_each_possible_cpu(cpu) { - void *start = (void *)CPU_AREA_BASE + - (blocks << CPU_AREA_BLOCK_SHIFT); - err = cpu_area_populate(CPU_PTR(start, cpu), BLOCK_SIZE, flags, cpu_to_node(cpu)); @@ -267,8 +264,8 @@ static int expand_cpu_area(gfp_t flags) if (!new_map) goto out; } - spin_lock(&cpu_alloc_map_lock); + spin_lock(&cpu_alloc_map_lock); /* * We dropped the lock. Another processor may have already extended * the cpu area size as needed. @@ -276,7 +273,7 @@ static int expand_cpu_area(gfp_t flags) if (blocks != active_blocks) { if (new_map) free_pages((unsigned long)new_map, - cpu_alloc_map_order); + map_order); err = 0; goto out; }