From 54ad4bb212829915338d4839754c202e3f64ff89 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 6 Nov 2007 11:33:49 -0800 Subject: [PATCH] cpu alloc: Simple version of the allocator (static allocations) The core portion of the cpu allocator. The per cpu allocator allows dynamic allocation of memory on all processor simultaneously. A bitmap is used to track used areas. The allocator implements tight packing to reduce the cache footprint and increase speed since cacheline contention is typically not a concern for memory mainly used by a single cpu. Small objects will fill up gaps left by larger allocations that required alignments. This is a limited version of the cpu allocator that only performs a static allocation of a single page for each processor. This is enough for the use of the cpu allocator in the slab and page allocator for most of the common configurations. The configuration will be useful for embedded systems to reduce memory requirements. However, there is a hard limit of the size of the per cpu structures and so the default configuration of an order 0 allocation can only support up to 150 slab caches (most systems that I got use 70) and probably not more than 16 or so NUMA nodes. The size of the statically configured area can be changed via make menuconfig etc. The cpu allocator virtualization patch is needed in order to support the dynamically extending per cpu areas. V1->V2: - Split off the dynamic extendable cpu area feature to make it clear that it exists.\ - Remove useless variables. - Add boot_cpu_alloc for bootime cpu area reservations (allows the folding in of per cpu areas and other arch specific per cpu stuff during boot). Signed-off-by: Christoph Lameter --- include/linux/percpu.h | 55 +++++++++++ include/linux/vmstat.h | 2 init/main.c | 2 mm/Makefile | 2 mm/cpu_alloc.c | 237 +++++++++++++++++++++++++++++++++++++++++++++++++ mm/vmstat.c | 1 6 files changed, 297 insertions(+), 2 deletions(-) create mode 100644 include/linux/cpu_alloc.h create mode 100644 mm/cpu_alloc.c Index: linux-2.6/include/linux/vmstat.h =================================================================== --- linux-2.6.orig/include/linux/vmstat.h 2008-01-28 18:56:39.000000000 -0800 +++ linux-2.6/include/linux/vmstat.h 2008-01-28 19:05:09.000000000 -0800 @@ -36,7 +36,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS FOR_ALL_ZONES(PGSCAN_KSWAPD), FOR_ALL_ZONES(PGSCAN_DIRECT), PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL, - PAGEOUTRUN, ALLOCSTALL, PGROTATED, + PAGEOUTRUN, ALLOCSTALL, PGROTATED, CPU_BYTES, NR_VM_EVENT_ITEMS }; Index: linux-2.6/mm/Makefile =================================================================== --- linux-2.6.orig/mm/Makefile 2008-01-28 18:56:39.000000000 -0800 +++ linux-2.6/mm/Makefile 2008-01-28 19:05:09.000000000 -0800 @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o $(mmu-y) + page_isolation.o cpu_alloc.o $(mmu-y) obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o Index: linux-2.6/mm/cpu_alloc.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/mm/cpu_alloc.c 2008-01-28 20:12:25.000000000 -0800 @@ -0,0 +1,237 @@ +/* + * Cpu allocator - Manage objects allocated for each processor + * + * (C) 2007-2008 SGI, Christoph Lameter + * Basic implementation with allocation and free from a dedicated per + * cpu area. + * + * The per cpu allocator allows dynamic allocation of memory on all + * processor simultaneously. A bitmap is used to track used areas. + * The allocator implements tight packing to reduce the cache footprint + * and increase speed since cacheline contention is typically not a concern + * for memory mainly used by a single cpu. Small objects will fill up gaps + * left by larger allocations that required alignments. + * + * + * Memory layout of the per cpu area (my_cpu_offset may be a register + * or be obtained via __per_cpu_offset[smp_processor_id()]): + * + * Runtime addresss LoadAddress + *----------------------------------------------------------------------- + * my_cpu_offset + 0 __per_cpu_start (__per_cpu_load) + * + * Static per cpu definitions (DEFINE_PER_CPU) + * This is a loader section that is copied + * during startup to the per cpu area of each processor. + * + * my_cpu_offset + __per_cpu_size __per_cpu_end + * + * Boot time allocations (boot_cpu_alloc()) + * + * my_cpu_offset + cpu_alloc_reserved + * + * Runtime allocations (cpu_alloc) + * + * my_cpu_offset + cpu_alloc_end_ + * ( >= my_cpu_offset + PERCPU_ENOUGH_ROOM) + * + */ +#include +#include +#include +#include +#include +#include +#include + +/* + * Basic allocation unit. A bit map is created to track the use of each + * UNIT_SIZE element in the cpu area. + */ + +#define __per_cpu_size (__per_cpu_end - __per_cpu_start) + + +static unsigned long cpu_alloc_start = (unsigned long)__per_cpu_end; + +/* cpu_alloc_end may be set externally to specify the size of the area */ +unsigned long cpu_alloc_end; + +/* + * Basic allocation unit. This also defines the basic alignment + * of boottime allocations. + */ +#define UNIT_SIZE sizeof(int) + +/* + * How many units are needed for an object of a given size + */ +static int size_to_units(unsigned long size) +{ + return DIV_ROUND_UP(size, UNIT_SIZE); +} + +/* + * Lock to protect the bitmap and the meta data for the cpu allocator. + */ +static DEFINE_SPINLOCK(cpu_alloc_map_lock); +static unsigned long units; +static unsigned long *cpu_alloc_map; + +void * __init boot_cpu_alloc(unsigned long size, unsigned long align) +{ + unsigned long balign = max_t(unsigned long, UNIT_SIZE, align); + unsigned long x = ALIGN(cpu_alloc_start, balign); + unsigned int tsize = ALIGN(size, UNIT_SIZE); + + cpu_alloc_start += tsize; + + /* Check for overflow */ + BUG_ON(cpu_alloc_start >= (unsigned long)__per_cpu_start + + PERCPU_ENOUGH_ROOM); + return (void *)x; +} + +static int first_free; /* First known free unit */ + +/* + * Mark an object as used in the cpu_alloc_map + * + * Must hold cpu_alloc_map_lock + */ +static void set_map(int start, int length) +{ + while (length-- > 0) + __set_bit(start++, cpu_alloc_map); +} + +/* + * Mark an area as freed. + * + * Must hold cpu_alloc_map_lock + */ +static void clear_map(int start, int length) +{ + while (length-- > 0) + __clear_bit(start++, cpu_alloc_map); +} + +/* + * Allocate an object of a certain size + * + * Returns a special pointer that can be used with CPU_PTR to find the + * address of the object for a certain cpu. + */ +void *cpu_alloc(unsigned long size, gfp_t gfpflags, unsigned long align) +{ + unsigned long start; + int units = size_to_units(size); + void *ptr; + int first; + unsigned long flags; + + BUG_ON(gfpflags & ~(GFP_RECLAIM_MASK | __GFP_ZERO)); + + spin_lock_irqsave(&cpu_alloc_map_lock, flags); + + first = 1; + start = first_free; + + for ( ; ; ) { + + start = find_next_zero_bit(cpu_alloc_map, units, start); + if (start >= units) + goto out_of_memory; + + if (first) + first_free = start; + + /* + * Check alignment and that there is enough space after + * the starting unit. + */ + if (start % (align / UNIT_SIZE) == 0 && + find_next_bit(cpu_alloc_map, units, start + 1) >= + start + units) + break; + start++; + first = 0; + } + + if (first) + first_free = start + units; + + if (start + units > units) + goto out_of_memory; + + set_map(start, units); + __count_vm_events(CPU_BYTES, units * UNIT_SIZE); + + spin_unlock_irqrestore(&cpu_alloc_map_lock, flags); + + ptr = (void *)cpu_alloc_start + start * UNIT_SIZE; + + if (gfpflags & __GFP_ZERO) { + int cpu; + + for_each_possible_cpu(cpu) + memset(CPU_PTR(ptr, cpu), 0, size); + } + + return ptr; + +out_of_memory: + spin_unlock_irqrestore(&cpu_alloc_map_lock, flags); + return NULL; +} +EXPORT_SYMBOL(cpu_alloc); + +/* + * Free an object. The pointer must be a cpu pointer allocated + * via cpu_alloc. + */ +void cpu_free(void *start, unsigned long size) +{ + int units = size_to_units(size); + int index; + unsigned long p = (unsigned long)start; + unsigned long flags; + + BUG_ON(p < cpu_alloc_start); + index = (p - cpu_alloc_start) / UNIT_SIZE; + BUG_ON(!test_bit(index, cpu_alloc_map) || + index >= units); + + spin_lock_irqsave(&cpu_alloc_map_lock, flags); + + clear_map(index, units); + __count_vm_events(CPU_BYTES, -units * UNIT_SIZE); + if (index < first_free) + first_free = index; + + spin_unlock_irqrestore(&cpu_alloc_map_lock, flags); +} +EXPORT_SYMBOL(cpu_free); + +/* + * Setup cpu_alloc. All boot time boot_cpu_alloc() must have been completed. + */ +void cpu_alloc_init(void) +{ + unsigned long size; + unsigned long reserve; + unsigned long per_cpu_start = (unsigned long)__per_cpu_start; + + reserve = ALIGN(cpu_alloc_start, cache_line_size()) - per_cpu_start; + + if (!cpu_alloc_end) + cpu_alloc_end = PERCPU_ENOUGH_ROOM; + + size = cpu_alloc_end - reserve; + units = size_to_units(size); + cpu_alloc_map = alloc_bootmem(units * UNIT_SIZE); + cpu_alloc_start = per_cpu_start + reserve; + printk("CPU_ALLOC: size=%lu reserved=%lu static=%u unitsize=%u units=%lu\n", + cpu_alloc_end, reserve, __per_cpu_size, UNIT_SIZE, units); +} + Index: linux-2.6/mm/vmstat.c =================================================================== --- linux-2.6.orig/mm/vmstat.c 2008-01-28 18:56:39.000000000 -0800 +++ linux-2.6/mm/vmstat.c 2008-01-28 19:05:09.000000000 -0800 @@ -642,6 +642,7 @@ static const char * const vmstat_text[] "allocstall", "pgrotated", + "cpu_bytes", #endif }; Index: linux-2.6/include/linux/percpu.h =================================================================== --- linux-2.6.orig/include/linux/percpu.h 2008-01-28 19:04:58.000000000 -0800 +++ linux-2.6/include/linux/percpu.h 2008-01-28 20:15:20.000000000 -0800 @@ -134,4 +134,59 @@ static inline void percpu_free(void *__p #define free_percpu(ptr) percpu_free((ptr)) #define per_cpu_ptr(ptr, cpu) percpu_ptr((ptr), (cpu)) + +/* + * cpu allocator definitions + * + * The cpu allocator allows allocating an array of objects on all processors. + * A single pointer can then be used to access the instance of the object + * on a particular processor. + * + * Cpu objects are typically small. The allocator packs them tightly + * to increase the chance on each access that a per cpu object is already + * cached. Alignments may be specified but the intent is to align the data + * properly due to cpu alignment constraints and not to avoid cacheline + * contention. Any holes left by aligning objects are filled up with smaller + * objects that are allocated later. + * + * Cpu data can be allocated using CPU_ALLOC. The resulting pointer is + * pointing to the instance of the variable on cpu 0. It is generally an + * error to use the pointer directly unless we are running on cpu 0. So + * the use is valid during boot for example. + * + * The GFP flags have their usual function: __GFP_ZERO zeroes the object + * and other flags may be used to control reclaim behavior if the cpu + * areas have to be extended. However, zones cannot be selected nor + * can locality constraint flags be used. + * + * CPU_PTR() may be used to calculate the pointer for a specific processor. + * CPU_PTR is highly scalable since it simply adds the shifted value of + * smp_processor_id() to the base. + * + * Note: Synchronization is up to caller. If preemption is disabled then + * it is generally safe to access cpu variables (unless they are also + * handled from an interrupt context). + */ + +#define CPU_PTR(__p, __cpu) percpu_ptr((__p), (__cpu)) + +#define CPU_ALLOC(type, flags) cpu_alloc(sizeof(type), flags, \ + __alignof__(type)) +#define CPU_FREE(pointer) cpu_free(pointer, sizeof(*(pointer))) + +#define THIS_CPU(__p) SHIFT_PERCPU_PTR((__p), my_cpu_offset) +#define __THIS_CPU(__p) SHIFT_PERCPU_PTR((__p), __my_cpu_offset) + +/* + * Raw calls + */ +void *cpu_alloc(unsigned long size, gfp_t gfp, unsigned long align); +void cpu_free(void *cpu_pointer, unsigned long size); + +/* + * Early boot allocator for per_cpu variables and special per cpu areas. + * Allocations are not tracked and cannot be freed. + */ +void *boot_cpu_alloc(unsigned long size, unsigned long align); + #endif /* __LINUX_PERCPU_H */ Index: linux-2.6/init/main.c =================================================================== --- linux-2.6.orig/init/main.c 2008-01-28 18:55:30.000000000 -0800 +++ linux-2.6/init/main.c 2008-01-28 19:07:23.000000000 -0800 @@ -89,6 +89,7 @@ extern void pidmap_init(void); extern void prio_tree_init(void); extern void radix_tree_init(void); extern void free_initmem(void); +extern void cpu_alloc_init(void); #ifdef CONFIG_ACPI extern void acpi_early_init(void); #else @@ -567,6 +568,7 @@ asmlinkage void __init start_kernel(void "enabled *very* early, fixing it\n"); local_irq_disable(); } + cpu_alloc_init(); sort_main_extable(); trap_init(); rcu_init();