8p systems < 2k for slab / slub ~16k for complete bootup But Dave wants to run 100000 ip tunnels with this. And we want our 16k cpu /1 k node systems (~200k in pageset structs) --- include/asm-x86/pgtable_64.h | 1 include/linux/cpu_alloc.h | 71 ++++++++ include/linux/vmstat.h | 2 mm/Kconfig | 32 +++ mm/Makefile | 3 mm/cpu_alloc.c | 371 +++++++++++++++++++++++++++++++++++++++++++ mm/vmstat.c | 1 7 files changed, 478 insertions(+), 3 deletions(-) Index: linux-2.6/mm/cpu_alloc.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/mm/cpu_alloc.c 2007-11-03 17:22:15.008818301 -0700 @@ -0,0 +1,371 @@ +/* + * Cpu allocator - Manage objects allocated for each processor + * + * (C) 2007 SGI, Christoph Lameter + * Basic implementation with allocation and free from a dedicated per + * cpu area. + * + * The per cpu allocator allows dynamic allocation of memory on all + * processor simultaneously. A bitmap is used to track used areas. + * The allocator implements tight packing to reduce cache footprint + * and increase speed since cacheline contention is typically not a cocern + * for memory mainly used by a single cpu. Small objects will fill up gaps + * left by larger allocations that required alignments. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(cpu_alloc_map_lock); + +/* + * Basic allocation unit. A bit map is created to track the use of each + * UNIT_SIZE element in the cpu area. + */ +#define BLOCK_SIZE (1UL << (CONFIG_CPU_AREA_ALLOC_ORDER + PAGE_SHIFT)) +#define UNIT_SIZE sizeof(int) +#define UNITS_PER_BLOCK (BLOCK_SIZE / UNIT_SIZE) + +#ifdef CONFIG_CPU_AREA_VIRTUAL + +/* + * The maximum number of blocks is the maximum size of the + * cpu area for one processor divided by the size of an allocation + * block. + */ +#define MAX_BLOCKS (1UL << (CONFIG_CPU_AREA_ORDER - \ + CONFIG_CPU_AREA_ALLOC_ORDER)) + +static unsigned long *cpu_alloc_map = NULL; +static int cpu_alloc_map_order = -1; +static unsigned long active_blocks; /* Number of block allocated on each cpu */ +static unsigned long units_free; /* Number of available units */ +static unsigned long units_total; /* Total avilable units */ +/* + * Allocate a block of memory to be used to provide cpu area memory + * or to extend the bitmap for the cpu map. + * Uses the page allocator if its available available, else bootmem. + */ +void *cpu_area_alloc_block(unsigned long size, gfp_t flags, int node) +{ + struct page *page = alloc_pages_node(node, + flags, get_order(size)); + if (page) + return page_address(page); + return NULL; +} + +pte_t *cpu_area_pte_populate(pmd_t *pmd, unsigned long addr, + gfp_t flags, int node) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) { + pte_t entry; + void *p = cpu_area_alloc_block(PAGE_SIZE, flags, node); + if (!p) + return 0; + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } + return pte; +} + +pmd_t *cpu_area_pmd_populate(pud_t *pud, unsigned long addr, + gfp_t flags, int node) +{ + pmd_t *pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p = cpu_area_alloc_block(PAGE_SIZE, flags, node); + if (!p) + return 0; + pmd_populate_kernel(&init_mm, pmd, p); + } + return pmd; +} + +pud_t *cpu_area_pud_populate(pgd_t *pgd, unsigned long addr, + gfp_t flags, int node) +{ + pud_t *pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + void *p = cpu_area_alloc_block(PAGE_SIZE, flags, node); + if (!p) + return 0; + pud_populate(&init_mm, pud, p); + } + return pud; +} + +pgd_t *cpu_area_pgd_populate(unsigned long addr, gfp_t flags, int node) +{ + pgd_t *pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + void *p = cpu_area_alloc_block(PAGE_SIZE, flags, node); + if (!p) + return 0; + pgd_populate(&init_mm, pgd, p); + } + return pgd; +} + +int cpu_area_populate_basepages(void *start, unsigned long size, + gfp_t flags, int node) +{ + unsigned long addr = (unsigned long)start; + unsigned long end = addr + size; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (; addr < end; addr += PAGE_SIZE) { + pgd = cpu_area_pgd_populate(addr, flags, node); + if (!pgd) + return -ENOMEM; + pud = cpu_area_pud_populate(pgd, addr, flags, node); + if (!pud) + return -ENOMEM; + pmd = cpu_area_pmd_populate(pud, addr, flags, node); + if (!pmd) + return -ENOMEM; + pte = cpu_area_pte_populate(pmd, addr, flags, node); + if (!pte) + return -ENOMEM; + } + + return 0; +} + +int __attribute__((weak)) cpu_area_populate(void *start, unsigned long size, + gfp_t flags, int node) +{ + return cpu_area_populate_basepages(start, size, flags, node); +} + +/* + * Not enough memory in the cpu areas. Extend the areas on all processors + * by another block. + */ +static int expand_cpu_area(gfp_t flags) +{ + unsigned long blocks = active_blocks; + unsigned long bits; + int cpu; + int err = -ENOMEM; + int map_order; + unsigned long *new_map = NULL; + void *start; + + if (active_blocks == MAX_BLOCKS) + goto out; + + spin_unlock(&cpu_alloc_map_lock); + /* + * Determine the size of the bit map needed + */ + bits = (blocks + 1) * UNITS_PER_BLOCK; + map_order = get_order(DIV_ROUND_UP(bits, 8)); + start = (void *)CPU_AREA_BASE + \ + (blocks << (PAGE_SHIFT + CONFIG_CPU_AREA_ALLOC_ORDER)); + + for_each_possible_cpu(cpu) { + err = cpu_area_populate(CPU_PTR(start, cpu), BLOCK_SIZE, + flags, cpu_to_node(cpu)); + + if (err) { + spin_lock(&cpu_alloc_map_lock); + goto out; + } + } + + if (map_order > cpu_alloc_map_order) { + new_map = cpu_area_alloc_block(PAGE_SIZE << map_order, + flags | __GFP_ZERO, 0); + if (!new_map) + goto out; + } + + spin_lock(&cpu_alloc_map_lock); + /* + * We dropped the lock. Another processor may have already extended + * the cpu area size as needed. + */ + if (blocks != active_blocks) { + if (new_map) + free_pages((unsigned long)new_map, + map_order); + err = 0; + goto out; + } + + if (new_map) { + /* + * Need to extend the bitmap + */ + if (cpu_alloc_map) + memcpy(new_map, cpu_alloc_map, + PAGE_SIZE << cpu_alloc_map_order); + cpu_alloc_map = new_map; + cpu_alloc_map_order = map_order; + } + active_blocks++; + units_total += UNITS_PER_BLOCK; + units_free += UNITS_PER_BLOCK; + err = 0; +out: + return err; +} +#else +/* + * Static fallback configuration. The cpu areas are of a fixed size and + * cannot be extended. Such configurations are mainly useful for SMP on + * machines that do not have MMU support. + */ +#define MAX_BLOCKS 1 + +u8 cpu_area[NR_CPUS * BLOCK_SIZE]; +static DECLARE_BITMAP(cpu_alloc_map, UNITS_PER_BLOCK); +#define CPU_AREA_BASE (unsigned long)cpu_area +static int units_free = UNITS_PER_BLOCK; +#define units_total UNITS_PER_BLOCK + +static inline int expand_cpu_area(gfp_t flags) +{ + return -ENOSYS; +} +#endif + +static int end_full_alloc; /* 0 .. end_full_alloc are all allocated */ +static int begin_all_free; /* begin_all_free .. units_total are all free */ + +/* + * How many units are needed for an object of a given size + */ +static int size_to_units(unsigned long size) +{ + return DIV_ROUND_UP(size, UNIT_SIZE); +} + +/* + * Mark an object as used in the cpu_alloc_map + * + * Must hold cpu_alloc_map_lock + */ +static void set_map(int start, int length) +{ + while (length-- > 0) + __set_bit(start++, cpu_alloc_map); +} + +/* + * Mark an area as freed. + * + * Must hold cpu_alloc_map_lock + */ +static void clear_map(int start, int length) +{ + while (length-- > 0) + __clear_bit(start++, cpu_alloc_map); +} + +/* + * Allocate an object of a certain size + * + * Returns a per cpu pointer that must not be directly used. + */ +void *cpu_alloc(unsigned long size, gfp_t gfpflags, unsigned long align) +{ + unsigned long start; + int units = size_to_units(size); + unsigned end; + void *ptr; + int first; + + BUG_ON(gfpflags & ~(GFP_RECLAIM_MASK | __GFP_ZERO)); + spin_lock(&cpu_alloc_map_lock); +restart: + first = 1; + start = end_full_alloc; + for ( ; ; ) { + while (start < units_total && + test_bit(start, cpu_alloc_map)) + start++; + + if (first) + end_full_alloc = start; + + if (start == units_total) { + if (!expand_cpu_area(gfpflags)) + goto restart; + spin_unlock(&cpu_alloc_map_lock); + return NULL; + } + + /* Ok we found a free cell */ + end = start + 1; + /* Alignment okay ? */ + if (start % (align / UNIT_SIZE) == 0) { + /* See if we can find enough units for the object */ + while (end < units_total && end - start < units && + !test_bit(end, cpu_alloc_map)) + end++; + if (end - start == units) + break; + } + start = end; + first = 0; + }; + + if (first) + end_full_alloc = end; + if (end > begin_all_free) + begin_all_free = end; + + set_map(start, units); + units_free -= units; + __count_vm_events(CPU_BYTES, units * UNIT_SIZE); + spin_unlock(&cpu_alloc_map_lock); + + ptr = (void *)CPU_AREA_BASE + start * UNIT_SIZE; + if (gfpflags & __GFP_ZERO) { + int cpu; + + for_each_possible_cpu(cpu) + memset(CPU_PTR(ptr, cpu), 0, size); + } + return ptr; +} +EXPORT_SYMBOL(cpu_alloc); + +/* + * Free an object. The pointer must be a per cpu pointer allocated + * via cpu_alloc. + */ +void cpu_free(void *pcpu, unsigned long size) +{ + unsigned long start = (unsigned long)pcpu; + int units = size_to_units(size); + int index; + + BUG_ON(start < CPU_AREA_BASE); + index = (start - CPU_AREA_BASE) / UNIT_SIZE; + BUG_ON(!test_bit(index, cpu_alloc_map) || + index >= units_total); + + spin_lock(&cpu_alloc_map_lock); + clear_map(index, units); + units_free += units; + __count_vm_events(CPU_BYTES, -units * UNIT_SIZE); + if (start < end_full_alloc) + end_full_alloc = start; + spin_unlock(&cpu_alloc_map_lock); +} +EXPORT_SYMBOL(cpu_free); + + Index: linux-2.6/include/linux/vmstat.h =================================================================== --- linux-2.6.orig/include/linux/vmstat.h 2007-11-03 16:11:32.890977379 -0700 +++ linux-2.6/include/linux/vmstat.h 2007-11-03 16:11:55.047227103 -0700 @@ -36,7 +36,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PS FOR_ALL_ZONES(PGSCAN_KSWAPD), FOR_ALL_ZONES(PGSCAN_DIRECT), PGINODESTEAL, SLABS_SCANNED, KSWAPD_STEAL, KSWAPD_INODESTEAL, - PAGEOUTRUN, ALLOCSTALL, PGROTATED, + PAGEOUTRUN, ALLOCSTALL, PGROTATED, CPU_BYTES, NR_VM_EVENT_ITEMS }; Index: linux-2.6/mm/Makefile =================================================================== --- linux-2.6.orig/mm/Makefile 2007-11-03 16:11:32.878977412 -0700 +++ linux-2.6/mm/Makefile 2007-11-03 16:11:55.051227258 -0700 @@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o page_alloc.o page-writeback.o pdflush.o \ readahead.o swap.o truncate.o vmscan.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o $(mmu-y) + page_isolation.o cpu_alloc.o $(mmu-y) obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o @@ -30,4 +30,3 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o - Index: linux-2.6/mm/vmstat.c =================================================================== --- linux-2.6.orig/mm/vmstat.c 2007-11-03 16:11:32.886977305 -0700 +++ linux-2.6/mm/vmstat.c 2007-11-03 16:11:55.051227258 -0700 @@ -642,6 +642,7 @@ static const char * const vmstat_text[] "allocstall", "pgrotated", + "cpu_bytes", #endif }; Index: linux-2.6/include/linux/cpu_alloc.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/include/linux/cpu_alloc.h 2007-11-03 17:17:59.271179201 -0700 @@ -0,0 +1,71 @@ +/* + * include/linux/cpu_alloc.h - cpu allocator definitions + * + * The cpu allocator allows allocating an array of objects on all processors. + * A single pointer can then be used to access the instance of the object + * on a particular processor. + * + * Cpu objects are typically small. The allocator packs them tightly + * to increase the chance on each access that a per cpu object is already + * cached. Alignments may be specified but the intend is to align the data + * properly due to cpu alignment constraints and not to avoid cacheline + * contention. Any holes left by aligning objects are filled up with smaller + * objects that are allocated later. + * + * Cpu data can be allocated using CPU_ALLOC. The resulting pointer is + * pointing to the instance of the variable on cpu 0. It is generally an + * error to use the pointer directly unless we are running on cpu 0. So + * the use is valid during boot for example. + * + * The GFP flags have their usual function: __GFP_ZERO zeroes the object + * and other flags may be used to control reclaim behavior if the cpu + * areas have to be extended. However, zones cannot be selected nor + * can locality constraints flags be used. + * + * CPU_PTR() may be used to calculate the pointer for a specific processor. + * CPU_PTR is highly scalable since it simply adds the shifted value of + * smp_processor_id() to the base. + * + * Note: Synchronization is up to caller. If preemption is disabled then + * it is generally safe to access cpu variables (unless they are also + * handled from an interrupt context). + * + * The cpu allocator falls back to slab operations for the !SMP case. + * If the cpu allocator is used during early boot before slab bootstrap + * is complete then the UP case must be handled in a special way. + */ +#ifndef _LINUX_CPU_ALLOC_H_ +#define _LINUX_CPU_ALLOC_H_ + +#define CPU_OFFSET(__cpu) \ + ((unsigned long)(__cpu) << (CONFIG_CPU_AREA_ORDER + PAGE_SHIFT)) + +#define CPU_PTR(__p, __cpu) ((__typeof__(__p))((void *)(__p) + \ + CPU_OFFSET(cpu))) + +#define CPU_ALLOC(type, flags) cpu_alloc(sizeof(type), flags, \ + __alignof__(type)) +#define CPU_FREE(pointer) cpu_free(pointer, sizeof(*(pointer))) + +/* + * Raw calls + */ +void *cpu_alloc(unsigned long size, gfp_t gfp, unsigned long align); +void cpu_free(void *cpu_pointer, unsigned long size); + +/* Functions for populating the per cpu areas mappings */ +pgd_t *cpu_area_pgd_populate(unsigned long addr, gfp_t flags, int node); +pud_t *cpu_area_pud_populate(pgd_t *pgd, unsigned long addr, + gfp_t flags, int node); +pmd_t *cpu_area_pmd_populate(pud_t *pud, unsigned long addr, + gfp_t flags, int node); +pte_t *cpu_area_pte_populate(pmd_t *pmd, unsigned long addr, + gfp_t flags, int node); +void *cpu_area_alloc_block(unsigned long size, gfp_t flags, int node); +int cpu_area_populate_basepages(void *start, unsigned long size, + gfp_t flags, int node); +int cpu_area_populate(void *start, unsigned long size, + gfp_t flags, int node); + +#endif /* _LINUX_CPU_ALLOC_H_ */ + Index: linux-2.6/mm/Kconfig =================================================================== --- linux-2.6.orig/mm/Kconfig 2007-11-03 16:17:58.215727669 -0700 +++ linux-2.6/mm/Kconfig 2007-11-03 17:21:18.055976715 -0700 @@ -194,3 +194,35 @@ config NR_QUICK config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS + +# +# CPU area management +# +config CPU_AREA_VIRTUAL + bool"Virtually mapped cpu data area" + def_bool n + help + Enabling a virtual cpu area makes the size of the cpu area dynamic. + More memory is mapped into the cpu area if the system allocates too + much per cpu data. If this is off then a default static area of 64k + is reserved for cpu allocations which is sufficient for most needs. + +config CPU_AREA_ORDER + int "Maximum order of CPU area" + default "16" if CPU_AREA_VIRTUAL + default "4" if !CPU_AREA_VIRTUAL + help + Sets the maximum amount of memory that can be allocated via cpu_alloc + The size is set in page order. The size set (times the maximum + number of processors) determines the amount of virtual memory that + is set aside for the per cpu areas. + +config CPU_AREA_ALLOC_ORDER + int "Allocation order for CPU area expansion" + default "0" if CPU_AREA_VIRTUAL + help + If the cpu_alloc area is running out of memory then the per cpu area + is expanded by the indicated amount of memory. Some architectures can + perform higher order mapping. In that case allocations need to be + performed at that page order in order to use that capability. + Index: linux-2.6/include/asm-x86/pgtable_64.h =================================================================== --- linux-2.6.orig/include/asm-x86/pgtable_64.h 2007-11-03 17:26:37.311477168 -0700 +++ linux-2.6/include/asm-x86/pgtable_64.h 2007-11-03 17:28:15.539226791 -0700 @@ -138,6 +138,7 @@ static inline pte_t ptep_get_and_clear_f #define VMALLOC_START _AC(0xffffc20000000000, UL) #define VMALLOC_END _AC(0xffffe1ffffffffff, UL) #define VMEMMAP_START _AC(0xffffe20000000000, UL) +#define CPU_AREA_BASE _AC(0xfffff20000000000, UL) #define MODULES_VADDR _AC(0xffffffff88000000, UL) #define MODULES_END _AC(0xfffffffffff00000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR)