Subject: cpu alloc: x86 support 64 bit: Set up a cpu area that allows the use of up 16MB for each processor. Cpu memory use can grow a bit. F.e. if we assume that a pageset occupies 64 bytes of memory and we have 3 zones in each of 1024 nodes then we need 3 * 1k * 16k = 50 million pagesets or 3096 pagesets per processor. This results in a total of 3.2 GB of page structs. Each cpu needs around 200k of cpu storage for the page allocator alone. So its a worth it to use a 2M huge mapping here. For the UP and SMP case map the area using 4k ptes. Typical use of per cpu data is around 16k for UP and SMP configurations. It goes up to 45k when the per cpu area is managed by cpu_alloc (see special x86_64 patchset). Allocating in 2M segments would be overkill. For NUMA map the area using 2M PMDs. A large NUMA system may use lots of cpu data for the page allocator data alone. We typically have large amounts of memory around on those size. Using a 2M page size reduces TLB pressure for that case. Some numbers for envisioned maximum configurations of NUMA systems: 4k cpu configurations with 1k nodes: 4096 * 16MB = 64GB of virtual space. Maximum theoretical configuration 16384 processors 1k nodes: 16384 * 16MB = 256GB of virtual space. Both fit within the established limits established. 32 bit: Setup a 256 kB area for the cpu areas below the FIXADDR area. The use of the cpu alloc area is pretty minimal on i386. An 8p system with no extras uses only ~8kb. So 256kb should be plenty. A configuration that supports up to 8 processors takes up 2MB of the scarce virtual address space Signed-off-by: Christoph Lameter --- arch/x86/Kconfig | 15 +++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 1 + arch/x86/mm/init_32.c | 3 +++ arch/x86/mm/init_64.c | 38 ++++++++++++++++++++++++++++++++++++++ include/asm-x86/page_64.h | 3 ++- include/asm-x86/percpu_32.h | 2 ++ include/asm-x86/percpu_64.h | 2 ++ include/asm-x86/pgtable_32.h | 7 +++++-- 8 files changed, 68 insertions(+), 3 deletions(-) Index: linux-2.6/arch/x86/mm/init_64.c =================================================================== --- linux-2.6.orig/arch/x86/mm/init_64.c 2007-11-20 20:23:27.489806034 -0800 +++ linux-2.6/arch/x86/mm/init_64.c 2007-11-20 20:29:35.897256136 -0800 @@ -781,3 +781,41 @@ int __meminit vmemmap_populate(struct pa return 0; } #endif + +#ifdef CONFIG_NUMA +int __meminit cpu_area_populate(void *start, unsigned long size, + gfp_t flags, int node) +{ + unsigned long addr = (unsigned long)start; + unsigned long end = addr + size; + unsigned long next; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + for (; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = cpu_area_pgd_populate(addr, flags, node); + if (!pgd) + return -ENOMEM; + pud = cpu_area_pud_populate(pgd, addr, flags, node); + if (!pud) + return -ENOMEM; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + pte_t entry; + void *p = cpu_area_alloc_block(PMD_SIZE, flags, node); + if (!p) + return -ENOMEM; + + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + mk_pte_huge(entry); + set_pmd(pmd, __pmd(pte_val(entry))); + } + } + + return 0; +} +#endif Index: linux-2.6/arch/x86/Kconfig =================================================================== --- linux-2.6.orig/arch/x86/Kconfig 2007-11-20 20:23:27.501806383 -0800 +++ linux-2.6/arch/x86/Kconfig 2007-11-20 20:47:33.822639182 -0800 @@ -159,6 +159,21 @@ config X86_TRAMPOLINE config KTIME_SCALAR def_bool X86_32 + +config CPU_AREA_VIRTUAL + bool + default y + +config CPU_AREA_ORDER + int + default "6" if X86_32 + default "12" if X86_64 + +config CPU_AREA_ALLOC_ORDER + int + default "0" if !NUMA || X86_32 + default "9" if NUMA && X86_64 + source "init/Kconfig" menu "Processor type and features" Index: linux-2.6/arch/x86/mm/init_32.c =================================================================== --- linux-2.6.orig/arch/x86/mm/init_32.c 2007-11-20 20:23:27.493806572 -0800 +++ linux-2.6/arch/x86/mm/init_32.c 2007-11-20 20:29:35.978668720 -0800 @@ -674,6 +674,7 @@ void __init mem_init(void) #if 1 /* double-sanity-check paranoia */ printk("virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" + " cpu area: 0x%08lx - 0x%08lx (%4ld kb)\n" #ifdef CONFIG_HIGHMEM " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" #endif @@ -684,6 +685,8 @@ void __init mem_init(void) " .text : 0x%08lx - 0x%08lx (%4ld kB)\n", FIXADDR_START, FIXADDR_TOP, (FIXADDR_TOP - FIXADDR_START) >> 10, + CPU_AREA_BASE, FIXADDR_START, + (FIXADDR_START - CPU_AREA_BASE) >> 10, #ifdef CONFIG_HIGHMEM PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, Index: linux-2.6/include/asm-x86/pgtable_32.h =================================================================== --- linux-2.6.orig/include/asm-x86/pgtable_32.h 2007-11-20 20:23:27.517806498 -0800 +++ linux-2.6/include/asm-x86/pgtable_32.h 2007-11-20 20:29:36.009506418 -0800 @@ -79,11 +79,14 @@ void paging_init(void); #define VMALLOC_START (((unsigned long) high_memory + \ 2*VMALLOC_OFFSET-1) & ~(VMALLOC_OFFSET-1)) #ifdef CONFIG_HIGHMEM -# define VMALLOC_END (PKMAP_BASE-2*PAGE_SIZE) +# define CPU_AREA_BASE (PKMAP_BASE - NR_CPUS * \ + (1 << (CONFIG_CPU_AREA_ORDER + PAGE_SHIFT))) #else -# define VMALLOC_END (FIXADDR_START-2*PAGE_SIZE) +# define CPU_AREA_BASE (FIXADDR_START - NR_CPUS * \ + (1 << (CONFIG_CPU_AREA_ORDER + PAGE_SHIFT))) #endif +#define VMALLOC_END (CPU_AREA_BASE - 2 * PAGE_SIZE) /* * _PAGE_PSE set in the page directory entry just means that * the page directory entry points directly to a 4MB-aligned block of Index: linux-2.6/arch/x86/kernel/vmlinux_32.lds.S =================================================================== --- linux-2.6.orig/arch/x86/kernel/vmlinux_32.lds.S 2007-11-20 20:23:27.509806350 -0800 +++ linux-2.6/arch/x86/kernel/vmlinux_32.lds.S 2007-11-20 20:29:36.089469338 -0800 @@ -26,6 +26,7 @@ OUTPUT_FORMAT("elf32-i386", "elf32-i386" OUTPUT_ARCH(i386) ENTRY(phys_startup_32) jiffies = jiffies_64; +cpu_area = CPU_AREA_BASE; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ Index: linux-2.6/include/asm-x86/page_64.h =================================================================== --- linux-2.6.orig/include/asm-x86/page_64.h 2007-11-20 20:23:27.521806260 -0800 +++ linux-2.6/include/asm-x86/page_64.h 2007-11-20 20:39:44.175751424 -0800 @@ -104,8 +104,9 @@ extern unsigned long phys_base; #define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1) #define KERNEL_TEXT_SIZE (40*1024*1024) -#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL) +#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL) #define PAGE_OFFSET __PAGE_OFFSET +#define CPU_AREA_BASE _AC(0xffffe20000000000, UL) #ifndef __ASSEMBLY__ Index: linux-2.6/include/asm-x86/percpu_64.h =================================================================== --- linux-2.6.orig/include/asm-x86/percpu_64.h 2007-11-20 20:23:27.529806323 -0800 +++ linux-2.6/include/asm-x86/percpu_64.h 2007-11-20 20:29:36.161690091 -0800 @@ -2,6 +2,8 @@ #define _ASM_X8664_PERCPU_H_ #include +#define cpu_area ((void *)CPU_AREA_BASE) + /* Same as asm-generic/percpu.h, except that we store the per cpu offset in the PDA. Longer term the PDA and every per cpu variable should be just put into a single section and referenced directly Index: linux-2.6/include/asm-x86/percpu_32.h =================================================================== --- linux-2.6.orig/include/asm-x86/percpu_32.h 2007-11-20 20:23:27.537806270 -0800 +++ linux-2.6/include/asm-x86/percpu_32.h 2007-11-20 20:29:36.166006343 -0800 @@ -28,6 +28,8 @@ #else /* ...!ASSEMBLY */ +char cpu_area[]; + /* * PER_CPU finds an address of a per-cpu variable. *