X86_64: Fixed Base cpu area Put the base cpu area at a fixed location that is reachable from the kernel text segment. This will avoid a lot of offset calculation and will allow the removal of the arrays of pointers pointing to per cpu areas. The choice of location is a bit awkward right now. I stuffed it between kernel and modules. The optimal layout would be to change the arrangement of kernel text and modules would be to place it behind the modules area but that area is now at the end of the address space. Something like this: 1. Kernel Text 2. Modules 3. cpu area for processor 0 (canonical per cpu pointers) (CPU_AREA_BASE) 4. cpu area for other processors. Area 1-3 must be within 2 GB so that 32 bit offsets can reach all kernel variables. The per cpu offsets of per cpu variables can then be calculated at link time by ld instead of the current runtime calculations. Area 4 needs to be pretty large to support 16k cpus at 16M per cpu each. 256GB is needed. Maybe reserve a terabyte for this area just to be safe? Then we would need to shift the kernel and the modules area down. Signed-off-by: Christoph Lameter --- arch/x86/kernel/setup64.c | 6 +++--- arch/x86/kernel/vmlinux_64.lds.S | 4 +++- include/linux/percpu.h | 9 +++++---- mm/cpu_alloc.c | 13 ++++++------- 4 files changed, 17 insertions(+), 15 deletions(-) Index: linux-2.6/arch/x86/kernel/vmlinux_64.lds.S =================================================================== --- linux-2.6.orig/arch/x86/kernel/vmlinux_64.lds.S 2007-11-18 09:21:14.290783076 -0800 +++ linux-2.6/arch/x86/kernel/vmlinux_64.lds.S 2007-11-18 09:24:14.873783142 -0800 @@ -6,6 +6,7 @@ #include #include +#include #undef i386 /* in case the preprocessor is a 32bit one */ @@ -16,6 +17,7 @@ jiffies_64 = jiffies; _proxy_pda = 1; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ + percpu PT_LOAD FLAGS(4); data PT_LOAD FLAGS(7); /* RWE */ user PT_LOAD FLAGS(7); /* RWE */ data.init PT_LOAD FLAGS(7); /* RWE */ @@ -203,7 +205,7 @@ SECTIONS __initramfs_end = .; #endif - PERCPU(4096) + FIXED_ADDR_PERCPU(0, 4096) . = ALIGN(4096); __init_end = .; Index: linux-2.6/arch/x86/kernel/setup64.c =================================================================== --- linux-2.6.orig/arch/x86/kernel/setup64.c 2007-11-18 09:21:42.245033143 -0800 +++ linux-2.6/arch/x86/kernel/setup64.c 2007-11-18 09:24:14.889645156 -0800 @@ -96,8 +96,8 @@ void __init setup_per_cpu_areas(void) /* Copy section for each CPU (we discard the original) */ base = boot_cpu_alloc(PERCPU_ENOUGH_ROOM); - if (!base) - panic("Cannot allocate cpu data\n"); + if (base) + panic("Cannot allocate cpu data at offset 0\n"); printk(KERN_INFO "PERCPU: Allocating %lu bytes of per cpu data\n", PERCPU_ENOUGH_ROOM); @@ -111,7 +111,7 @@ void __init setup_per_cpu_areas(void) */ cpu_pda(i)->data_offset = base_for_cpu - __per_cpu_start; - memcpy(base_for_cpu, __per_cpu_start, __per_cpu_end - __per_cpu_start); + memcpy(base_for_cpu, __load_per_cpu_start, __per_cpu_end - __per_cpu_start); pda_for_cpu = &per_cpu(pda, i); /* Relocate the pda */ Index: linux-2.6/mm/cpu_alloc.c =================================================================== --- linux-2.6.orig/mm/cpu_alloc.c 2007-11-18 09:21:34.381533132 -0800 +++ linux-2.6/mm/cpu_alloc.c 2007-11-18 09:24:14.889645156 -0800 @@ -210,8 +210,7 @@ static int expand_cpu_area(gfp_t flags) map_order = get_order(DIV_ROUND_UP(bits, 8)); BUG_ON(map_order >= MAX_ORDER); - start = cpu_area + \ - (blocks << (PAGE_SHIFT + CONFIG_CPU_AREA_ALLOC_ORDER)); + start = (void *)(blocks << (PAGE_SHIFT + CONFIG_CPU_AREA_ALLOC_ORDER)); for_each_possible_cpu(cpu) { err = cpu_area_populate(CPU_PTR(start, cpu), ALLOC_SIZE, @@ -285,7 +284,7 @@ void * __init boot_cpu_alloc(unsigned lo expand_cpu_area(BOOT_ALLOC); units_reserved += units; spin_unlock_irqrestore(&cpu_alloc_map_lock, flags); - return cpu_area + x * UNIT_SIZE; + return (void *)(x * UNIT_SIZE); } #else @@ -408,7 +407,7 @@ restart: spin_unlock_irqrestore(&cpu_alloc_map_lock, flags); - ptr = cpu_area + (start + units_reserved) * UNIT_SIZE; + ptr = (void *)((start + units_reserved) * UNIT_SIZE); if (gfpflags & __GFP_ZERO) { int cpu; @@ -433,11 +432,11 @@ void cpu_free(void *start, unsigned long { int units = size_to_units(size); int index; - u8 *p = start; + unsigned long p = (unsigned long)start; unsigned long flags; - BUG_ON(p < (cpu_area + units_reserved * UNIT_SIZE)); - index = (p - cpu_area) / UNIT_SIZE - units_reserved; + BUG_ON(p < units_reserved * UNIT_SIZE); + index = p / UNIT_SIZE - units_reserved; BUG_ON(!test_bit(index, cpu_alloc_map) || index >= units_total - units_reserved); Index: linux-2.6/include/linux/percpu.h =================================================================== --- linux-2.6.orig/include/linux/percpu.h 2007-11-18 09:21:42.225624645 -0800 +++ linux-2.6/include/linux/percpu.h 2007-11-18 09:24:59.153533098 -0800 @@ -66,18 +66,19 @@ DECLARE_PER_CPU(cpumask_t, cpu_mask); * handled from an interrupt context). */ +#define __CPU_OFFSET(__cpu) \ + (CPU_AREA_BASE + ((unsigned long)(__cpu) << (CONFIG_CPU_AREA_ORDER + PAGE_SHIFT))) + #ifdef CONFIG_DEBUG_VM #define CPU_OFFSET(__cpu) ({ \ BUG_ON(!cpu_isset((__cpu), cpu_possible_map)); \ if (system_state == SYSTEM_RUNNING) \ WARN_ON(!cpu_isset((__cpu), cpu_online_map)); \ - (((unsigned long)(__cpu) << \ - (CONFIG_CPU_AREA_ORDER + PAGE_SHIFT))); \ + (__CPU_OFFSET(__cpu)); \ }) #else -#define CPU_OFFSET(__cpu) \ - ((unsigned long)(__cpu) << (CONFIG_CPU_AREA_ORDER + PAGE_SHIFT)) +#define CPU_OFFSET(__cpu) __CPU_OFFSET(__cpu) #endif #define CPU_PTR(__p, __cpu) ((__typeof__(__p))((void *)(__p) + \