--- include/linux/mm.h | 13 ++ mm/Kconfig | 10 + mm/cpu_alloc.c | 287 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 299 insertions(+), 11 deletions(-) Index: linux-2.6.24-rc2-mm1/mm/cpu_alloc.c =================================================================== --- linux-2.6.24-rc2-mm1.orig/mm/cpu_alloc.c 2007-11-14 17:46:18.130799764 -0800 +++ linux-2.6.24-rc2-mm1/mm/cpu_alloc.c 2007-11-14 17:59:44.755547045 -0800 @@ -17,6 +17,12 @@ #include #include #include +#include +#include +#include /* i386 definition of init_mm */ +#include /* i386 dependency on highmem config */ +#include +#include /* * Basic allocation unit. A bit map is created to track the use of each @@ -24,7 +30,7 @@ */ #define UNIT_SIZE sizeof(int) -#define UNITS (ALLOC_SIZE / UNIT_SIZE) +#define UNITS_PER_BLOCK (ALLOC_SIZE / UNIT_SIZE) /* * How many units are needed for an object of a given size @@ -40,6 +46,249 @@ static int size_to_units(unsigned long s static DEFINE_SPINLOCK(cpu_alloc_map_lock); static unsigned long units_reserved; /* Units reserved by boot allocations */ +#ifdef CONFIG_CPU_AREA_VIRTUAL + +/* + * Virtualized cpu area. The cpu area can be extended if more space is needed. + */ + +#define cpu_area ((u8 *)(CPU_AREA_BASE)) +#define ALLOC_SIZE (1UL << (CONFIG_CPU_AREA_ALLOC_ORDER + PAGE_SHIFT)) +#define BOOT_ALLOC (1 << __GFP_BITS_SHIFT) + + +/* + * The maximum number of blocks is the maximum size of the + * cpu area for one processor divided by the size of an allocation + * block. + */ +#define MAX_BLOCKS (1UL << (CONFIG_CPU_AREA_ORDER - \ + CONFIG_CPU_AREA_ALLOC_ORDER)) + + +static unsigned long *cpu_alloc_map = NULL; +static int cpu_alloc_map_order = -1; /* Size of the bitmap in page order */ +static unsigned long active_blocks; /* Number of block allocated on each cpu */ +static unsigned long units_total; /* Total units that are managed */ +/* + * Allocate a block of memory to be used to provide cpu area memory + * or to extend the bitmap for the cpu map. + */ +void *cpu_area_alloc_block(unsigned long size, gfp_t flags, int node) +{ + if (!(flags & BOOT_ALLOC)) { + struct page *page = alloc_pages_node(node, + flags, get_order(size)); + + if (page) + return page_address(page); + return NULL; + } else + return __alloc_bootmem_node(NODE_DATA(node), size, size, + __pa(MAX_DMA_ADDRESS)); +} + +pte_t *cpu_area_pte_populate(pmd_t *pmd, unsigned long addr, + gfp_t flags, int node) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + if (pte_none(*pte)) { + pte_t entry; + void *p = cpu_area_alloc_block(PAGE_SIZE, flags, node); + if (!p) + return 0; + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } + return pte; +} + +pmd_t *cpu_area_pmd_populate(pud_t *pud, unsigned long addr, + gfp_t flags, int node) +{ + pmd_t *pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p = cpu_area_alloc_block(PAGE_SIZE, flags, node); + if (!p) + return 0; + pmd_populate_kernel(&init_mm, pmd, p); + } + return pmd; +} + +pud_t *cpu_area_pud_populate(pgd_t *pgd, unsigned long addr, + gfp_t flags, int node) +{ + pud_t *pud = pud_offset(pgd, addr); + if (pud_none(*pud)) { + void *p = cpu_area_alloc_block(PAGE_SIZE, flags, node); + if (!p) + return 0; + pud_populate(&init_mm, pud, p); + } + return pud; +} + +pgd_t *cpu_area_pgd_populate(unsigned long addr, gfp_t flags, int node) +{ + pgd_t *pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) { + void *p = cpu_area_alloc_block(PAGE_SIZE, flags, node); + if (!p) + return 0; + pgd_populate(&init_mm, pgd, p); + } + return pgd; +} + +int cpu_area_populate_basepages(void *start, unsigned long size, + gfp_t flags, int node) +{ + unsigned long addr = (unsigned long)start; + unsigned long end = addr + size; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (; addr < end; addr += PAGE_SIZE) { + pgd = cpu_area_pgd_populate(addr, flags, node); + if (!pgd) + return -ENOMEM; + pud = cpu_area_pud_populate(pgd, addr, flags, node); + if (!pud) + return -ENOMEM; + pmd = cpu_area_pmd_populate(pud, addr, flags, node); + if (!pmd) + return -ENOMEM; + pte = cpu_area_pte_populate(pmd, addr, flags, node); + if (!pte) + return -ENOMEM; + } + + return 0; +} + +/* + * If no other population function is defined then this function will stand + * in and provide the capability to map PAGE_SIZE pages into the cpu area. + */ +int __attribute__((weak)) cpu_area_populate(void *start, unsigned long size, + gfp_t flags, int node) +{ + return cpu_area_populate_basepages(start, size, flags, node); +} + +/* + * Extend the areas on all processors. This function may be called repeatedly + * until we have enough space to accomodate a newly allocated object. + * + * Must hold the cpu_alloc_map_lock on entry. Will drop the lock and then + * regain it. + */ +static int expand_cpu_area(gfp_t flags) +{ + unsigned long blocks = active_blocks; + unsigned long bits; + int cpu; + int err = -ENOMEM; + int map_order; + unsigned long *new_map = NULL; + void *start; + + if (active_blocks == MAX_BLOCKS) + goto out; + + spin_unlock(&cpu_alloc_map_lock); + if (flags & __GFP_WAIT) + local_irq_enable(); + + /* + * Determine the size of the bit map needed + */ + bits = (blocks + 1) * UNITS_PER_BLOCK - units_reserved; + + map_order = get_order(DIV_ROUND_UP(bits, 8)); + BUG_ON(map_order >= MAX_ORDER); + start = cpu_area + \ + (blocks << (PAGE_SHIFT + CONFIG_CPU_AREA_ALLOC_ORDER)); + + for_each_possible_cpu(cpu) { + err = cpu_area_populate(CPU_PTR(start, cpu), ALLOC_SIZE, + flags, cpu_to_node(cpu)); + + if (err) { + spin_lock(&cpu_alloc_map_lock); + goto out; + } + } + + if (map_order > cpu_alloc_map_order) { + new_map = cpu_area_alloc_block(PAGE_SIZE << map_order, + flags | __GFP_ZERO, 0); + if (!new_map) + goto out; + } + + if (flags & __GFP_WAIT) + local_irq_disable(); + spin_lock(&cpu_alloc_map_lock); + + /* + * We dropped the lock. Another processor may have already extended + * the cpu area size as needed. + */ + if (blocks != active_blocks) { + if (new_map) + free_pages((unsigned long)new_map, + map_order); + err = 0; + goto out; + } + + if (new_map) { + /* + * Need to extend the bitmap + */ + if (cpu_alloc_map) + memcpy(new_map, cpu_alloc_map, + PAGE_SIZE << cpu_alloc_map_order); + cpu_alloc_map = new_map; + cpu_alloc_map_order = map_order; + } + + active_blocks++; + units_total += UNITS_PER_BLOCK; + err = 0; +out: + return err; +} + +void * __init boot_cpu_alloc(unsigned long size) +{ + unsigned long flags; + unsigned long x = units_reserved; + unsigned long units = size_to_units(size); + + /* + * Locking is really not necessary during boot + * but expand_cpu_area() unlocks and relocks. + * If we do not perform locking here then + * + * 1. The cpu_alloc_map_lock is locked when + * we exit boot causing a hang on the next cpu_alloc(). + * 2. lockdep will get upset if we do not consistently + * handle things. + */ + spin_lock_irqsave(&cpu_alloc_map_lock, flags); + while (units_reserved + units > units_total) + expand_cpu_area(BOOT_ALLOC); + units_reserved += units; + spin_unlock_irqrestore(&cpu_alloc_map_lock, flags); + return cpu_area + x * UNIT_SIZE; +} +#else + /* * Static configuration. The cpu areas are of a fixed size and * cannot be extended. Such configurations are mainly useful on @@ -51,16 +300,24 @@ static unsigned long units_reserved; /* #define ALLOC_SIZE (1UL << (CONFIG_CPU_AREA_ORDER + PAGE_SHIFT)) static u8 cpu_area[NR_CPUS * ALLOC_SIZE]; -static DECLARE_BITMAP(cpu_alloc_map, UNITS); +static DECLARE_BITMAP(cpu_alloc_map, UNITS_PER_BLOCK); +#define cpu_alloc_map_order CONFIG_CPU_AREA_ORDER +#define units_total UNITS_PER_BLOCK + +static inline int expand_cpu_area(gfp_t flags) +{ + return -ENOSYS; +} void * __init boot_cpu_alloc(unsigned long size) { unsigned long x = units_reserved; units_reserved += size_to_units(size); - BUG_ON(units_reserved > UNITS); + BUG_ON(units_reserved > units_total); return cpu_area + x * UNIT_SIZE; } +#endif static int first_free; /* First known free unit */ @@ -98,20 +355,30 @@ void *cpu_alloc(unsigned long size, gfp_ int units = size_to_units(size); void *ptr; int first; + unsigned long map_size; unsigned long flags; BUG_ON(gfpflags & ~(GFP_RECLAIM_MASK | __GFP_ZERO)); spin_lock_irqsave(&cpu_alloc_map_lock, flags); +restart: + if (cpu_alloc_map_order >= 0) + map_size = PAGE_SIZE << cpu_alloc_map_order; + else + map_size = 0; + first = 1; start = first_free; for ( ; ; ) { - start = find_next_zero_bit(cpu_alloc_map, ALLOC_SIZE, start); - if (start >= UNITS - units_reserved) + start = find_next_zero_bit(cpu_alloc_map, map_size, start); + if (start >= units_total - units_reserved) { + if (!expand_cpu_area(gfpflags)) + goto restart; goto out_of_memory; + } if (first) first_free = start; @@ -121,7 +388,7 @@ void *cpu_alloc(unsigned long size, gfp_ * the starting unit. */ if ((start + units_reserved) % (align / UNIT_SIZE) == 0 && - find_next_bit(cpu_alloc_map, ALLOC_SIZE, start + 1) + find_next_bit(cpu_alloc_map, map_size, start + 1) >= start + units) break; start++; @@ -131,8 +398,10 @@ void *cpu_alloc(unsigned long size, gfp_ if (first) first_free = start + units; - if (start + units > UNITS - units_reserved) - goto out_of_memory; + while (start + units > units_total - units_reserved) { + if (expand_cpu_area(gfpflags)) + goto out_of_memory; + } set_map(start, units); __count_vm_events(CPU_BYTES, units * UNIT_SIZE); @@ -170,7 +439,7 @@ void cpu_free(void *start, unsigned long BUG_ON(p < (cpu_area + units_reserved * UNIT_SIZE)); index = (p - cpu_area) / UNIT_SIZE - units_reserved; BUG_ON(!test_bit(index, cpu_alloc_map) || - index >= UNITS - units_reserved); + index >= units_total - units_reserved); spin_lock_irqsave(&cpu_alloc_map_lock, flags); Index: linux-2.6.24-rc2-mm1/include/linux/mm.h =================================================================== --- linux-2.6.24-rc2-mm1.orig/include/linux/mm.h 2007-11-14 17:46:18.158801481 -0800 +++ linux-2.6.24-rc2-mm1/include/linux/mm.h 2007-11-14 17:52:27.356047054 -0800 @@ -1185,5 +1185,18 @@ int vmemmap_populate_basepages(struct pa unsigned long pages, int node); int vmemmap_populate(struct page *start_page, unsigned long pages, int node); +pgd_t *cpu_area_pgd_populate(unsigned long addr, gfp_t flags, int node); +pud_t *cpu_area_pud_populate(pgd_t *pgd, unsigned long addr, + gfp_t flags, int node); +pmd_t *cpu_area_pmd_populate(pud_t *pud, unsigned long addr, + gfp_t flags, int node); +pte_t *cpu_area_pte_populate(pmd_t *pmd, unsigned long addr, + gfp_t flags, int node); +void *cpu_area_alloc_block(unsigned long size, gfp_t flags, int node); +int cpu_area_populate_basepages(void *start, unsigned long size, + gfp_t flags, int node); +int cpu_area_populate(void *start, unsigned long size, + gfp_t flags, int node); + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ Index: linux-2.6.24-rc2-mm1/mm/Kconfig =================================================================== --- linux-2.6.24-rc2-mm1.orig/mm/Kconfig 2007-11-14 17:46:18.146800745 -0800 +++ linux-2.6.24-rc2-mm1/mm/Kconfig 2007-11-14 17:52:27.408047053 -0800 @@ -197,7 +197,13 @@ config VIRT_TO_BUS config CPU_AREA_ORDER int "Maximum size (order) of CPU area" - default "0" + default "10" if CPU_AREA_VIRTUAL + default "0" if !CPU_AREA_VIRTUAL help Sets the maximum amount of memory that can be allocated via cpu_alloc - The size is set in page order, so 0 = PAGE_SIZE, 1 = PAGE_SIZE << 1 etc. + The size is set in page order. The size set (times the maximum + number of processors) determines the amount of virtual memory that + is set aside for the per cpu areas for virtualized cpu areas or the + amount of memory allocated in the bss segment for non virtualized + cpu areas. +