Move virtual memory map into variable page size area Using higher order pages reduces memory pressure. This patch moves the virtual memory map into an area with a larger page size thereby reducing the number of TLBs requires to access the memmap array. The default is to use 1 Megabyte page size for the memmap. With 1MB page structs we can map 16k pages which is 256 megabytes. So if we have 4 GB ram per node (hopefully properly aligned) then we can map the complete memory in a node with one 16 MB huge page that is using only a single TLB entry instead of 1024 right now. Signed-off-by: Christoph Lameter Index: linux-2.6.18-mm3/arch/ia64/mm/init.c =================================================================== --- linux-2.6.18-mm3.orig/arch/ia64/mm/init.c 2006-10-07 20:02:56.625629747 -0700 +++ linux-2.6.18-mm3/arch/ia64/mm/init.c 2006-10-07 21:02:02.363858872 -0700 @@ -463,6 +463,17 @@ retry_pte: return hole_next_pfn - pgdat->node_start_pfn; } +#ifdef CONFIG_VIRTUAL_MEM_MAP_HUGE +#define VMEM_MAP_PAGE_SIZE (1UL << hpage_shift) +#else +#define VMEM_MAP_PAGE_SIZE PAGE_SIZE +#endif + +static void * __init alloc_vmem_page(int node, unsigned long size) +{ + return __alloc_bootmem_node(NODE_DATA(node), size, size, __pa(MAX_DMA_ADDRESS)); +} + int __init create_mem_map_page_table (u64 start, u64 end, void *arg) { @@ -477,27 +488,41 @@ create_mem_map_page_table (u64 start, u6 map_start = virt_to_page(start); map_end = virt_to_page(end); - start_page = (unsigned long) map_start & PAGE_MASK; - end_page = PAGE_ALIGN((unsigned long) map_end); + start_page = (unsigned long) map_start & ~(VMEM_MAP_PAGE_SIZE - 1); + end_page = ALIGN((unsigned long) map_end, VMEM_MAP_PAGE_SIZE); node = paddr_to_nid(__pa(start)); - for (address = start_page; address < end_page; address += PAGE_SIZE) { - pgd = pgd_offset_k(address); + for (address = start_page; address < end_page; address += VMEM_MAP_PAGE_SIZE) { +#ifdef CONFIG_VIRTUAL_MEM_MAP_HUGE + unsigned long taddr = htlbpage_to_page(address); + /* Keep region so that lookups can properly occur */ + pgd = pgd_offset(&init_mm, taddr); +#else + unsigned long taddr = address; + pgd = pgd_offset_k(taddr); +#endif if (pgd_none(*pgd)) - pgd_populate(&init_mm, pgd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)); - pud = pud_offset(pgd, address); + pgd_populate(&init_mm, pgd, alloc_vmem_page(node, PAGE_SIZE)); + pud = pud_offset(pgd, taddr); if (pud_none(*pud)) - pud_populate(&init_mm, pud, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)); - pmd = pmd_offset(pud, address); + pud_populate(&init_mm, pud, alloc_vmem_page(node, PAGE_SIZE)); + pmd = pmd_offset(pud, taddr); if (pmd_none(*pmd)) - pmd_populate_kernel(&init_mm, pmd, alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)); - pte = pte_offset_kernel(pmd, address); + pmd_populate_kernel(&init_mm, pmd, alloc_vmem_page(node, PAGE_SIZE)); + pte = pte_offset_kernel(pmd, taddr); + + if (pte_none(*pte)) { + unsigned long addr; - if (pte_none(*pte)) - set_pte(pte, pfn_pte(__pa(alloc_bootmem_pages_node(NODE_DATA(node), PAGE_SIZE)) >> PAGE_SHIFT, - PAGE_KERNEL)); + addr = __pa(alloc_vmem_page(node, VMEM_MAP_PAGE_SIZE)); + set_pte(pte, mk_pte_phys(addr, PAGE_KERNEL)); + printk(KERN_CRIT "Virtual mmap range %lx-%lx page @%lx:%lx:%lx pte=%lx size=%lu node=%d\n", start, end, address, taddr, addr, pte_val(*pte), VMEM_MAP_PAGE_SIZE, node); + } + else + printk(KERN_CRIT "Virtual mmap %lx-%lx @%lx node %d already present.\n", + start, end, address, node); } return 0; } Index: linux-2.6.18-mm3/include/asm-ia64/page.h =================================================================== --- linux-2.6.18-mm3.orig/include/asm-ia64/page.h 2006-10-07 20:02:56.622700240 -0700 +++ linux-2.6.18-mm3/include/asm-ia64/page.h 2006-10-07 21:02:02.984914299 -0700 @@ -53,7 +53,11 @@ #ifdef CONFIG_HUGETLB_PAGE # define HPAGE_REGION_BASE RGN_BASE(RGN_HPAGE) # define HPAGE_SHIFT hpage_shift +#ifdef CONFIG_VIRTUAL_MEM_MAP_HUGE +# define HPAGE_SHIFT_DEFAULT 20 /* Reduce memory overhead for virtual mem_map */ +#else # define HPAGE_SHIFT_DEFAULT 28 /* check ia64 SDM for architecture supported size */ +#endif # define HPAGE_SIZE (__IA64_UL_CONST(1) << HPAGE_SHIFT) # define HPAGE_MASK (~(HPAGE_SIZE - 1)) @@ -99,23 +103,40 @@ do { \ #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE +#ifdef CONFIG_VIRTUAL_MEM_MAP /* * STRUCT_PAGE_ORDER is needed to approximate the size of struct page * that is unknown at this point. struct page must be smaller than * 1 << STRUCT_PAGE_ORDER. */ #define STRUCT_PAGE_ORDER 6 - -#define VIRTUAL_MEM_MAP (RGN_BASE(RGN_GATE) + 0x200000000UL) #define VIRTUAL_MEM_MAP_SIZE (1UL << (IA64_MAX_PHYS_BITS - PAGE_SHIFT +\ STRUCT_PAGE_ORDER)) +#ifdef CONFIG_VIRTUAL_MEM_MAP_HUGE +/* + * Use huge pages for the virtual memory map. Since we have separate + * huge page region we can use the whole range and leave VMALLOC + * untouched. + */ +#define VIRTUAL_MEM_MAP_REGION RGN_HPAGE +#define VIRTUAL_MEM_MAP RGN_BASE(VIRTUAL_MEM_MAP_REGION) +#define VMALLOC_START (RGN_BASE(RGN_GATE) + 0x200000000UL) + +#else +/* + * Place the virtual memory map in the VMALLOC area reducing the + * available address space of 128 TB by 8 TB. + */ +#define VIRTUAL_MEM_MAP_REGION RGN_GATE +#define VIRTUAL_MEM_MAP (RGN_BASE(VIRTUAL_MEM_MAP_REGION) + 0x200000000UL) #define VMALLOC_START (VIRTUAL_MEM_MAP + VIRTUAL_MEM_MAP_SIZE) +#endif + #define VMALLOC_END (RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 9))) #define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) -#ifdef CONFIG_VIRTUAL_MEM_MAP extern int ia64_pfn_valid (unsigned long pfn); #elif defined(CONFIG_FLATMEM) # define ia64_pfn_valid(pfn) 1 Index: linux-2.6.18-mm3/arch/ia64/mm/fault.c =================================================================== --- linux-2.6.18-mm3.orig/arch/ia64/mm/fault.c 2006-10-07 16:07:34.001025348 -0700 +++ linux-2.6.18-mm3/arch/ia64/mm/fault.c 2006-10-07 21:02:02.364835375 -0700 @@ -65,6 +65,12 @@ mapped_kernel_page_is_present (unsigned pmd_t *pmd; pte_t *ptep, pte; +#ifdef CONFIG_VIRTUAL_MEM_MAP_HUGE + if (REGION_NUMBER(address) == RGN_HPAGE && address >= VIRTUAL_MEM_MAP) { + address = htlbpage_to_page(address); + pgd = pgd_offset(&init_mm, address); + } else +#endif pgd = pgd_offset_k(address); if (pgd_none(*pgd) || pgd_bad(*pgd)) return 0; @@ -105,13 +111,14 @@ ia64_do_page_fault (unsigned long addres #ifdef CONFIG_VIRTUAL_MEM_MAP /* - * If fault is in region 5 and we are in the kernel, we may already - * have the mmap_sem (pfn_valid macro is called during mmap). There - * is no vma for region 5 addr's anyway, so skip getting the semaphore - * and go directly to the exception handling code. + * If fault is in VIRTUAL_MEM_MAP region and we are in the kernel, + * we may already have the mmap_sem (pfn_valid macro is called during + * mmap). There is no vma for VIRTUAL_MEM_MAPs region anyway, so skip + * getting the semaphore and go directly to the exception handling + * code. */ - if ((REGION_NUMBER(address) == 5) && !user_mode(regs)) + if (REGION_NUMBER(address) == RGN_GATE && !user_mode(regs)) goto bad_area_no_up; #endif @@ -256,8 +263,10 @@ ia64_do_page_fault (unsigned long addres * translation, which fixed the problem. So, we check to see if the translation is * valid, and return if it is. */ - if (REGION_NUMBER(address) == 5 && mapped_kernel_page_is_present(address)) - return; + if ((REGION_NUMBER(address) == RGN_GATE || + REGION_NUMBER(address) == VIRTUAL_MEM_MAP_REGION) && + mapped_kernel_page_is_present(address)) + return; if (ia64_done_with_exception(regs)) return; Index: linux-2.6.18-mm3/arch/ia64/Kconfig =================================================================== --- linux-2.6.18-mm3.orig/arch/ia64/Kconfig 2006-10-07 16:07:33.655343509 -0700 +++ linux-2.6.18-mm3/arch/ia64/Kconfig 2006-10-07 21:02:02.985890801 -0700 @@ -363,8 +363,13 @@ config NODES_SHIFT config ARCH_POPULATES_NODE_MAP def_bool y -# VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP are functionally equivalent. -# VIRTUAL_MEM_MAP has been retained for historical reasons. +# VIRTUAL_MEM_MAP and FLAT_NODE_MEM_MAP may be functionally equivalent but +# the overhead of FLAT_NODE_MEM_MAP is much higher. Its even worse for +# a SPARSEMEM configuration that needs indirections through multiple tables +# for elementary VM operations. +# +# VIRTUAL_MEM_MAP is the best choice for handling large sparse memory maps. +# config VIRTUAL_MEM_MAP bool "Virtual mem map" depends on !SPARSEMEM @@ -376,6 +381,23 @@ config VIRTUAL_MEM_MAP require the DISCONTIGMEM option for your machine. If you are unsure, say Y. +config VIRTUAL_MEM_MAP_HUGE + depends on VIRTUAL_MEM_MAP + bool "Virtual mem map uses Huge pages" + help + By default we map the virtual memory map using the default page + size and take a part of VMALLOC space for the map. This option + makes the virtual memory map use huge pages as a base and moves + the virtual memory map out of the VMALLOC space. This has the + effect of decreasing TLB use necessary to access the virtual + memory map. + The default huge page size is decreased from 256M to 16M in order + in order to reduce overhead. A 4M huge page can map ~4GB memory. + A 16k page can map ~4 Megabytes of memory. + Note that changes of the huge page size via a boot option will + then also change the base page size for the virtual memory map. + Too high huge page sizes may lead to memory being wasted. + config HOLES_IN_ZONE bool default y if VIRTUAL_MEM_MAP