From apw@shadowen.org Tue May 22 15:48:19 2007 Date: Wed, 23 May 2007 00:01:03 +0100 From: Andy Whitcroft To: linux-mm@kvack.org Cc: linux-arch@vger.kernel.org, Nick Piggin , Christoph Lameter , Mel Gorman , Andy Whitcroft Subject: IA64: SPARSEMEM_VMEMMAP 4M page size support This implements 4M page sized vmemmap support for IA64. This is important because the traditional vmemmap on IA64 uses page size for mapping the TLB. For a typical 8GB node on IA64 we need about (33 - 14 + 6 = 25) = 32 MB of page structs. Using page size we will end up with (25 - 14 = 11) 2048 page table entries. This patch will reduce this to eight 4MB TLBs. So its a factor of 256 less TLBs for the virtual memory map. We modify the alt_dtlb_miss handler to branch to a vmemmap TLB lookup function if bit 60 is set. The vmemmap will start with 0xF000xxx so its going be very distinctive in dumps and can be distinguished easily from 0xE000xxx (kernel 1-1 area) and 0xA000xxx (kernel text, data and vmalloc). We use a 2 level page table to do lookups for the vmemmap TLBs. We want to cover 8 Petabyte so we need to handle at least 53 bit. 22 bits (4MB) are covered by the vmmemmap page and we need roughly 5 bits (64 bytes) for a page struct: 53 - 14 - 22 + 5 = 22 If we also use PAGE_SIZE pages for the 1st level page table then we have 22 - 11 = 11 16k left for the highest page. This is the base page size so its very convienient. The fault handler only has to do a two lookups in contrast to 4 for the current vmalloc/vmemmap implementation. But the 16k implementation has the advantage that is can use the VHPT walker. [apw@shadowen.org: style fixups] From: Christoph Lameter Signed-off-by: Christoph Lameter Signed-off-by: Andy Whitcroft Acked-by: Mel Gorman --- Index: linux-2.6.22-rc2/arch/ia64/Kconfig =================================================================== --- linux-2.6.22-rc2.orig/arch/ia64/Kconfig 2007-05-22 23:09:35.000000000 -0700 +++ linux-2.6.22-rc2/arch/ia64/Kconfig 2007-05-22 23:09:54.000000000 -0700 @@ -359,6 +359,16 @@ config SPARSEMEM_VMEMMAP def_bool y depends on SPARSEMEM +config ARCH_POPULATES_SPARSEMEM_VMEMMAP + bool "Use 4M pages for the virtual memory map" + default n + depends on SPARSEMEM_VMEMMAP + help + Enables large page virtual memmap support. Each virtual memmap + page will be 4MB in size. Such a vmemmap block can cover 1GB + of memory. If nodes are smaller than 1GB then we may have overlap + issues and the page structs may end up on the wrong nodes. + config ARCH_DISCONTIGMEM_DEFAULT def_bool y if (IA64_SGI_SN2 || IA64_GENERIC || IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB) depends on ARCH_DISCONTIGMEM_ENABLE Index: linux-2.6.22-rc2/arch/ia64/kernel/ivt.S =================================================================== --- linux-2.6.22-rc2.orig/arch/ia64/kernel/ivt.S 2007-05-22 23:09:35.000000000 -0700 +++ linux-2.6.22-rc2/arch/ia64/kernel/ivt.S 2007-05-22 23:09:54.000000000 -0700 @@ -9,6 +9,8 @@ * Suresh Siddha * Kenneth Chen * Fenghua Yu + * Copyright (C) 2007 Silicon Graphics Inc. + * Christoph Lameter * * 00/08/23 Asit Mallick TLB handling for SMP * 00/12/20 David Mosberger-Tang DTLB/ITLB handler now uses virtual PT. @@ -391,9 +393,11 @@ ENTRY(alt_dtlb_miss) tbit.z p12,p0=r16,61 // access to region 6? mov r25=PERCPU_PAGE_SHIFT << 2 mov r26=PERCPU_PAGE_SIZE - nop.m 0 - nop.b 0 + tbit.nz p6,p0=r16,53 // Access to VMEMMAP? +(p6) br.cond.dptk do_vmemmap ;; +dtlb_continue: + .pred.rel "mutex", p11, p10 (p10) mov r19=IA64_KR(PER_CPU_DATA) (p11) and r19=r19,r16 // clear non-ppn fields extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl @@ -416,6 +420,39 @@ ENTRY(alt_dtlb_miss) (p7) itc.d r19 // insert the TLB entry mov pr=r31,-1 rfi + +do_vmemmap: + // + // VMEMMAP_SIZE lookup via vmemmap_table for + // the virtual memory map. + // + tbit.nz p6,p0=r16,54 // more top bits set? +(p6) br.cond.spnt dtlb_continue // then its mmu bootstrap + ;; + rsm psr.dt // switch to using physical addressing + extr.u r25=r16, VMEMMAP_SHIFT + PAGE_SHIFT - 3, PAGE_SHIFT -3 + extr.u r19=r16, VMEMMAP_SHIFT, PAGE_SHIFT - 3 + ;; + srlz.d + LOAD_PHYSICAL(p0, r26, vmemmap_l1) + shl r25=r25,3 + ;; + add r26=r26,r25 // Index into vmemmap table + shl r19=r19,3 + ;; + ld8 r25=[r26] // Get addresss of l1 */ + ;; + add r25=r25,r19 // Index into l1 + ;; + ld8 r25=[r25]; + ;; + cmp.eq p6,p0=r25, r0 // Valid? +(p6) br.cond.spnt page_fault // Page not present + or r19=r25,r17 // insert PTE control bits into r19 + ;; + itc.d r19 // insert the TLB entry + mov pr=r31,-1 + rfi END(alt_dtlb_miss) .org ia64_ivt+0x1400 Index: linux-2.6.22-rc2/arch/ia64/mm/discontig.c =================================================================== --- linux-2.6.22-rc2.orig/arch/ia64/mm/discontig.c 2007-05-22 23:09:35.000000000 -0700 +++ linux-2.6.22-rc2/arch/ia64/mm/discontig.c 2007-05-22 23:09:54.000000000 -0700 @@ -8,6 +8,8 @@ * Russ Anderson * Jesse Barnes * Jack Steiner + * Copyright (C) 2007 sgi + * Christoph Lameter */ /* @@ -44,6 +46,90 @@ struct early_node_data { unsigned long max_pfn; }; +#ifdef CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP +/* + * The vmemmap_table contains the number of the 4M page used to map + * that section of the virtual memmap. + * + * We support 53 address bits, 14 bits are used for the page size. This + * leaves 39 bits (512G) for the pfn. Using page structs the memmap is going + * to take up a bit less than 16TB of virtual space. + * + * We are mapping these 16TB using 4M granule size which makes us end up + * with a bit less than 4 mio entries. These are in turn in 2048 groups + * in 16k pages. The top level page also has 2048 entries. + */ + +#define VMEMMAP_SIZE ALIGN((1UL << (MAX_PHYSMEM_BITS - PAGE_SHIFT)) \ + * sizeof(struct page), VMEMMAP_BLOCK_SIZE) + +/* + * Each vmemmap_table entry contains the physical address of a l1 16k + * page that in turn contains the physical addresses of a VMEMMAP_SIZE + * block. + * If a pointer is NULL then that entry is not populated yet. + */ +unsigned long vmemmap_l1[VMEMMAP_SIZE >> (VMEMMAP_SHIFT + PAGE_SHIFT - 3)]; + +int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node) +{ + unsigned long phys_start = __pa(start) & ~VMEMMAP_FLAG; + unsigned long phys_end = __pa(start + nr) & ~VMEMMAP_FLAG; + unsigned long addr = phys_start & ~(VMEMMAP_BLOCK_SIZE -1); + unsigned long end = ALIGN(phys_end, VMEMMAP_BLOCK_SIZE); + + for (; addr < end; addr += VMEMMAP_BLOCK_SIZE) { + unsigned int index_l2 = + (addr >> VMEMMAP_SHIFT) & (((1 << PAGE_SHIFT) - 3) - 1); + + unsigned int index_l1 = + addr >> (VMEMMAP_SHIFT + PAGE_SHIFT - 3); + + unsigned long *p1 = &vmemmap_l1[index_l1]; + unsigned long *p2; + + if (*p1) + p2 = (u64 *)__va(*p1); + else { + /* Level 2 block allocation */ + p2 = vmemmap_alloc_block(PAGE_SIZE, node); + if (!p2) + return -ENOMEM; + + *p1 = __pa(p2); + } + p2 += index_l2; + + if (*p2) { + int actual_node; + + actual_node = early_pfn_to_nid(*p2 >> PAGE_SHIFT); + if (actual_node != node) + printk(KERN_WARNING "Virtual memory segments " + "on node %d instead of %d", + actual_node, node); + } else { + void *block = + vmemmap_alloc_block(VMEMMAP_BLOCK_SIZE, node); + + if (!block) + return -ENOMEM; + + *p2 = __pa(block); + + printk(KERN_INFO "[%p-%p] page_structs=%lu " + "node=%d entry=%lu/%lu\n", start, block, nr, + node, addr >> VMEMMAP_SHIFT, + VMEMMAP_SIZE >> VMEMMAP_SHIFT); + } + } + return 0; +} +#else +/* Satisfy reference in arch/ia64/kernel/ivt.S */ +u32 vmemmap_table[0]; +#endif + static struct early_node_data mem_data[MAX_NUMNODES] __initdata; static nodemask_t memory_less_mask __initdata; Index: linux-2.6.22-rc2/include/asm-ia64/pgtable.h =================================================================== --- linux-2.6.22-rc2.orig/include/asm-ia64/pgtable.h 2007-05-22 23:09:35.000000000 -0700 +++ linux-2.6.22-rc2/include/asm-ia64/pgtable.h 2007-05-22 23:09:54.000000000 -0700 @@ -147,6 +147,10 @@ #define PAGE_KERNEL __pgprot(__DIRTY_BITS | _PAGE_PL_0 | _PAGE_AR_RWX) #define PAGE_KERNELRX __pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_RX) +#define VMEMMAP_FLAG (1UL << 53) +#define VMEMMAP_SHIFT 22 +#define VMEMMAP_BLOCK_SIZE (1UL << VMEMMAP_SHIFT) + # ifndef __ASSEMBLY__ #include /* for mm_struct */ @@ -236,7 +240,8 @@ ia64_phys_addr_valid (unsigned long addr # define VMALLOC_END vmalloc_end extern unsigned long vmalloc_end; #else -#if defined(CONFIG_SPARSEMEM) && defined(CONFIG_SPARSEMEM_VMEMMAP) +#if defined(CONFIG_SPARSEMEM) && defined(CONFIG_SPARSEMEM_VMEMMAP) && \ + !defined(CONFIG_ARCH_POPULATES_SPARSEMEM_VMEMMAP) /* SPARSEMEM_VMEMMAP uses half of vmalloc... */ # define VMALLOC_END (RGN_BASE(RGN_GATE) + (1UL << (4*PAGE_SHIFT - 10))) # define vmemmap ((struct page *)VMALLOC_END) @@ -559,6 +564,7 @@ do { \ # endif /* CONFIG_VIRTUAL_MEM_MAP */ # endif /* !__ASSEMBLY__ */ +#define vmemmap ((struct page *)(RGN_BASE(RGN_KERNEL) | VMEMMAP_FLAG)) /* * Identity-mapped regions use a large page size. We'll call such large pages * "granules". If you can think of a better name that's unambiguous, let me