IA64: Variable Kernel Page size support This patch adds the capability to manage pages of varying sizes for the kernel in region 4. We switch off the VHPT walker for region 4 and then add special processing in the nested_dtlb_miss handler in order to be able to handle page tables and tlbs of various sizes. We allow 7 page sizes 1..7 and encode that in the address above the area used to indexed into the mapped area. This is typically bit 47-49 (3 level page tables) or (58-60) (4 level page tables). So the address format for a 4 level page tables to address pages of varying sizes becomes REGION |PS |PGD |PUD |PMD |PTE |OFFSET For 3 level page tables we need to keep the gap in the proper place to also be able to accomodate Itanium processors that only support 50 virtual address bits. REGION |GAP |PS |PGD |PMD |PTE |OFFSET Supported page sizes: Each page size region supports an address space of 58 bit (4 level) or 47 bit (3 level): Index Shift Size 3 level map size ------------------------------------- 0 hpage ? (preserve existing huge page semantics) 1 18 256k 2 20 1M 3 22 4M 4 24 16M 5 26 64M 6 28 256M 7 30 1GB Separate page tables are kept for each region (we do not ever need the full number of pgd entries since the pages are much longer). The separate 7 page tables are pushed into a regular full sized pgd directory that is segmented into 8 pieces. This modification is performance wise almost neutral. Changes are not invasive since we use a rarely used handler to implement the variable page sizes. The huge pages can no longer use the VHPT walker but then there is also not a high frequency of huge page TLB misses compared to regular sizes pages. Signed-off-by: Christoph Lameter Index: linux-2.6.19-rc1-mm1/arch/ia64/kernel/ivt.S =================================================================== --- linux-2.6.19-rc1-mm1.orig/arch/ia64/kernel/ivt.S 2006-10-04 21:57:05.000000000 -0500 +++ linux-2.6.19-rc1-mm1/arch/ia64/kernel/ivt.S 2006-10-10 21:19:37.922309776 -0500 @@ -375,7 +375,6 @@ ENTRY(alt_dtlb_miss) mov r21=cr.ipsr mov r31=pr ;; -#ifdef CONFIG_DISABLE_VHPT shr.u r22=r16,61 // get the region number into r21 ;; cmp.gt p8,p0=6,r22 // access to region 0-5 @@ -385,7 +384,6 @@ ENTRY(alt_dtlb_miss) (p8) mov cr.iha=r17 (p8) mov r29=b0 // save b0 (p8) br.cond.dptk dtlb_fault -#endif extr.u r23=r21,IA64_PSR_CPL0_BIT,2 // extract psr.cpl and r22=IA64_ISR_CODE_MASK,r20 // get the isr.code field tbit.nz p6,p7=r20,IA64_ISR_SP_BIT // is speculation bit on? @@ -414,7 +412,9 @@ END(alt_dtlb_miss) // 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45) ENTRY(nested_dtlb_miss) /* - * In the absence of kernel bugs, we get here when the virtually mapped linear + * We get here for REGION 4 since the VHPT is off for REGION 4. Special magic + * can be performed here if bits above RGN_MAP_SHIFT are set to work with + * various page size. We may also get here if the * page table is accessed non-speculatively (e.g., in the Dirty-bit, Instruction * Access-bit, or Data Access-bit faults). If the DTLB entry for the virtual page * table is missing, a nested TLB miss fault is triggered and control is @@ -439,27 +439,83 @@ ENTRY(nested_dtlb_miss) mov r19=IA64_KR(PT_BASE) // get the page table base address shl r21=r16,3 // shift bit 60 into sign bit mov r18=cr.itir - ;; shr.u r17=r16,61 // get the region number into r17 - extr.u r18=r18,2,6 // get the faulting page size + movl r22=(1< + // 1 256k + // 2 1M + // 3 4M + // 4 16M + // 5 64M + // 6 256M + // 7 1GB + ;; + mov r21=IA64_VKP_FIRST_SIZE-2 + extr r22=r16,IA64_RGN_MAP_SHIFT,3 // Get page size bits + srlz.d + LOAD_PHYSICAL(p0, r19, sizes_pg_dir) // And the right pg dir + ;; + dep r19=r22,r19,PAGE_SHIFT-3,3 // Fix up pgd to point to subsection + dep r16=0,r16,IA64_RGN_MAP_SHIFT,3 // Clear page size bits + ;; + shladd r22=r22,1,r21 // KVP_INDEX_TO_SHIFT(...) + ;; + dep r18=r22,r18,2,6 // Fix up itir + ;; + mov cr.itir=r18 // Set new page size + ;; + mov r18=r22 // page size + ;; + add r22=-PAGE_SHIFT,r18 // page order + ;; add r18=PGDIR_SHIFT-PAGE_SHIFT,r18 + shr.u r22=r16,r22 // page number within region ;; - shr.u r22=r16,r22 - shr.u r18=r16,r18 -(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place + shr.u r18=r16,r18 // Calculate PGD offset + mov r21=r0 // Make sure test succeeds + ;; + dep r17=r18,r19,3,(PAGE_SHIFT-6) // r17=pgd_offset for region 5 + br.cond.spnt.many nested_dtlb // Continue +kernel_pgd: srlz.d - LOAD_PHYSICAL(p6, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir - + LOAD_PHYSICAL(p0, r19, swapper_pg_dir) // region 5 is rooted at swapper_pg_dir + ;; +default_pagesize: + extr.u r18=r18,2,6 // get the faulting page size + ;; + add r22=-PAGE_SHIFT,r18 // page order + ;; + add r18=PGDIR_SHIFT-PAGE_SHIFT+3,r18 // Compensated shift for r21 + shr.u r22=r16,r22 + ;; + shr.u r18=r21,r18 +(p7) dep r17=r17,r19,(PAGE_SHIFT-3),3 // put region number bits in place + ;; .pred.rel "mutex", p6, p7 (p6) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT (p7) shr.u r21=r21,PGDIR_SHIFT+PAGE_SHIFT-3 - ;; (p6) dep r17=r18,r19,3,(PAGE_SHIFT-3) // r17=pgd_offset for region 5 (p7) dep r17=r18,r17,3,(PAGE_SHIFT-6) // r17=pgd_offset for region[0-4] + ;; +nested_dtlb: cmp.eq p7,p6=0,r21 // unused address bits all zeroes? #ifdef CONFIG_PGTABLE_4 shr.u r18=r22,PUD_SHIFT // shift pud index into position Index: linux-2.6.19-rc1-mm1/arch/ia64/kernel/head.S =================================================================== --- linux-2.6.19-rc1-mm1.orig/arch/ia64/kernel/head.S 2006-10-04 21:57:05.000000000 -0500 +++ linux-2.6.19-rc1-mm1/arch/ia64/kernel/head.S 2006-10-10 21:19:21.819349761 -0500 @@ -174,6 +174,15 @@ empty_zero_page: swapper_pg_dir: .skip PAGE_SIZE + // + // Special pg_dir for variable kernel page sizes. The table is + // segmented into 8 sections of equal size that provide the lookups + // for each supported page size. + // + .global sizes_pg_dir +sizes_pg_dir: + .skip PAGE_SIZE + .rodata halt_msg: stringz "Halting kernel\n" Index: linux-2.6.19-rc1-mm1/include/asm-ia64/pgtable.h =================================================================== --- linux-2.6.19-rc1-mm1.orig/include/asm-ia64/pgtable.h 2006-10-04 21:57:05.000000000 -0500 +++ linux-2.6.19-rc1-mm1/include/asm-ia64/pgtable.h 2006-10-10 21:19:37.943796016 -0500 @@ -147,6 +147,37 @@ #define PAGE_KERNEL __pgprot(__DIRTY_BITS | _PAGE_PL_0 | _PAGE_AR_RWX) #define PAGE_KERNELRX __pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_RX) + +/* + * Definitions to support various sizes of kernel pages + * that can be used to reduce TLB pressure. + * + * Supported lage page shifts and sizes are: + * + * 18(256k) 20(1M) 22(4M) 24(16M) 26(64M) 28(256M) 30(1GB) + */ +#define VKP_FIRST_SHIFT 18 /* First supported page size (256k) */ + +#define VKP_SHIFT_TO_INDEX(x) (((x)-VKP_FIRST_SHIFT) / 2 + __IA64_UL_CONST(1)) + +#define VKP_AREA(shift) (RGN_BASE(RGN_HPAGE) + \ + (VKP_SHIFT_TO_INDEX(shift) << RGN_MAP_SHIFT)) + +/* Extract various things from a VKP address */ +#define VKP_ADDR_TO_INDEX(addr) (((addr) >> RGN_MAP_SHIFT) & 7UL) +#define VKP_ADDR_TO_SHIFT(addr) (VKP_ADDR_TO_INDEX(addr) * 2UL + VKP_FIRST_SHIFT - 2UL) + +#define VKP_ADDR_TO_OFFSET(addr) ((addr) & (RGN_MAP_LIMIT -1UL)) +#define VKP_ADDR_TO_AREA(addr) ((addr) & ~(RGN_MAP_LIMIT -1UL)) + +#define VKP_PAGE_TO_PAGE(addr) (VKP_ADDR_TO_OFFSET(addr) >> (VKP_ADDR_TO_SHIFT(addr) - PAGE_SHIFT) | \ + VKP_ADDR_TO_AREA(addr)) + +#define VKP_VALID(addr) (REGION_NUMBER(addr) == RGN_HPAGE && VKP_ADDR_TO_INDEX(addr)) + +#define pgd_offset_vkp(addr) (sizes_pg_dir + (VKP_ADDR_TO_INDEX(addr) << (PAGE_SHIFT-6)) +\ + ((VKP_ADDR_TO_OFFSET(addr) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))) + # ifndef __ASSEMBLY__ #include /* for mm_struct */ @@ -451,6 +482,7 @@ pte_same (pte_t a, pte_t b) #define update_mmu_cache(vma, address, pte) do { } while (0) extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; +extern pgd_t sizes_pg_dir[PTRS_PER_PGD]; extern void paging_init (void); /* Index: linux-2.6.19-rc1-mm1/arch/ia64/kernel/asm-offsets.c =================================================================== --- linux-2.6.19-rc1-mm1.orig/arch/ia64/kernel/asm-offsets.c 2006-10-04 21:57:05.000000000 -0500 +++ linux-2.6.19-rc1-mm1/arch/ia64/kernel/asm-offsets.c 2006-10-10 21:19:21.868182124 -0500 @@ -268,4 +268,8 @@ void foo(void) DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64); DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32); DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec)); + BLANK(); + + DEFINE(IA64_VKP_FIRST_SIZE, VKP_FIRST_SHIFT); + DEFINE(IA64_RGN_MAP_SHIFT, RGN_MAP_SHIFT); } Index: linux-2.6.19-rc1-mm1/include/asm-ia64/mmu_context.h =================================================================== --- linux-2.6.19-rc1-mm1.orig/include/asm-ia64/mmu_context.h 2006-10-04 21:57:05.000000000 -0500 +++ linux-2.6.19-rc1-mm1/include/asm-ia64/mmu_context.h 2006-10-10 21:19:21.887715069 -0500 @@ -142,7 +142,11 @@ reload_context (nv_mm_context_t context) rr1 = rr0 + 1*rid_incr; rr2 = rr0 + 2*rid_incr; rr3 = rr0 + 3*rid_incr; - rr4 = rr0 + 4*rid_incr; + /* + * The VHPT walker must be disabled in region 4 for variable page + * size address magic in nested_dtlb_miss (ivt.S) to work. + */ + rr4 = ((rid << 8) | (PAGE_SHIFT << 2)) + 4*rid_incr; #ifdef CONFIG_HUGETLB_PAGE rr4 = (rr4 & (~(0xfcUL))) | (old_rr4 & 0xfc);