Redo hugetlb locking to address reservation problems. This fixes hugepage locking so that we can properly scale huge pages. There are three levels of locking: 1. mmap_sem: Read lock guarantees page table structure. Write lock prohibits any modifications to the page tables. 2. page_Table_lock: Used to guarantee integritey of higher level page tables (pgd, pud, pmd). 3. We add a pte_lock that is used to lock the lowest level of page table.s Another lock exist hugetlb_lock which protects the global huge tlb variables and the per node lists. With the pte_lock we can lock a pte and thereby we can guarantee that no other attempt to install a pte is made at the same address. This fixes the concurrent reservation problem. This significantly reduces locking overhead in the huge page locking. Signed-off-by: Christoph Lameter Index: linux-2.6.19-rc6-mm2/mm/hugetlb.c =================================================================== --- linux-2.6.19-rc6-mm2.orig/mm/hugetlb.c 2006-12-04 19:49:45.000000000 -0800 +++ linux-2.6.19-rc6-mm2/mm/hugetlb.c 2006-12-04 21:17:56.000000000 -0800 @@ -32,6 +32,21 @@ */ static DEFINE_SPINLOCK(hugetlb_lock); +static pte_t huge_pte_lock(pte_t *p) { + pte_t result; + + while (!ptep_trylock(p, &result)) + cpu_relax(); + return result; +} + +static void huge_pte_unlock(pte_t *p) +{ + BUG_ON(!(pte_val(*p) & _PAGE_PTE_LOCK)); + smp_wmb(); + ptep_unlock(p); +} + static void clear_huge_page(struct page *page, unsigned long addr) { int i; @@ -333,28 +348,31 @@ src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; + entry = huge_pte_lock(src_pte); dst_pte = huge_pte_alloc(dst, addr); if (!dst_pte) goto nomem; - spin_lock(&dst->page_table_lock); - spin_lock(&src->page_table_lock); - if (!pte_none(*src_pte)) { + huge_pte_lock(dst_pte); + if (!pte_none(entry)) { if (cow) ptep_set_wrprotect(src, addr, src_pte); - entry = *src_pte; ptepage = pte_page(entry); get_page(ptepage); - set_huge_pte_at(dst, addr, dst_pte, entry); + set_locked_huge_pte_at(dst, addr, dst_pte, entry); } - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); + huge_pte_unlock(src_pte); + huge_pte_unlock(dst_pte); } return 0; nomem: + huge_pte_unlock(src_pte); return -ENOMEM; } +/* + * Must hold write lock on mmap_sem + */ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { @@ -375,7 +393,6 @@ BUG_ON(start & ~HPAGE_MASK); BUG_ON(end & ~HPAGE_MASK); - spin_lock(&mm->page_table_lock); for (address = start; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); if (!ptep) @@ -391,7 +408,6 @@ page = pte_page(pte); list_add(&page->lru, &page_list); } - spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { list_del(&page->lru); @@ -399,6 +415,9 @@ } } +/* + * Must hold write lock on mmap_msem + */ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { @@ -441,9 +460,7 @@ return VM_FAULT_OOM; } - spin_unlock(&mm->page_table_lock); copy_huge_page(new_page, old_page, address); - spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & HPAGE_MASK); if (likely(pte_same(*ptep, pte))) { @@ -473,8 +490,7 @@ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); /* - * Use page lock to guard against racing truncation - * before we get page_table_lock. + * Use page lock to guard against racing truncation. */ retry: page = find_lock_page(mapping, idx); @@ -507,31 +523,26 @@ lock_page(page); } - spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> HPAGE_SHIFT; if (idx >= size) goto backout; ret = VM_FAULT_MINOR; - if (!pte_none(*ptep)) - goto backout; new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); - set_huge_pte_at(mm, address, ptep, new_pte); + set_locked_huge_pte_at(mm, address, ptep, new_pte); if (write_access && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_cow(mm, vma, address, ptep, new_pte); } - spin_unlock(&mm->page_table_lock); unlock_page(page); out: return ret; backout: - spin_unlock(&mm->page_table_lock); hugetlb_put_quota(mapping); unlock_page(page); put_page(page); @@ -544,35 +555,24 @@ pte_t *ptep; pte_t entry; int ret; - static DEFINE_MUTEX(hugetlb_instantiation_mutex); ptep = huge_pte_alloc(mm, address); if (!ptep) return VM_FAULT_OOM; - /* - * Serialize hugepage allocation and instantiation, so that we don't - * get spurious allocation failures if two CPUs race to instantiate - * the same page in the page cache. - */ - mutex_lock(&hugetlb_instantiation_mutex); - entry = *ptep; + entry = huge_pte_lock(ptep); + if (pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); - mutex_unlock(&hugetlb_instantiation_mutex); - return ret; + goto out; } ret = VM_FAULT_MINOR; - spin_lock(&mm->page_table_lock); - /* Check for a racing update before calling hugetlb_cow */ - if (likely(pte_same(entry, *ptep))) - if (write_access && !pte_write(entry)) - ret = hugetlb_cow(mm, vma, address, ptep, entry); - spin_unlock(&mm->page_table_lock); - mutex_unlock(&hugetlb_instantiation_mutex); - + if (write_access && !pte_write(entry)) + ret = hugetlb_cow(mm, vma, address, ptep, entry); +out: + huge_pte_unlock(ptep); return ret; } @@ -584,7 +584,6 @@ unsigned long vaddr = *position; int remainder = *length; - spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; struct page *page; @@ -599,9 +598,7 @@ if (!pte || pte_none(*pte)) { int ret; - spin_unlock(&mm->page_table_lock); ret = hugetlb_fault(mm, vma, vaddr, 0); - spin_lock(&mm->page_table_lock); if (ret == VM_FAULT_MINOR) continue; @@ -635,7 +632,6 @@ goto same_page; } } - spin_unlock(&mm->page_table_lock); *length = remainder; *position = vaddr; @@ -654,7 +650,6 @@ flush_cache_range(vma, address, end); spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); - spin_lock(&mm->page_table_lock); for (; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); if (!ptep) Index: linux-2.6.19-rc6-mm2/include/asm-ia64/pgtable.h =================================================================== --- linux-2.6.19-rc6-mm2.orig/include/asm-ia64/pgtable.h 2006-12-04 19:49:45.000000000 -0800 +++ linux-2.6.19-rc6-mm2/include/asm-ia64/pgtable.h 2006-12-04 19:51:35.000000000 -0800 @@ -56,6 +56,7 @@ #define _PAGE_D (1 << _PAGE_D_BIT) /* page dirty bit */ #define _PAGE_PPN_MASK (((__IA64_UL(1) << IA64_MAX_PHYS_BITS) - 1) & ~0xfffUL) #define _PAGE_ED (__IA64_UL(1) << 52) /* exception deferral */ +#define _PAGE_PTE_LOCK (__IA64_UL(1) << 62) #define _PAGE_PROTNONE (__IA64_UL(1) << 63) /* Valid only for a PTE with the present bit cleared: */ @@ -229,6 +230,8 @@ */ #define set_pte(ptep, pteval) (*(ptep) = (pteval)) #define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) +#define set_locked_pte_at(mm,addr,ptep,pteval) set_pte(ptep,\ + (pteval|_PAGE_PTE_LOCK)) #define VMALLOC_START (RGN_BASE(RGN_GATE) + 0x200000000UL) #ifdef CONFIG_VIRTUAL_MEM_MAP @@ -442,6 +445,25 @@ #endif } +static inline ptep_t ptep_trylock(pte_t *ptep, pte_t *p) +{ + unsigned long new, old; + + do { + old = ptep->pte; + if (old & _PAGE_PTE_LOCK) + return 0; + new = old | _PAGE_PTE_LOCK; + } while (cmpxchg((unsigned long *) ptep, old, new) != old); + p->pte = old; + return 1; +} + +static inline void ptep_unlock(pte_t *ptep) +{ + ptep->pte &= ~_PAGE_PTE_LOCK; +} + static inline int pte_same (pte_t a, pte_t b) { Index: linux-2.6.19-rc6-mm2/include/asm-i386/pgtable.h =================================================================== --- linux-2.6.19-rc6-mm2.orig/include/asm-i386/pgtable.h 2006-12-04 19:54:18.000000000 -0800 +++ linux-2.6.19-rc6-mm2/include/asm-i386/pgtable.h 2006-12-04 21:21:59.000000000 -0800 @@ -118,7 +118,7 @@ #define _PAGE_GLOBAL 0x100 /* Global TLB entry PPro+ */ #define _PAGE_UNUSED1 0x200 /* available for programmer */ #define _PAGE_UNUSED2 0x400 -#define _PAGE_UNUSED3 0x800 +#define _PAGE_PTE_LOCK 0x800 /* Locking of a pte entry */ /* If _PAGE_PRESENT is clear, we use these: */ #define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ @@ -508,6 +508,31 @@ #define GET_IOSPACE(pfn) 0 #define GET_PFN(pfn) (pfn) +#ifndef __ASSEMBLY__ +#define set_locked_huge_pte_at(mm,addr,ptep,pteval) set_pte(ptep,\ + __pte(pte_val(*ptep)|_PAGE_PTE_LOCK)) + +static inline int ptep_trylock(pte_t *ptep, pte_t *p) +{ + unsigned long new, old; + + do { + old = ptep->pte_low; + if (old & _PAGE_PTE_LOCK) + return 0; + new = old | _PAGE_PTE_LOCK; + } while (cmpxchg(&ptep->pte_low, old, new) != old); + p->pte_low = old; + return 1; +} + +static inline void ptep_unlock(pte_t *ptep) +{ + ptep->pte_low &= ~_PAGE_PTE_LOCK; +} + +#endif + #include #endif /* _I386_PGTABLE_H */ Index: linux-2.6.19-rc6-mm2/include/asm-x86_64/pgtable.h =================================================================== --- linux-2.6.19-rc6-mm2.orig/include/asm-x86_64/pgtable.h 2006-12-04 20:17:32.000000000 -0800 +++ linux-2.6.19-rc6-mm2/include/asm-x86_64/pgtable.h 2006-12-04 20:24:28.000000000 -0800 @@ -157,7 +157,7 @@ #define _PAGE_PSE 0x080 /* 2MB page */ #define _PAGE_FILE 0x040 /* nonlinear file mapping, saved PTE; unset:swap */ #define _PAGE_GLOBAL 0x100 /* Global TLB entry */ - +#define _PAGE_PTE_LOCK 0x200 /* Lock Pte */ #define _PAGE_PROTNONE 0x080 /* If not present */ #define _PAGE_NX (1UL<<_PAGE_BIT_NX) @@ -447,4 +447,26 @@ #define __HAVE_ARCH_PTE_SAME #include +#define set_locked_pte_at(mm,addr,ptep,pteval) set_pte(ptep,\ + (pteval|_PAGE_PTE_LOCK)) + +static inline int ptep_trylock(pte_t *ptep, pte_t *p) +{ + unsigned long new, old; + + do { + old = ptep->pte; + if (old & _PAGE_PTE_LOCK) + return 0; + new = old | _PAGE_PTE_LOCK; + } while (cmpxchg((unsigned long *) ptep, old, new) != old); + p->pte = old; + return 1; +} + +static inline void ptep_unlock(pte_t *ptep) +{ + ptep->pte &= ~_PAGE_PTE_LOCK; +} + #endif /* _X86_64_PGTABLE_H */