Index: linux-2.6.19-rc1-mm1/mm/hugetlb.c =================================================================== --- linux-2.6.19-rc1-mm1.orig/mm/hugetlb.c 2006-10-18 10:49:03.590165563 -0500 +++ linux-2.6.19-rc1-mm1/mm/hugetlb.c 2006-10-18 20:01:58.163430051 -0500 @@ -27,11 +27,19 @@ unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; + + /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages */ static DEFINE_SPINLOCK(hugetlb_lock); +/* + * Number of pages taken from the pool that we have not processed yet. + * This is used to avoid falsely report OOM conditions. + */ +static atomic_t tentative_pages; + static void clear_huge_page(struct page *page, unsigned long addr) { int i; @@ -121,9 +129,10 @@ static int alloc_fresh_huge_page(void) } static struct page *alloc_huge_page(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, int *concurrent) { struct page *page; + *concurrent = 0; spin_lock(&hugetlb_lock); if (vma->vm_flags & VM_MAYSHARE) @@ -131,10 +140,12 @@ static struct page *alloc_huge_page(stru else if (free_huge_pages <= resv_huge_pages) goto fail; + *concurrent = !atomic_read(&tentative_pages); + smp_rmb(); page = dequeue_huge_page(vma, addr); if (!page) goto fail; - + atomic_inc(&tentative_pages); spin_unlock(&hugetlb_lock); set_page_refcounted(page); return page; @@ -409,6 +420,7 @@ static int hugetlb_cow(struct mm_struct { struct page *old_page, *new_page; int avoidcopy; + int concurrent; old_page = pte_page(pte); @@ -421,11 +433,11 @@ static int hugetlb_cow(struct mm_struct } page_cache_get(old_page); - new_page = alloc_huge_page(vma, address); + new_page = alloc_huge_page(vma, address, &concurrent); if (!new_page) { page_cache_release(old_page); - return VM_FAULT_OOM; + return concurrent ? VM_FAULT_MINOR : VM_FAULT_OOM; } spin_unlock(&mm->page_table_lock); @@ -442,6 +454,7 @@ static int hugetlb_cow(struct mm_struct } page_cache_release(new_page); page_cache_release(old_page); + atomic_dec(&tentative_pages); return VM_FAULT_MINOR; } @@ -454,6 +467,8 @@ int hugetlb_no_page(struct mm_struct *mm struct page *page; struct address_space *mapping; pte_t new_pte; + int new = 0; + int concurrent; mapping = vma->vm_file->f_mapping; idx = ((address - vma->vm_start) >> HPAGE_SHIFT) @@ -468,12 +483,13 @@ retry: if (!page) { if (hugetlb_get_quota(mapping)) goto out; - page = alloc_huge_page(vma, address); + page = alloc_huge_page(vma, address, &concurrent); if (!page) { hugetlb_put_quota(mapping); - ret = VM_FAULT_OOM; + ret = concurrent ? VM_FAULT_MINOR : VM_FAULT_OOM; goto out; } + new = 1; clear_huge_page(page, address); if (vma->vm_flags & VM_SHARED) { @@ -483,8 +499,11 @@ retry: if (err) { put_page(page); hugetlb_put_quota(mapping); - if (err == -EEXIST) + if (err == -EEXIST) { + if (new) + atomic_dec(&tentative_pages); goto retry; + } goto out; } } else @@ -512,6 +531,8 @@ retry: spin_unlock(&mm->page_table_lock); unlock_page(page); out: + if (new) + atomic_dec(&tentative_pages); return ret; backout: @@ -528,22 +549,14 @@ int hugetlb_fault(struct mm_struct *mm, pte_t *ptep; pte_t entry; int ret; - static DEFINE_MUTEX(hugetlb_instantiation_mutex); ptep = huge_pte_alloc(mm, address); if (!ptep) return VM_FAULT_OOM; - /* - * Serialize hugepage allocation and instantiation, so that we don't - * get spurious allocation failures if two CPUs race to instantiate - * the same page in the page cache. - */ - mutex_lock(&hugetlb_instantiation_mutex); entry = *ptep; if (pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); - mutex_unlock(&hugetlb_instantiation_mutex); return ret; } @@ -555,7 +568,6 @@ int hugetlb_fault(struct mm_struct *mm, if (write_access && !pte_write(entry)) ret = hugetlb_cow(mm, vma, address, ptep, entry); spin_unlock(&mm->page_table_lock); - mutex_unlock(&hugetlb_instantiation_mutex); return ret; }