Subject: allocate memory in khugepaged outside of mmap_sem write mode From: Andrea Arcangeli This tries to be more friendly to filesystem in userland, with userland backends that allocate memory in the I/O paths and that could deadlock if khugepaged holds the mmap_sem write mode of the userland backend while allocating memory. Memory allocation may wait for writeback I/O completion from the daemon that may be blocked in the mmap_sem read mode if a page fault happens and the daemon wasn't using mlock for the memory required for the I/O submission and completion. Signed-off-by: Andrea Arcangeli --- diff --git a/mm/huge_memory.c b/mm/huge_memory.c --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1676,9 +1676,34 @@ static void collapse_huge_page(struct mm VM_BUG_ON(address & ~HPAGE_PMD_MASK); #ifndef CONFIG_NUMA VM_BUG_ON(!*hpage); + new_page = *hpage; #else VM_BUG_ON(*hpage); + /* + * Allocate the page while the vma is still valid and under + * the mmap_sem read mode so there is no memory allocation + * later when we take the mmap_sem in write mode. This is more + * friendly behavior (OTOH it may actually hide bugs) to + * filesystems in userland with daemons allocating memory in + * the userland I/O paths. Allocating memory with the + * mmap_sem in read mode is good idea also to allow greater + * scalability. + */ + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + if (unlikely(!new_page)) { + up_read(&mm->mmap_sem); + *hpage = ERR_PTR(-ENOMEM); + return; + } #endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + up_read(&mm->mmap_sem); + put_page(new_page); + return; + } + + /* after allocating the hugepage upgrade to mmap_sem write mode */ + up_read(&mm->mmap_sem); /* * Prevent all access to pagetables with the exception of @@ -1716,18 +1741,6 @@ static void collapse_huge_page(struct mm if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) goto out; -#ifndef CONFIG_NUMA - new_page = *hpage; -#else - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); - if (unlikely(!new_page)) { - *hpage = ERR_PTR(-ENOMEM); - goto out; - } -#endif - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) - goto out_put_page; - anon_vma_lock(vma->anon_vma); pte = pte_offset_map(pmd, address); @@ -1755,7 +1768,7 @@ static void collapse_huge_page(struct mm spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); mem_cgroup_uncharge_page(new_page); - goto out_put_page; + goto out; } /* @@ -1794,15 +1807,15 @@ static void collapse_huge_page(struct mm *hpage = NULL; #endif khugepaged_pages_collapsed++; -out: +out_up_write: up_write(&mm->mmap_sem); return; -out_put_page: +out: #ifdef CONFIG_NUMA put_page(new_page); #endif - goto out; + goto out_up_write; } static int khugepaged_scan_pmd(struct mm_struct *mm, @@ -1861,10 +1874,9 @@ static int khugepaged_scan_pmd(struct mm ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { - up_read(&mm->mmap_sem); + if (ret) + /* collapse_huge_page will return with the mmap_sem released */ collapse_huge_page(mm, address, hpage); - } out: return ret; }