From: Rik van Riel Originally by Nick Piggin Remove mlocked pages from the LRU using "unevictable infrastructure" during mmap(), munmap(), mremap() and truncate(). Try to move back to normal LRU lists on munmap() when last mlocked mapping removed. Remove PageMlocked() status when page truncated from file. [akpm@linux-foundation.org: cleanup] [kamezawa.hiroyu@jp.fujitsu.com: fix double unlock_page()] [kosaki.motohiro@jp.fujitsu.com: split LRU: munlock rework] Signed-off-by: Nick Piggin Signed-off-by: Lee Schermerhorn Signed-off-by: Rik van Riel Signed-off-by: KOSAKI Motohiro Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton --- mm/fremap.c | 27 ++++++-- mm/internal.h | 7 +- mm/mlock.c | 156 +++++++++++------------------------------------- mm/mmap.c | 70 ++++++++++++++++----- mm/mremap.c | 8 +- mm/truncate.c | 4 + 6 files changed, 128 insertions(+), 144 deletions(-) diff -puN mm/fremap.c~mmap-handle-mlocked-pages-during-map-remap-unmap mm/fremap.c --- a/mm/fremap.c~mmap-handle-mlocked-pages-during-map-remap-unmap +++ a/mm/fremap.c @@ -21,6 +21,8 @@ #include #include +#include "internal.h" + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -215,15 +217,31 @@ asmlinkage long sys_remap_file_pages(uns spin_unlock(&mapping->i_mmap_lock); } + if (vma->vm_flags & VM_LOCKED) { + /* + * drop PG_Mlocked flag for over-mapped range + */ + unsigned int saved_flags = vma->vm_flags; + munlock_vma_pages_range(vma, start, start + size); + vma->vm_flags = saved_flags; + } + mmu_notifier_invalidate_range_start(mm, start, start + size); err = populate_range(mm, vma, start, size, pgoff); mmu_notifier_invalidate_range_end(mm, start, start + size); if (!err && !(flags & MAP_NONBLOCK)) { - if (unlikely(has_write_lock)) { - downgrade_write(&mm->mmap_sem); - has_write_lock = 0; + if (vma->vm_flags & VM_LOCKED) { + /* + * might be mapping previously unmapped range of file + */ + mlock_vma_pages_range(vma, start, start + size); + } else { + if (unlikely(has_write_lock)) { + downgrade_write(&mm->mmap_sem); + has_write_lock = 0; + } + make_pages_present(start, start+size); } - make_pages_present(start, start+size); } /* @@ -240,4 +258,3 @@ out: return err; } - diff -puN mm/internal.h~mmap-handle-mlocked-pages-during-map-remap-unmap mm/internal.h --- a/mm/internal.h~mmap-handle-mlocked-pages-during-map-remap-unmap +++ a/mm/internal.h @@ -63,7 +63,12 @@ static inline unsigned long page_order(s extern int mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); -extern void munlock_vma_pages_all(struct vm_area_struct *vma); +extern void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +static inline void munlock_vma_pages_all(struct vm_area_struct *vma) +{ + munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); +} #ifdef CONFIG_UNEVICTABLE_LRU /* diff -puN mm/mlock.c~mmap-handle-mlocked-pages-during-map-remap-unmap mm/mlock.c --- a/mm/mlock.c~mmap-handle-mlocked-pages-during-map-remap-unmap +++ a/mm/mlock.c @@ -120,18 +120,33 @@ static void munlock_vma_page(struct page * vma->vm_mm->mmap_sem must be held for write. */ static int __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + int mlock) { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; struct page *pages[16]; /* 16 gives a reasonable batch */ - int write = !!(vma->vm_flags & VM_WRITE); int nr_pages = (end - start) / PAGE_SIZE; int ret; + int gup_flags = 0; - VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK); - VM_BUG_ON(start < vma->vm_start || end > vma->vm_end); - VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON(start < vma->vm_start); + VM_BUG_ON(end > vma->vm_end); + VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && + (atomic_read(&mm->mm_users) != 0)); + + /* + * mlock: don't page populate if page has PROT_NONE permission. + * munlock: the pages always do munlock althrough + * its has PROT_NONE permission. + */ + if (!mlock) + gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; + + if (vma->vm_flags & VM_WRITE) + gup_flags |= GUP_FLAGS_WRITE; lru_add_drain_all(); /* push cached pages to LRU */ @@ -146,9 +161,9 @@ static int __mlock_vma_pages_range(struc * disable migration of this page. However, page may * still be truncated out from under us. */ - ret = get_user_pages(current, mm, addr, + ret = __get_user_pages(current, mm, addr, min_t(int, nr_pages, ARRAY_SIZE(pages)), - write, 0, pages, NULL); + gup_flags, pages, NULL); /* * This can happen for, e.g., VM_NONLINEAR regions before * a page has been allocated and mapped at a given offset, @@ -178,8 +193,12 @@ static int __mlock_vma_pages_range(struc * by the elevated reference, we need only check for * page truncation (file-cache only). */ - if (page->mapping) - mlock_vma_page(page); + if (page->mapping) { + if (mlock) + mlock_vma_page(page); + else + munlock_vma_page(page); + } unlock_page(page); put_page(page); /* ref from get_user_pages() */ @@ -197,119 +216,19 @@ static int __mlock_vma_pages_range(struc return 0; /* count entire vma as locked_vm */ } -/* - * private structure for munlock page table walk - */ -struct munlock_page_walk { - struct vm_area_struct *vma; - pmd_t *pmd; /* for migration_entry_wait() */ -}; - -/* - * munlock normal pages for present ptes - */ -static int __munlock_pte_handler(pte_t *ptep, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - struct munlock_page_walk *mpw = walk->private; - swp_entry_t entry; - struct page *page; - pte_t pte; - -retry: - pte = *ptep; - /* - * If it's a swap pte, we might be racing with page migration. - */ - if (unlikely(!pte_present(pte))) { - if (!is_swap_pte(pte)) - goto out; - entry = pte_to_swp_entry(pte); - if (is_migration_entry(entry)) { - migration_entry_wait(mpw->vma->vm_mm, mpw->pmd, addr); - goto retry; - } - goto out; - } - - page = vm_normal_page(mpw->vma, addr, pte); - if (!page) - goto out; - - lock_page(page); - if (!page->mapping) { - unlock_page(page); - goto retry; - } - munlock_vma_page(page); - unlock_page(page); - -out: - return 0; -} - -/* - * Save pmd for pte handler for waiting on migration entries - */ -static int __munlock_pmd_handler(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - struct munlock_page_walk *mpw = walk->private; - - mpw->pmd = pmd; - return 0; -} - - -/* - * munlock a range of pages in the vma using standard page table walk. - * - * vma->vm_mm->mmap_sem must be held for write. - */ -static void __munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ - struct mm_struct *mm = vma->vm_mm; - struct munlock_page_walk mpw = { - .vma = vma, - }; - struct mm_walk munlock_page_walk = { - .pmd_entry = __munlock_pmd_handler, - .pte_entry = __munlock_pte_handler, - .private = &mpw, - .mm = mm, - }; - - VM_BUG_ON(start & ~PAGE_MASK || end & ~PAGE_MASK); - VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); - VM_BUG_ON(start < vma->vm_start); - VM_BUG_ON(end > vma->vm_end); - - lru_add_drain_all(); /* push cached pages to LRU */ - walk_page_range(start, end, &munlock_page_walk); - lru_add_drain_all(); /* to update stats */ -} - #else /* CONFIG_UNEVICTABLE_LRU */ /* * Just make pages present if VM_LOCKED. No-op if unlocking. */ static int __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + int mlock) { - if (vma->vm_flags & VM_LOCKED) + if (mlock && (vma->vm_flags & VM_LOCKED)) make_pages_present(start, end); return 0; } - -/* - * munlock a range of pages in the vma -- no-op. - */ -static void __munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ -} #endif /* CONFIG_UNEVICTABLE_LRU */ /* @@ -332,7 +251,7 @@ int mlock_vma_pages_range(struct vm_area is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { downgrade_write(&mm->mmap_sem); - nr_pages = __mlock_vma_pages_range(vma, start, end); + nr_pages = __mlock_vma_pages_range(vma, start, end, 1); up_read(&mm->mmap_sem); /* vma can change or disappear */ @@ -361,12 +280,13 @@ no_mlock: /* - * munlock all pages in vma. For munmap() and exit(). + * munlock all pages in the vma range. For mremap(), munmap() and exit(). */ -void munlock_vma_pages_all(struct vm_area_struct *vma) +void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { vma->vm_flags &= ~VM_LOCKED; - __munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); + __mlock_vma_pages_range(vma, start, end, 0); } /* @@ -443,7 +363,7 @@ success: */ downgrade_write(&mm->mmap_sem); - ret = __mlock_vma_pages_range(vma, start, end); + ret = __mlock_vma_pages_range(vma, start, end, 1); if (ret > 0) { mm->locked_vm -= ret; ret = 0; @@ -469,7 +389,7 @@ success: * while. Should we downgrade the semaphore for both lock * AND unlock ? */ - __munlock_vma_pages_range(vma, start, end); + __mlock_vma_pages_range(vma, start, end, 0); } out: diff -puN mm/mmap.c~mmap-handle-mlocked-pages-during-map-remap-unmap mm/mmap.c --- a/mm/mmap.c~mmap-handle-mlocked-pages-during-map-remap-unmap +++ a/mm/mmap.c @@ -970,6 +970,7 @@ unsigned long do_mmap_pgoff(struct file return -EPERM; vm_flags |= VM_LOCKED; } + /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; @@ -1133,10 +1134,12 @@ munmap_back: * The VM_SHARED test is necessary because shmem_zero_setup * will create the file object for a shared anonymous map below. */ - if (!file && !(vm_flags & VM_SHARED) && - vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, NULL, pgoff, NULL)) - goto out; + if (!file && !(vm_flags & VM_SHARED)) { + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, + NULL, NULL, pgoff, NULL); + if (vma) + goto out; + } /* * Determine the object being mapped and call the appropriate @@ -1218,10 +1221,14 @@ out: mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); - } - if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) + /* + * makes pages present; downgrades, drops, reacquires mmap_sem + */ + int nr_pages = mlock_vma_pages_range(vma, addr, addr + len); + if (nr_pages < 0) + return nr_pages; /* vma gone! */ + mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; + } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); return addr; @@ -1694,8 +1701,11 @@ find_extend_vma(struct mm_struct *mm, un return vma; if (!prev || expand_stack(prev, addr)) return NULL; - if (prev->vm_flags & VM_LOCKED) - make_pages_present(addr, prev->vm_end); + if (prev->vm_flags & VM_LOCKED) { + int nr_pages = mlock_vma_pages_range(prev, addr, prev->vm_end); + if (nr_pages < 0) + return NULL; /* vma gone! */ + } return prev; } #else @@ -1721,8 +1731,11 @@ find_extend_vma(struct mm_struct * mm, u start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; - if (vma->vm_flags & VM_LOCKED) - make_pages_present(addr, start); + if (vma->vm_flags & VM_LOCKED) { + int nr_pages = mlock_vma_pages_range(vma, addr, start); + if (nr_pages < 0) + return NULL; /* vma gone! */ + } return vma; } #endif @@ -1908,6 +1921,18 @@ int do_munmap(struct mm_struct *mm, unsi vma = prev? prev->vm_next: mm->mmap; /* + * unlock any mlock()ed ranges before detaching vmas + */ + if (mm->locked_vm) { + struct vm_area_struct *tmp = vma; + while (tmp && tmp->vm_start < end) { + if (tmp->vm_flags & VM_LOCKED) + munlock_vma_pages_all(tmp); + tmp = tmp->vm_next; + } + } + + /* * Remove the vma's, and unmap the actual pages */ detach_vmas_to_be_unmapped(mm, vma, prev, end); @@ -2019,8 +2044,9 @@ unsigned long do_brk(unsigned long addr, return -ENOMEM; /* Can we just expand an old private anonymous mapping? */ - if (vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL)) + vma = vma_merge(mm, prev, addr, addr + len, flags, + NULL, NULL, pgoff, NULL); + if (vma) goto out; /* @@ -2042,8 +2068,9 @@ unsigned long do_brk(unsigned long addr, out: mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); + int nr_pages = mlock_vma_pages_range(vma, addr, addr + len); + if (nr_pages >= 0) + mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; } return addr; } @@ -2054,7 +2081,7 @@ EXPORT_SYMBOL(do_brk); void exit_mmap(struct mm_struct *mm) { struct mmu_gather *tlb; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; unsigned long nr_accounted = 0; unsigned long end; @@ -2062,6 +2089,15 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); mmu_notifier_release(mm); + if (mm->locked_vm) { + vma = mm->mmap; + while (vma) { + if (vma->vm_flags & VM_LOCKED) + munlock_vma_pages_all(vma); + vma = vma->vm_next; + } + } + vma = mm->mmap; lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); diff -puN mm/mremap.c~mmap-handle-mlocked-pages-during-map-remap-unmap mm/mremap.c --- a/mm/mremap.c~mmap-handle-mlocked-pages-during-map-remap-unmap +++ a/mm/mremap.c @@ -24,6 +24,8 @@ #include #include +#include "internal.h" + static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -238,8 +240,8 @@ static unsigned long move_vma(struct vm_ if (vm_flags & VM_LOCKED) { mm->locked_vm += new_len >> PAGE_SHIFT; if (new_len > old_len) - make_pages_present(new_addr + old_len, - new_addr + new_len); + mlock_vma_pages_range(new_vma, new_addr + old_len, + new_addr + new_len); } return new_addr; @@ -379,7 +381,7 @@ unsigned long do_mremap(unsigned long ad vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); if (vma->vm_flags & VM_LOCKED) { mm->locked_vm += pages; - make_pages_present(addr + old_len, + mlock_vma_pages_range(vma, addr + old_len, addr + new_len); } ret = addr; diff -puN mm/truncate.c~mmap-handle-mlocked-pages-during-map-remap-unmap mm/truncate.c --- a/mm/truncate.c~mmap-handle-mlocked-pages-during-map-remap-unmap +++ a/mm/truncate.c @@ -18,6 +18,7 @@ #include #include /* grr. try_to_release_page, do_invalidatepage */ +#include "internal.h" /** @@ -103,6 +104,7 @@ truncate_complete_page(struct address_sp cancel_dirty_page(page, PAGE_CACHE_SIZE); + clear_page_mlock(page); remove_from_page_cache(page); ClearPageMappedToDisk(page); page_cache_release(page); /* pagecache ref */ @@ -127,6 +129,7 @@ invalidate_complete_page(struct address_ if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; + clear_page_mlock(page); ret = remove_mapping(mapping, page); return ret; @@ -352,6 +355,7 @@ invalidate_complete_page2(struct address if (PageDirty(page)) goto failed; + clear_page_mlock(page); BUG_ON(PagePrivate(page)); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); _