From andrea@qumranet.com Tue Apr 8 08:57:00 2008 Date: Tue, 08 Apr 2008 17:44:05 +0200 From: Andrea Arcangeli To: Christoph Lameter Cc: akpm@linux-foundation.org, Nick Piggin , Steve Wise , Peter Zijlstra , linux-mm@kvack.org, Kanoj Sarcar , Roland Dreier , Jack Steiner , linux-kernel@vger.kernel.org, Avi Kivity , kvm-devel@lists.sourceforge.net, Robin Holt , general@lists.openfabrics.org, Hugh Dickins Subject: [PATCH 2 of 9] Core of mmu notifiers # HG changeset patch # User Andrea Arcangeli # Date 1207666462 -7200 # Node ID baceb322b45ed43280654dac6c964c9d3d8a936f # Parent ec6d8f91b299cf26cce5c3d49bb25d35ee33c137 Core of mmu notifiers. Signed-off-by: Andrea Arcangeli Signed-off-by: Nick Piggin Signed-off-by: Christoph Lameter --- include/linux/mm_types.h | 3 include/linux/mmu_notifier.h | 153 +++++++++++++++++++++++++++++++++++++++++++ kernel/fork.c | 2 mm/Kconfig | 4 + mm/Makefile | 1 mm/filemap_xip.c | 2 mm/fremap.c | 3 mm/hugetlb.c | 3 mm/memory.c | 31 +++++++- mm/mmap.c | 2 mm/mmu_notifier.c | 126 +++++++++++++++++++++++++++++++++++ mm/mprotect.c | 3 mm/mremap.c | 6 + mm/rmap.c | 28 ++++++- 14 files changed, 356 insertions(+), 11 deletions(-) Index: linux-2.6/include/linux/mm_types.h =================================================================== --- linux-2.6.orig/include/linux/mm_types.h 2008-04-17 18:37:58.000000000 -0700 +++ linux-2.6/include/linux/mm_types.h 2008-04-17 18:48:50.000000000 -0700 @@ -225,6 +225,9 @@ struct mm_struct { #ifdef CONFIG_CGROUP_MEM_RES_CTLR struct mem_cgroup *mem_cgroup; #endif +#ifdef CONFIG_MMU_NOTIFIER + struct hlist_head mmu_notifier_list; +#endif }; #endif /* _LINUX_MM_TYPES_H */ Index: linux-2.6/include/linux/mmu_notifier.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/include/linux/mmu_notifier.h 2008-04-17 18:48:50.000000000 -0700 @@ -0,0 +1,153 @@ +#ifndef _LINUX_MMU_NOTIFIER_H +#define _LINUX_MMU_NOTIFIER_H + +#include +#include +#include + +struct mmu_notifier; +struct mmu_notifier_ops; + +#ifdef CONFIG_MMU_NOTIFIER + +struct mmu_notifier_ops { + /* + * Called when nobody can register any more notifier in the mm + * and after the "mn" notifier has been disarmed already. + */ + void (*release)(struct mmu_notifier *mn, + struct mm_struct *mm); + + /* + * clear_flush_young is called after the VM is + * test-and-clearing the young/accessed bitflag in the + * pte. This way the VM will provide proper aging to the + * accesses to the page through the secondary MMUs and not + * only to the ones through the Linux pte. + */ + int (*clear_flush_young)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address); + + /* + * Before this is invoked any secondary MMU is still ok to + * read/write to the page previously pointed by the Linux pte + * because the old page hasn't been freed yet. If required + * set_page_dirty has to be called internally to this method. + */ + void (*invalidate_page)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address); + + /* + * invalidate_range_start() and invalidate_range_end() must be + * paired. Multiple invalidate_range_start/ends may be nested + * or called concurrently. + */ + void (*invalidate_range_start)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end); + void (*invalidate_range_end)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end); +}; + +struct mmu_notifier { + struct hlist_node hlist; + const struct mmu_notifier_ops *ops; +}; + +static inline int mm_has_notifiers(struct mm_struct *mm) +{ + return unlikely(!hlist_empty(&mm->mmu_notifier_list)); +} + +extern int mmu_notifier_register(struct mmu_notifier *mn, + struct mm_struct *mm); +extern int mmu_notifier_unregister(struct mmu_notifier *mn, + struct mm_struct *mm); +extern void __mmu_notifier_release(struct mm_struct *mm); +extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long address); +extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, + unsigned long address); +extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end); +extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, + unsigned long start, unsigned long end); + + +static inline void mmu_notifier_release(struct mm_struct *mm) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_release(mm); +} + +static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long address) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_clear_flush_young(mm, address); + return 0; +} + +static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, + unsigned long address) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_invalidate_page(mm, address); +} + +static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_invalidate_range_start(mm, start, end); +} + +static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_invalidate_range_end(mm, start, end); +} + +static inline void mmu_notifier_mm_init(struct mm_struct *mm) +{ + INIT_HLIST_HEAD(&mm->mmu_notifier_list); +} + +#else /* CONFIG_MMU_NOTIFIER */ + +static inline void mmu_notifier_release(struct mm_struct *mm) +{ +} + +static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long address) +{ + return 0; +} + +static inline void mmu_notifier_invalidate_page(struct mm_struct *mm, + unsigned long address) +{ +} + +static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + +static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + +static inline void mmu_notifier_mm_init(struct mm_struct *mm) +{ +} + +#endif /* CONFIG_MMU_NOTIFIER */ + +#endif /* _LINUX_MMU_NOTIFIER_H */ Index: linux-2.6/kernel/fork.c =================================================================== --- linux-2.6.orig/kernel/fork.c 2008-04-17 18:38:46.000000000 -0700 +++ linux-2.6/kernel/fork.c 2008-04-17 18:48:50.000000000 -0700 @@ -53,6 +53,7 @@ #include #include #include +#include #include #include @@ -362,6 +363,7 @@ static struct mm_struct * mm_init(struct if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; + mmu_notifier_mm_init(mm); return mm; } Index: linux-2.6/mm/Kconfig =================================================================== --- linux-2.6.orig/mm/Kconfig 2008-04-17 18:37:58.000000000 -0700 +++ linux-2.6/mm/Kconfig 2008-04-17 18:48:50.000000000 -0700 @@ -193,3 +193,7 @@ config NR_QUICK config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS + +config MMU_NOTIFIER + def_bool y + bool "MMU notifier, for paging KVM/RDMA" Index: linux-2.6/mm/Makefile =================================================================== --- linux-2.6.orig/mm/Makefile 2008-04-17 18:37:58.000000000 -0700 +++ linux-2.6/mm/Makefile 2008-04-17 18:48:50.000000000 -0700 @@ -33,4 +33,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o +obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o Index: linux-2.6/mm/filemap_xip.c =================================================================== --- linux-2.6.orig/mm/filemap_xip.c 2008-04-17 18:38:46.000000000 -0700 +++ linux-2.6/mm/filemap_xip.c 2008-04-17 18:48:50.000000000 -0700 @@ -199,6 +199,8 @@ __xip_unmap (struct address_space * mapp dec_mm_counter(mm, file_rss); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); + /* must invalidate_page _before_ freeing the page */ + mmu_notifier_invalidate_page(mm, address); page_cache_release(page); } } Index: linux-2.6/mm/fremap.c =================================================================== --- linux-2.6.orig/mm/fremap.c 2008-04-17 18:38:46.000000000 -0700 +++ linux-2.6/mm/fremap.c 2008-04-17 18:48:50.000000000 -0700 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -214,7 +215,9 @@ asmlinkage long sys_remap_file_pages(uns up_write(&mapping->i_mmap_sem); } + mmu_notifier_invalidate_range_start(mm, start, start + size); err = populate_range(mm, vma, start, size, pgoff); + mmu_notifier_invalidate_range_end(mm, start, start + size); if (!err && !(flags & MAP_NONBLOCK)) { if (unlikely(has_write_lock)) { downgrade_write(&mm->mmap_sem); Index: linux-2.6/mm/hugetlb.c =================================================================== --- linux-2.6.orig/mm/hugetlb.c 2008-04-17 18:38:46.000000000 -0700 +++ linux-2.6/mm/hugetlb.c 2008-04-17 18:48:50.000000000 -0700 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -799,6 +800,7 @@ void __unmap_hugepage_range(struct vm_ar BUG_ON(start & ~HPAGE_MASK); BUG_ON(end & ~HPAGE_MASK); + mmu_notifier_invalidate_range_start(mm, start, end); spin_lock(&mm->page_table_lock); for (address = start; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); @@ -819,6 +821,7 @@ void __unmap_hugepage_range(struct vm_ar } spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { list_del(&page->lru); put_page(page); Index: linux-2.6/mm/memory.c =================================================================== --- linux-2.6.orig/mm/memory.c 2008-04-17 18:39:00.000000000 -0700 +++ linux-2.6/mm/memory.c 2008-04-17 18:48:50.000000000 -0700 @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -617,6 +618,9 @@ int copy_page_range(struct mm_struct *ds if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_start(src_mm, addr, end); + dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { @@ -627,6 +631,11 @@ int copy_page_range(struct mm_struct *ds vma, addr, next)) return -ENOMEM; } while (dst_pgd++, src_pgd++, addr = next, addr != end); + + if (is_cow_mapping(vma->vm_flags)) + mmu_notifier_invalidate_range_end(src_mm, + vma->vm_start, end); + return 0; } @@ -829,6 +838,7 @@ unsigned long unmap_vmas(struct vm_area_ tlb = tlb_gather_mmu(mm, 0); update_hiwater_rss(mm); fullmm = tlb->fullmm; + mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; @@ -873,6 +883,7 @@ unsigned long unmap_vmas(struct vm_area_ } } tlb_finish_mmu(tlb, start_addr, end_addr); + mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); return start; /* which is now the end (or restart) address */ } @@ -1453,10 +1464,11 @@ int apply_to_page_range(struct mm_struct { pgd_t *pgd; unsigned long next; - unsigned long end = addr + size; + unsigned long start = addr, end = addr + size; int err; BUG_ON(addr >= end); + mmu_notifier_invalidate_range_start(mm, start, end); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -1464,6 +1476,7 @@ int apply_to_page_range(struct mm_struct if (err) break; } while (pgd++, addr = next, addr != end); + mmu_notifier_invalidate_range_end(mm, start, end); return err; } EXPORT_SYMBOL_GPL(apply_to_page_range); @@ -1603,9 +1616,10 @@ static int do_wp_page(struct mm_struct * */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); - page_cache_release(old_page); + new_page = NULL; if (!pte_same(*page_table, orig_pte)) goto unlock; + page_cache_release(old_page); page_mkwrite = 1; } @@ -1621,6 +1635,7 @@ static int do_wp_page(struct mm_struct * if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, entry); ret |= VM_FAULT_WRITE; + old_page = new_page = NULL; goto unlock; } @@ -1677,12 +1692,18 @@ gotten: } else mem_cgroup_uncharge_page(new_page); - if (new_page) +unlock: + pte_unmap_unlock(page_table, ptl); + + if (new_page) { + if (new_page == old_page) + /* cow happened, notify before releasing old_page */ + mmu_notifier_invalidate_page(mm, address); page_cache_release(new_page); + } if (old_page) page_cache_release(old_page); -unlock: - pte_unmap_unlock(page_table, ptl); + if (dirty_page) { if (vma->vm_file) file_update_time(vma->vm_file); Index: linux-2.6/mm/mmap.c =================================================================== --- linux-2.6.orig/mm/mmap.c 2008-04-17 18:44:23.000000000 -0700 +++ linux-2.6/mm/mmap.c 2008-04-17 18:48:50.000000000 -0700 @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -2033,6 +2034,7 @@ void exit_mmap(struct mm_struct *mm) unsigned long end; /* mm's last user has gone, and its about to be pulled down */ + mmu_notifier_release(mm); arch_exit_mmap(mm); lru_add_drain(); Index: linux-2.6/mm/mmu_notifier.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6/mm/mmu_notifier.c 2008-04-17 18:48:50.000000000 -0700 @@ -0,0 +1,126 @@ +/* + * linux/mm/mmu_notifier.c + * + * Copyright (C) 2008 Qumranet, Inc. + * Copyright (C) 2008 SGI + * Christoph Lameter + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include + +/* + * No synchronization. This function can only be called when only a single + * process remains that performs teardown. + */ +void __mmu_notifier_release(struct mm_struct *mm) +{ + struct mmu_notifier *mn; + + while (unlikely(!hlist_empty(&mm->mmu_notifier_list))) { + mn = hlist_entry(mm->mmu_notifier_list.first, + struct mmu_notifier, + hlist); + hlist_del(&mn->hlist); + if (mn->ops->release) + mn->ops->release(mn, mm); + } +} + +/* + * If no young bitflag is supported by the hardware, ->clear_flush_young can + * unmap the address and return 1 or 0 depending if the mapping previously + * existed or not. + */ +int __mmu_notifier_clear_flush_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int young = 0; + + hlist_for_each_entry(mn, n, &mm->mmu_notifier_list, hlist) { + if (mn->ops->clear_flush_young) + young |= mn->ops->clear_flush_young(mn, mm, address); + } + + return young; +} + +void __mmu_notifier_invalidate_page(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + hlist_for_each_entry(mn, n, &mm->mmu_notifier_list, hlist) { + if (mn->ops->invalidate_page) + mn->ops->invalidate_page(mn, mm, address); + } +} + +void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + hlist_for_each_entry(mn, n, &mm->mmu_notifier_list, hlist) { + if (mn->ops->invalidate_range_start) + mn->ops->invalidate_range_start(mn, mm, start, end); + } +} + +void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + hlist_for_each_entry(mn, n, &mm->mmu_notifier_list, hlist) { + if (mn->ops->invalidate_range_end) + mn->ops->invalidate_range_end(mn, mm, start, end); + } +} + +/* + * Must not hold mmap_sem nor any other VM related lock when calling + * this registration function. + */ +int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) +{ + int rc; + + rc = mm_lock(mm); + if (rc) + return rc; + hlist_add_head(&mn->hlist, &mm->mmu_notifier_list); + mm_unlock(mm); + return 0; +} +EXPORT_SYMBOL_GPL(mmu_notifier_register); + +/* + * mm_users can't go down to zero while mmu_notifier_unregister() + * runs or it can race with ->release. So a mm_users pin must + * be taken by the caller (if mm can be different from current->mm). + */ +int mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) +{ + int rc; + + BUG_ON(!atomic_read(&mm->mm_users)); + + rc = mm_lock(mm); + if (unlikely(rc)) + return rc; + hlist_del(&mn->hlist); + mm_unlock(mm); + return 0; +} +EXPORT_SYMBOL_GPL(mmu_notifier_unregister); Index: linux-2.6/mm/mprotect.c =================================================================== --- linux-2.6.orig/mm/mprotect.c 2008-04-17 18:37:58.000000000 -0700 +++ linux-2.6/mm/mprotect.c 2008-04-17 18:48:50.000000000 -0700 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -198,10 +199,12 @@ success: dirty_accountable = 1; } + mmu_notifier_invalidate_range_start(mm, start, end); if (is_vm_hugetlb_page(vma)) hugetlb_change_protection(vma, start, end, vma->vm_page_prot); else change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + mmu_notifier_invalidate_range_end(mm, start, end); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; Index: linux-2.6/mm/mremap.c =================================================================== --- linux-2.6.orig/mm/mremap.c 2008-04-17 18:38:46.000000000 -0700 +++ linux-2.6/mm/mremap.c 2008-04-17 18:48:50.000000000 -0700 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -74,7 +75,11 @@ static void move_ptes(struct vm_area_str struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; + unsigned long old_start; + old_start = old_addr; + mmu_notifier_invalidate_range_start(vma->vm_mm, + old_start, old_end); if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before @@ -114,6 +119,7 @@ static void move_ptes(struct vm_area_str spin_unlock(new_ptl); pte_unmap_nested(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); + mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); if (mapping) up_write(&mapping->i_mmap_sem); } Index: linux-2.6/mm/rmap.c =================================================================== --- linux-2.6.orig/mm/rmap.c 2008-04-17 18:39:11.000000000 -0700 +++ linux-2.6/mm/rmap.c 2008-04-17 18:48:50.000000000 -0700 @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -283,7 +284,7 @@ static int page_referenced_one(struct pa unsigned long address; pte_t *pte; spinlock_t *ptl; - int referenced = 0; + int referenced = 0, clear_flush_young = 0; address = vma_address(page, vma); if (address == -EFAULT) @@ -296,8 +297,11 @@ static int page_referenced_one(struct pa if (vma->vm_flags & VM_LOCKED) { referenced++; *mapcount = 1; /* break early from loop */ - } else if (ptep_clear_flush_young(vma, address, pte)) - referenced++; + } else { + clear_flush_young = 1; + if (ptep_clear_flush_young(vma, address, pte)) + referenced++; + } /* Pretend the page is referenced if the task has the swap token and is in the middle of a page fault. */ @@ -307,6 +311,10 @@ static int page_referenced_one(struct pa (*mapcount)--; pte_unmap_unlock(pte, ptl); + + if (clear_flush_young) + referenced += mmu_notifier_clear_flush_young(mm, address); + out: return referenced; } @@ -473,6 +481,10 @@ static int page_mkclean_one(struct page } pte_unmap_unlock(pte, ptl); + + if (ret) + mmu_notifier_invalidate_page(mm, address); + out: return ret; } @@ -725,8 +737,7 @@ static int try_to_unmap_one(struct page * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!migration && ((vma->vm_flags & VM_LOCKED) || - (ptep_clear_flush_young(vma, address, pte)))) { + if (!migration && (vma->vm_flags & VM_LOCKED)) { ret = SWAP_FAIL; goto out_unmap; } @@ -788,6 +799,8 @@ static int try_to_unmap_one(struct page out_unmap: pte_unmap_unlock(pte, ptl); + if (ret != SWAP_FAIL) + mmu_notifier_invalidate_page(mm, address); out: return ret; } @@ -826,7 +839,7 @@ static void try_to_unmap_cluster(unsigne spinlock_t *ptl; struct page *page; unsigned long address; - unsigned long end; + unsigned long start, end; address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -847,6 +860,8 @@ static void try_to_unmap_cluster(unsigne if (!pmd_present(*pmd)) return; + start = address; + mmu_notifier_invalidate_range_start(mm, start, end); pte = pte_offset_map_lock(mm, pmd, address, &ptl); /* Update high watermark before we lower rss */ @@ -879,6 +894,7 @@ static void try_to_unmap_cluster(unsigne (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + mmu_notifier_invalidate_range_end(mm, start, end); } static int try_to_unmap_anon(struct page *page, int migration)