Subject: mmu_notifier core logic

This patch implements a notifier for device drivers that establish
their own references to pages (KVM, GRU, XPmem, RDMA/Infiniband, DMA engines
etc). These references are unknown to the VM (in some sense these subsystems
have their own mmu thus mmu notifier).

With these callbacks it is possible for the device driver to release external
references when the VM requests it. This enables swapping, page migration and
allows support of remapping, permission changes etc etc for externally
mapped memory.

With this functionality it becomes also possible to avoid pinning or mlocking
pages (commonly done to stop the VM from unmapping device mapped pages).

A device driver must subscribe to a process using

        mmu_notifier_register(struct mmu_notifier *, struct mm_struct *)


The VM will then perform callbacks for operations that unmap or change
permissions of pages in that address space. When the process terminates
the release method is called.

Callbacks are performed before and after the unmapping action of the VM.

        start()    before

        end()      after

The device driver must hold off establishing new references to pages
in the range specified between a call to start() and the subsequent
call to end(). This allows the VM to ensure that no concurrent driver
actions are performed on an address range while performing remapping
or unmapping operations.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>
Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/mm_types.h     |    3 
 include/linux/mmu_notifier.h |  167 ++++++++++++++++++++++++++++++++++++
 kernel/fork.c                |    2 
 mm/Kconfig                   |    4 
 mm/Makefile                  |    1 
 mm/filemap_xip.c             |    3 
 mm/hugetlb.c                 |    3 
 mm/memory.c                  |   45 +++++++--
 mm/mmap.c                    |    2 
 mm/mmu_notifier.c            |  196 +++++++++++++++++++++++++++++++++++++++++++
 mm/mprotect.c                |    3 
 mm/mremap.c                  |    5 +
 mm/rmap.c                    |   33 +++++--
 mm/slub.c                    |    2 
 14 files changed, 452 insertions(+), 17 deletions(-)

Index: linux-2.6/include/linux/mm_types.h
===================================================================
--- linux-2.6.orig/include/linux/mm_types.h	2008-04-21 22:45:24.000000000 -0700
+++ linux-2.6/include/linux/mm_types.h	2008-04-21 22:45:36.000000000 -0700
@@ -225,6 +225,9 @@ struct mm_struct {
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 	struct mem_cgroup *mem_cgroup;
 #endif
+#ifdef CONFIG_MMU_NOTIFIER
+	struct list_head mmu_notifier_list;
+#endif
 };
 
 #endif /* _LINUX_MM_TYPES_H */
Index: linux-2.6/include/linux/mmu_notifier.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/include/linux/mmu_notifier.h	2008-04-21 22:45:36.000000000 -0700
@@ -0,0 +1,167 @@
+#ifndef _LINUX_MMU_NOTIFIER_H
+#define _LINUX_MMU_NOTIFIER_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mm_types.h>
+
+struct mmu_notifier;
+struct mmu_notifier_ops;
+
+#ifdef CONFIG_MMU_NOTIFIER
+
+struct mmu_notifier_ops {
+	/*
+	 * Called after all other threads have terminated and the executing
+	 * thread is the only remaining execution thread. There are no
+	 * users of the mm_struct remaining.
+	 */
+	void (*release)(struct mmu_notifier *mn, struct mm_struct *mm);
+
+	/*
+	 * clear_flush_young is called after the VM is
+	 * test-and-clearing the young/accessed bitflag in the
+	 * pte. This way the VM will provide proper aging to the
+	 * accesses to the page through the secondary MMUs and not
+	 * only to the ones through the Linux pte.
+	 */
+	int (*clear_flush_young)(struct mmu_notifier *mn, struct mm_struct *mm,
+				 unsigned long address);
+
+	/*
+	 * start() and end() must be paired and are called only when the
+	 * mmap_sem is held and/or the semaphores protecting the reverse
+	 * maps. Both functions may sleep. The subsystem must guarantee that
+	 * no additional references to the pages in the range established
+	 * between the call to start() and the matching call to end().
+	 *
+	 * Invalidation of multiple concurrent ranges may be permitted by the
+	 * driver or the driver may exclude other invalidation from proceeding
+	 * by blocking on new start() callback that overlap invalidates that
+	 * are already in progress. Either way the establishment of additional
+	 * references to the range can only be allowed if all stop() function
+	 * have been called.
+	 *
+	 * start() is called when all pages in the range are still mapped and
+	 * have at least a refcount of one.
+	 *
+	 * end() is called when all pages in the range have been unmapped
+	 * and the refcount has been dropped.
+	 *
+	 * The VM will remove the page table entries and potentially the
+	 * page between start() and end(). If the page must not be freed
+	 * because of pending I/O or other circumstances then the start()
+	 * callback (or the initial mapping by the driver) must make sure
+	 * that the refcount is kept elevated.
+	 *
+	 * If the driver increases the refcount when the pages are initially
+	 * mapped into an address space then either start() or end() may
+	 * decrease the refcount. If the refcount is decreased on start()
+	 * then the VM can free pages as page table entries are removed.
+	 * If the refcount is only droppped on end() then the driver itself
+	 * will drop the last refcount. Pages will no longer be referenced
+	 * by the address space but may still be referenced by the driver
+	 * until the last refcount is dropped.
+	 */
+	void (*start)(struct mmu_notifier *mn, struct mm_struct *mm,
+				unsigned long start, unsigned long end);
+	void (*end)(struct mmu_notifier *mn, struct mm_struct *mm,
+				unsigned long start, unsigned long end);
+};
+
+/*
+ * The notifier chains are protected by mmap_sem and/or the reverse map
+ * semaphores. Notifier chains are only changed when all reverse maps and
+ * the mmap_sem locks are taken.
+ *
+ * Therefore notifier chains can only be traversed when either
+ *
+ * 1. mmap_sem is held.
+ * 2. One of the reverse map locks is held (i_mmap_sem or anon_vma->sem).
+ * 3. No other concurrent thread can access the list (release)
+ */
+struct mmu_notifier {
+	struct list_head list;
+	const struct mmu_notifier_ops *ops;
+};
+
+static inline int mm_has_notifiers(struct mm_struct *mm)
+{
+	return unlikely(!list_empty(&mm->mmu_notifier_list));
+}
+
+extern int mmu_notifier_register(struct mmu_notifier *mn,
+				 struct mm_struct *mm);
+extern int mmu_notifier_unregister(struct mmu_notifier *mn,
+				   struct mm_struct *mm);
+extern void __mmu_notifier_release(struct mm_struct *mm);
+extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+					  unsigned long address);
+extern void __mmu_notifier_start(struct mm_struct *mm,
+				  unsigned long start, unsigned long end);
+extern void __mmu_notifier_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end);
+
+
+static inline void mmu_notifier_release(struct mm_struct *mm)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_release(mm);
+}
+
+static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
+					  unsigned long address)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_clear_flush_young(mm, address);
+	return 0;
+}
+
+static inline void mmu_notifier_start(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_start(mm, start, end);
+}
+
+static inline void mmu_notifier_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	if (mm_has_notifiers(mm))
+		__mmu_notifier_end(mm, start, end);
+}
+
+static inline void mmu_notifier_mm_init(struct mm_struct *mm)
+{
+	INIT_LIST_HEAD(&mm->mmu_notifier_list);
+}
+
+#else /* CONFIG_MMU_NOTIFIER */
+
+static inline void mmu_notifier_release(struct mm_struct *mm)
+{
+}
+
+static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
+					  unsigned long address)
+{
+	return 0;
+}
+
+static inline void mmu_notifier_start(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+}
+
+static inline void mmu_notifier_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+}
+
+static inline void mmu_notifier_mm_init(struct mm_struct *mm)
+{
+}
+
+#endif /* CONFIG_MMU_NOTIFIER */
+
+#endif /* _LINUX_MMU_NOTIFIER_H */
Index: linux-2.6/kernel/fork.c
===================================================================
--- linux-2.6.orig/kernel/fork.c	2008-04-21 22:45:36.000000000 -0700
+++ linux-2.6/kernel/fork.c	2008-04-21 22:45:36.000000000 -0700
@@ -53,6 +53,7 @@
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -362,6 +363,7 @@ static struct mm_struct * mm_init(struct
 
 	if (likely(!mm_alloc_pgd(mm))) {
 		mm->def_flags = 0;
+		mmu_notifier_mm_init(mm);
 		return mm;
 	}
 
Index: linux-2.6/mm/Kconfig
===================================================================
--- linux-2.6.orig/mm/Kconfig	2008-04-21 22:42:31.000000000 -0700
+++ linux-2.6/mm/Kconfig	2008-04-21 22:45:36.000000000 -0700
@@ -193,3 +193,7 @@ config NR_QUICK
 config VIRT_TO_BUS
 	def_bool y
 	depends on !ARCH_NO_VIRT_TO_BUS
+
+config MMU_NOTIFIER
+	def_bool y
+	bool "MMU notifier for devices/subsystems mapping memory"
Index: linux-2.6/mm/Makefile
===================================================================
--- linux-2.6.orig/mm/Makefile	2008-04-21 22:42:31.000000000 -0700
+++ linux-2.6/mm/Makefile	2008-04-21 22:45:36.000000000 -0700
@@ -33,4 +33,5 @@ obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o
+obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
 
Index: linux-2.6/mm/filemap_xip.c
===================================================================
--- linux-2.6.orig/mm/filemap_xip.c	2008-04-21 22:45:36.000000000 -0700
+++ linux-2.6/mm/filemap_xip.c	2008-04-21 22:45:36.000000000 -0700
@@ -15,6 +15,7 @@
 #include <linux/rmap.h>
 #include <linux/sched.h>
 #include <asm/tlbflush.h>
+#include <linux/mmu_notifier.h>
 
 /*
  * We do use our own empty page to avoid interference with other users
@@ -192,6 +193,7 @@ __xip_unmap (struct address_space * mapp
 		BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 		pte = page_check_address(page, mm, address, &ptl);
 		if (pte) {
+		 	mmu_notifier_start(mm, address, address + PAGE_SIZE);
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
 			pteval = ptep_clear_flush(vma, address, pte);
@@ -200,6 +202,7 @@ __xip_unmap (struct address_space * mapp
 			BUG_ON(pte_dirty(pteval));
 			pte_unmap_unlock(pte, ptl);
 			page_cache_release(page);
+		 	mmu_notifier_end(mm, address, address + PAGE_SIZE);
 		}
 	}
 	up_read(&mapping->i_mmap_sem);
Index: linux-2.6/mm/hugetlb.c
===================================================================
--- linux-2.6.orig/mm/hugetlb.c	2008-04-21 22:45:36.000000000 -0700
+++ linux-2.6/mm/hugetlb.c	2008-04-21 22:45:36.000000000 -0700
@@ -14,6 +14,7 @@
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
 #include <linux/mutex.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -799,6 +800,7 @@ void __unmap_hugepage_range(struct vm_ar
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
+	mmu_notifier_start(mm, start, end);
 	spin_lock(&mm->page_table_lock);
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
@@ -819,6 +821,7 @@ void __unmap_hugepage_range(struct vm_ar
 	}
 	spin_unlock(&mm->page_table_lock);
 	flush_tlb_range(vma, start, end);
+	mmu_notifier_end(mm, start, end);
 	list_for_each_entry_safe(page, tmp, &page_list, lru) {
 		list_del(&page->lru);
 		put_page(page);
Index: linux-2.6/mm/memory.c
===================================================================
--- linux-2.6.orig/mm/memory.c	2008-04-21 22:45:36.000000000 -0700
+++ linux-2.6/mm/memory.c	2008-04-21 22:45:36.000000000 -0700
@@ -51,6 +51,7 @@
 #include <linux/init.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -601,6 +602,7 @@ int copy_page_range(struct mm_struct *ds
 	unsigned long next;
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;
+	int ret = 0;
 
 	/*
 	 * Don't copy ptes where a page fault will fill them correctly.
@@ -610,12 +612,15 @@ int copy_page_range(struct mm_struct *ds
 	 */
 	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
 		if (!vma->anon_vma)
-			return 0;
+			goto out;
 	}
 
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
 
+	if (is_cow_mapping(vma->vm_flags))
+		mmu_notifier_start(src_mm, addr, end);
+
 	dst_pgd = pgd_offset(dst_mm, addr);
 	src_pgd = pgd_offset(src_mm, addr);
 	do {
@@ -623,10 +628,17 @@ int copy_page_range(struct mm_struct *ds
 		if (pgd_none_or_clear_bad(src_pgd))
 			continue;
 		if (copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd,
-						vma, addr, next))
-			return -ENOMEM;
+						vma, addr, next)) {
+			ret = -ENOMEM;
+			break;
+		}
 	} while (dst_pgd++, src_pgd++, addr = next, addr != end);
-	return 0;
+
+	if (is_cow_mapping(vma->vm_flags))
+		mmu_notifier_end(src_mm, vma->vm_start, end);
+
+out:
+	return ret;
 }
 
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -829,6 +841,7 @@ unsigned long unmap_vmas(struct vm_area_
 	tlb = tlb_gather_mmu(mm, 0);
 	update_hiwater_rss(mm);
 	fullmm = tlb->fullmm;
+	mmu_notifier_start(mm, start_addr, end_addr);
 
 	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
 		unsigned long end;
@@ -881,6 +894,7 @@ unsigned long unmap_vmas(struct vm_area_
 	}
 	tlb_finish_mmu(tlb, start_addr, end_addr);
 out:
+	mmu_notifier_end(mm, start_addr, end_addr);
 	return start;	/* which is now the end (or restart) address */
 }
 
@@ -1369,6 +1383,7 @@ int remap_pfn_range(struct vm_area_struc
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
 	pgd = pgd_offset(mm, addr);
+	mmu_notifier_start(mm, addr, end);
 	flush_cache_range(vma, addr, end);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -1377,6 +1392,7 @@ int remap_pfn_range(struct vm_area_struc
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	mmu_notifier_end(mm, addr, end);
 	return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -1461,9 +1477,11 @@ int apply_to_page_range(struct mm_struct
 	pgd_t *pgd;
 	unsigned long next;
 	unsigned long end = addr + size;
+	unsigned long start = addr;
 	int err;
 
 	BUG_ON(addr >= end);
+	mmu_notifier_start(mm, start, end);
 	pgd = pgd_offset(mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
@@ -1471,6 +1489,7 @@ int apply_to_page_range(struct mm_struct
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
+	mmu_notifier_end(mm, start, end);
 	return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -1610,9 +1629,12 @@ static int do_wp_page(struct mm_struct *
 			 */
 			page_table = pte_offset_map_lock(mm, pmd, address,
 							 &ptl);
+			new_page = NULL;
+			if (!pte_same(*page_table, orig_pte)) {
+				pte_unmap_unlock(page_table, ptl);
+				goto check_dirty;
+			}
 			page_cache_release(old_page);
-			if (!pte_same(*page_table, orig_pte))
-				goto unlock;
 
 			page_mkwrite = 1;
 		}
@@ -1628,7 +1650,9 @@ static int do_wp_page(struct mm_struct *
 		if (ptep_set_access_flags(vma, address, page_table, entry,1))
 			update_mmu_cache(vma, address, entry);
 		ret |= VM_FAULT_WRITE;
-		goto unlock;
+		old_page = new_page = NULL;
+		pte_unmap_unlock(page_table, ptl);
+		goto check_dirty;
 	}
 
 	/*
@@ -1653,6 +1677,7 @@ gotten:
 	/*
 	 * Re-check the pte - we dropped the lock
 	 */
+	mmu_notifier_start(mm, address, address + PAGE_SIZE);
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (likely(pte_same(*page_table, orig_pte))) {
 		if (old_page) {
@@ -1684,12 +1709,14 @@ gotten:
 	} else
 		mem_cgroup_uncharge_page(new_page);
 
+	pte_unmap_unlock(page_table, ptl);
+	mmu_notifier_end(mm, address, address + PAGE_SIZE);
+check_dirty:
 	if (new_page)
 		page_cache_release(new_page);
 	if (old_page)
 		page_cache_release(old_page);
-unlock:
-	pte_unmap_unlock(page_table, ptl);
+
 	if (dirty_page) {
 		if (vma->vm_file)
 			file_update_time(vma->vm_file);
Index: linux-2.6/mm/mmap.c
===================================================================
--- linux-2.6.orig/mm/mmap.c	2008-04-21 22:45:36.000000000 -0700
+++ linux-2.6/mm/mmap.c	2008-04-21 22:45:36.000000000 -0700
@@ -26,6 +26,7 @@
 #include <linux/mount.h>
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -2032,6 +2033,7 @@ void exit_mmap(struct mm_struct *mm)
 
 	/* mm's last user has gone, and its about to be pulled down */
 	arch_exit_mmap(mm);
+	mmu_notifier_release(mm);
 
 	lru_add_drain();
 	flush_cache_mm(mm);
Index: linux-2.6/mm/mmu_notifier.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/mm/mmu_notifier.c	2008-04-21 22:50:28.000000000 -0700
@@ -0,0 +1,196 @@
+/*
+ *  linux/mm/mmu_notifier.c
+ *
+ *  Copyright (C) 2008  Qumranet, Inc.
+ *  Copyright (C) 2008  SGI
+ *             Christoph Lameter <clameter@sgi.com>
+ *
+ *  This work is licensed under the terms of the GNU GPL, version 2. See
+ *  the COPYING file in the top-level directory.
+ */
+
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/rmap.h>
+#include <linux/fs.h>
+#include <linux/mmu_notifier.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/sort.h>
+
+/*
+ * No synchronization. This function can only be called when only a single
+ * process remains that performs teardown.
+ */
+void __mmu_notifier_release(struct mm_struct *mm)
+{
+	struct mmu_notifier *mn;
+
+	while (unlikely(!list_empty(&mm->mmu_notifier_list))) {
+		mn = list_entry(mm->mmu_notifier_list.next,
+				struct mmu_notifier,
+				list);
+		list_del(&mn->list);
+		if (mn->ops->release)
+			mn->ops->release(mn, mm);
+	}
+}
+
+/*
+ * If no young bitflag is supported by the hardware, ->clear_flush_young can
+ * unmap the address and return 1 or 0 depending if the mapping previously
+ * existed or not.
+ */
+int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
+					unsigned long address)
+{
+	struct mmu_notifier *mn;
+	int young = 0;
+
+	list_for_each_entry(mn, &mm->mmu_notifier_list, list) {
+		if (mn->ops->clear_flush_young)
+			young |= mn->ops->clear_flush_young(mn, mm, address);
+	}
+
+	return young;
+}
+
+void __mmu_notifier_start(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	struct mmu_notifier *mn;
+
+	list_for_each_entry(mn, &mm->mmu_notifier_list, list) {
+		if (mn->ops->start)
+			mn->ops->start(mn, mm, start, end);
+	}
+}
+
+void __mmu_notifier_end(struct mm_struct *mm,
+				  unsigned long start, unsigned long end)
+{
+	struct mmu_notifier *mn;
+
+	list_for_each_entry(mn, &mm->mmu_notifier_list, list) {
+		if (mn->ops->end)
+			mn->ops->end(mn, mm, start, end);
+	}
+}
+
+static int cmp_rwsem(const void *a, const void *b)
+{
+	struct rw_semaphore * const *pa = a;
+	struct rw_semaphore * const *pb = b;
+	struct rw_semaphore *va = *pa;
+	struct rw_semaphore *vb = *pb;
+
+	if (va == vb)
+		return 0;
+	if (va > vb)
+		return 1;
+	return -1;
+}
+
+static void scan_locks(struct mm_struct *mm, struct rw_semaphore **locks,
+							int anon, int lock)
+{
+	struct vm_area_struct *vma;
+	struct rw_semaphore *last;
+	int i = 0;
+
+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
+		if (anon) {
+			if (vma->anon_vma)
+				locks[i++] = &vma->anon_vma->sem;
+		} else {
+			if (vma->vm_file && vma->vm_file->f_mapping)
+				locks[i++] = &vma->vm_file->f_mapping->i_mmap_sem;
+		}
+	}
+
+	if (!i)
+		return;
+
+	sort(locks, i, sizeof(struct rw_semaphore *), cmp_rwsem, NULL);
+
+	last = NULL;
+	while (i-- > 0) {
+		/*  Multiple vmas may use the same lock. */
+		if (last != locks[i]) {
+			if (lock)
+				down_write(locks[i]);
+			else
+				up_write(locks[i]);
+
+			last = locks[i];
+		}
+	}
+}
+
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults. The holder
+ * must not hold any mm related lock. A single task can't take more
+ * than one mm lock in a row or it would deadlock.
+ */
+static void mm_lock(struct mm_struct *mm, struct rw_semaphore **locks)
+{
+	down_write(&mm->mmap_sem);
+	scan_locks(mm, locks, 0, 1);
+	scan_locks(mm, locks, 1, 1);
+
+}
+
+static void mm_unlock(struct mm_struct *mm, struct rw_semaphore **locks)
+{
+	scan_locks(mm, locks, 0, 0);
+	scan_locks(mm, locks, 1, 0);
+	up_write(&mm->mmap_sem);
+}
+
+/*
+ * Must not hold mmap_sem nor any other VM related lock when calling
+ * this registration function.
+ */
+int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	struct rw_semaphore **locks;
+
+	locks = vmalloc(sizeof(struct rw_semaphore *) * mm->map_count);
+	if (!locks)
+		return -ENOMEM;
+
+	mm_lock(mm, locks);
+	list_add(&mn->list, &mm->mmu_notifier_list);
+	mm_unlock(mm, locks);
+
+	vfree(locks);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_register);
+
+/*
+ * mm_users can't go down to zero while mmu_notifier_unregister()
+ * runs or it can race with ->release. So a mm_users pin must
+ * be taken by the caller (if mm can be different from current->mm).
+ */
+int mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+	struct rw_semaphore **locks;
+
+	BUG_ON(!atomic_read(&mm->mm_users));
+
+	locks = vmalloc(sizeof(struct rw_semaphore *) * mm->map_count);
+	if (!locks)
+		return -ENOMEM;
+
+	mm_lock(mm, locks);
+	list_del(&mn->list);
+	mm_unlock(mm, locks);
+
+	vfree(locks);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
Index: linux-2.6/mm/mprotect.c
===================================================================
--- linux-2.6.orig/mm/mprotect.c	2008-04-21 22:42:31.000000000 -0700
+++ linux-2.6/mm/mprotect.c	2008-04-21 22:45:36.000000000 -0700
@@ -21,6 +21,7 @@
 #include <linux/syscalls.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/mmu_notifier.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -198,10 +199,12 @@ success:
 		dirty_accountable = 1;
 	}
 
+	mmu_notifier_start(mm, start, end);
 	if (is_vm_hugetlb_page(vma))
 		hugetlb_change_protection(vma, start, end, vma->vm_page_prot);
 	else
 		change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable);
+	mmu_notifier_end(mm, start, end);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
Index: linux-2.6/mm/mremap.c
===================================================================
--- linux-2.6.orig/mm/mremap.c	2008-04-21 22:45:36.000000000 -0700
+++ linux-2.6/mm/mremap.c	2008-04-21 22:45:36.000000000 -0700
@@ -18,6 +18,7 @@
 #include <linux/highmem.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -74,7 +75,10 @@ static void move_ptes(struct vm_area_str
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
+	unsigned long old_start;
 
+	old_start = old_addr;
+	mmu_notifier_start(vma->vm_mm, old_start, old_end);
 	if (vma->vm_file) {
 		/*
 		 * Subtle point from Rajesh Venkatasubramanian: before
@@ -114,6 +118,7 @@ static void move_ptes(struct vm_area_str
 		spin_unlock(new_ptl);
 	pte_unmap_nested(new_pte - 1);
 	pte_unmap_unlock(old_pte - 1, old_ptl);
+	mmu_notifier_end(vma->vm_mm, old_start, old_end);
 	if (mapping)
 		up_write(&mapping->i_mmap_sem);
 }
Index: linux-2.6/mm/rmap.c
===================================================================
--- linux-2.6.orig/mm/rmap.c	2008-04-21 22:45:36.000000000 -0700
+++ linux-2.6/mm/rmap.c	2008-04-21 22:45:36.000000000 -0700
@@ -49,6 +49,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/memcontrol.h>
+#include <linux/mmu_notifier.h>
 
 #include <asm/tlbflush.h>
 
@@ -284,6 +285,7 @@ static int page_referenced_one(struct pa
 	pte_t *pte;
 	spinlock_t *ptl;
 	int referenced = 0;
+	int clear_flush_young = 0;
 
 	address = vma_address(page, vma);
 	if (address == -EFAULT)
@@ -296,8 +298,11 @@ static int page_referenced_one(struct pa
 	if (vma->vm_flags & VM_LOCKED) {
 		referenced++;
 		*mapcount = 1;	/* break early from loop */
-	} else if (ptep_clear_flush_young(vma, address, pte))
-		referenced++;
+	} else {
+		clear_flush_young = 1;
+		if (ptep_clear_flush_young(vma, address, pte))
+			referenced++;
+	}
 
 	/* Pretend the page is referenced if the task has the
 	   swap token and is in the middle of a page fault. */
@@ -307,6 +312,10 @@ static int page_referenced_one(struct pa
 
 	(*mapcount)--;
 	pte_unmap_unlock(pte, ptl);
+
+	if (clear_flush_young)
+		referenced += mmu_notifier_clear_flush_young(mm, address);
+
 out:
 	return referenced;
 }
@@ -457,9 +466,10 @@ static int page_mkclean_one(struct page 
 	if (address == -EFAULT)
 		goto out;
 
+	mmu_notifier_start(mm, address, address + PAGE_SIZE);
 	pte = page_check_address(page, mm, address, &ptl);
 	if (!pte)
-		goto out;
+		goto out_notifier;
 
 	if (pte_dirty(*pte) || pte_write(*pte)) {
 		pte_t entry;
@@ -473,6 +483,10 @@ static int page_mkclean_one(struct page 
 	}
 
 	pte_unmap_unlock(pte, ptl);
+
+out_notifier:
+	mmu_notifier_start(mm, address, address + PAGE_SIZE);
+
 out:
 	return ret;
 }
@@ -716,17 +730,17 @@ static int try_to_unmap_one(struct page 
 	if (address == -EFAULT)
 		goto out;
 
+	mmu_notifier_start(mm, address, address + PAGE_SIZE);
 	pte = page_check_address(page, mm, address, &ptl);
 	if (!pte)
-		goto out;
+		goto out_notifier;
 
 	/*
 	 * If the page is mlock()d, we cannot swap it out.
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
-	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-			(ptep_clear_flush_young(vma, address, pte)))) {
+	if (!migration && (vma->vm_flags & VM_LOCKED)) {
 		ret = SWAP_FAIL;
 		goto out_unmap;
 	}
@@ -788,6 +802,8 @@ static int try_to_unmap_one(struct page 
 
 out_unmap:
 	pte_unmap_unlock(pte, ptl);
+out_notifier:
+	mmu_notifier_end(mm, address, address + PAGE_SIZE);
 out:
 	return ret;
 }
@@ -826,7 +842,7 @@ static void try_to_unmap_cluster(unsigne
 	spinlock_t *ptl;
 	struct page *page;
 	unsigned long address;
-	unsigned long end;
+	unsigned long start, end;
 
 	address = (vma->vm_start + cursor) & CLUSTER_MASK;
 	end = address + CLUSTER_SIZE;
@@ -847,6 +863,8 @@ static void try_to_unmap_cluster(unsigne
 	if (!pmd_present(*pmd))
 		return;
 
+	start = address;
+	mmu_notifier_start(mm, start, end);
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 
 	/* Update high watermark before we lower rss */
@@ -879,6 +897,7 @@ static void try_to_unmap_cluster(unsigne
 		(*mapcount)--;
 	}
 	pte_unmap_unlock(pte - 1, ptl);
+	mmu_notifier_end(mm, start, end);
 }
 
 static int try_to_unmap_anon(struct page *page, int migration)
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c	2008-04-23 11:52:02.000000000 -0700
+++ linux-2.6/mm/slub.c	2008-04-23 11:52:13.000000000 -0700
@@ -2407,7 +2407,7 @@ static inline int kmem_cache_close(struc
 	for_each_node_state(node, N_NORMAL_MEMORY) {
 		struct kmem_cache_node *n = get_node(s, node);
 
-		n->nr_partial -= free_list(s, n, &n->partial);
+		n->nr_partial = free_list(s, n, &n->partial);
 		if (slabs_node(s, node))
 			return 1;
 	}