From: Peter Zijlstra Tracking of dirty pages in shared writeable mmap()s. The idea is simple: write protect clean shared writeable pages, catch the write-fault, make writeable and set dirty. On page write-back clean all the PTE dirty bits and write protect them once again. The implementation is a tad harder, mainly because the default backing_dev_info capabilities were too loosely maintained. Hence it is not enough to test the backing_dev_info for cap_account_dirty. The current heuristic is as follows, a VMA is eligible when: - its shared writeable (vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED) - it is not a 'special' mapping (vm_flags & (VM_PFNMAP|VM_INSERTPAGE)) == 0 - the backing_dev_info is cap_account_dirty mapping_cap_account_dirty(vma->vm_file->f_mapping) - f_op->mmap() didn't change the default page protection Page from remap_pfn_range() are explicitly excluded because their COW semantics are already horrid enough (see vm_normal_page() in do_wp_page()) and because they don't have a backing store anyway. mprotect() is taught about the new behaviour as well. However it fudges the last condition. Cleaning the pages on write-back is done with page_mkclean() a new rmap call. It cleans and wrprotects all PTEs of dirty accountable pages. Finally, in fs/buffers.c:try_to_free_buffers(); remove clear_page_dirty() from under ->private_lock. This seems to be safe, since ->private_lock is used to serialize access to the buffers, not the page itself. This is needed because clear_page_dirty() will call into page_mkclean() and would thereby violate locking order. Signed-off-by: Peter Zijlstra Cc: Hugh Dickins --- include/linux/mm.h | 15 +++++---------- mm/mmap.c | 2 +- mm/mprotect.c | 9 +++++---- mm/rmap.c | 13 ++++++------- 4 files changed, 17 insertions(+), 22 deletions(-) diff -puN include/linux/mm.h~mm-tracking-shared-dirty-pages-update include/linux/mm.h --- a/include/linux/mm.h~mm-tracking-shared-dirty-pages-update +++ a/include/linux/mm.h @@ -809,16 +809,13 @@ struct shrinker; extern struct shrinker *set_shrinker(int, shrinker_t); extern void remove_shrinker(struct shrinker *shrinker); -#define VM_NOTIFY_NO_PROT 0x01 -#define VM_NOTIFY_NO_MKWRITE 0x02 - /* * Some shared mappigns will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot * to the private version (using protection_map[] without the * VM_SHARED bit). */ -static inline int vma_wants_writenotify(struct vm_area_struct *vma, int flags) +static inline int vma_wants_writenotify(struct vm_area_struct *vma) { unsigned int vm_flags = vma->vm_flags; @@ -827,15 +824,13 @@ static inline int vma_wants_writenotify( return 0; /* The backer wishes to know when pages are first written to? */ - if (!(flags & VM_NOTIFY_NO_MKWRITE) && - vma->vm_ops && vma->vm_ops->page_mkwrite) + if (vma->vm_ops && vma->vm_ops->page_mkwrite) return 1; /* The open routine did something to the protections already? */ - if (!(flags & VM_NOTIFY_NO_PROT) && - pgprot_val(vma->vm_page_prot) != - pgprot_val(protection_map[vm_flags & - (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)])) + if (pgprot_val(vma->vm_page_prot) != + pgprot_val(protection_map[vm_flags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)])) return 0; /* Specialty mapping? */ diff -puN mm/mmap.c~mm-tracking-shared-dirty-pages-update mm/mmap.c --- a/mm/mmap.c~mm-tracking-shared-dirty-pages-update +++ a/mm/mmap.c @@ -1107,7 +1107,7 @@ munmap_back: pgoff = vma->vm_pgoff; vm_flags = vma->vm_flags; - if (vma_wants_writenotify(vma, 0)) + if (vma_wants_writenotify(vma)) vma->vm_page_prot = protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)]; diff -puN mm/mprotect.c~mm-tracking-shared-dirty-pages-update mm/mprotect.c --- a/mm/mprotect.c~mm-tracking-shared-dirty-pages-update +++ a/mm/mprotect.c @@ -123,7 +123,6 @@ mprotect_fixup(struct vm_area_struct *vm unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; unsigned long charged = 0; - unsigned long mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED; pgoff_t pgoff; int error; @@ -180,9 +179,11 @@ success: * held in write mode. */ vma->vm_flags = newflags; - if (vma_wants_writenotify(vma, VM_NOTIFY_NO_PROT)) - mask &= ~VM_SHARED; - vma->vm_page_prot = protection_map[newflags & mask]; + vma->vm_page_prot = protection_map[newflags & + (VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]; + if (vma_wants_writenotify(vma)) + vma->vm_page_prot = protection_map[newflags & + (VM_READ|VM_WRITE|VM_EXEC)]; if (is_vm_hugetlb_page(vma)) hugetlb_change_protection(vma, start, end, vma->vm_page_prot); diff -puN mm/rmap.c~mm-tracking-shared-dirty-pages-update mm/rmap.c --- a/mm/rmap.c~mm-tracking-shared-dirty-pages-update +++ a/mm/rmap.c @@ -440,8 +440,6 @@ static int page_mkclean_one(struct page unsigned long address; pte_t *pte, entry; spinlock_t *ptl; - int writefault = vma_wants_writenotify(vma, - VM_NOTIFY_NO_PROT|VM_NOTIFY_NO_MKWRITE); int ret = 0; address = vma_address(page, vma); @@ -452,13 +450,12 @@ static int page_mkclean_one(struct page if (!pte) goto out; - if (!(pte_dirty(*pte) || (writefault && pte_write(*pte)))) + if (!pte_dirty(*pte) && !pte_write(*pte)) goto unlock; entry = ptep_get_and_clear(mm, address, pte); entry = pte_mkclean(entry); - if (writefault) - entry = pte_wrprotect(entry); + entry = pte_wrprotect(entry); ptep_establish(vma, address, pte, entry); lazy_mmu_prot_update(entry); ret = 1; @@ -479,8 +476,10 @@ static int page_mkclean_file(struct addr BUG_ON(PageAnon(page)); spin_lock(&mapping->i_mmap_lock); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) - ret += page_mkclean_one(page, vma); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + if (vma->vm_flags & VM_SHARED) + ret += page_mkclean_one(page, vma); + } spin_unlock(&mapping->i_mmap_lock); return ret; } _