From: Peter Zijlstra People expressed the need to track dirty pages in shared mappings. Linus outlined the general idea of doing that through making clean writable pages write-protected and taking the write fault. This patch does exactly that, it makes pages in a shared writable mapping write-protected. On write-fault the pages are marked dirty and made writable. When the pages get synced with their backing store, the write-protection is re-instated. It survives a simple test and shows the dirty pages in /proc/vmstat. Signed-off-by: Peter Zijlstra Cc: Nick Piggin Cc: Hugh Dickins Cc: Christoph Lameter Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 + include/linux/rmap.h | 6 +++ mm/fremap.c | 10 ++++-- mm/memory.c | 34 +++++++++++++++++++-- mm/page-writeback.c | 9 ++++- mm/rmap.c | 66 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 120 insertions(+), 7 deletions(-) diff -puN include/linux/mm.h~tracking-dirty-pages-in-shared-mappings-v4 include/linux/mm.h --- devel/include/linux/mm.h~tracking-dirty-pages-in-shared-mappings-v4 2006-05-17 13:09:44.000000000 -0700 +++ devel-akpm/include/linux/mm.h 2006-05-17 13:09:44.000000000 -0700 @@ -182,6 +182,8 @@ extern unsigned int kobjsize(const void #define VM_SequentialReadHint(v) ((v)->vm_flags & VM_SEQ_READ) #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) +#define VM_SharedWritable(v) (((v)->vm_flags & (VM_SHARED|VM_WRITE)) == (VM_SHARED|VM_WRITE)) + /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. diff -puN include/linux/rmap.h~tracking-dirty-pages-in-shared-mappings-v4 include/linux/rmap.h --- devel/include/linux/rmap.h~tracking-dirty-pages-in-shared-mappings-v4 2006-05-17 13:09:44.000000000 -0700 +++ devel-akpm/include/linux/rmap.h 2006-05-17 13:09:44.000000000 -0700 @@ -103,6 +103,12 @@ pte_t *page_check_address(struct page *, */ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); +/* + * Used to writeprotect clean pages, in order to count nr_dirty for shared + * mappings + */ +int page_wrprotect(struct page *); + #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0) diff -puN mm/fremap.c~tracking-dirty-pages-in-shared-mappings-v4 mm/fremap.c --- devel/mm/fremap.c~tracking-dirty-pages-in-shared-mappings-v4 2006-05-17 13:09:44.000000000 -0700 +++ devel-akpm/mm/fremap.c 2006-05-17 13:09:44.000000000 -0700 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -79,9 +80,14 @@ int install_page(struct mm_struct *mm, s inc_mm_counter(mm, file_rss); flush_icache_page(vma, page); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); + pte_val = mk_pte(page, prot); + if (VM_SharedWritable(vma)) { + struct address_space *mapping = page_mapping(page); + if (mapping && mapping_cap_account_dirty(mapping)) + pte_val = pte_wrprotect(pte_val); + } + set_pte_at(mm, addr, pte, pte_val); page_add_file_rmap(page); - pte_val = *pte; update_mmu_cache(vma, addr, pte_val); err = 0; unlock: diff -puN mm/memory.c~tracking-dirty-pages-in-shared-mappings-v4 mm/memory.c --- devel/mm/memory.c~tracking-dirty-pages-in-shared-mappings-v4 2006-05-17 13:09:44.000000000 -0700 +++ devel-akpm/mm/memory.c 2006-05-17 13:09:44.000000000 -0700 @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -2058,6 +2059,7 @@ static int do_no_page(struct mm_struct * unsigned int sequence = 0; int ret = VM_FAULT_MINOR; int anon = 0; + struct page *dirty_page = NULL; pte_unmap(page_table); BUG_ON(vma->vm_flags & VM_PFNMAP); @@ -2131,6 +2133,11 @@ retry: entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (VM_SharedWritable(vma)) { + struct address_space *mapping = page_mapping(new_page); + if (mapping && mapping_cap_account_dirty(mapping)) + entry = pte_wrprotect(entry); + } set_pte_at(mm, address, page_table, entry); if (anon) { inc_mm_counter(mm, anon_rss); @@ -2139,6 +2146,10 @@ retry: } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); + if (write_access) { + dirty_page = new_page; + get_page(dirty_page); + } } } else { /* One of our sibling threads was faster, back out. */ @@ -2151,6 +2162,10 @@ retry: lazy_mmu_prot_update(entry); unlock: pte_unmap_unlock(page_table, ptl); + if (dirty_page) { + set_page_dirty(dirty_page); + put_page(dirty_page); + } return ret; oom: page_cache_release(new_page); @@ -2215,6 +2230,7 @@ static inline int handle_pte_fault(struc pte_t entry; pte_t old_entry; spinlock_t *ptl; + struct page *dirty_page = NULL; old_entry = entry = *pte; if (!pte_present(entry)) { @@ -2237,12 +2253,20 @@ static inline int handle_pte_fault(struc if (unlikely(!pte_same(*pte, entry))) goto unlock; if (write_access) { - if (!pte_write(entry)) - return do_wp_page(mm, vma, address, - pte, pmd, ptl, entry); + if (!pte_write(entry)) { + if (!VM_SharedWritable(vma)) { + return do_wp_page(mm, vma, address, + pte, pmd, ptl, entry); + } else { + entry = pte_mkwrite(entry); + dirty_page = vm_normal_page(vma, address, entry); + get_page(dirty_page); + } + } entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); + if (!pte_same(old_entry, entry)) { ptep_set_access_flags(vma, address, pte, entry, write_access); update_mmu_cache(vma, address, entry); @@ -2259,6 +2283,10 @@ static inline int handle_pte_fault(struc } unlock: pte_unmap_unlock(pte, ptl); + if (dirty_page) { + set_page_dirty(dirty_page); + put_page(dirty_page); + } return VM_FAULT_MINOR; } diff -puN mm/page-writeback.c~tracking-dirty-pages-in-shared-mappings-v4 mm/page-writeback.c --- devel/mm/page-writeback.c~tracking-dirty-pages-in-shared-mappings-v4 2006-05-17 13:09:44.000000000 -0700 +++ devel-akpm/mm/page-writeback.c 2006-05-17 13:09:44.000000000 -0700 @@ -29,6 +29,7 @@ #include #include #include +#include /* * The maximum number of pages to writeout in a single bdflush/kupdate @@ -725,8 +726,10 @@ int test_clear_page_dirty(struct page *p page_index(page), PAGECACHE_TAG_DIRTY); write_unlock_irqrestore(&mapping->tree_lock, flags); - if (mapping_cap_account_dirty(mapping)) + if (mapping_cap_account_dirty(mapping)) { + page_wrprotect(page); dec_page_state(nr_dirty); + } return 1; } write_unlock_irqrestore(&mapping->tree_lock, flags); @@ -756,8 +759,10 @@ int clear_page_dirty_for_io(struct page if (mapping) { if (TestClearPageDirty(page)) { - if (mapping_cap_account_dirty(mapping)) + if (mapping_cap_account_dirty(mapping)) { + page_wrprotect(page); dec_page_state(nr_dirty); + } return 1; } return 0; diff -puN mm/rmap.c~tracking-dirty-pages-in-shared-mappings-v4 mm/rmap.c --- devel/mm/rmap.c~tracking-dirty-pages-in-shared-mappings-v4 2006-05-17 13:09:44.000000000 -0700 +++ devel-akpm/mm/rmap.c 2006-05-17 13:09:44.000000000 -0700 @@ -434,6 +434,72 @@ int page_referenced(struct page *page, i return referenced; } +static int page_wrprotect_one(struct page *page, struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long address; + pte_t *pte, entry; + spinlock_t *ptl; + int ret = 0; + + address = vma_address(page, vma); + if (address == -EFAULT) + goto out; + + pte = page_check_address(page, mm, address, &ptl); + if (!pte) + goto out; + + if (!pte_write(*pte)) + goto unlock; + + entry = pte_mkclean(pte_wrprotect(*pte)); + ptep_establish(vma, address, pte, entry); + update_mmu_cache(vma, address, entry); + lazy_mmu_prot_update(entry); + ret = 1; + +unlock: + pte_unmap_unlock(pte, ptl); +out: + return ret; +} + +static int page_wrprotect_file(struct page *page) +{ + struct address_space *mapping = page->mapping; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int ret = 0; + + BUG_ON(PageAnon(page)); + + spin_lock(&mapping->i_mmap_lock); + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + if (VM_SharedWritable(vma)) + ret += page_wrprotect_one(page, vma); + } + + spin_unlock(&mapping->i_mmap_lock); + return ret; +} + +int page_wrprotect(struct page *page) +{ + int ret = 0; + + BUG_ON(!PageLocked(page)); + + if (page_mapped(page) && page->mapping) { + if (!PageAnon(page)) + ret = page_wrprotect_file(page); + } + + return ret; +} + /** * page_set_anon_rmap - setup new anonymous rmap * @page: the page to add the mapping to _