[RFC] Notifier for Externally Mapped Memory (EMM) This patch implements a simple callback for device drivers that establish their own references to pages (KVM, GRU, XPmem, RDMA/Infiniband, DMA engines etc). These references are unknown to the VM (therefore external). With these callbacks it is possible for the device driver to release external references when the VM requests it. This enables swapping, page migration and allows support of remapping, permission changes etc etc for externally mapped memory. With this functionality it becomes possible to avoid pinning or mlocking pages (commonly done to stop the VM from unmapping pages). A device driver must subscribe to a process using emm_register_notifier The VM will then perform callbacks for operations that unmap or change permissions of pages in that address space. When the process terminates the callback function is called with emm_release. Callbacks are performed before and after the unmapping action of the VM. emm_invalidate_start before emm_invalidate_end after Callbacks are mostly performed in a non atomic context. However, in various places spinlocks are held to traverse rmaps. So this patch here is only useful for those devices that can remove mappings in an atomic context (f.e. KVM/GRU). If the rmap spinlocks are converted to semaphores then all callbacks will be performed in a nonatomic context. Signed-off-by: Christoph Lameter --- include/linux/mm_types.h | 3 + include/linux/rmap.h | 51 +++++++++++++++++++++++++++++++++ kernel/fork.c | 1 mm/Kconfig | 5 +++ mm/filemap_xip.c | 5 +++ mm/fremap.c | 2 + mm/hugetlb.c | 3 + mm/memory.c | 32 ++++++++++++++++++-- mm/mmap.c | 3 + mm/mprotect.c | 3 + mm/mremap.c | 5 +++ mm/rmap.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++- 12 files changed, 180 insertions(+), 5 deletions(-) Index: linux-2.6/include/linux/mm_types.h =================================================================== --- linux-2.6.orig/include/linux/mm_types.h 2008-03-03 21:33:20.000000000 -0800 +++ linux-2.6/include/linux/mm_types.h 2008-03-03 21:33:22.000000000 -0800 @@ -225,6 +225,9 @@ struct mm_struct { /* aio bits */ rwlock_t ioctx_list_lock; struct kioctx *ioctx_list; +#ifdef CONFIG_EMM_NOTIFIER + struct emm_notifier *emm_notifier; +#endif #ifdef CONFIG_CGROUP_MEM_CONT struct mem_cgroup *mem_cgroup; #endif Index: linux-2.6/mm/Kconfig =================================================================== --- linux-2.6.orig/mm/Kconfig 2008-03-03 21:33:20.000000000 -0800 +++ linux-2.6/mm/Kconfig 2008-03-03 22:13:44.000000000 -0800 @@ -193,3 +193,8 @@ config NR_QUICK config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS + +config EMM_NOTIFIER + def_bool n + bool "External Mapped Memory Notifier for drivers directly mapping memory" + Index: linux-2.6/mm/mmap.c =================================================================== --- linux-2.6.orig/mm/mmap.c 2008-03-03 21:33:20.000000000 -0800 +++ linux-2.6/mm/mmap.c 2008-03-03 22:41:12.000000000 -0800 @@ -1747,11 +1747,13 @@ static void unmap_region(struct mm_struc lru_add_drain(); tlb = tlb_gather_mmu(mm, 0); update_hiwater_rss(mm); + emm_notify(mm, emm_invalidate_start, start, end); unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); tlb_finish_mmu(tlb, start, end); + emm_notify(mm, emm_invalidate_end, start, end); } /* @@ -2038,6 +2040,7 @@ void exit_mmap(struct mm_struct *mm) /* mm's last user has gone, and its about to be pulled down */ arch_exit_mmap(mm); + emm_notify(mm, emm_release, 0, TASK_SIZE); lru_add_drain(); flush_cache_mm(mm); Index: linux-2.6/mm/mprotect.c =================================================================== --- linux-2.6.orig/mm/mprotect.c 2008-03-03 21:33:20.000000000 -0800 +++ linux-2.6/mm/mprotect.c 2008-03-03 21:33:22.000000000 -0800 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -198,10 +199,12 @@ success: dirty_accountable = 1; } + emm_notify(mm, emm_invalidate_start, start, end); if (is_vm_hugetlb_page(vma)) hugetlb_change_protection(vma, start, end, vma->vm_page_prot); else change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable); + emm_notify(mm, emm_invalidate_end, start, end); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); return 0; Index: linux-2.6/mm/mremap.c =================================================================== --- linux-2.6.orig/mm/mremap.c 2008-03-03 21:33:20.000000000 -0800 +++ linux-2.6/mm/mremap.c 2008-03-03 22:41:12.000000000 -0800 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -74,7 +75,9 @@ static void move_ptes(struct vm_area_str struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; + unsigned long old_start = old_addr; + emm_notify(mm, emm_invalidate_start, old_start, old_end); if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before @@ -98,6 +101,7 @@ static void move_ptes(struct vm_area_str new_ptl = pte_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + arch_enter_lazy_mmu_mode(); for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, @@ -116,6 +120,7 @@ static void move_ptes(struct vm_area_str pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); + emm_notify(mm, emm_invalidate_end, old_start, old_end); } #define LATENCY_LIMIT (64 * PAGE_SIZE) Index: linux-2.6/mm/rmap.c =================================================================== --- linux-2.6.orig/mm/rmap.c 2008-03-03 21:33:20.000000000 -0800 +++ linux-2.6/mm/rmap.c 2008-03-03 22:41:50.000000000 -0800 @@ -298,6 +298,10 @@ static int page_referenced_one(struct pa (*mapcount)--; pte_unmap_unlock(pte, ptl); + if (!referenced) + /* rmap lock held */ + referenced = emm_notify(mm, emm_referenced, + address, address + PAGE_SIZE); out: return referenced; } @@ -446,6 +450,8 @@ static int page_mkclean_one(struct page if (address == -EFAULT) goto out; + /* rmap lock held */ + emm_notify(mm, emm_invalidate_start, address, address + PAGE_SIZE); pte = page_check_address(page, mm, address, &ptl); if (!pte) goto out; @@ -462,6 +468,7 @@ static int page_mkclean_one(struct page } pte_unmap_unlock(pte, ptl); + emm_notify(mm, emm_invalidate_end, address, address + PAGE_SIZE); out: return ret; } @@ -702,9 +709,11 @@ static int try_to_unmap_one(struct page if (address == -EFAULT) goto out; + /* rmap lock held */ + emm_notify(mm, emm_invalidate_start, address, address + PAGE_SIZE); pte = page_check_address(page, mm, address, &ptl); if (!pte) - goto out; + goto out_notify; /* * If the page is mlock()d, we cannot swap it out. @@ -774,6 +783,8 @@ static int try_to_unmap_one(struct page out_unmap: pte_unmap_unlock(pte, ptl); +out_notify: + emm_notify(mm, emm_invalidate_end, address, address + PAGE_SIZE); out: return ret; } @@ -812,6 +823,7 @@ static void try_to_unmap_cluster(unsigne spinlock_t *ptl; struct page *page; unsigned long address; + unsigned long start; unsigned long end; address = (vma->vm_start + cursor) & CLUSTER_MASK; @@ -833,6 +845,8 @@ static void try_to_unmap_cluster(unsigne if (!pmd_present(*pmd)) return; + start = address; + emm_notify(mm, emm_invalidate_start, start, end); pte = pte_offset_map_lock(mm, pmd, address, &ptl); /* Update high watermark before we lower rss */ @@ -865,6 +879,7 @@ static void try_to_unmap_cluster(unsigne (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + emm_notify(mm, emm_invalidate_end, start, end); } static int try_to_unmap_anon(struct page *page, int migration) @@ -1011,3 +1026,58 @@ int try_to_unmap(struct page *page, int return ret; } +/* + * Notifier for devices establishing their own references to Linux + * kernel pages in addition to the regular mapping via page + * table and rmap. The notifier allows the device to drop the mapping + * when the VM removes references to pages. + * + * Copyright (C) 2008 SGI + * Christoph Lameter + */ + +#ifdef CONFIG_EMM_NOTIFIER +/* + * No synchronization. This function can only be called when only a single + * process remains that performs teardown. + */ +void emm_notifier_release(struct mm_struct *mm) +{ + struct emm_notifier *e; + + while (mm->emm_notifier) { + e = mm->emm_notifier; + mm->emm_notifier = e->next; + e->func(e, mm, emm_release, 0, 0); + } +} +EXPORT_SYMBOL_GPL(emm_notifier_release); + +/* Register a notifier */ +void emm_notifier_register(struct emm_notifier *e, struct mm_struct *mm) +{ + e->next = mm->emm_notifier; + mm->emm_notifier = e; +} +EXPORT_SYMBOL_GPL(emm_notifier_register); + +/* Perform a callback */ +int __emm_notify(struct mm_struct *mm, enum emm_operations op, + unsigned long start, unsigned long end) +{ + struct emm_notifier *e = mm->emm_notifier; + int x; + + while (e) { + if (e->func) { + x = e->func(e, mm, op, start, end); + if (x) + return x; + } + e = e->next; + } + return 0; +} +EXPORT_SYMBOL_GPL(__emm_notify); +#endif + Index: linux-2.6/mm/memory.c =================================================================== --- linux-2.6.orig/mm/memory.c 2008-03-03 21:33:20.000000000 -0800 +++ linux-2.6/mm/memory.c 2008-03-03 22:41:12.000000000 -0800 @@ -611,6 +611,9 @@ int copy_page_range(struct mm_struct *ds if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); + if (is_cow_mapping(vma->vm_flags)) + emm_notify(src_mm, emm_invalidate_start, addr, end); + dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { @@ -621,6 +624,10 @@ int copy_page_range(struct mm_struct *ds vma, addr, next)) return -ENOMEM; } while (dst_pgd++, src_pgd++, addr = next, addr != end); + + if (is_cow_mapping(vma->vm_flags)) + emm_notify(src_mm, emm_invalidate_end, addr, end); + return 0; } @@ -897,7 +904,11 @@ unsigned long zap_page_range(struct vm_a lru_add_drain(); tlb = tlb_gather_mmu(mm, 0); update_hiwater_rss(mm); + + /* i_mmap_lock may be held */ + emm_notify(mm, emm_invalidate_start, address, end); end = unmap_vmas(&tlb, vma, address, end, &nr_accounted, details); + emm_notify(mm, emm_invalidate_end, address, end); if (tlb) tlb_finish_mmu(tlb, address, end); return end; @@ -1340,6 +1351,7 @@ int remap_pfn_range(struct vm_area_struc pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); + unsigned long start = addr; struct mm_struct *mm = vma->vm_mm; int err; @@ -1372,6 +1384,7 @@ int remap_pfn_range(struct vm_area_struc BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); + emm_notify(mm, emm_invalidate_start, start, end); flush_cache_range(vma, addr, end); do { next = pgd_addr_end(addr, end); @@ -1380,6 +1393,7 @@ int remap_pfn_range(struct vm_area_struc if (err) break; } while (pgd++, addr = next, addr != end); + emm_notify(mm, emm_invalidate_end, start, end); return err; } EXPORT_SYMBOL(remap_pfn_range); @@ -1463,10 +1477,12 @@ int apply_to_page_range(struct mm_struct { pgd_t *pgd; unsigned long next; + unsigned long start = addr; unsigned long end = addr + size; int err; BUG_ON(addr >= end); + emm_notify(mm, emm_invalidate_start, start, end); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -1474,6 +1490,7 @@ int apply_to_page_range(struct mm_struct if (err) break; } while (pgd++, addr = next, addr != end); + emm_notify(mm, emm_invalidate_end, start, end); return err; } EXPORT_SYMBOL_GPL(apply_to_page_range); @@ -1614,8 +1631,10 @@ static int do_wp_page(struct mm_struct * page_table = pte_offset_map_lock(mm, pmd, address, &ptl); page_cache_release(old_page); - if (!pte_same(*page_table, orig_pte)) - goto unlock; + if (!pte_same(*page_table, orig_pte)) { + pte_unmap_unlock(page_table, ptl); + goto check_dirty; + } page_mkwrite = 1; } @@ -1631,7 +1650,8 @@ static int do_wp_page(struct mm_struct * if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, entry); ret |= VM_FAULT_WRITE; - goto unlock; + pte_unmap_unlock(page_table, ptl); + goto check_dirty; } /* @@ -1653,6 +1673,7 @@ gotten: if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; + emm_notify(mm, emm_invalidate_start, address, address + PAGE_SIZE); /* * Re-check the pte - we dropped the lock */ @@ -1691,8 +1712,11 @@ gotten: page_cache_release(new_page); if (old_page) page_cache_release(old_page); -unlock: + pte_unmap_unlock(page_table, ptl); + emm_notify(mm, emm_invalidate_end, address, address + PAGE_SIZE); + +check_dirty: if (dirty_page) { if (vma->vm_file) file_update_time(vma->vm_file); Index: linux-2.6/include/linux/rmap.h =================================================================== --- linux-2.6.orig/include/linux/rmap.h 2008-03-03 21:33:20.000000000 -0800 +++ linux-2.6/include/linux/rmap.h 2008-03-03 21:52:38.000000000 -0800 @@ -133,4 +133,55 @@ static inline int page_mkclean(struct pa #define SWAP_AGAIN 1 #define SWAP_FAIL 2 +/* + * Notifier for devices establishing their own references to Linux + * kernel pages in addition to the regular mapping via page + * table and rmap. The notifier allows the device to drop the mapping + * when the VM removes references to pages. + */ +enum emm_operations { + emm_release, /* Process existing, */ + emm_invalidate_start, /* Before the VM unmaps pages */ + emm_invalidate_end, /* After the VM unmapped pages */ + emm_referenced /* Check if a range was referenced */ +}; + +struct emm_notifier { + int (*func)(struct emm_notifier *e, struct mm_struct *mm, + enum emm_operations op, + unsigned long start, unsigned long end); + struct emm_notifier *next; +}; + +extern int __emm_notify(struct mm_struct *mm, enum emm_operations op, + unsigned long start, unsigned long end); + +static inline int mm_has_emm_notifier(struct mm_struct *mm) +{ +#ifdef CONFIG_EMM_NOTIFIER + return unlikely(mm->emm_notifier); +#else + return 0; +#endif +} + +static inline int emm_notify(struct mm_struct *mm, enum emm_operations op, + unsigned long start, unsigned long end) +{ +#ifdef CONFIG_EMM_NOTIFIER + if (mm_has_emm_notifier(mm)) + return __emm_notify(mm, op, start, end); +#endif + return 0; +} + +/* + * Register a notifier with an mm struct. Release occurs when the process + * terminates by calling the notifier function with emm_release. + * + * Must hold the mmap_sem for write. + */ +extern void emm_notifier_register(struct emm_notifier *e, + struct mm_struct *mm); + #endif /* _LINUX_RMAP_H */ Index: linux-2.6/kernel/fork.c =================================================================== --- linux-2.6.orig/kernel/fork.c 2008-03-03 21:33:20.000000000 -0800 +++ linux-2.6/kernel/fork.c 2008-03-03 22:41:12.000000000 -0800 @@ -362,6 +362,7 @@ static struct mm_struct * mm_init(struct if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; + mm->emm_notifier = NULL; return mm; } Index: linux-2.6/mm/filemap_xip.c =================================================================== --- linux-2.6.orig/mm/filemap_xip.c 2008-03-03 21:33:20.000000000 -0800 +++ linux-2.6/mm/filemap_xip.c 2008-03-03 22:41:12.000000000 -0800 @@ -190,6 +190,9 @@ __xip_unmap (struct address_space * mapp address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); BUG_ON(address < vma->vm_start || address >= vma->vm_end); + /* i_mmap_lock held */ + emm_notify(mm, emm_invalidate_start, + address, address + PAGE_SIZE); pte = page_check_address(page, mm, address, &ptl); if (pte) { /* Nuke the page table entry. */ @@ -201,6 +204,8 @@ __xip_unmap (struct address_space * mapp pte_unmap_unlock(pte, ptl); page_cache_release(page); } + emm_notify(mm, emm_invalidate_end, + address, address + PAGE_SIZE); } spin_unlock(&mapping->i_mmap_lock); } Index: linux-2.6/mm/fremap.c =================================================================== --- linux-2.6.orig/mm/fremap.c 2008-03-03 21:33:20.000000000 -0800 +++ linux-2.6/mm/fremap.c 2008-03-03 22:41:12.000000000 -0800 @@ -214,7 +214,9 @@ asmlinkage long sys_remap_file_pages(uns spin_unlock(&mapping->i_mmap_lock); } + emm_notify(mm, emm_invalidate_start, start, end); err = populate_range(mm, vma, start, size, pgoff); + emm_notify(mm, emm_invalidate_end, start, end); if (!err && !(flags & MAP_NONBLOCK)) { if (unlikely(has_write_lock)) { downgrade_write(&mm->mmap_sem); Index: linux-2.6/mm/hugetlb.c =================================================================== --- linux-2.6.orig/mm/hugetlb.c 2008-03-03 21:33:20.000000000 -0800 +++ linux-2.6/mm/hugetlb.c 2008-03-03 22:41:12.000000000 -0800 @@ -755,6 +755,8 @@ void __unmap_hugepage_range(struct vm_ar BUG_ON(start & ~HPAGE_MASK); BUG_ON(end & ~HPAGE_MASK); + /* i_mmap_lock held */ + emm_notify(mm, emm_invalidate_start, start, end); spin_lock(&mm->page_table_lock); for (address = start; address < end; address += HPAGE_SIZE) { ptep = huge_pte_offset(mm, address); @@ -775,6 +777,7 @@ void __unmap_hugepage_range(struct vm_ar } spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); + emm_notify(mm, emm_invalidate_end, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { list_del(&page->lru); put_page(page);