From holt@sgi.com Wed Jan 23 20:05:32 2008 Date: Wed, 23 Jan 2008 22:05:25 -0600 From: Robin Holt To: Andrea Arcangeli Cc: Avi Kivity , Izik Eidus , Andrew Morton , Nick Piggin , kvm-devel@lists.sourceforge.net, Benjamin Herrenschmidt , steiner@sgi.com, linux-kernel@vger.kernel.org, linux-mm@kvack.org, daniel.blueman@quadrics.com, holt@sgi.com, Hugh Dickins , clameter@sgi.com, Peter Zijlstra Subject: Re: Enhance mmu notifiers to accomplish a lockless implementation (incomplete). Expand the mmu_notifiers to allow for lockless callers. To accomplish this, the function receiving notifications needs to implement an rmap equivalent. The notification function is also responsible for tracking page dirty state. Version 2 brings with it __xip_unmap and do_wp_page so this is getting to the point where we can start testing. It does compile now. I am traveling tomorrow but should be able to get back to this tomorrow evening or early Friday. Thank you for your attention, Robin Holt Index: linux-2.6/include/linux/page-flags.h =================================================================== --- linux-2.6.orig/include/linux/page-flags.h 2008-01-24 18:26:43.000000000 -0800 +++ linux-2.6/include/linux/page-flags.h 2008-01-24 18:28:47.000000000 -0800 @@ -105,6 +105,7 @@ * 64 bit | FIELDS | ?????? FLAGS | * 63 32 0 */ +#define PG_external_rmap 30 /* Page has external rmap */ #define PG_uncached 31 /* Page has been mapped as uncached */ #endif @@ -260,6 +261,14 @@ static inline void __ClearPageTail(struc #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#if defined(CONFIG_MMU_NOTIFIER) && defined(CONFIG_64BIT) +#define PageExternalRmap(page) test_bit(PG_external_rmap, &(page)->flags) +#define SetPageExternalRmap(page) set_bit(PG_external_rmap, &(page)->flags) +#define ClearPageExternalRmap(page) clear_bit(PG_external_rmap, &(page)->flags) +#else +#define PageExternalRmap(page) 0 +#endif + struct page; /* forward declaration */ extern void cancel_dirty_page(struct page *page, unsigned int account_size); Index: linux-2.6/mm/rmap.c =================================================================== --- linux-2.6.orig/mm/rmap.c 2008-01-24 18:26:43.000000000 -0800 +++ linux-2.6/mm/rmap.c 2008-01-24 18:26:48.000000000 -0800 @@ -49,6 +49,7 @@ #include #include #include +#include #include @@ -473,6 +474,8 @@ int page_mkclean(struct page *page) struct address_space *mapping = page_mapping(page); if (mapping) { ret = page_mkclean_file(mapping, page); + if (unlikely(PageExternalRmap(page))) + mmu_rmap_notifier(invalidate_page, page); if (page_test_dirty(page)) { page_clear_dirty(page); ret = 1; @@ -971,6 +974,9 @@ int try_to_unmap(struct page *page, int else ret = try_to_unmap_file(page, migration); + if (unlikely(PageExternalRmap(page))) + mmu_rmap_notifier(invalidate_page, page); + if (!page_mapped(page)) ret = SWAP_SUCCESS; return ret; Index: linux-2.6/mm/fremap.c =================================================================== --- linux-2.6.orig/mm/fremap.c 2008-01-24 18:26:43.000000000 -0800 +++ linux-2.6/mm/fremap.c 2008-01-24 18:26:48.000000000 -0800 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -211,6 +212,7 @@ asmlinkage long sys_remap_file_pages(uns spin_unlock(&mapping->i_mmap_lock); } + mmu_notifier(invalidate_range, mm, start, start + size); err = populate_range(mm, vma, start, size, pgoff); if (!err && !(flags & MAP_NONBLOCK)) { if (unlikely(has_write_lock)) { Index: linux-2.6/mm/memory.c =================================================================== --- linux-2.6.orig/mm/memory.c 2008-01-24 18:26:48.000000000 -0800 +++ linux-2.6/mm/memory.c 2008-01-24 18:26:48.000000000 -0800 @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -1637,6 +1638,7 @@ gotten: /* * Re-check the pte - we dropped the lock */ + mmu_notifier(invalidate_range, mm, address, address + PAGE_SIZE - 1); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { Index: linux-2.6/mm/filemap_xip.c =================================================================== --- linux-2.6.orig/mm/filemap_xip.c 2008-01-24 18:26:43.000000000 -0800 +++ linux-2.6/mm/filemap_xip.c 2008-01-24 18:26:48.000000000 -0800 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -183,6 +184,9 @@ __xip_unmap (struct address_space * mapp if (!page) return; + if (PageExternalRmap(page)) + mmu_rmap_notifier(invalidate_page, page); + spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { mm = vma->vm_mm; Index: linux-2.6/include/linux/mmu_notifier.h =================================================================== --- linux-2.6.orig/include/linux/mmu_notifier.h 2008-01-24 18:26:48.000000000 -0800 +++ linux-2.6/include/linux/mmu_notifier.h 2008-01-24 18:26:48.000000000 -0800 @@ -4,10 +4,14 @@ #include #include #include +#include -#ifdef CONFIG_MMU_NOTIFIER +struct mmu_notifier_ops; -struct mmu_notifier; +struct mmu_notifier { + struct hlist_node hlist; + const struct mmu_notifier_ops *ops; +}; struct mmu_notifier_ops { void (*release)(struct mmu_notifier *mn, @@ -23,16 +27,23 @@ struct mmu_notifier_ops { unsigned long start, unsigned long end); }; -struct mmu_notifier_head { - struct hlist_head head; -}; +struct mmu_rmap_notifier_ops; -struct mmu_notifier { +struct mmu_rmap_notifier { struct hlist_node hlist; - const struct mmu_notifier_ops *ops; + const struct mmu_rmap_notifier_ops *ops; }; -#include +struct mmu_rmap_notifier_ops { + /* + * Called with the page lock held after ptes are modified or removed. + * + * Must clear PageExported() + */ + void (*invalidate_page)(struct mmu_rmap_notifier *em, struct page *page); +}; + +#ifdef CONFIG_MMU_NOTIFIER extern void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm); @@ -65,6 +76,24 @@ static inline void mmu_notifier_head_ini } \ } while (0) +extern void mmu_rmap_notifier_register(struct mmu_rmap_notifier *mrn); +extern void mmu_rmap_notifier_unregister(struct mmu_rmap_notifier *mrn); + +extern struct hlist_head mmu_rmap_notifier_list; + +#define mmu_rmap_notifier(function, args...) \ + do { \ + struct mmu_rmap_notifier *__mrn; \ + struct hlist_node *__n; \ + \ + rcu_read_lock(); \ + hlist_for_each_entry_rcu(__mrn, __n, &mmu_rmap_notifier_list, \ + hlist) \ + if (__mrn->ops->function) \ + __mrn->ops->function(__mrn, args); \ + rcu_read_unlock(); \ + } while (0); + #else /* CONFIG_MMU_NOTIFIER */ #define mmu_notifier_register(mn, mm) do {} while(0) @@ -76,6 +105,12 @@ static inline void mmu_notifier_head_ini #define mmu_notifier(function, mm, args...) \ do { } while (0) + +#define mmu_rmap_notifier(function, args...) + +static inline void mmu_rmap_notifier_register(struct export_notifier *em) {} +static inline void mmu_rmap_notifier_unregister(struct export_notifier *em) {} + #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */ Index: linux-2.6/mm/mmu_notifier.c =================================================================== --- linux-2.6.orig/mm/mmu_notifier.c 2008-01-24 18:26:48.000000000 -0800 +++ linux-2.6/mm/mmu_notifier.c 2008-01-24 18:26:48.000000000 -0800 @@ -51,14 +51,41 @@ int mmu_notifier_age_page(struct mm_stru return young; } +static DEFINE_SPINLOCK(mmu_notifier_list_lock); + void mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm) { + spin_lock(&mmu_notifier_list_lock); hlist_add_head(&mn->hlist, &mm->mmu_notifier.head); + spin_unlock(&mmu_notifier_list_lock); } EXPORT_SYMBOL_GPL(mmu_notifier_register); void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) { + spin_lock(&mmu_notifier_list_lock); hlist_del(&mn->hlist); + spin_unlock(&mmu_notifier_list_lock); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister); + +HLIST_HEAD(mmu_rmap_notifier_list); + +void mmu_rmap_notifier_register(struct mmu_rmap_notifier *mrn) +{ + spin_lock(&mmu_notifier_list_lock); + hlist_add_head_rcu(&mrn->hlist, &mmu_rmap_notifier_list); + spin_unlock(&mmu_notifier_list_lock); +} +EXPORT_SYMBOL(mmu_rmap_notifier_register); + +void mmu_rmap_notifier_unregister(struct mmu_rmap_notifier *mrn) +{ + spin_lock(&mmu_notifier_list_lock); + hlist_del_rcu(&mrn->hlist); + spin_unlock(&mmu_notifier_list_lock); +} +EXPORT_SYMBOL(mmu_rmap_notifier_unregister); + + + Index: linux-2.6/include/linux/mm_types.h =================================================================== --- linux-2.6.orig/include/linux/mm_types.h 2008-01-24 18:26:48.000000000 -0800 +++ linux-2.6/include/linux/mm_types.h 2008-01-24 18:26:48.000000000 -0800 @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -154,6 +153,10 @@ struct vm_area_struct { #endif }; +struct mmu_notifier_head { + struct hlist_head head; +}; + struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; Index: linux-2.6/kernel/fork.c =================================================================== --- linux-2.6.orig/kernel/fork.c 2008-01-24 18:26:48.000000000 -0800 +++ linux-2.6/kernel/fork.c 2008-01-24 18:26:48.000000000 -0800 @@ -51,6 +51,7 @@ #include #include #include +#include #include #include Index: linux-2.6/mm/mmap.c =================================================================== --- linux-2.6.orig/mm/mmap.c 2008-01-24 18:26:48.000000000 -0800 +++ linux-2.6/mm/mmap.c 2008-01-24 18:26:48.000000000 -0800 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include Index: linux-2.6/mm/mremap.c =================================================================== --- linux-2.6.orig/mm/mremap.c 2008-01-24 18:26:43.000000000 -0800 +++ linux-2.6/mm/mremap.c 2008-01-24 18:26:48.000000000 -0800 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include