From: Christoph Lameter Implement read/write migration ptes We take the upper two swapfiles for the two types of migration ptes and define a series of macros in swapops.h. The VM is modified to handle the migration entries. migration entries can only be encountered when the page they are pointing to is locked. This limits the number of places one has to fix. We also check in copy_pte_range and in mprotect_pte_range() for migration ptes. We check for migration ptes in do_swap_cache and call a function that will then wait on the page lock. This allows us to effectively stop all accesses to apge. Migration entries are created by try_to_unmap if called for migration and removed by local functions in migrate.c Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton --- include/linux/swap.h | 7 ++ include/linux/swapops.h | 53 +++++++++++++++ mm/memory.c | 21 +++++- mm/migrate.c | 132 +++++++++++++++++++++++++++++++++++++- mm/mprotect.c | 21 +++++- mm/rmap.c | 32 ++++++--- mm/swapfile.c | 6 + 7 files changed, 256 insertions(+), 16 deletions(-) diff -puN include/linux/swap.h~swapless-pm-add-r-w-migration-entries include/linux/swap.h --- devel/include/linux/swap.h~swapless-pm-add-r-w-migration-entries 2006-05-17 13:09:43.000000000 -0700 +++ devel-akpm/include/linux/swap.h 2006-05-17 13:09:43.000000000 -0700 @@ -28,7 +28,14 @@ static inline int current_is_kswapd(void * the type/offset into the pte as 5/27 as well. */ #define MAX_SWAPFILES_SHIFT 5 +#ifndef CONFIG_MIGRATION #define MAX_SWAPFILES (1 << MAX_SWAPFILES_SHIFT) +#else +/* Use last entry for page migration swap entries */ +#define MAX_SWAPFILES ((1 << MAX_SWAPFILES_SHIFT)-2) +#define SWP_MIGRATION_READ MAX_SWAPFILES +#define SWP_MIGRATION_WRITE (MAX_SWAPFILES + 1) +#endif /* * Magic header for a swap area. The first part of the union is diff -puN include/linux/swapops.h~swapless-pm-add-r-w-migration-entries include/linux/swapops.h --- devel/include/linux/swapops.h~swapless-pm-add-r-w-migration-entries 2006-05-17 13:09:43.000000000 -0700 +++ devel-akpm/include/linux/swapops.h 2006-05-17 13:09:43.000000000 -0700 @@ -67,3 +67,56 @@ static inline pte_t swp_entry_to_pte(swp BUG_ON(pte_file(__swp_entry_to_pte(arch_entry))); return __swp_entry_to_pte(arch_entry); } + +#ifdef CONFIG_MIGRATION +static inline swp_entry_t make_migration_entry(struct page *page, int write) +{ + BUG_ON(!PageLocked(page)); + return swp_entry(write ? SWP_MIGRATION_WRITE : SWP_MIGRATION_READ, + page_to_pfn(page)); +} + +static inline int is_migration_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_MIGRATION_READ || + swp_type(entry) == SWP_MIGRATION_WRITE); +} + +static inline int is_write_migration_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_MIGRATION_WRITE); +} + +static inline struct page *migration_entry_to_page(swp_entry_t entry) +{ + struct page *p = pfn_to_page(swp_offset(entry)); + /* + * Any use of migration entries may only occur while the + * corresponding page is locked + */ + BUG_ON(!PageLocked(p)); + return p; +} + +static inline void make_migration_entry_read(swp_entry_t *entry) +{ + *entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry)); +} + +extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address); +#else + +#define make_migration_entry(page, write) swp_entry(0, 0) +#define is_migration_entry(swp) 0 +#define migration_entry_to_page(swp) NULL +static inline void make_migration_entry_read(swp_entry_t *entryp) { } +static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) { } +static inline int is_write_migration_entry(swp_entry_t entry) +{ + return 0; +} + +#endif + diff -puN mm/memory.c~swapless-pm-add-r-w-migration-entries mm/memory.c --- devel/mm/memory.c~swapless-pm-add-r-w-migration-entries 2006-05-17 13:09:43.000000000 -0700 +++ devel-akpm/mm/memory.c 2006-05-17 13:09:43.000000000 -0700 @@ -434,7 +434,9 @@ copy_one_pte(struct mm_struct *dst_mm, s /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { if (!pte_file(pte)) { - swap_duplicate(pte_to_swp_entry(pte)); + swp_entry_t entry = pte_to_swp_entry(pte); + + swap_duplicate(entry); /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); @@ -443,6 +445,19 @@ copy_one_pte(struct mm_struct *dst_mm, s &src_mm->mmlist); spin_unlock(&mmlist_lock); } + if (is_migration_entry(entry) && + is_cow_mapping(vm_flags)) { + page = migration_entry_to_page(entry); + + /* + * COW mappings require pages in both parent + * and child to be set to read. + */ + entry = make_migration_entry(page, + SWP_MIGRATION_READ); + pte = swp_entry_to_pte(entry); + set_pte_at(src_mm, addr, src_pte, pte); + } } goto out_set_pte; } @@ -1879,6 +1894,10 @@ static int do_swap_page(struct mm_struct goto out; entry = pte_to_swp_entry(orig_pte); + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); + goto out; + } page = lookup_swap_cache(entry); if (!page) { swapin_readahead(entry, address, vma); diff -puN mm/migrate.c~swapless-pm-add-r-w-migration-entries mm/migrate.c --- devel/mm/migrate.c~swapless-pm-add-r-w-migration-entries 2006-05-17 13:09:43.000000000 -0700 +++ devel-akpm/mm/migrate.c 2006-05-17 13:09:43.000000000 -0700 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -23,7 +24,6 @@ #include #include #include -#include #include "internal.h" @@ -119,6 +119,136 @@ int putback_lru_pages(struct list_head * return count; } +static inline int is_swap_pte(pte_t pte) +{ + return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); +} + +/* + * Restore a potential migration pte to a working pte entry for + * anonymous pages. + */ +static void remove_migration_pte(struct vm_area_struct *vma, unsigned long addr, + struct page *old, struct page *new) +{ + struct mm_struct *mm = vma->vm_mm; + swp_entry_t entry; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return; + + ptep = pte_offset_map(pmd, addr); + + if (!is_swap_pte(*ptep)) { + pte_unmap(ptep); + return; + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; + + entry = pte_to_swp_entry(pte); + + if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) + goto out; + + inc_mm_counter(mm, anon_rss); + get_page(new); + pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + if (is_write_migration_entry(entry)) + pte = pte_mkwrite(pte); + set_pte_at(mm, addr, ptep, pte); + page_add_anon_rmap(new, vma, addr); +out: + pte_unmap_unlock(pte, ptl); +} + +/* + * Get rid of all migration entries and replace them by + * references to the indicated page. + * + * Must hold mmap_sem lock on at least one of the vmas containing + * the page so that the anon_vma cannot vanish. + */ +static void remove_migration_ptes(struct page *old, struct page *new) +{ + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + unsigned long mapping; + + mapping = (unsigned long)new->mapping; + + if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) + return; + + /* + * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. + */ + anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); + spin_lock(&anon_vma->lock); + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) + remove_migration_pte(vma, page_address_in_vma(new, vma), + old, new); + + spin_unlock(&anon_vma->lock); +} + +/* + * Something used the pte of a page under migration. We need to + * get to the page and wait until migration is finished. + * When we return from this function the fault will be retried. + * + * This function is called from do_swap_page(). + */ +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + pte_t *ptep, pte; + spinlock_t *ptl; + swp_entry_t entry; + struct page *page; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; + + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto out; + + page = migration_entry_to_page(entry); + + /* Pages with migration entries are always locked */ + BUG_ON(!PageLocked(page)); + + get_page(page); + pte_unmap_unlock(ptep, ptl); + wait_on_page_locked(page); + put_page(page); + return; +out: + pte_unmap_unlock(ptep, ptl); +} + /* * swapout a single page * page is locked upon entry, unlocked on exit diff -puN mm/mprotect.c~swapless-pm-add-r-w-migration-entries mm/mprotect.c --- devel/mm/mprotect.c~swapless-pm-add-r-w-migration-entries 2006-05-17 13:09:43.000000000 -0700 +++ devel-akpm/mm/mprotect.c 2006-05-17 13:09:43.000000000 -0700 @@ -19,7 +19,8 @@ #include #include #include - +#include +#include #include #include #include @@ -28,12 +29,13 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot) { - pte_t *pte; + pte_t *pte, oldpte; spinlock_t *ptl; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); do { - if (pte_present(*pte)) { + oldpte = *pte; + if (pte_present(oldpte)) { pte_t ptent; /* Avoid an SMP race with hardware updated dirty/clean @@ -43,7 +45,20 @@ static void change_pte_range(struct mm_s ptent = pte_modify(ptep_get_and_clear(mm, addr, pte), newprot); set_pte_at(mm, addr, pte, ptent); lazy_mmu_prot_update(ptent); + } else if (!pte_file(oldpte)) { + swp_entry_t entry = pte_to_swp_entry(oldpte); + + if (is_write_migration_entry(entry)) { + /* + * A protection check is difficult so + * just be safe and disable write + */ + make_migration_entry_read(&entry); + set_pte_at(mm, addr, pte, + swp_entry_to_pte(entry)); + } } + } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(pte - 1, ptl); } diff -puN mm/rmap.c~swapless-pm-add-r-w-migration-entries mm/rmap.c --- devel/mm/rmap.c~swapless-pm-add-r-w-migration-entries 2006-05-17 13:09:43.000000000 -0700 +++ devel-akpm/mm/rmap.c 2006-05-17 13:09:43.000000000 -0700 @@ -620,17 +620,27 @@ static int try_to_unmap_one(struct page if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; - /* - * Store the swap location in the pte. - * See handle_pte_fault() ... - */ - BUG_ON(!PageSwapCache(page)); - swap_duplicate(entry); - if (list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - if (list_empty(&mm->mmlist)) - list_add(&mm->mmlist, &init_mm.mmlist); - spin_unlock(&mmlist_lock); + + if (PageSwapCache(page)) { + /* + * Store the swap location in the pte. + * See handle_pte_fault() ... + */ + swap_duplicate(entry); + if (list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + if (list_empty(&mm->mmlist)) + list_add(&mm->mmlist, &init_mm.mmlist); + spin_unlock(&mmlist_lock); + } + } else { + /* + * Store the pfn of the page in a special migration + * pte. do_swap_page() will wait until the migration + * pte is removed and then restart fault handling. + */ + BUG_ON(!migration); + entry = make_migration_entry(page, pte_write(pteval)); } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); diff -puN mm/swapfile.c~swapless-pm-add-r-w-migration-entries mm/swapfile.c --- devel/mm/swapfile.c~swapless-pm-add-r-w-migration-entries 2006-05-17 13:09:43.000000000 -0700 +++ devel-akpm/mm/swapfile.c 2006-05-17 13:09:43.000000000 -0700 @@ -395,6 +395,9 @@ void free_swap_and_cache(swp_entry_t ent struct swap_info_struct * p; struct page *page = NULL; + if (is_migration_entry(entry)) + return; + p = swap_info_get(entry); if (p) { if (swap_entry_free(p, swp_offset(entry)) == 1) { @@ -1702,6 +1705,9 @@ int swap_duplicate(swp_entry_t entry) unsigned long offset, type; int result = 0; + if (is_migration_entry(entry)) + return 1; + type = swp_type(entry); if (type >= nr_swapfiles) goto bad_file; _