Add simple direct migration support with fall back to swap. Simple direct migration support on top of the swap based page migration facility. Fall back to swap out if necessary. Limitations - No filesystem support Dirty pages backed by a file will be written back before migration occurs. Signed-off-by: Mike Kravetz Signed-off-by: Christoph Lameter Index: linux-2.6.14-rc5-mm1/include/linux/radix-tree.h =================================================================== --- linux-2.6.14-rc5-mm1.orig/include/linux/radix-tree.h 2005-10-24 10:27:30.000000000 -0700 +++ linux-2.6.14-rc5-mm1/include/linux/radix-tree.h 2005-10-26 09:51:13.000000000 -0700 @@ -48,6 +48,7 @@ int radix_tree_insert(struct radix_tree_ void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); void *radix_tree_delete(struct radix_tree_root *, unsigned long); +void *radix_tree_replace(struct radix_tree_root *, unsigned long, void *); unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); Index: linux-2.6.14-rc5-mm1/lib/radix-tree.c =================================================================== --- linux-2.6.14-rc5-mm1.orig/lib/radix-tree.c 2005-10-24 10:27:30.000000000 -0700 +++ linux-2.6.14-rc5-mm1/lib/radix-tree.c 2005-10-26 09:51:13.000000000 -0700 @@ -765,6 +765,50 @@ out: EXPORT_SYMBOL(radix_tree_delete); /** + * radix_tree_replace - replace an item in a radix tree + * @root: radix tree root + * @index: index key + * @item: item to insert + * + * Replace the item at @index with @item. + * Returns the address of the deleted item, or NULL if it was not present. + */ +void *radix_tree_replace(struct radix_tree_root *root, + unsigned long index, void *item) +{ + struct radix_tree_node *slot, *node; + unsigned int height, shift, offset; + + height = root->height; + if (index > radix_tree_maxindex(height)) + goto not_present; + + node = NULL; + slot = root->rnode; + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + + offset = 0; + while (height > 0) { + if (slot == NULL) + goto not_present; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + node = slot; + slot = slot->slots[offset]; + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + + if (node) { + node->slots[offset] = item; + return slot; + } +not_present: + return NULL; +} +EXPORT_SYMBOL(radix_tree_replace); + +/** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root * @tag: tag to test Index: linux-2.6.14-rc5-mm1/mm/vmscan.c =================================================================== --- linux-2.6.14-rc5-mm1.orig/mm/vmscan.c 2005-10-26 09:50:02.000000000 -0700 +++ linux-2.6.14-rc5-mm1/mm/vmscan.c 2005-10-26 13:51:27.000000000 -0700 @@ -574,6 +574,21 @@ keep: } /* + * Determines the zone for a page and take the necessary lru lock to + * return a page to the proper lru + */ +static void putback_lru_page(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + __putback_lru_page(zone, page); + spin_unlock_irq(&zone->lru_lock); + /* Undo the get from isolate_lru_page */ + put_page(page); +} + +/* * swapout a single page * page is locked upon entry, unlocked on exit * @@ -631,6 +646,142 @@ unlock_retry: retry: return 1; } + +/* + * common logic to directly migrate a single page + * page is locked upon entry, always unlocked before return + * + * It has been verified that the page is not + * 1. Undergoing writeback. + * 2. A page in the swapcache. + * 2. Dirty and backed by a file with other subsystems + * (such as a filesystem) tracking the page. + * 3. Part of a vma that is not swappable + * + * return codes: + * 0 = complete + * 1 = retry + */ +static int migrate_page(struct page *page, struct page *newpage) +{ + struct address_space *mapping = page_mapping(page); + struct page *oldpage; + + BUG_ON(PageSwapCache(page)); + BUG_ON(PageWriteback(page)); + + /* Bail out if there are other users of the page */ + if (page_mapcount(page) + 2 != page_count(page)) { + unlock_page(page); + return 1; + } + + /* + * Lock the new page to insure that accesses wait until we + * are finished migrating the page + */ + lock_page(newpage); + + /* Change the mapping in the radix tree */ + get_page(newpage); + read_lock_irq(&mapping->tree_lock); + newpage->index = page->index; + oldpage = radix_tree_replace(&mapping->page_tree, page_index(page), newpage); + read_unlock_irq(&mapping->tree_lock); + + if (!oldpage) { + /* + * replace failed because truncation is in progress + * + * Keep the newpage on the list for other pages to use + * but remove the original page from the list of pages + * to be migrated. + */ + unlock_page(newpage); + put_page(newpage); + unlock_page(page); + list_del(&page->lru); + putback_lru_page(page); + return 0; + } + + __put_page(page); /* mapping */ + + /* + * We are now in the critical section where there is no easy way + * out. We need to succeed! + */ + if (page_mapped(page) && mapping) { + int rc; + + /* + * Unmap the page. Repeat while transient conditions + * prevent the unmapping to occur. + */ + while (1) { + rc = try_to_unmap(page); + + if (rc != SWAP_AGAIN && rc != SWAP_REFERENCE) + break; + + /* + * If we wait on a lock then do something else + * first. + */ + if (rc == SWAP_AGAIN) + schedule(); + }; + + /* + * A really unswappable page should not occur here + * since we should have checked for the page + * not being in a vma that is unswappable + * before entering this function + */ + BUG_ON(rc == SWAP_FAIL); + } + + copy_highpage(newpage, page); + + /* + * Copy page flags. This includes all the potentially useful flags + * that we may want to support in the future. + */ + newpage->flags |= page->flags & + (PG_error | PG_referenced | PG_uptodate | PG_swapcache | + PG_active | PG_checked | PG_mappedtodisk); + + /* The dirty flag needs special handling */ + if (PageDirty(page)) { + clear_page_dirty_for_io(page); + set_page_dirty(newpage); + } + + /* + * If any waiters have accumulated on the new page then + * wake them up. + */ + if (PageWriteback(newpage)) + end_page_writeback(newpage); + + unlock_page(newpage); + + /* + * We may free pages that were taken off the active list + * by isolate_lru_page. However, free_hot_cold_page will check + * if the active bit is set. So clear it. + */ + ClearPageActive(page); + + page->mapping = NULL; + list_del(&page->lru); + unlock_page(page); + + list_del(&newpage->lru); + putback_lru_page(newpage); + return 0; +} + /* * migrate_pages * @@ -648,11 +799,6 @@ retry: * -1 list of new pages has become exhausted. * 0 All page migrated * n Number of pages not migrated - * - * SIMPLIFIED VERSION: This implementation of migrate_pages - * is only swapping out pages and never touches the second - * list. The direct migration patchset - * extends this function to avoid the use of swap. */ int migrate_pages(struct list_head *l, struct list_head *t) { @@ -699,6 +845,7 @@ redo: #ifdef CONFIG_SWAP if (PageAnon(page) && !PageSwapCache(page)) { if (!add_to_swap(page)) { +failed: unlock_page(page); failed++; continue; @@ -710,9 +857,44 @@ redo: * Page is properly locked and writeback is complete. * Try to migrate the page. */ - if (swap_page(page)) { + if (t && !PageDirty(page) && !PagePrivate(page) && !PageSwapCache(page)) { + /* + * Direct page migration + */ + struct page *newpage; + + if (list_empty(t)) + goto failed; + newpage = lru_to_page(t); + + if (migrate_page(page, newpage)) + goto retry_later; + + } else { + struct address_space *mapping = page_mapping(page); + + if (PageDirty(page) && + PagePrivate(page) && + !PageWriteback(page) && + mapping) { + /* + * Write out page if possible to enable direct + * migration in later passes + */ + int rc = pageout(page, mapping); + + if (rc != PAGE_KEEP) { + unlock_page(page); + goto retry_later; + } + } + /* + * Use swap because direct migration is not possible. + */ + if (swap_page(page)) { retry_later: - retry++; + retry++; + } } } if (retry && pass++ < 10) @@ -861,11 +1043,8 @@ static void shrink_cache(struct zone *zo done: pagevec_release(&pvec); } - /* * Add isolated pages on the list back to the LRU - * Determines the zone for each pages and takes - * the necessary lru lock for each page. * * returns the number of pages put back. */ @@ -876,15 +1055,10 @@ int putback_lru_pages(struct list_head * int count = 0; list_for_each_entry_safe(page, page2, l, lru) { - struct zone *zone = page_zone(page); list_del(&page->lru); - spin_lock_irq(&zone->lru_lock); - __putback_lru_page(zone, page); - spin_unlock_irq(&zone->lru_lock); + putback_lru_page(page); count++; - /* Undo the get from isolate_lru_page */ - put_page(page); } return count; }