Modify policy layer to support direct page migration - Add migrate_pages_to() allowing the migration of a list of pages to a a specified node or to vma with a specific allocation policy. - Modify do_migrate_pages() to do a staged move of pages from the source nodes to the target nodes. Signed-off-by: Christoph Lameter Index: linux-2.6.14-rc5-mm1/mm/mempolicy.c =================================================================== --- linux-2.6.14-rc5-mm1.orig/mm/mempolicy.c 2005-10-26 09:50:01.000000000 -0700 +++ linux-2.6.14-rc5-mm1/mm/mempolicy.c 2005-10-26 10:48:57.000000000 -0700 @@ -234,6 +234,40 @@ static void migrate_page_add(struct vm_a } } +/* + * Migrate a list of pages to a certain destination. + * + * return the number of pages not migrated or error code + */ +static int migrate_pages_to(struct list_head *l, struct vm_area_struct *vma, int node) +{ + LIST_HEAD(newlist); + int err = 0; + struct page *page; + struct list_head *p; + + list_for_each(p, l) { + if (vma) + page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start); + else + page = alloc_pages_node(node, GFP_HIGHUSER, 0); + + if (!page) { + err = -ENOMEM; + goto out; + } + list_add(&page->lru, &newlist); + } + err = migrate_pages(l, &newlist); +out: + while (!list_empty(l)) { + page =list_entry(l->next, struct page, lru); + __free_page(page); + list_del(&page->lru); + } + return err; +} + /* Ensure all existing pages follow the policy. */ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, @@ -500,7 +534,7 @@ long do_mbind(unsigned long start, unsig if (!IS_ERR(vma)) { err = mbind_range(vma, start, end, new); if (!list_empty(&pagelist)) - migrate_pages(&pagelist, NULL); + migrate_pages_to(&pagelist, vma, -1); if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT)) err = -EIO; } @@ -631,6 +665,29 @@ long do_get_mempolicy(int *policy, nodem } /* + * Migrate pages from one node to a target node. + * Returns error or the number of pages not migrated. + */ +int migrate_node(struct mm_struct *mm, int source, int dest, int flags) +{ + nodemask_t nodes; + LIST_HEAD(pagelist); + int err = 0; + + nodes_setall(nodes); + node_clear(source, nodes); + + check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + if (!list_empty(&pagelist)) { + err = migrate_pages_to(&pagelist, NULL, dest); + if (!list_empty(&pagelist)) + putback_lru_pages(&pagelist); + } + return err; +} + +/* * For now migrate_pages simply swaps out the pages from nodes that are in * the source set but not in the target set. In the future, we would * want a function that moves pages between the two nodesets in such @@ -642,22 +699,52 @@ int do_migrate_pages(struct mm_struct *m nodemask_t *from_nodes, nodemask_t *to_nodes, int flags) { LIST_HEAD(pagelist); - int count = 0; - nodemask_t nodes; - - nodes_andnot(nodes, *from_nodes, *to_nodes); - nodes_complement(nodes, nodes); + int err = 0; + int count; + int node; + int tnodes = nodes_weight(*to_nodes); + int targets[tnodes]; + + count = 0; + for_each_node_mask(node, *to_nodes) + targets[count++] = node; down_read(&mm->mmap_sem); - check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, - flags | MPOL_MF_DISCONTIG_OK, &pagelist); - if (!list_empty(&pagelist)) { - migrate_pages(&pagelist, NULL); - if (!list_empty(&pagelist)) - count = putback_lru_pages(&pagelist); + + /* + * Migration needs to happen in such a way that we + * do not migrate too many pages intermittendly on one + * node. + */ + if (first_node(*from_nodes) < first_node(*to_nodes)) { + /* Walk backward through the source nodelist */ + count = tnodes - 1; + + for (node = MAX_NUMNODES-1; node >= 0; node--) + if (node_isset(node, *from_nodes)) { + err = migrate_node(mm, node, targets[count], flags); + if (err) + goto out; + + if (count >0) + count--; + else + count = tnodes-1; + } + } else { + /* Walk forward through the source nodelist */ + count = 0; + for_each_node_mask(node, *from_nodes) { + err = migrate_node(mm, node, targets[count % tnodes], flags); + if (err) + goto out; + + count++; + } } +out: up_read(&mm->mmap_sem); - return count; + return err; } /*