Control manual page migration via /proc//numa policy /proc//numa_maps shows how the pages are allocated for each virtual memory area. This patch makes /proc/ N[()]>. Example: Lets say we have the following numa_maps information: 2000000000000000 prefer=1 MaxRef=42 Pages=11 Mapped=11 N0=3 N1=2 N2=2 N3=4 2000000000038000 prefer=1 MaxRef=1 Pages=2 Mapped=2 Anon=2 N1=2 2000000000040000 prefer=1 MaxRef=1 Pages=1 Mapped=1 Anon=1 N1=1 2000000000058000 prefer=1 MaxRef=42 Pages=59 Mapped=59 N0=14 N1=16 N2=15 N3=14 2000000000260000 prefer=1 MaxRef=0 Pages=0 Mapped=0 In order to move the 2 pages on Node 1 for the vma at 2000000000038000 to node 2 we could do the following echo "2000000000038000 N1(2)>2" >/proc/1234/numa_maps The number of pages is optional. If not specified then all pages of a node are moved. echo "2000000000038000 N1>2" >/proc/1234/numa_maps would have the same effect since there are 2 pages on node 1. The target is also optional. If not specified then the pages are moved to some location in the currently allowed set of nodes for that task. echo "2000000000038000 N1" >/proc/1234/numa_maps would remove all pages from node 1 for this vma. Signed-off-by: Christoph Lameter Index: linux-2.6.14-rc5-mm1/mm/mempolicy.c =================================================================== --- linux-2.6.14-rc5-mm1.orig/mm/mempolicy.c 2005-11-03 11:52:55.000000000 -0800 +++ linux-2.6.14-rc5-mm1/mm/mempolicy.c 2005-11-03 12:02:49.000000000 -0800 @@ -1536,6 +1536,13 @@ void numa_default_policy(void) * numa proc interface to allow a display of the allocation patterns * in a vma */ + +#define MPOL_BUFFER_SIZE 50 + +#define MPOL_BUFFER_SIZE 50 + +#define MPOL_BUFFER_SIZE 50 + struct numa_maps { unsigned long pages; unsigned long anon; @@ -1643,7 +1650,7 @@ static int show_numa_map(struct seq_file struct vm_area_struct *vma = v; struct numa_maps *md; int n; - char buffer[50]; + char buffer[MPOL_BUFFER_SIZE]; if (!vma->vm_mm) return 0; @@ -1652,7 +1659,7 @@ static int show_numa_map(struct seq_file if (!md) return 0; - mpol_to_str(buffer, sizeof(buffer), get_vma_policy(task, vma, vma->vm_start)); + mpol_to_str(buffer, MPOL_BUFFER_SIZE, get_vma_policy(task, vma, vma->vm_start)); seq_printf(m, "%08lx %s MaxRef=%lu Pages=%lu Mapped=%lu", vma->vm_start, buffer, md->mapcount_max, md->pages, md->mapped); @@ -1677,3 +1684,114 @@ struct seq_operations proc_pid_numa_maps .show = show_numa_map }; +ssize_t numa_maps_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct vm_area_struct *vma; + char *p, *q; + unsigned long addr; + nodemask_t nodes; + int target = -1; + int pages = -1; + int rc; + char buffer[MPOL_BUFFER_SIZE]; + + nodes_clear(nodes); + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + if (!task->mm || count >= MPOL_BUFFER_SIZE) + return -EINVAL; + + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + addr = simple_strtoul(buffer, &p, 16); + if (*p++ != ' ') + return -EINVAL; + + vma = find_vma(task->mm, addr); + if (!vma || vma->vm_end < addr) + return -EINVAL; + + if (toupper(*p) == 'N') { + p++; + + /* Node number must follow */ + node_set(simple_strtoul(p, &q, 10), nodes); + + if (q == p) + return -EINVAL; + + /* Check for optional number of pages */ + if (*q == '(') { + q++; + pages = simple_strtoul(q, &p, 10); + if (*p != ')') + return -EINVAL; + p++; + } else + p = q; + + if (*p == '>') { + p++; + target = simple_strtoul(p, &q, 10); + if (q == p) + return -EINVAL; + p = q; + } + + down_read(&vma->vm_mm->mmap_sem); + rc = try_to_migrate_vma_pages(vma, nodes, target, pages); + up_read(&vma->vm_mm->mmap_sem); + if (rc) + printk(KERN_ERR "migrate_vma(%p,%d,%d)=%d\n", vma, target, pages, rc); + + return p - buffer; + } else { + struct mempolicy *pol, *old_policy; + + /* + * Note that the policy may contain nodes not allowed + * in the context of the current cpuset + */ + pol = str_to_mpol(p); + if (!pol) + return -EINVAL; + + down_write(&vma->vm_mm->mmap_sem); + old_policy = vma->vm_policy; + + if (!mpol_equal(pol, old_policy)) { + if (pol->policy == MPOL_DEFAULT) + pol = NULL; + + vma->vm_policy = pol; + } else + old_policy = pol; + + up_write(&vma->vm_mm->mmap_sem); + mpol_free(old_policy); + return count; + } +} + +static int numa_maps_open(struct inode *inode, struct file *file) +{ + struct task_struct *task = proc_task(inode); + int ret = seq_open(file, &proc_pid_numa_maps_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = task; + } + return ret; +} + +struct file_operations proc_numa_maps_operations = { + .open = numa_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = numa_maps_write +}; Index: linux-2.6.14-rc5-mm1/fs/proc/base.c =================================================================== --- linux-2.6.14-rc5-mm1.orig/fs/proc/base.c 2005-10-19 23:23:05.000000000 -0700 +++ linux-2.6.14-rc5-mm1/fs/proc/base.c 2005-11-03 11:55:55.000000000 -0800 @@ -618,24 +618,8 @@ static struct file_operations proc_maps_ }; #ifdef CONFIG_NUMA -extern struct seq_operations proc_pid_numa_maps_op; -static int numa_maps_open(struct inode *inode, struct file *file) -{ - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_numa_maps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; - } - return ret; -} - -static struct file_operations proc_numa_maps_operations = { - .open = numa_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +extern struct file_operations proc_numa_maps_operations; +extern struct file_operations proc_numa_policy_operations; #endif #ifdef CONFIG_MMU