Display and modify the memory policy of a process through /proc//numa_policy This patch adds a new proc entry for each process called "numa_policy". If read this file will output a text string describing the memory policy for the process. A new policy may be written to "numa_policy" in order to change the memory policy for the process. The following strings may be written to /proc//numa_policy: default -> Reset allocation policy to default prefer= -> Prefer allocation on specified node interleave={nodelist} -> Interleaved allocation on the given nodes bind={zonelist} -> Restrict allocation to the specified zones. Zones are specified by either only providing the node number or using the notation zone/name. I.e. 3/normal 1/high 0/dma etc. Additionally the patch also adds write capability to the "numa_maps". One can write a VMA address followed by the policy to that file to change the mempolicy of an individual virtual memory area. i.e. echo "2aaaaaaab000 bind={0/Normal}" >numa_maps This is compatible with the output format of numa_maps. These functions are a core requirement for the ability to manage the memory allocation of processes dynamically. This may be done by the administrator manually as described here or one may write a batch process manager that manages the memory on a numa system. The patch requires my numa_maps patch from Andrew Morton's tree. Here is an example. We want to reorganize how process 12024 is allocating memory. We would like to allocate most pages on node 1. However, we would like the heap pages to be allocated interleaved on nodes 2 and 3 to allow better throughput. cd /proc/12024/ echo "prefer=1" >numa_policy margin:/proc/12024 # cat numa_maps 00000000 prefer=1 MaxRef=0 Pages=0 Mapped=0 2000000000000000 prefer=1 MaxRef=42 Pages=11 Mapped=11 N0=3 N1=2 N2=2 N3=4 2000000000038000 prefer=1 MaxRef=1 Pages=2 Mapped=2 Anon=2 N1=2 2000000000040000 prefer=1 MaxRef=1 Pages=1 Mapped=1 Anon=1 N1=1 2000000000058000 prefer=1 MaxRef=42 Pages=59 Mapped=59 N0=14 N1=16 N2=15 N3=14 2000000000260000 prefer=1 MaxRef=0 Pages=0 Mapped=0 2000000000268000 prefer=1 MaxRef=1 Pages=2 Mapped=2 Anon=2 N1=2 2000000000274000 prefer=1 MaxRef=1 Pages=3 Mapped=3 Anon=3 N1=3 2000000000280000 prefer=1 MaxRef=8 Pages=3 Mapped=3 N0=3 2000000000300000 prefer=1 MaxRef=8 Pages=2 Mapped=2 N0=2 2000000000318000 prefer=1 MaxRef=1 Pages=1 Mapped=1 Anon=1 N1=1 4000000000000000 prefer=1 MaxRef=6 Pages=2 Mapped=2 N1=2 6000000000004000 prefer=1 MaxRef=1 Pages=1 Mapped=1 Anon=1 N1=1 6000000000008000 prefer=1 MaxRef=1 Pages=1 Mapped=1 Anon=1 N1=1 60000fff7fffc000 prefer=1 MaxRef=1 Pages=1 Mapped=1 Anon=1 N1=1 60000ffffff3c000 prefer=1 MaxRef=1 Pages=1 Mapped=1 Anon=1 N1=1 margin:/proc/12024 # cat maps 00000000-00004000 r--p 00000000 00:00 0 2000000000000000-200000000002c000 r-xp 00000000 08:04 516 /lib/ld-2.3.3.so 2000000000038000-2000000000040000 rw-p 00028000 08:04 516 /lib/ld-2.3.3.so 2000000000040000-2000000000044000 rw-p 2000000000040000 00:00 0 2000000000058000-2000000000260000 r-xp 00000000 08:04 54707842 /lib/tls/libc.so.6.1 2000000000260000-2000000000268000 ---p 00208000 08:04 54707842 /lib/tls/libc.so.6.1 2000000000268000-2000000000274000 rw-p 00200000 08:04 54707842 /lib/tls/libc.so.6.1 2000000000274000-2000000000280000 rw-p 2000000000274000 00:00 0 2000000000280000-20000000002b4000 r--p 00000000 08:04 9126923 /usr/lib/locale/en_US.utf8/LC_CTYPE 2000000000300000-2000000000308000 r--s 00000000 08:04 60071467 /usr/lib/gconv/gconv-modules.cache 2000000000318000-2000000000328000 rw-p 2000000000318000 00:00 0 4000000000000000-4000000000008000 r-xp 00000000 08:04 29576399 /sbin/mingetty 6000000000004000-6000000000008000 rw-p 00004000 08:04 29576399 /sbin/mingetty 6000000000008000-600000000002c000 rw-p 6000000000008000 00:00 0 [heap] 60000fff7fffc000-60000fff80000000 rw-p 60000fff7fffc000 00:00 0 60000ffffff3c000-60000ffffff90000 rw-p 60000ffffff3c000 00:00 0 [stack] a000000000000000-a000000000020000 ---p 00000000 00:00 0 [vdso] echo "2xxxx interleave={2,3}" >numa_maps margin:/proc/12024 # cat numa_maps 00000000 prefer=1 MaxRef=0 Pages=0 Mapped=0 2000000000000000 prefer=1 MaxRef=42 Pages=11 Mapped=11 N0=3 N1=2 N2=2 N3=4 2000000000038000 prefer=1 MaxRef=1 Pages=2 Mapped=2 Anon=2 N1=2 2000000000040000 prefer=1 MaxRef=1 Pages=1 Mapped=1 Anon=1 N1=1 2000000000058000 prefer=1 MaxRef=42 Pages=59 Mapped=59 N0=14 N1=16 N2=15 N3=14 2000000000260000 prefer=1 MaxRef=0 Pages=0 Mapped=0 2000000000268000 prefer=1 MaxRef=1 Pages=2 Mapped=2 Anon=2 N1=2 2000000000274000 prefer=1 MaxRef=1 Pages=3 Mapped=3 Anon=3 N1=3 2000000000280000 prefer=1 MaxRef=8 Pages=3 Mapped=3 N0=3 2000000000300000 prefer=1 MaxRef=8 Pages=2 Mapped=2 N0=2 2000000000318000 prefer=1 MaxRef=1 Pages=1 Mapped=1 Anon=1 N1=1 4000000000000000 prefer=1 MaxRef=6 Pages=2 Mapped=2 N1=2 6000000000004000 prefer=1 MaxRef=1 Pages=1 Mapped=1 Anon=1 N1=1 6000000000008000 interleave={2,3} MaxRef=1 Pages=1 Mapped=1 Anon=1 N1=1 60000fff7fffc000 prefer=1 MaxRef=1 Pages=1 Mapped=1 Anon=1 N1=1 60000ffffff3c000 prefer=1 MaxRef=1 Pages=1 Mapped=1 Anon=1 N1=1 Signed-off-by: Christoph Lameter Index: linux-2.6.13-rc4-mm1/fs/proc/base.c =================================================================== --- linux-2.6.13-rc4-mm1.orig/fs/proc/base.c 2005-08-01 13:00:01.000000000 -0700 +++ linux-2.6.13-rc4-mm1/fs/proc/base.c 2005-08-01 13:47:06.000000000 -0700 @@ -102,7 +102,9 @@ PROC_TGID_STAT, PROC_TGID_STATM, PROC_TGID_MAPS, - PROC_TGID_NUMA_MAPS, +#ifdef CONFIG_NUMA + PROC_TGID_NUMA_POLICY, +#endif PROC_TGID_MOUNTS, PROC_TGID_WCHAN, PROC_TGID_EMAPS, @@ -141,7 +143,9 @@ PROC_TID_STAT, PROC_TID_STATM, PROC_TID_MAPS, - PROC_TID_NUMA_MAPS, +#ifdef CONFIG_NUMA + PROC_TID_NUMA_POLICY, +#endif PROC_TID_MOUNTS, PROC_TID_WCHAN, PROC_TID_EMAPS, @@ -186,7 +190,7 @@ E(PROC_TGID_STATM, "statm", S_IFREG|S_IRUGO), E(PROC_TGID_MAPS, "maps", S_IFREG|S_IRUGO), #ifdef CONFIG_NUMA - E(PROC_TGID_NUMA_MAPS, "numa_maps", S_IFREG|S_IRUGO), + E(PROC_TGID_NUMA_POLICY, "numa_policy", S_IFREG|S_IRUGO|S_IWUSR), #endif E(PROC_TGID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), #ifdef CONFIG_SECCOMP @@ -226,7 +230,7 @@ E(PROC_TID_STATM, "statm", S_IFREG|S_IRUGO), E(PROC_TID_MAPS, "maps", S_IFREG|S_IRUGO), #ifdef CONFIG_NUMA - E(PROC_TID_NUMA_MAPS, "numa_maps", S_IFREG|S_IRUGO), + E(PROC_TID_NUMA_POLICY, "numa_policy", S_IFREG|S_IRUGO|S_IWUSR), #endif E(PROC_TID_MEM, "mem", S_IFREG|S_IRUSR|S_IWUSR), #ifdef CONFIG_SECCOMP @@ -565,24 +569,7 @@ }; #ifdef CONFIG_NUMA -extern struct seq_operations proc_pid_numa_maps_op; -static int numa_maps_open(struct inode *inode, struct file *file) -{ - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_numa_maps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; - } - return ret; -} - -static struct file_operations proc_numa_maps_operations = { - .open = numa_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +extern struct file_operations proc_numa_policy_operations; #endif extern struct seq_operations proc_pid_emaps_op; @@ -1614,9 +1601,9 @@ inode->i_fop = &proc_maps_operations; break; #ifdef CONFIG_NUMA - case PROC_TID_NUMA_MAPS: - case PROC_TGID_NUMA_MAPS: - inode->i_fop = &proc_numa_maps_operations; + case PROC_TID_NUMA_POLICY: + case PROC_TGID_NUMA_POLICY: + inode->i_fop = &proc_numa_policy_operations; break; #endif case PROC_TID_MEM: Index: linux-2.6.13-rc4-mm1/fs/proc/task_mmu.c =================================================================== --- linux-2.6.13-rc4-mm1.orig/fs/proc/task_mmu.c 2005-08-01 13:00:01.000000000 -0700 +++ linux-2.6.13-rc4-mm1/fs/proc/task_mmu.c 2005-08-01 16:11:00.000000000 -0700 @@ -407,131 +407,74 @@ }; #ifdef CONFIG_NUMA +/* + * Retrieval and setting of the memory policy for a task + */ +static ssize_t numa_policy_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = proc_task(file->f_dentry->d_inode); + char buffer[50]; /* Should this really be on the stack ?? */ + size_t len; + loff_t __ppos = *ppos; -struct numa_maps { - unsigned long pages; - unsigned long anon; - unsigned long mapped; - unsigned long mapcount_max; - unsigned long node[MAX_NUMNODES]; -}; + len = mpol_to_str(buffer, sizeof(buffer), task->mempolicy); + if (__ppos >= len) + return 0; + if (count > len-__ppos) + count = len-__ppos; + if (copy_to_user(buf, buffer + __ppos, count)) + return -EFAULT; + *ppos = __ppos + count; + return count; +} /* - * Calculate numa node maps for a vma + * This piece of code is run in the context of the task where we want to change + * the memory policy. */ -static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma) +static int policy_change_notifier(struct notifier_block *n, unsigned long x, void*v) { - struct page *page; - unsigned long vaddr; - struct mm_struct *mm = vma->vm_mm; - int i; - struct numa_maps *md = kmalloc(sizeof(struct numa_maps), GFP_KERNEL); - - if (!md) - return NULL; - md->pages = 0; - md->anon = 0; - md->mapped = 0; - md->mapcount_max = 0; - for_each_node(i) - md->node[i] =0; - - spin_lock(&mm->page_table_lock); - for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) { - page = follow_page(mm, vaddr, 0); - if (page) { - int count = page_mapcount(page); - - if (count) - md->mapped++; - if (count > md->mapcount_max) - md->mapcount_max = count; - md->pages++; - if (PageAnon(page)) - md->anon++; - md->node[page_to_nid(page)]++; - } - } - spin_unlock(&mm->page_table_lock); - return md; + struct mempolicy *new = str_to_mpol((char *)(n+1)); + + if (!new) + return -EINVAL; + + /* Maybe the next segment could become do_set_mempolicy ? */ + mpol_free(xchg(¤t->mempolicy, new)); + if (new && new->policy == MPOL_INTERLEAVE) + current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES); + + kfree(n); + return 0; } -static int show_numa_map(struct seq_file *m, void *v) +static ssize_t numa_policy_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) { - struct task_struct *task = m->private; - struct vm_area_struct *vma = v; - struct mempolicy *pol; - struct numa_maps *md; - struct zone **z; - int n; - int first; + struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct notifier_block *n; - if (!vma->vm_mm) - return 0; + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; - md = get_numa_maps(vma); - if (!md) - return 0; + if (!task->mm) + return -EINVAL; - seq_printf(m, "%08lx", vma->vm_start); - pol = get_vma_policy(task, vma, vma->vm_start); - /* Print policy */ - switch (pol->policy) { - case MPOL_PREFERRED: - seq_printf(m, " prefer=%d", pol->v.preferred_node); - break; - case MPOL_BIND: - seq_printf(m, " bind={"); - first = 1; - for (z = pol->v.zonelist->zones; *z; z++) { - - if (!first) - seq_putc(m, ','); - else - first = 0; - seq_printf(m, "%d/%s", (*z)->zone_pgdat->node_id, - (*z)->name); - } - seq_putc(m, '}'); - break; - case MPOL_INTERLEAVE: - seq_printf(m, " interleave={"); - first = 1; - for_each_node(n) { - if (test_bit(n, pol->v.nodes)) { - if (!first) - seq_putc(m,','); - else - first = 0; - seq_printf(m, "%d",n); - } - } - seq_putc(m, '}'); - break; - default: - seq_printf(m," default"); - break; - } - seq_printf(m, " MaxRef=%lu Pages=%lu Mapped=%lu", - md->mapcount_max, md->pages, md->mapped); - if (md->anon) - seq_printf(m," Anon=%lu",md->anon); - - for_each_online_node(n) { - if (md->node[n]) - seq_printf(m, " N%d=%lu", n, md->node[n]); + n = kmalloc(sizeof(struct notifier_block) + count, GFP_KERNEL); + memset(n, 0, sizeof(struct notifier_block) + count); + n->notifier_call = policy_change_notifier; + if (copy_from_user(n+1, buf, count)) { + kfree(n); + return -EFAULT; } - seq_putc(m, '\n'); - kfree(md); - if (m->count < m->size) /* vma is copied successfully */ - m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; - return 0; + notifier_chain_register(&task->todo, n); + return count; } -struct seq_operations proc_pid_numa_maps_op = { - .start = m_start, - .next = m_next, - .stop = m_stop, - .show = show_numa_map + +struct file_operations proc_numa_policy_operations = { + .read = numa_policy_read, + .write = numa_policy_write }; #endif