Avoid allocating interleave from almost full nodes Interleave allocation often go over large sets of nodes. Some of nodes nodes may have tasks on them that heavily use memory. Overallocating those nodes may reduce performance of those tasks. It is better if we try to avoid nodes that have most of its memory used. This patch checks for the amount of free pages on a node. If it is lower than a predefined limit (in /proc/sys/kernel/min_interleave_ratio) then we restart interleave at a random node in the allowed set of nodes. We do not check again if the random node is also overallocated because that will introduce a loop into a peformance critical section of the kernel. This means that we may still allocate on overallocated nodes if multiple nodes in the interleave set are overallocated. The likelyhood increases the more nodes are overallocated in the set. This is only effective for interleave pages that are placed without regard to the address in a process (anonymous pages are typically placed depending on an interleave node generated from the address). This means it applies mainly to slab interleave and page cache interleave. Signed-off-by: Christoph Lameter Index: linux-2.6.19-rc2-mm2/Documentation/sysctl/vm.txt =================================================================== --- linux-2.6.19-rc2-mm2.orig/Documentation/sysctl/vm.txt 2006-10-23 17:09:51.796097864 -0500 +++ linux-2.6.19-rc2-mm2/Documentation/sysctl/vm.txt 2006-10-23 17:57:53.739950472 -0500 @@ -198,6 +198,28 @@ and may not be fast. ============================================================= +min_interleave_ratio: + +This is available only on NUMA kernels. + +A percentage of the free pages in each zone. If less than this +percentage of pages are in use then interleave will attempt to +leave this zone alone and allocate from other zones. This results +in a balancing effect on the system if interleave and node local allocations +are mixed throughout the system. Interleave pages will not cause zone +reclaim and leave some memory on node to allow node local allocation to +occur. Interleave allocations will allocate all over the system until global +reclaim kicks in. + +The mininum does not apply to pages that are placed using interleave +based on an address such as implemented for anonymous pages. It is +effective for slab allocations, huge page allocations and page cache +allocations. + +The default ratio is 10 percent. + +============================================================= + panic_on_oom This enables or disables panic on out-of-memory feature. If this is set to 1, Index: linux-2.6.19-rc2-mm2/include/linux/mmzone.h =================================================================== --- linux-2.6.19-rc2-mm2.orig/include/linux/mmzone.h 2006-10-23 17:52:35.882059867 -0500 +++ linux-2.6.19-rc2-mm2/include/linux/mmzone.h 2006-10-23 17:57:53.811245558 -0500 @@ -192,6 +192,12 @@ struct zone { */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; + /* + * If a zone has less pages free then interleave will + * attempt to bypass the zone + */ + unsigned long min_interleave_pages; + struct per_cpu_pageset *pageset[NR_CPUS]; #else struct per_cpu_pageset pageset[NR_CPUS]; @@ -566,6 +572,8 @@ int sysctl_min_unmapped_ratio_sysctl_han struct file *, void __user *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int sysctl_min_interleave_ratio_sysctl_handler(struct ctl_table *, int, + struct file *, void __user *, size_t *, loff_t *); #include /* Returns the number of the current Node. */ Index: linux-2.6.19-rc2-mm2/include/linux/swap.h =================================================================== --- linux-2.6.19-rc2-mm2.orig/include/linux/swap.h 2006-10-23 17:09:51.812700823 -0500 +++ linux-2.6.19-rc2-mm2/include/linux/swap.h 2006-10-23 17:57:53.833708393 -0500 @@ -197,6 +197,7 @@ extern long vm_total_pages; extern int zone_reclaim_mode; extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; +extern int sysctl_min_interleave_ratio; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else #define zone_reclaim_mode 0 Index: linux-2.6.19-rc2-mm2/include/linux/sysctl.h =================================================================== --- linux-2.6.19-rc2-mm2.orig/include/linux/sysctl.h 2006-10-23 17:09:51.820513980 -0500 +++ linux-2.6.19-rc2-mm2/include/linux/sysctl.h 2006-10-23 17:57:53.853241294 -0500 @@ -199,6 +199,7 @@ enum VM_SWAP_PREFETCH=36, /* swap prefetch */ VM_READAHEAD_RATIO=37, /* percent of read-ahead size to thrashing-threshold */ VM_READAHEAD_HIT_RATE=38, /* one accessed page legitimizes so many read-ahead pages */ + VM_MIN_INTERLEAVE=39, /* Limit for interleave */ }; Index: linux-2.6.19-rc2-mm2/kernel/sysctl.c =================================================================== --- linux-2.6.19-rc2-mm2.orig/kernel/sysctl.c 2006-10-23 17:52:35.863503614 -0500 +++ linux-2.6.19-rc2-mm2/kernel/sysctl.c 2006-10-23 17:57:53.877657419 -0500 @@ -1028,6 +1028,17 @@ static ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = VM_MIN_INTERLEAVE, + .procname = "min_interleave_ratio", + .data = &sysctl_min_interleave_ratio, + .maxlen = sizeof(sysctl_min_interleave_ratio), + .mode = 0644, + .proc_handler = &sysctl_min_interleave_ratio_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif #ifdef CONFIG_X86_32 { Index: linux-2.6.19-rc2-mm2/mm/mempolicy.c =================================================================== --- linux-2.6.19-rc2-mm2.orig/mm/mempolicy.c 2006-10-23 17:09:51.856649831 -0500 +++ linux-2.6.19-rc2-mm2/mm/mempolicy.c 2006-10-23 18:14:14.022022872 -0500 @@ -1118,16 +1118,56 @@ static struct zonelist *zonelist_policy( return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp); } +/* + * Generic interleave function to be used by cpusets and memory policies. + */ +int __interleave(int current_node, nodemask_t *nodes) +{ + unsigned next; + struct zone *z; + int w, n; + + next = next_node(current_node, *nodes); + if (next >= MAX_NUMNODES) + next = first_node(*nodes); + + /* + * If we are overallocating a zone then restart interleave at some + * other node. We should not loop here since this is a performance + * critical path. Just get another random zone and allocate from it. + */ + z = &NODE_DATA(next)->node_zones[policy_zone]; + if (likely(z->free_pages >= z->min_interleave_pages)) + return next; + + w = nodes_weight(*nodes) - 1; + + if (w <= 0) + return next; + + /* + * We want a random node in the allowed set while skipping the + * overallocated node. get_cycles() gives us a rapidly increasing + * cycle counters that we can use to get a pseudo random result + * without too much effort. + */ + n = get_cycles() % w; + + current_node = next; + next = first_node(*nodes); + while (next == current_node || --n >= 0) + next = next_node(next, *nodes); + return next; +} + /* Do dynamic interleaving for a process */ -static unsigned interleave_nodes(struct mempolicy *policy) +static int interleave_nodes(struct mempolicy *policy) { unsigned nid, next; struct task_struct *me = current; nid = me->il_next; - next = next_node(nid, policy->v.nodes); - if (next >= MAX_NUMNODES) - next = first_node(policy->v.nodes); + next = __interleave(nid, &policy->v.nodes); me->il_next = next; return nid; } Index: linux-2.6.19-rc2-mm2/mm/page_alloc.c =================================================================== --- linux-2.6.19-rc2-mm2.orig/mm/page_alloc.c 2006-10-23 17:52:35.819554594 -0500 +++ linux-2.6.19-rc2-mm2/mm/page_alloc.c 2006-10-23 17:57:53.939186055 -0500 @@ -2058,6 +2058,9 @@ static void setup_pagelist_highmark(stru #ifdef CONFIG_NUMA + +int sysctl_min_interleave_ratio = 10; + /* * Boot pageset table. One per cpu which is going to be used for all * zones and all nodes. The parameters will be set in such a way @@ -2653,6 +2656,7 @@ static void __meminit free_area_init_cor zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100; zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; + zone->min_interleave_pages = (realsize + sysctl_min_interleave_ratio) / 100; #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); @@ -3243,6 +3247,21 @@ int sysctl_min_slab_ratio_sysctl_handler sysctl_min_slab_ratio) / 100; return 0; } +int sysctl_min_interleave_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->min_interleave_pages = (zone->present_pages * + sysctl_min_interleave_ratio) / 100; + return 0; +} #endif #if CONFIG_MULTI_ZONE Index: linux-2.6.19-rc2-mm2/include/linux/mempolicy.h =================================================================== --- linux-2.6.19-rc2-mm2.orig/include/linux/mempolicy.h 2006-10-23 17:09:51.831257071 -0500 +++ linux-2.6.19-rc2-mm2/include/linux/mempolicy.h 2006-10-23 17:57:53.952859085 -0500 @@ -156,6 +156,7 @@ extern void mpol_fix_fork_child_flag(str #else #define current_cpuset_is_being_rebound() 0 #endif +extern int __interleave(int node, nodemask_t *nodes); extern struct mempolicy default_policy; extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, Index: linux-2.6.19-rc2-mm2/kernel/cpuset.c =================================================================== --- linux-2.6.19-rc2-mm2.orig/kernel/cpuset.c 2006-10-23 17:09:51.847860030 -0500 +++ linux-2.6.19-rc2-mm2/kernel/cpuset.c 2006-10-23 17:57:53.969462051 -0500 @@ -2475,9 +2475,8 @@ int cpuset_mem_spread_node(void) { int node; - node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); - if (node == MAX_NUMNODES) - node = first_node(current->mems_allowed); + node = __interleave(current->cpuset_mem_spread_rotor, + ¤t->mems_allowed); current->cpuset_mem_spread_rotor = node; return node; }