Index: linux-2.6.19-rc2-mm1/Documentation/sysctl/vm.txt =================================================================== --- linux-2.6.19-rc2-mm1.orig/Documentation/sysctl/vm.txt 2006-10-23 14:45:03.341155754 -0500 +++ linux-2.6.19-rc2-mm1/Documentation/sysctl/vm.txt 2006-10-23 15:22:01.482622585 -0500 @@ -198,6 +198,29 @@ and may not be fast. ============================================================= +min_interleave_ratio: + +This is available only on NUMA kernels. + +A percentage of the free pages in each zone. If less than this +percentage of pages are in use then interleave will attempt to +leave this zone alone and allocate from other zones. This results +in a balancing effect on the system if interleave and node local allocations +are mixed throughout the system. Interleave pages will not cause zone +reclaim and leave some memory on node to allow node local allocation to +occur. Interleave allocations will allocate all over the system until global +reclaim kicks in. + +The mininum does not apply to pages that are placed using interleave +based on an address such as implemented for anonymous pages. It is +effective for slab allocations, huge page allocations and page cache +allocations. A set of nodes to be interleaved must at least +contain 3 nodes in order for this mechanism to be activated. + +The default ratio is 10 percent. + +============================================================= + panic_on_oom This enables or disables panic on out-of-memory feature. If this is set to 1, Index: linux-2.6.19-rc2-mm1/include/linux/mmzone.h =================================================================== --- linux-2.6.19-rc2-mm1.orig/include/linux/mmzone.h 2006-10-23 14:45:03.348968865 -0500 +++ linux-2.6.19-rc2-mm1/include/linux/mmzone.h 2006-10-23 14:49:11.452433653 -0500 @@ -192,6 +192,12 @@ struct zone { */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; + /* + * If a zone has less pages free then interleave will + * attempt to bypass the node + */ + unsigned long min_interleave_pages; + struct per_cpu_pageset *pageset[NR_CPUS]; #else struct per_cpu_pageset pageset[NR_CPUS]; @@ -568,6 +574,8 @@ int sysctl_min_unmapped_ratio_sysctl_han struct file *, void __user *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int sysctl_min_interleave_ratio_sysctl_handler(struct ctl_table *, int, + struct file *, void __user *, size_t *, loff_t *); #include /* Returns the number of the current Node. */ Index: linux-2.6.19-rc2-mm1/include/linux/swap.h =================================================================== --- linux-2.6.19-rc2-mm1.orig/include/linux/swap.h 2006-10-23 14:45:03.355805338 -0500 +++ linux-2.6.19-rc2-mm1/include/linux/swap.h 2006-10-23 14:49:11.463176686 -0500 @@ -197,6 +197,7 @@ extern long vm_total_pages; extern int zone_reclaim_mode; extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; +extern int sysctl_min_interleave_ratio; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else #define zone_reclaim_mode 0 Index: linux-2.6.19-rc2-mm1/include/linux/sysctl.h =================================================================== --- linux-2.6.19-rc2-mm1.orig/include/linux/sysctl.h 2006-10-23 14:45:03.365571727 -0500 +++ linux-2.6.19-rc2-mm1/include/linux/sysctl.h 2006-10-23 14:49:11.473919720 -0500 @@ -199,6 +199,7 @@ enum VM_SWAP_PREFETCH=36, /* swap prefetch */ VM_READAHEAD_RATIO=37, /* percent of read-ahead size to thrashing-threshold */ VM_READAHEAD_HIT_RATE=38, /* one accessed page legitimizes so many read-ahead pages */ + VM_MIN_INTERLEAVE=39, /* Limit for interleave */ }; Index: linux-2.6.19-rc2-mm1/kernel/sysctl.c =================================================================== --- linux-2.6.19-rc2-mm1.orig/kernel/sysctl.c 2006-10-23 14:45:03.374361478 -0500 +++ linux-2.6.19-rc2-mm1/kernel/sysctl.c 2006-10-23 14:49:11.493452509 -0500 @@ -1022,6 +1022,17 @@ static ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = VM_MIN_INTERLEAVE, + .procname = "min_interleave_ratio", + .data = &sysctl_min_interleave_ratio, + .maxlen = sizeof(sysctl_min_interleave_ratio), + .mode = 0644, + .proc_handler = &sysctl_min_interleave_ratio_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif #ifdef CONFIG_X86_32 { Index: linux-2.6.19-rc2-mm1/mm/mempolicy.c =================================================================== --- linux-2.6.19-rc2-mm1.orig/mm/mempolicy.c 2006-10-23 14:45:03.382174589 -0500 +++ linux-2.6.19-rc2-mm1/mm/mempolicy.c 2006-10-23 15:32:03.971304687 -0500 @@ -1123,11 +1123,38 @@ static unsigned interleave_nodes(struct { unsigned nid, next; struct task_struct *me = current; + struct zone *z; nid = me->il_next; next = next_node(nid, policy->v.nodes); if (next >= MAX_NUMNODES) next = first_node(policy->v.nodes); + + /* + * If we are overallocating a zone then restart interleave at some + * other node. We should not loop here since this is a performance + * critical path. Just get another random zone and allocate from it. + */ + z = &NODE_DATA(nid)->node_zones[policy_zone]; + if (unlikely(z->free_pages < z->min_interleave_pages)) { + int w = nodes_weight(policy->v.nodes) - 1; + + if (w > 0) { + /* + * We want a random node in the allowed set while + * skipping the overallocated node. + * get_cycles() gives us a rapidly increasing + * cycle counters that we can use to get a pseudo + * random result without too much effort. + */ + int n = get_cycles() % w; + + nid = next; + next = first_node(policy->v.nodes); + while (next == nid || n-- >= 0) + next = next_node(next, policy->v.nodes); + } + } me->il_next = next; return nid; } Index: linux-2.6.19-rc2-mm1/mm/page_alloc.c =================================================================== --- linux-2.6.19-rc2-mm1.orig/mm/page_alloc.c 2006-10-23 14:45:03.392917617 -0500 +++ linux-2.6.19-rc2-mm1/mm/page_alloc.c 2006-10-23 14:49:11.531541446 -0500 @@ -2053,6 +2053,9 @@ static void setup_pagelist_highmark(stru #ifdef CONFIG_NUMA + +int sysctl_min_interleave_ratio = 10; + /* * Boot pageset table. One per cpu which is going to be used for all * zones and all nodes. The parameters will be set in such a way @@ -2648,6 +2651,7 @@ static void __meminit free_area_init_cor zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100; zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; + zone->min_interleave_pages = (realsize + sysctl_min_interleave_ratio) / 100; #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); @@ -3223,6 +3227,21 @@ int sysctl_min_slab_ratio_sysctl_handler sysctl_min_slab_ratio) / 100; return 0; } +int sysctl_min_interleave_ratio_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->min_interleave_pages = (zone->present_pages * + sysctl_min_interleave_ratio) / 100; + return 0; +} #endif /*