Index: linux-2.6.16-rc1-mm3/include/linux/swap.h =================================================================== --- linux-2.6.16-rc1-mm3.orig/include/linux/swap.h 2006-01-26 22:14:12.000000000 -0800 +++ linux-2.6.16-rc1-mm3/include/linux/swap.h 2006-01-26 22:14:12.000000000 -0800 @@ -178,6 +178,7 @@ extern int vm_swappiness; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; +extern int zone_reclaim_interval; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else #define zone_reclaim_mode 0 Index: linux-2.6.16-rc1-mm3/mm/vmscan.c =================================================================== --- linux-2.6.16-rc1-mm3.orig/mm/vmscan.c 2006-01-26 22:14:12.000000000 -0800 +++ linux-2.6.16-rc1-mm3/mm/vmscan.c 2006-01-26 22:14:12.000000000 -0800 @@ -1831,10 +1831,15 @@ module_init(kswapd_init) */ int zone_reclaim_mode __read_mostly; +#define RECLAIM_OFF 0 +#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ + /* * Mininum time between zone reclaim scans */ -#define ZONE_RECLAIM_INTERVAL 30*HZ +int zone_reclaim_interval __read_mostly = 30*HZ; /* * Priority for ZONE_RECLAIM. This determines the fraction of pages @@ -1856,7 +1861,7 @@ int zone_reclaim(struct zone *zone, gfp_ int node_id; if (time_before(jiffies, - zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL)) + zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) return 0; if (!(gfp_mask & __GFP_WAIT) || @@ -1869,8 +1874,8 @@ int zone_reclaim(struct zone *zone, gfp_ if (!cpus_empty(mask) && node_id != numa_node_id()) return 0; - sc.may_writepage = 0; - sc.may_swap = 0; + sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); + sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); sc.nr_scanned = 0; sc.nr_reclaimed = 0; sc.priority = ZONE_RECLAIM_PRIORITY + 1; Index: linux-2.6.16-rc1-mm3/Documentation/sysctl/vm.txt =================================================================== --- linux-2.6.16-rc1-mm3.orig/Documentation/sysctl/vm.txt 2006-01-26 22:14:09.000000000 -0800 +++ linux-2.6.16-rc1-mm3/Documentation/sysctl/vm.txt 2006-01-30 11:08:42.000000000 -0800 @@ -28,6 +28,7 @@ Currently, these files are in /proc/sys/ - block_dump - drop-caches - zone_reclaim_mode +- zone_reclaim_interval ============================================================== @@ -126,8 +127,18 @@ the high water marks for each per cpu pa zone_reclaim_mode: -This is set during bootup to 1 if it is determined that pages from -remote zones will cause a significant performance reduction. The +This allows to set more or less agressive forms of reclaiming memory +when a zone runs out of memory. If it is set to zero then no +zone reclaim occurs. + +This is value ORed together of + +1 = Zone reclaim on +2 = Zone reclaim writes dirty pages out +4 = Zone reclaim swaps pages + +zone_reclaim_mode is set during bootup to 1 if it is determined that pages +from remote zones will cause a significant performance reduction. The page allocator will then reclaim easily reusable pages (those page cache pages that are currently not used) before going off node. @@ -135,6 +146,27 @@ The user can override this setting. It m off zone reclaim if the system is used for a file server and all of memory should be used for caching files from disk. -It may be beneficial to switch this on if one wants to do zone -reclaim regardless of the numa distances in the system. +Allowing zone reclaim to write out pages stops processes that are +writing large amounts of data from dirtying pages on other nodes. Zone +reclaim will write out dirty pages if a zone fills up and so effetively +throttle the process. This may decrease the performance of a single process +since it cannot use all of system memory to buffer the outgoing writes +anymore but it preserve the memory on other nodes so that the performance +of other processes running on other nodes will not be affected. + +Allowing regular swap effectively restricts allocations to the local +node unless explicitly overridden by memory policy. + +================================================================ + +zone_reclaim_interval: + +The time allowed for off node allocations after zone reclaim +has failed to reclaim enough pages to allow a local allocation. + +This is 30 seconds by default. Time is set in centiseconds. +F.e. 30 seconds would be specified as 3000. + +If undesired off node allocations occur then this interval should be +reduced. However, too frequent scans will degrade performance. Index: linux-2.6.16-rc1-mm3/kernel/sysctl.c =================================================================== --- linux-2.6.16-rc1-mm3.orig/kernel/sysctl.c 2006-01-26 22:14:12.000000000 -0800 +++ linux-2.6.16-rc1-mm3/kernel/sysctl.c 2006-01-30 11:08:56.000000000 -0800 @@ -889,6 +889,15 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, + { + .ctl_name = VM_ZONE_RECLAIM_INTERVAL, + .procname = "zone_reclaim_interval", + .data = &zone_reclaim_interval, + .maxlen = sizeof(zone_reclaim_interval), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, #endif { .ctl_name = 0 } }; Index: linux-2.6.16-rc1-mm3/include/linux/sysctl.h =================================================================== --- linux-2.6.16-rc1-mm3.orig/include/linux/sysctl.h 2006-01-26 22:14:12.000000000 -0800 +++ linux-2.6.16-rc1-mm3/include/linux/sysctl.h 2006-01-26 22:14:12.000000000 -0800 @@ -184,6 +184,7 @@ enum VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */ VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */ VM_ZONE_RECLAIM_MODE=31,/* reclaim local zone memory before going off node */ + VM_ZONE_RECLAIM_INTERVAL=32,/* time period to wait after reclaim failure */ };