Index: linux-2.6.16-rc1-mm2/include/linux/swap.h =================================================================== --- linux-2.6.16-rc1-mm2.orig/include/linux/swap.h 2006-01-20 08:35:10.000000000 -0800 +++ linux-2.6.16-rc1-mm2/include/linux/swap.h 2006-01-23 10:08:29.000000000 -0800 @@ -178,6 +178,7 @@ extern int vm_swappiness; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; +extern int zone_reclaim_interval; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else #define zone_reclaim_mode 0 Index: linux-2.6.16-rc1-mm2/mm/vmscan.c =================================================================== --- linux-2.6.16-rc1-mm2.orig/mm/vmscan.c 2006-01-23 10:02:23.000000000 -0800 +++ linux-2.6.16-rc1-mm2/mm/vmscan.c 2006-01-23 10:09:27.000000000 -0800 @@ -1827,10 +1827,15 @@ module_init(kswapd_init) */ int zone_reclaim_mode __read_mostly; +#define RECLAIM_OFF 0 +#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ + /* * Mininum time between zone reclaim scans */ -#define ZONE_RECLAIM_INTERVAL 30*HZ +int zone_reclaim_interval __read_mostly = 30*HZ; /* * Priority for ZONE_RECLAIM. This determines the fraction of pages @@ -1850,7 +1855,7 @@ int zone_reclaim(struct zone *zone, gfp_ struct scan_control sc; if (time_before(jiffies, - zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL)) + zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) return 0; if (!(gfp_mask & __GFP_WAIT) || @@ -1860,8 +1865,8 @@ int zone_reclaim(struct zone *zone, gfp_ atomic_read(&zone->reclaim_in_progress) > 0) return 0; - sc.may_writepage = 0; - sc.may_swap = 0; + sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); + sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); sc.nr_scanned = 0; sc.nr_reclaimed = 0; sc.priority = ZONE_RECLAIM_PRIORITY + 1; Index: linux-2.6.16-rc1-mm2/Documentation/sysctl/vm.txt =================================================================== --- linux-2.6.16-rc1-mm2.orig/Documentation/sysctl/vm.txt 2006-01-20 08:35:07.000000000 -0800 +++ linux-2.6.16-rc1-mm2/Documentation/sysctl/vm.txt 2006-01-23 10:08:29.000000000 -0800 @@ -28,6 +28,7 @@ Currently, these files are in /proc/sys/ - block_dump - drop-caches - zone_reclaim_mode +- zone_reclaim_interval ============================================================== @@ -126,8 +127,18 @@ the high water marks for each per cpu pa zone_reclaim_mode: -This is set during bootup to 1 if it is determined that pages from -remote zones will cause a significant performance reduction. The +This allows to set more or less agressive forms of reclaiming memory +when a zone runs out of memory. If it is set to zero then no +zone reclaim occurs. + +This is a ORed together value of + +1 = Zone reclaim on +2 = Zone reclaim writes dirty pages out +4 = Zone reclaim swaps pages + +zone_reclaim_mode is set during bootup to 1 if it is determined that pages +from remote zones will cause a significant performance reduction. The page allocator will then reclaim easily reusable pages (those page cache pages that are currently not used) before going off node. @@ -135,6 +146,27 @@ The user can override this setting. It m off zone reclaim if the system is used for a file server and all of memory should be used for caching files from disk. -It may be beneficial to switch this on if one wants to do zone -reclaim regardless of the numa distances in the system. +Allowing zone reclaim to write out pages stops an process doing huge +amounts of writes from dirtying pages on other nodes. Instead the +process will be throttled when zone reclaim begins to writeout +the locally dirtied pages. This may decrease the performance +of a single process since it cannot use all of system memory to buffer +the outgoing writes anymore but it will protect processes on other +nodes from the writeout happy process. + +Allowing regular swap effectively restricts allocations to the local +node unless explicitly overridden by memory policy. + +================================================================ + +zone_reclaim_interval: + +The time allowed for off node allocations after zone reclaim +has failed to reclaim enough pages to allow a local allocation. + +This is 30 seconds by default. Time is set in ticks. +F.e. if HZ = 250 then 30 seconds is 30*250 = 7500. + +If undesired off node allocations occur then this interval should be +reduced. However, too frequent scans will degrade performance. Index: linux-2.6.16-rc1-mm2/kernel/sysctl.c =================================================================== --- linux-2.6.16-rc1-mm2.orig/kernel/sysctl.c 2006-01-20 16:30:06.000000000 -0800 +++ linux-2.6.16-rc1-mm2/kernel/sysctl.c 2006-01-23 10:08:29.000000000 -0800 @@ -889,6 +889,16 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, + { + .ctl_name = VM_ZONE_RECLAIM_INTERVAL, + .procname = "zone_reclaim_interval", + .data = &zone_reclaim_interval, + .maxlen = sizeof(zone_reclaim_interval), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + .extra1 = &one_hundred, + }, #endif { .ctl_name = 0 } }; Index: linux-2.6.16-rc1-mm2/include/linux/sysctl.h =================================================================== --- linux-2.6.16-rc1-mm2.orig/include/linux/sysctl.h 2006-01-20 08:35:10.000000000 -0800 +++ linux-2.6.16-rc1-mm2/include/linux/sysctl.h 2006-01-23 10:08:29.000000000 -0800 @@ -184,6 +184,7 @@ enum VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */ VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */ VM_ZONE_RECLAIM_MODE=31,/* reclaim local zone memory before going off node */ + VM_ZONE_RECLAIM_INTERVAL=32,/* time period to wait after reclaim failure */ };