Index: linux-2.6.16-rc1-mm1/include/linux/swap.h =================================================================== --- linux-2.6.16-rc1-mm1.orig/include/linux/swap.h 2006-01-18 13:41:51.000000000 -0800 +++ linux-2.6.16-rc1-mm1/include/linux/swap.h 2006-01-19 17:52:27.000000000 -0800 @@ -178,6 +178,7 @@ extern int vm_swappiness; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; +extern int zone_reclaim_interval; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else #define zone_reclaim_mode 0 Index: linux-2.6.16-rc1-mm1/mm/vmscan.c =================================================================== --- linux-2.6.16-rc1-mm1.orig/mm/vmscan.c 2006-01-19 17:52:20.000000000 -0800 +++ linux-2.6.16-rc1-mm1/mm/vmscan.c 2006-01-19 18:05:14.000000000 -0800 @@ -1816,17 +1816,20 @@ module_init(kswapd_init) * * If non-zero call zone_reclaim when the number of free pages falls below * the watermarks. - * - * In the future we may add flags to the mode. However, the page allocator - * should only have to check that zone_reclaim_mode != 0 before calling - * zone_reclaim(). */ int zone_reclaim_mode __read_mostly; +#define RECLAIM_OFF 0 +#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ +#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ +#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ +#define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ + /* * Mininum time between zone reclaim scans */ -#define ZONE_RECLAIM_INTERVAL 30*HZ +int zone_reclaim_interval __read_mostly = 30*HZ; + /* * Try to free up some pages from this zone through reclaim. */ @@ -1838,7 +1841,7 @@ int zone_reclaim(struct zone *zone, gfp_ struct scan_control sc; if (time_before(jiffies, - zone->last_unsuccessful_zone_reclaim + ZONE_RECLAIM_INTERVAL)) + zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) return 0; if (!(gfp_mask & __GFP_WAIT) || @@ -1848,8 +1851,8 @@ int zone_reclaim(struct zone *zone, gfp_ atomic_read(&zone->reclaim_in_progress) > 0) return 0; - sc.may_writepage = 0; - sc.may_swap = 0; + sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); + sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); sc.nr_scanned = 0; sc.nr_reclaimed = 0; sc.priority = 0; @@ -1868,7 +1871,15 @@ int zone_reclaim(struct zone *zone, gfp_ p->flags |= PF_MEMALLOC; reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - shrink_zone(zone, &sc); + + if (zone_reclaim_mode & (RECLAIM_ZONE|RECLAIM_SWAP|RECLAIM_WRITE)) + shrink_zone(zone, &sc); + + if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { + if (shrink_slab(sc.nr_scanned, gfp_mask, order) && + zone_watermark_ok(z, order, mark,idx, flags)) + sc.nr_reclaimed = nr_pages + 1; + } p->reclaim_state = NULL; current->flags &= ~PF_MEMALLOC; Index: linux-2.6.16-rc1-mm1/Documentation/sysctl/vm.txt =================================================================== --- linux-2.6.16-rc1-mm1.orig/Documentation/sysctl/vm.txt 2006-01-18 12:05:38.000000000 -0800 +++ linux-2.6.16-rc1-mm1/Documentation/sysctl/vm.txt 2006-01-19 17:54:33.000000000 -0800 @@ -126,6 +126,16 @@ the high water marks for each per cpu pa zone_reclaim_mode: +This allows to set more or less agressive forms of reclaiming memory +when a zone runs out of memory. + +This is a ORed together value of + +1 = Zone reclaim without swapout +2 = Zone reclaim writes dirty pages out +4 = Zone reclaim swaps pages +8 = Slab reclaim when zone is out of memory + This is set during bootup to 1 if it is determined that pages from remote zones will cause a significant performance reduction. The page allocator will then reclaim easily reusable pages (those page @@ -135,6 +145,15 @@ The user can override this setting. It m off zone reclaim if the system is used for a file server and all of memory should be used for caching files from disk. -It may be beneficial to switch this on if one wants to do zone -reclaim regardless of the numa distances in the system. +It may be advisable to set Slab reclaim if the system makes heavy +use of files and builds up large slab caches and no longer has +sufficient local memory available. Note that the slab shrink is global +and may free slab entries on other nodes. + +================================================================ + +zone_reclaim_interval: +The time allowed for off node allocations after zone reclaim +has failed. This is 30 seconds by default. Time is set in ticks. +30 seconds are 30*1024 = 30720. Index: linux-2.6.16-rc1-mm1/kernel/sysctl.c =================================================================== --- linux-2.6.16-rc1-mm1.orig/kernel/sysctl.c 2006-01-18 13:42:00.000000000 -0800 +++ linux-2.6.16-rc1-mm1/kernel/sysctl.c 2006-01-19 17:52:27.000000000 -0800 @@ -888,6 +888,15 @@ static ctl_table vm_table[] = { .proc_handler = &proc_dointvec, .strategy = &zero, }, + { + .ctl_name = VM_ZONE_RECLAIM_INTERVAL, + .procname = "zone_reclaim_interval", + .data = &zone_reclaim_interval, + .maxlen = sizeof(zone_reclaim_interval), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &zero, + }, #endif { .ctl_name = 0 } }; Index: linux-2.6.16-rc1-mm1/include/linux/sysctl.h =================================================================== --- linux-2.6.16-rc1-mm1.orig/include/linux/sysctl.h 2006-01-18 13:42:00.000000000 -0800 +++ linux-2.6.16-rc1-mm1/include/linux/sysctl.h 2006-01-19 17:52:27.000000000 -0800 @@ -184,6 +184,7 @@ enum VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */ VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */ VM_ZONE_RECLAIM_MODE=31,/* reclaim local zone memory before going off node */ + VM_ZONE_RECLAIM_INTERVAL=32,/* time period to wait after reclaim failure */ };