Index: linux-2.6.19-rc5-mm2/mm/vmscan.c =================================================================== --- linux-2.6.19-rc5-mm2.orig/mm/vmscan.c 2006-11-15 16:48:13.428048131 -0600 +++ linux-2.6.19-rc5-mm2/mm/vmscan.c 2006-11-16 13:53:12.995977130 -0600 @@ -474,7 +474,11 @@ static unsigned long shrink_page_list(st sc->nr_scanned++; - if (!sc->may_swap && page_mapped(page)) + /* + * Avoid expensive reference checks if we can neither swap + * nor write out a page. + */ + if (!sc->may_swap && !sc->may_writepage && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ @@ -1590,7 +1594,6 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_OFF 0 #define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */ -#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ #define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ /* @@ -1613,6 +1616,12 @@ int sysctl_min_unmapped_ratio = 1; int sysctl_min_slab_ratio = 5; /* + * If the number of dirty pages in a zone grows beyond this percentage then + * reclaim needs to do synchrononous writes. + */ +int sysctl_zone_dirty_ratio = 60; + +/* * Try to free up some pages from this zone through reclaim. */ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) @@ -1624,7 +1633,7 @@ static int __zone_reclaim(struct zone *z int priority; unsigned long nr_reclaimed = 0; struct scan_control sc = { - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_writepage = 0, .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), @@ -1633,6 +1642,12 @@ static int __zone_reclaim(struct zone *z }; unsigned long slab_reclaimable; + if (zone_page_state(zone, NR_FILE_DIRTY) + + zone_page_state(zone, NR_UNSTABLE_NFS) >= + zone->dirty_limit) + /* Too many dirty pages. We need to write out pages synchrononously */ + sc.may_writepage = 1; + disable_swap_token(); cond_resched(); /* @@ -1644,9 +1659,10 @@ static int __zone_reclaim(struct zone *z reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - if (zone_page_state(zone, NR_FILE_PAGES) - + if (sc.may_writepage || + zone_page_state(zone, NR_FILE_PAGES) - zone_page_state(zone, NR_FILE_MAPPED) > - zone->min_unmapped_pages) { + zone->min_unmapped_pages) { /* * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. @@ -1705,10 +1721,13 @@ int zone_reclaim(struct zone *zone, gfp_ * unmapped file backed pages. */ if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages - && zone_page_state(zone, NR_SLAB_RECLAIMABLE) - <= zone->min_slab_pages) - return 0; + zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages + && zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= + zone->min_slab_pages + && zone_page_state(zone, NR_FILE_DIRTY) + + zone_page_state(zone, NR_UNSTABLE_NFS) < + zone->dirty_limit) + return 0; /* * Avoid concurrent zone reclaims, do not reclaim in a zone that does Index: linux-2.6.19-rc5-mm2/include/linux/mmzone.h =================================================================== --- linux-2.6.19-rc5-mm2.orig/include/linux/mmzone.h 2006-11-15 18:48:00.586652379 -0600 +++ linux-2.6.19-rc5-mm2/include/linux/mmzone.h 2006-11-16 13:51:30.888693383 -0600 @@ -192,6 +192,7 @@ struct zone { */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; + unsigned long dirty_limit; struct per_cpu_pageset *pageset[NR_CPUS]; #else struct per_cpu_pageset pageset[NR_CPUS]; @@ -566,6 +567,8 @@ int sysctl_min_unmapped_ratio_sysctl_han struct file *, void __user *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int sysctl_zone_dirty_ratio_sysctl_handler(struct ctl_table *, int, + struct file *, void __user *, size_t *, loff_t *); #include /* Returns the number of the current Node. */ Index: linux-2.6.19-rc5-mm2/include/linux/swap.h =================================================================== --- linux-2.6.19-rc5-mm2.orig/include/linux/swap.h 2006-11-15 16:48:10.143607675 -0600 +++ linux-2.6.19-rc5-mm2/include/linux/swap.h 2006-11-16 13:49:58.880900780 -0600 @@ -197,6 +197,7 @@ extern long vm_total_pages; extern int zone_reclaim_mode; extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; +extern int sysctl_zone_dirty_ratio; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else #define zone_reclaim_mode 0 Index: linux-2.6.19-rc5-mm2/kernel/sysctl.c =================================================================== --- linux-2.6.19-rc5-mm2.orig/kernel/sysctl.c 2006-11-15 18:48:00.566142209 -0600 +++ linux-2.6.19-rc5-mm2/kernel/sysctl.c 2006-11-16 13:49:59.005911366 -0600 @@ -1028,6 +1028,17 @@ static ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = VM_ZONE_DIRTY, + .procname = "zone_dirty_ratio", + .data = &sysctl_zone_dirty_ratio, + .maxlen = sizeof(sysctl_zone_dirty_ratio), + .mode = 0644, + .proc_handler = &sysctl_zone_dirty_ratio_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif #ifdef CONFIG_X86_32 { Index: linux-2.6.19-rc5-mm2/mm/page_alloc.c =================================================================== --- linux-2.6.19-rc5-mm2.orig/mm/page_alloc.c 2006-11-15 18:48:00.545632040 -0600 +++ linux-2.6.19-rc5-mm2/mm/page_alloc.c 2006-11-16 13:51:53.177690908 -0600 @@ -2975,6 +2975,7 @@ static void __meminit free_area_init_cor zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100; zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; + zone->dirty_limit = (realsize * sysctl_zone_dirty_ratio) / 100; #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); @@ -3565,6 +3566,21 @@ int sysctl_min_slab_ratio_sysctl_handler sysctl_min_slab_ratio) / 100; return 0; } +int sysctl_zone_dirty_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->dirty_limit = (zone->present_pages * + sysctl_zone_dirty_ratio) / 100; + return 0; +} #endif #if CONFIG_MULTI_ZONE Index: linux-2.6.19-rc5-mm2/include/linux/sysctl.h =================================================================== --- linux-2.6.19-rc5-mm2.orig/include/linux/sysctl.h 2006-11-15 16:48:10.000000000 -0600 +++ linux-2.6.19-rc5-mm2/include/linux/sysctl.h 2006-11-16 13:55:02.332392345 -0600 @@ -207,6 +207,7 @@ enum VM_SWAP_PREFETCH=36, /* swap prefetch */ VM_READAHEAD_RATIO=37, /* percent of read-ahead size to thrashing-threshold */ VM_READAHEAD_HIT_RATE=38, /* one accessed page legitimizes so many read-ahead pages */ + VM_ZONE_DIRTY=39, /* Dirty limit for a zone */ }; Index: linux-2.6.19-rc5-mm2/Documentation/sysctl/vm.txt =================================================================== --- linux-2.6.19-rc5-mm2.orig/Documentation/sysctl/vm.txt 2006-11-15 16:47:28.000000000 -0600 +++ linux-2.6.19-rc5-mm2/Documentation/sysctl/vm.txt 2006-11-16 13:57:48.664855651 -0600 @@ -140,7 +140,6 @@ in the system. This is value ORed together of 1 = Zone reclaim on -2 = Zone reclaim writes dirty pages out 4 = Zone reclaim swaps pages zone_reclaim_mode is set during bootup to 1 if it is determined that pages @@ -153,14 +152,6 @@ used for a file server and all of memory from disk. In that case the caching effect is more important than data locality. -Allowing zone reclaim to write out pages stops processes that are -writing large amounts of data from dirtying pages on other nodes. Zone -reclaim will write out dirty pages if a zone fills up and so effectively -throttle the process. This may decrease the performance of a single process -since it cannot use all of system memory to buffer the outgoing writes -anymore but it preserve the memory on other nodes so that the performance -of other processes running on other nodes will not be affected. - Allowing regular swap effectively restricts allocations to the local node unless explicitly overridden by memory policies or cpuset configurations. @@ -198,6 +189,22 @@ and may not be fast. ============================================================= +zone_dirty_ratio: + +This is available only on NUMA kernels. + +A percentage of the total pages in each zone. On Zone reclaim +(fallback from the local zone occurs) we check the ratio of dirty pages. +If the number of dirty pages is higher than this percentage then we will +perform writeout while reclaiming pages. + +The default is 60 percent. + +Note that writeout can throttle any process allocating +on a node. + +============================================================= + panic_on_oom This enables or disables panic on out-of-memory feature. If this is set to 1,