From: Christoph Lameter zone_reclaim: proc limit for the minimal amount of unmapped pagecache pages Add /proc/sys/vm/min_unmapped to be able to control the percentage of unmapped pages. Zone reclaim will only be triggered if more than that number of unmapped pages exist in a zone. And remove some outdated comments. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton --- Documentation/sysctl/vm.txt | 12 ++++++++++++ include/linux/mmzone.h | 6 ++++++ include/linux/swap.h | 1 + include/linux/sysctl.h | 2 +- kernel/sysctl.c | 11 +++++++++++ mm/page_alloc.c | 22 ++++++++++++++++++++++ mm/vmscan.c | 21 +++++++++------------ 7 files changed, 62 insertions(+), 13 deletions(-) diff -puN Documentation/sysctl/vm.txt~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable Documentation/sysctl/vm.txt --- a/Documentation/sysctl/vm.txt~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable +++ a/Documentation/sysctl/vm.txt @@ -28,6 +28,7 @@ Currently, these files are in /proc/sys/ - block_dump - drop-caches - zone_reclaim_mode +- min_unmapped - panic_on_oom ============================================================== @@ -168,6 +169,17 @@ in all nodes of the system. ============================================================= +min_unmapped: + +A percentage of the file backed pages in each zone. Zone reclaim will only +occur if more than this percentage of pages are file backed and unmapped. +This is to insure that a minimal amount of local pages is still available +for file I/O even if the node is overallocated. + +The default is 1 percent. + +============================================================= + panic_on_oom This enables or disables panic on out-of-memory feature. If this is set to 1, diff -puN include/linux/mmzone.h~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable include/linux/mmzone.h --- a/include/linux/mmzone.h~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable +++ a/include/linux/mmzone.h @@ -150,6 +150,10 @@ struct zone { unsigned long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA + /* + * zone reclaim becomes active if more unmapped pages exist. + */ + unsigned long min_unmapped; struct per_cpu_pageset *pageset[NR_CPUS]; #else struct per_cpu_pageset pageset[NR_CPUS]; @@ -414,6 +418,8 @@ int lowmem_reserve_ratio_sysctl_handler( void __user *, size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int sysctl_min_unmapped_sysctl_handler(struct ctl_table *, int, + struct file *, void __user *, size_t *, loff_t *); #include /* Returns the number of the current Node. */ diff -puN include/linux/swap.h~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable include/linux/swap.h --- a/include/linux/swap.h~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable +++ a/include/linux/swap.h @@ -189,6 +189,7 @@ extern long vm_total_pages; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; +extern int sysctl_min_unmapped; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else #define zone_reclaim_mode 0 diff -puN include/linux/sysctl.h~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable include/linux/sysctl.h --- a/include/linux/sysctl.h~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable +++ a/include/linux/sysctl.h @@ -188,7 +188,7 @@ enum VM_DROP_PAGECACHE=29, /* int: nuke lots of pagecache */ VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */ VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */ - VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */ + VM_MIN_UNMAPPED=32, /* Set min percent of unmapped pages */ VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ }; diff -puN kernel/sysctl.c~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable kernel/sysctl.c --- a/kernel/sysctl.c~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable +++ a/kernel/sysctl.c @@ -932,6 +932,17 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, + { + .ctl_name = VM_MIN_UNMAPPED, + .procname = "min_unmapped", + .data = &sysctl_min_unmapped, + .maxlen = sizeof(sysctl_min_unmapped), + .mode = 0644, + .proc_handler = &sysctl_min_unmapped_sysctl_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one_hundred, + }, #endif #ifdef CONFIG_X86_32 { diff -puN mm/page_alloc.c~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable mm/page_alloc.c --- a/mm/page_alloc.c~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable +++ a/mm/page_alloc.c @@ -2005,6 +2005,10 @@ static void __meminit free_area_init_cor zone->spanned_pages = size; zone->present_pages = realsize; +#ifdef CONFIG_NUMA + zone->min_unmapped = (realsize * sysctl_min_unmapped) + / 100; +#endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); @@ -2298,6 +2302,24 @@ int min_free_kbytes_sysctl_handler(ctl_t return 0; } +#ifdef CONFIG_NUMA +int sysctl_min_unmapped_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + int rc; + + rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (rc) + return rc; + + for_each_zone(zone) + zone->min_unmapped = (zone->present_pages * + sysctl_min_unmapped) / 100; + return 0; +} +#endif + /* * lowmem_reserve_ratio_sysctl_handler - just a wrapper around * proc_dointvec() so that we can call setup_per_zone_lowmem_reserve() diff -puN mm/vmscan.c~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable mm/vmscan.c --- a/mm/vmscan.c~zvc-zone_reclaim-leave-1%-of-unmapped-pagecache-pages-for-file-i-o-tunable +++ a/mm/vmscan.c @@ -1503,10 +1503,6 @@ module_init(kswapd_init) * * If non-zero call zone_reclaim when the number of free pages falls below * the watermarks. - * - * In the future we may add flags to the mode. However, the page allocator - * should only have to check that zone_reclaim_mode != 0 before calling - * zone_reclaim(). */ int zone_reclaim_mode __read_mostly; @@ -1524,6 +1520,12 @@ int zone_reclaim_mode __read_mostly; #define ZONE_RECLAIM_PRIORITY 4 /* + * Percentile of pages in a zone that must be unmapped + * for zone_reclaim to occur. + */ +int sysctl_min_unmapped = 1; + +/* * Try to free up some pages from this zone through reclaim. */ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) @@ -1595,16 +1597,11 @@ int zone_reclaim(struct zone *zone, gfp_ * A small portion of unmapped file backed pages is needed for * file I/O otherwise pages read by file I/O will be immediately * thrown out if the zone is overallocated. So we do not reclaim - * if less than 1% of the zone is used by unmapped file backed pages. - * - * The division by 128 approximates this and is here because a division - * would be too expensive in this hot code path. - * - * Is it be useful to have a way to set the limit via /proc? + * if less than a specified percentage of the zone is used by + * unmapped file backed pages. */ if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) < - zone->present_pages / 128) + zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped) return 0; /* _