From: Christoph Lameter The zone_reclaim_interval was necessary because we were not able to determine how many unmapped pages exist in a zone. Therefore we had to scan in intervals to figure out if any pages were unmapped. With the zoned counters and NR_ANON_PAGES we now know the number of pagecache pages and the number of mapped pages in a zone. So we can simply skip the reclaim if there is an insufficient number of unmapped pages. We use SWAP_CLUSTER_MAX as the boundary. Drop all support for /proc/sys/vm/zone_reclaim_interval. Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton --- Documentation/sysctl/vm.txt | 13 ------------- include/linux/mmzone.h | 6 ------ include/linux/swap.h | 1 - kernel/sysctl.c | 9 --------- mm/vmscan.c | 31 ++++++++++--------------------- 5 files changed, 10 insertions(+), 50 deletions(-) diff -puN Documentation/sysctl/vm.txt~zoned-vm-counters-zone_reclaim-remove-proc-sys-vm-zone_reclaim_interval Documentation/sysctl/vm.txt --- a/Documentation/sysctl/vm.txt~zoned-vm-counters-zone_reclaim-remove-proc-sys-vm-zone_reclaim_interval +++ a/Documentation/sysctl/vm.txt @@ -28,7 +28,6 @@ Currently, these files are in /proc/sys/ - block_dump - drop-caches - zone_reclaim_mode -- zone_reclaim_interval - panic_on_oom ============================================================== @@ -167,18 +166,6 @@ use of files and builds up large slab ca shrink operation is global, may take a long time and free slabs in all nodes of the system. -================================================================ - -zone_reclaim_interval: - -The time allowed for off node allocations after zone reclaim -has failed to reclaim enough pages to allow a local allocation. - -Time is set in seconds and set by default to 30 seconds. - -Reduce the interval if undesired off node allocations occur. However, too -frequent scans will have a negative impact onoff node allocation performance. - ============================================================= panic_on_oom diff -puN include/linux/mmzone.h~zoned-vm-counters-zone_reclaim-remove-proc-sys-vm-zone_reclaim_interval include/linux/mmzone.h --- a/include/linux/mmzone.h~zoned-vm-counters-zone_reclaim-remove-proc-sys-vm-zone_reclaim_interval +++ a/include/linux/mmzone.h @@ -178,12 +178,6 @@ struct zone { /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; - /* - * timestamp (in jiffies) of the last zone reclaim that did not - * result in freeing of pages. This is used to avoid repeated scans - * if all memory in the zone is in use. - */ - unsigned long last_unsuccessful_zone_reclaim; /* * prev_priority holds the scanning priority for this zone. It is diff -puN include/linux/swap.h~zoned-vm-counters-zone_reclaim-remove-proc-sys-vm-zone_reclaim_interval include/linux/swap.h --- a/include/linux/swap.h~zoned-vm-counters-zone_reclaim-remove-proc-sys-vm-zone_reclaim_interval +++ a/include/linux/swap.h @@ -189,7 +189,6 @@ extern long vm_total_pages; #ifdef CONFIG_NUMA extern int zone_reclaim_mode; -extern int zone_reclaim_interval; extern int zone_reclaim(struct zone *, gfp_t, unsigned int); #else #define zone_reclaim_mode 0 diff -puN kernel/sysctl.c~zoned-vm-counters-zone_reclaim-remove-proc-sys-vm-zone_reclaim_interval kernel/sysctl.c --- a/kernel/sysctl.c~zoned-vm-counters-zone_reclaim-remove-proc-sys-vm-zone_reclaim_interval +++ a/kernel/sysctl.c @@ -942,15 +942,6 @@ static ctl_table vm_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, - { - .ctl_name = VM_ZONE_RECLAIM_INTERVAL, - .procname = "zone_reclaim_interval", - .data = &zone_reclaim_interval, - .maxlen = sizeof(zone_reclaim_interval), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, #endif { .ctl_name = 0 } }; diff -puN mm/vmscan.c~zoned-vm-counters-zone_reclaim-remove-proc-sys-vm-zone_reclaim_interval mm/vmscan.c --- a/mm/vmscan.c~zoned-vm-counters-zone_reclaim-remove-proc-sys-vm-zone_reclaim_interval +++ a/mm/vmscan.c @@ -1519,11 +1519,6 @@ int zone_reclaim_mode __read_mostly; #define RECLAIM_SLAB (1<<3) /* Do a global slab shrink if the zone is out of memory */ /* - * Mininum time between zone reclaim scans - */ -int zone_reclaim_interval __read_mostly = 30*HZ; - -/* * Priority for ZONE_RECLAIM. This determines the fraction of pages * of a node considered for each zone_reclaim. 4 scans 1/16th of * a zone. @@ -1588,16 +1583,6 @@ static int __zone_reclaim(struct zone *z p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - - if (nr_reclaimed == 0) { - /* - * We were unable to reclaim enough pages to stay on node. We - * now allow off node accesses for a certain time period before - * trying again to reclaim pages from the local zone. - */ - zone->last_unsuccessful_zone_reclaim = jiffies; - } - return nr_reclaimed >= nr_pages; } @@ -1607,13 +1592,17 @@ int zone_reclaim(struct zone *zone, gfp_ int node_id; /* - * Do not reclaim if there was a recent unsuccessful attempt at zone - * reclaim. In that case we let allocations go off node for the - * zone_reclaim_interval. Otherwise we would scan for each off-node - * page allocation. + * Do not reclaim if there are not enough reclaimable pages in this + * zone that would satify this allocations. + * + * All unmapped pagecache pages are reclaimable. + * + * Both counters may be temporarily off a bit so we use + * SWAP_CLUSTER_MAX as the boundary. It may also be good to + * leave a few frequently used unmapped pagecache pages around. */ - if (time_before(jiffies, - zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) + if (zone_page_state(zone, NR_FILE_PAGES) - + zone_page_state(zone, NR_FILE_MAPPED) < SWAP_CLUSTER_MAX) return 0; /* _