Subject: [PATCH] mm: vmstat: Actively update vmstat counters in low memory situations

From: Mel Gorman <mel@csn.ul.ie>

VM stat counters are kept in per-cpu and drained either every stat_interval
seconds or when a threshold is exceeded. This threshold can be quite high
on large systems e.g. 70 on a 16 CPU 8GB system.

NR_FREE_PAGES is a vmstat counter that is depended on for watermark
calculation. The problem is that the counter can be out of sync with
the actual buddy lists allowing the watermarks to be breached. This is
particularly problematic on large systems where there have been reports of
page allocation failures with the "free" counter being significantly higher
than the actual pages on the per-cpu and buddy lists.

Keeping the counters fully in sync would be very expensive. Instead, this
patch has kswapd actively update the vmstat counters while it is awake.
The counters can still be out of sync but within an acceptable margin.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
---

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -231,7 +231,8 @@ extern void __inc_zone_state(struct zone
 extern void dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 
-void refresh_cpu_vm_stats(int);
+void refresh_cpu_vm_stats(int, bool);
+void refresh_all_vm_stats(void);
 #else /* CONFIG_SMP */
 
 /*
@@ -277,6 +278,7 @@ static inline void __dec_zone_page_state
 #define mod_zone_page_state __mod_zone_page_state
 
 static inline void refresh_cpu_vm_stats(int cpu) { }
+static inline void refresh_all_vm_stats(void) { }
 #endif
 
 #endif /* _LINUX_VMSTAT_H */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -368,6 +368,7 @@ static void dump_header(struct task_stru
 	task_unlock(current);
 	dump_stack();
 	mem_cgroup_print_oom_info(mem, p);
+	refresh_all_vm_stats();
 	show_mem();
 	if (sysctl_oom_dump_tasks)
 		dump_tasks(mem);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2100,6 +2100,7 @@ nopage:
 			" order:%d, mode:0x%x\n",
 			p->comm, order, gfp_mask);
 		dump_stack();
+		refresh_all_vm_stats();
 		show_mem();
 	}
 	return page;
@@ -4768,7 +4769,7 @@ static int page_alloc_cpu_notify(struct 
 		 * This is only okay since the processor is dead and cannot
 		 * race with what we are doing.
 		 */
-		refresh_cpu_vm_stats(cpu);
+		refresh_cpu_vm_stats(cpu, true);
 	}
 	return NOTIFY_OK;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2117,6 +2117,16 @@ loop_again:
 		}
 		if (all_zones_ok)
 			break;		/* kswapd: all done */
+
+		/*
+		 * While kswapd is awake, actively keep the NR_FREE_PAGES
+		 * counters in sync with the buddy lists. On large systems
+		 * under load, the 1 second period timer can allow the min
+		 * watermark to be breached because NR_FREE_PAGES was higher
+		 * than reality
+		 */
+		refresh_all_vm_stats();
+
 		/*
 		 * OK, kswapd is getting into trouble.  Take a nap, then take
 		 * another pass across the zones.
diff --git a/mm/vmstat.c b/mm/vmstat.c
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -295,7 +295,7 @@ EXPORT_SYMBOL(dec_zone_page_state);
  * with the global counters. These could cause remote node cache line
  * bouncing and will have to be only done when necessary.
  */
-void refresh_cpu_vm_stats(int cpu)
+void refresh_cpu_vm_stats(int cpu, bool can_resched)
 {
 	struct zone *zone;
 	int i;
@@ -322,7 +322,8 @@ void refresh_cpu_vm_stats(int cpu)
 				p->expire = 3;
 #endif
 			}
-		cond_resched();
+		if (can_resched)
+			cond_resched();
 #ifdef CONFIG_NUMA
 		/*
 		 * Deal with draining the remote pageset of this
@@ -975,18 +976,35 @@ static const struct file_operations proc
 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
 int sysctl_stat_interval __read_mostly = HZ;
 
-static void vmstat_update(struct work_struct *w)
+static void vmstat_update_periodic(struct work_struct *w)
 {
-	refresh_cpu_vm_stats(smp_processor_id());
+	refresh_cpu_vm_stats(smp_processor_id(), true);
 	schedule_delayed_work(&__get_cpu_var(vmstat_work),
 		round_jiffies_relative(sysctl_stat_interval));
 }
 
+static void vmstat_update_immediate(void *arg)
+{
+	refresh_cpu_vm_stats(smp_processor_id(), false);
+}
+
+void refresh_all_vm_stats(void)
+{
+	/*
+	 * If called from the page allocator in IRQ context, we cannot send an
+	 * IPI to refresh stats
+	 */
+	if (irqs_disabled())
+		return;
+
+	on_each_cpu(vmstat_update_immediate, NULL, 1);
+}
+
 static void __cpuinit start_cpu_timer(int cpu)
 {
 	struct delayed_work *work = &per_cpu(vmstat_work, cpu);
 
-	INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update);
+	INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update_periodic);
 	schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
 }