---
 include/linux/gfp.h |    6 ---
 mm/page_alloc.c     |   65 +++++++++++++++++++++++++++++++++++++++-
 mm/slab.c           |   41 -------------------------
 mm/slub.c           |   84 ----------------------------------------------------
 4 files changed, 63 insertions(+), 133 deletions(-)

Index: slub/mm/page_alloc.c
===================================================================
--- slub.orig/mm/page_alloc.c	2007-04-28 19:30:01.000000000 -0700
+++ slub/mm/page_alloc.c	2007-04-28 19:37:30.000000000 -0700
@@ -933,8 +933,6 @@ static void __init setup_nr_node_ids(voi
 
 #ifdef CONFIG_NUMA
 /*
- * Called from the slab reaper to drain pagesets on a particular node that
- * belongs to the currently executing processor.
  * Note that this function must be called with the thread pinned to
  * a single processor.
  */
@@ -971,6 +969,69 @@ void drain_node_pages(int nodeid)
 		}
 	}
 }
+
+static DEFINE_PER_CPU(unsigned long, reap_node);
+static DEFINE_PER_CPU(struct delayed_work, node_reap_work);
+static unsigned long drain_interval;
+
+void reap_node(struct work_struct *w)
+{
+	int node = __get_cpu_var(reap_node);
+
+	/*
+	 * Only drain per cpu pages on remote zones
+	 */
+	if (node != numa_node_id())
+		drain_node_pages(node);
+
+	node = next_node(node, node_online_map);
+	if (unlikely(node >= nr_node_ids))
+		node = first_node(node_online_map);
+	__get_cpu_var(reap_node) = node;
+
+	schedule_delayed_work(&__get_cpu_var(node_reap_work),
+				drain_interval);
+}
+
+static void __devinit start_cpu_timer(int cpu)
+{
+	struct delayed_work *node_reap_work = &per_cpu(node_reap_work, cpu);
+	int node;
+
+	node = next_node(cpu_to_node(cpu), node_online_map);
+	if (node == nr_node_ids)
+		node = first_node(node_online_map);
+
+	__get_cpu_var(reap_node) = node;
+	INIT_DELAYED_WORK(node_reap_work, reap_node);
+	schedule_delayed_work_on(cpu, node_reap_work, HZ);
+}
+
+int __init setup_node_draining(void)
+{
+	int cpu;
+
+	/*
+	 * Determine call interval for draining nodes
+	 * Per cpu pages for remote nodes should be flushed at an interval
+	 * of once per minute or so.
+	 *
+	 * The worst case here is a a system with 1024 nodes which will have
+	 * a huge amount of remote node queues on each node. Then we get to
+	 * a calling interval of 60 * HZ / 1024 =  50ms. Which is sadly
+	 * getting into the range of frequency of timer interrupts.
+	 *
+	 * A more typical 2 node system needs only to do this every 30 seconds.
+	 *
+	 * All the reaping can run concurrently since it is very unlikely that all
+	 * processors will be draining the memory to the same node.
+	 */
+	drain_interval =  (60 * HZ) / num_online_nodes();
+	for_each_online_cpu(cpu)
+		start_cpu_timer(cpu);
+	return 0;
+}
+__initcall(setup_node_draining);
 #endif
 
 static void __drain_pages(unsigned int cpu)
Index: slub/mm/slab.c
===================================================================
--- slub.orig/mm/slab.c	2007-04-28 19:35:26.000000000 -0700
+++ slub/mm/slab.c	2007-04-28 19:39:19.000000000 -0700
@@ -902,46 +902,6 @@ __setup("noaliencache", noaliencache_set
 
 #ifdef CONFIG_NUMA
 /*
- * Special reaping functions for NUMA systems called from cache_reap().
- * These take care of doing round robin flushing of alien caches (containing
- * objects freed on different nodes from which they were allocated) and the
- * flushing of remote pcps by calling drain_node_pages.
- */
-static DEFINE_PER_CPU(unsigned long, reap_node);
-
-static void init_reap_node(int cpu)
-{
-	int node;
-
-	node = next_node(cpu_to_node(cpu), node_online_map);
-	if (node == MAX_NUMNODES)
-		node = first_node(node_online_map);
-
-	per_cpu(reap_node, cpu) = node;
-}
-
-static void next_reap_node(void)
-{
-	int node = __get_cpu_var(reap_node);
-
-	/*
-	 * Also drain per cpu pages on remote zones
-	 */
-	if (node != numa_node_id())
-		drain_node_pages(node);
-
-	node = next_node(node, node_online_map);
-	if (unlikely(node >= MAX_NUMNODES))
-		node = first_node(node_online_map);
-	__get_cpu_var(reap_node) = node;
-}
-
-#else
-#define init_reap_node(cpu) do { } while (0)
-#define next_reap_node(void) do { } while (0)
-#endif
-
-/*
  * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
  * via the workqueue/eventd.
  * Add the CPU number into the expiration time to minimize the possibility of
@@ -958,7 +918,6 @@ static void __devinit start_cpu_timer(in
 	 * at that time.
 	 */
 	if (keventd_up() && reap_work->work.func == NULL) {
-		init_reap_node(cpu);
 		INIT_DELAYED_WORK(reap_work, cache_reap);
 		schedule_delayed_work_on(cpu, reap_work,
 					__round_jiffies_relative(HZ, cpu));
Index: slub/include/linux/gfp.h
===================================================================
--- slub.orig/include/linux/gfp.h	2007-04-28 19:30:01.000000000 -0700
+++ slub/include/linux/gfp.h	2007-04-28 19:37:30.000000000 -0700
@@ -198,10 +198,4 @@ extern void FASTCALL(free_cold_page(stru
 #define free_page(addr) free_pages((addr),0)
 
 void page_alloc_init(void);
-#ifdef CONFIG_NUMA
-void drain_node_pages(int node);
-#else
-static inline void drain_node_pages(int node) { };
-#endif
-
 #endif /* __LINUX_GFP_H */
Index: slub/mm/slub.c
===================================================================
--- slub.orig/mm/slub.c	2007-04-28 19:36:12.000000000 -0700
+++ slub/mm/slub.c	2007-04-28 19:40:06.000000000 -0700
@@ -2464,90 +2464,6 @@ static struct notifier_block __cpuinitda
 
 #endif
 
-#ifdef CONFIG_NUMA
-
-/*****************************************************************
- * Generic reaper used to support the page allocator
- * (the cpu slabs are reaped by a per slab workqueue).
- *
- * Maybe move this to the page allocator?
- ****************************************************************/
-
-static DEFINE_PER_CPU(unsigned long, reap_node);
-
-static void init_reap_node(int cpu)
-{
-	int node;
-
-	node = next_node(cpu_to_node(cpu), node_online_map);
-	if (node == MAX_NUMNODES)
-		node = first_node(node_online_map);
-
-	__get_cpu_var(reap_node) = node;
-}
-
-static void next_reap_node(void)
-{
-	int node = __get_cpu_var(reap_node);
-
-	/*
-	 * Also drain per cpu pages on remote zones
-	 */
-	if (node != numa_node_id())
-		drain_node_pages(node);
-
-	node = next_node(node, node_online_map);
-	if (unlikely(node >= MAX_NUMNODES))
-		node = first_node(node_online_map);
-	__get_cpu_var(reap_node) = node;
-}
-#else
-#define init_reap_node(cpu) do { } while (0)
-#define next_reap_node(void) do { } while (0)
-#endif
-
-#define REAPTIMEOUT_CPUC	(2*HZ)
-
-#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct delayed_work, reap_work);
-
-static void cache_reap(struct work_struct *unused)
-{
-	next_reap_node();
-	schedule_delayed_work(&__get_cpu_var(reap_work),
-				      REAPTIMEOUT_CPUC);
-}
-
-static void __devinit start_cpu_timer(int cpu)
-{
-	struct delayed_work *reap_work = &per_cpu(reap_work, cpu);
-
-	/*
-	 * When this gets called from do_initcalls via cpucache_init(),
-	 * init_workqueues() has already run, so keventd will be setup
-	 * at that time.
-	 */
-	if (keventd_up() && reap_work->work.func == NULL) {
-		init_reap_node(cpu);
-		INIT_DELAYED_WORK(reap_work, cache_reap);
-		schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
-	}
-}
-
-static int __init cpucache_init(void)
-{
-	int cpu;
-
-	/*
-	 * Register the timers that drain pcp pages and update vm statistics
-	 */
-	for_each_online_cpu(cpu)
-		start_cpu_timer(cpu);
-	return 0;
-}
-__initcall(cpucache_init);
-#endif
-
 #ifdef SLUB_RESILIENCY_TEST
 static unsigned long validate_slab_cache(struct kmem_cache *s);