Index: linux-2.6.16-rc1-mm4/init/Kconfig
===================================================================
--- linux-2.6.16-rc1-mm4.orig/init/Kconfig	2006-01-30 11:27:37.000000000 -0800
+++ linux-2.6.16-rc1-mm4/init/Kconfig	2006-01-30 17:27:11.000000000 -0800
@@ -274,6 +274,19 @@ config KALLSYMS_ALL
 
 	   Say N.
 
+config SCRUBD
+	bool "Scrub Daemon (prezeroing of pages)"
+	depends on EXPERIMENTAL
+	help
+	   The scrub daemon manages a pool of zeroed pages. Pages of higher
+	   order are zeroed when the system is idle (configurable via
+	   /proc/sys/vm/scrubd_load).
+	   If the kernel later needs a zeroed page then a page may be
+	   obtained from these pools instead of hot-zeroing a paged.
+	   Prezeroing will in particular speed up applications allocating
+	   large amounts of memory and will be effective for sparse
+	   matrices (this includes multi-level page tables).
+
 config KALLSYMS_EXTRA_PASS
 	bool "Do an extra kallsyms pass"
 	depends on KALLSYMS
Index: linux-2.6.16-rc1-mm4/Documentation/vm/scrubd.txt
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6.16-rc1-mm4/Documentation/vm/scrubd.txt	2006-01-30 17:27:11.000000000 -0800
@@ -0,0 +1,38 @@
+The SCRUB Daemon
+----------------
+
+The scrub daemon zeroes memory so that later requests for zeroed memory can
+be satisifed without having to zero memory in a hot code path. The operations
+of scrubd may be controlled through /proc/sys/vm/scrub_limit:
+
+/proc/sys/vm/scrub_limit	default value	300
+
+	This represents a percentage *10. If more than this percentage
+	of free pages are unzeroed then invoke the scrub daemon.
+
+The amount of available zeroed memory may be seen in /proc/meminfo or in
+/proc/buddyinfo.
+
+Zeroing Drivers:
+----------------
+
+If hardware is available that can zero memory without the use of the cpu then a
+driver may be written that can then register itself with
+register_zero_driver(). See include/linux/scrubd.h for details and
+arch/ia64/sn/kernel/bte.c for an example of a zeroing driver)
+
+Performance considerations:
+---------------------------
+
+If there is no zeroing hardware available then zeroing may invalidate the
+cpu cache and is therefore something that may cause a minimal performance loss
+especially since scrubd may zero more pages than necessary.
+
+Scrubd is most effective for memory that is only sparsely accessed. Getting a
+prezeroed page for an application that then immediately overwrites all bytes
+in the page does not lead to any performance improvement. However, if the
+application only uses certain cachelines of the page immediately after a page
+fault then scrubd can be of tremendous benefit.
+
+Christoph Lameter, SGI, February 2006.
+
Index: linux-2.6.16-rc1-mm4/kernel/sysctl.c
===================================================================
--- linux-2.6.16-rc1-mm4.orig/kernel/sysctl.c	2006-01-30 11:31:18.000000000 -0800
+++ linux-2.6.16-rc1-mm4/kernel/sysctl.c	2006-01-30 17:27:11.000000000 -0800
@@ -153,6 +153,10 @@ extern ctl_table inotify_table[];
 int sysctl_legacy_va_layout;
 #endif
 
+#ifdef CONFIG_SCRUBD
+extern int sysctl_scrub_limit;
+#endif
+
 /* /proc declarations: */
 
 #ifdef CONFIG_PROC_FS
@@ -898,6 +902,18 @@ static ctl_table vm_table[] = {
 		.strategy	= &sysctl_jiffies,
 	},
 #endif
+#ifdef CONFIG_SCRUBD
+	{
+		.ctl_name	= VM_SCRUB_LIMIT,
+		.procname	= "scrub_limit",
+		.data		= &sysctl_scrub_limit,
+		.maxlen		= sizeof(sysctl_scrub_limit),
+		.mode		= 0644,
+		.proc_handler	= &scrub_limit_handler,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+#endif
 	{ .ctl_name = 0 }
 };
 
Index: linux-2.6.16-rc1-mm4/include/linux/sysctl.h
===================================================================
--- linux-2.6.16-rc1-mm4.orig/include/linux/sysctl.h	2006-01-30 11:35:20.000000000 -0800
+++ linux-2.6.16-rc1-mm4/include/linux/sysctl.h	2006-01-30 17:27:11.000000000 -0800
@@ -185,6 +185,7 @@ enum
 	VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */
 	VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */
 	VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */
+	VM_SCRUB_LIMIT=33,	/* if zeroed memory falls below run kscrubd */
 };
 
 
Index: linux-2.6.16-rc1-mm4/include/linux/mmzone.h
===================================================================
--- linux-2.6.16-rc1-mm4.orig/include/linux/mmzone.h	2006-01-30 17:27:07.000000000 -0800
+++ linux-2.6.16-rc1-mm4/include/linux/mmzone.h	2006-01-30 17:27:11.000000000 -0800
@@ -324,6 +324,9 @@ typedef struct pglist_data {
 	wait_queue_head_t kswapd_wait;
 	struct task_struct *kswapd;
 	int kswapd_max_order;
+
+	wait_queue_head_t kscrubd_wait;
+	struct task_struct *kscrubd;
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
@@ -466,6 +469,8 @@ int lowmem_reserve_ratio_sysctl_handler(
 					void __user *, size_t *, loff_t *);
 int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
+int scrub_limit_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
Index: linux-2.6.16-rc1-mm4/mm/page_alloc.c
===================================================================
--- linux-2.6.16-rc1-mm4.orig/mm/page_alloc.c	2006-01-30 17:27:07.000000000 -0800
+++ linux-2.6.16-rc1-mm4/mm/page_alloc.c	2006-01-30 17:37:24.000000000 -0800
@@ -69,6 +69,23 @@ int percpu_pagelist_fraction;
  */
 int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
 
+#ifdef CONFIG_NUMA
+/*
+ * The scrub limit is the percentage of the free pages that must
+ * be zeroed. If the zeroed pages fall below that limit then all free pages
+ * of the zone will be zeroed.
+ */
+int sysctl_scrub_limit __read_mostly = 300 /* 300/1024 */;
+/*
+ * Check if a zone has enough zeroed pages. The limit is specified in a
+ * percentage *10 in order to allow a division by 1024 (almost 1000) by\
+ * shifting right by 10.
+ */
+#define ZONE_NEEDS_MORE_ZEROED_PAGES(z) \
+	((z)->zeroed_pages < (z)->free_pages * sysctl_scrub_limit >> 10)
+
+#endif
+
 EXPORT_SYMBOL(totalram_pages);
 
 /*
@@ -583,7 +600,38 @@ static int prep_new_page(struct page *pa
 	return 0;
 }
 
-/* 
+
+#ifdef CONFIG_SCRUBD
+/*
+ * Special access for scrubd:
+ * Remove the first unzeroed page from the indicated free area
+ */
+struct page *scrubd_rmpage(struct zone *zone, int order)
+{
+	struct page *page;
+	struct list_head *l;
+	struct free_area *area = zone->free_area + order;
+
+	spin_lock(&zone->lock);
+	list_for_each(l, &area->free_list) {
+		page = list_entry(l, struct page, lru);
+		if (!PageZeroed(page)) {
+			list_del(&page->lru);
+			rmv_page_order(page);
+			area->nr_free--;
+			zone->free_pages -= 1 << order;
+			goto out;
+		}
+	}
+	page = NULL;
+out:
+	spin_unlock(&zone->lock);
+	return page;
+}
+
+#endif
+
+/*
  * Do the hard work of removing an element from the buddy allocator.
  * Call me with the zone->lock already held.
  */
@@ -623,7 +671,7 @@ static int rmqueue_bulk(struct zone *zon
 			unsigned long count, struct list_head *list, int last)
 {
 	int i;
-	
+
 	spin_lock(&zone->lock);
 	for (i = 0; i < count; ++i) {
 		struct page *page = __rmqueue(zone, order, last);
@@ -636,6 +684,13 @@ static int rmqueue_bulk(struct zone *zon
 }
 
 #ifdef CONFIG_NUMA
+void wakeup_kscrubd(struct zone *zone)
+{
+	if (!waitqueue_active(&zone->zone_pgdat->kscrubd_wait))
+                return;
+        wake_up_interruptible(&zone->zone_pgdat->kscrubd_wait);
+}
+
 /* Called from the slab reaper to drain remote pagesets */
 void drain_remote_pages(void)
 {
@@ -647,9 +702,15 @@ void drain_remote_pages(void)
 	for_each_zone(zone) {
 		struct per_cpu_pageset *pset;
 
-		/* Do not drain local pagesets */
-		if (zone->zone_pgdat->node_id == numa_node_id())
+		/*
+		 * Do not drain local pagesets. Check zeroed pages
+		 * instead.
+		 */
+		if (zone->zone_pgdat->node_id == numa_node_id()) {
+			if (ZONE_NEEDS_MORE_ZEROED_PAGES(zone))
+				wakeup_kscrubd(zone);
 			continue;
+		}
 
 		pset = zone_pcp(zone, smp_processor_id());
 		for (i = 0; i < NR_PER_CPU_PAGES; i++) {
@@ -2130,6 +2191,71 @@ void __init setup_per_cpu_pageset(void)
 	register_cpu_notifier(&pageset_notifier);
 }
 
+/*
+ * scrub_pgdat() will work across all this node's zones.
+ */
+static void scrub_pgdat(pg_data_t *pgdat)
+{
+	int order;
+	struct page *page;
+	struct zone *zone;
+
+	for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
+		if (!ZONE_NEEDS_MORE_ZEROED_PAGES(zone))
+			continue;
+
+		for (order = MAX_ORDER -1; order > 0; order--) {
+			while ((page = scrubd_rmpage(zone, order))) {
+				prep_zero_page(page, order, GFP_KERNEL);
+				__SetPageZeroed(page);
+				zone->zeroed_pages += 1 << order;
+				__free_one_page(page, zone, order);
+			}
+		}
+	}
+}
+
+/*
+ * The background scrub daemon.
+ */
+static int kscrubd(void *p)
+{
+	pg_data_t *pgdat = (pg_data_t*)p;
+	struct task_struct *tsk = current;
+	DEFINE_WAIT(wait);
+	cpumask_t cpumask;
+
+	daemonize("kscrubd%d", pgdat->node_id);
+	cpumask = node_to_cpumask(pgdat->node_id);
+	if (!cpus_empty(cpumask))
+		set_cpus_allowed(tsk, cpumask);
+
+	tsk->flags |= PF_MEMALLOC;
+
+	/* kscrubd should always run at lowest priority */
+	set_user_nice(current, 19);
+
+	for (;;) {
+		if (current->flags & PF_FREEZE)
+			refrigerator();
+		prepare_to_wait(&pgdat->kscrubd_wait, &wait, TASK_INTERRUPTIBLE);
+		schedule();
+		finish_wait(&pgdat->kscrubd_wait, &wait);
+
+		scrub_pgdat(pgdat);
+	}
+	return 0;
+}
+
+static int __init kscrubd_init(void)
+{
+	pg_data_t *pgdat;
+	for_each_pgdat(pgdat)
+		pgdat->kscrubd = find_task_by_pid(kernel_thread(kscrubd, pgdat, CLONE_KERNEL));
+	return 0;
+}
+
+module_init(kscrubd_init)
 #endif
 
 static __meminit
@@ -2202,8 +2328,9 @@ static void __init free_area_init_core(s
 	pgdat_resize_init(pgdat);
 	pgdat->nr_zones = 0;
 	init_waitqueue_head(&pgdat->kswapd_wait);
+	init_waitqueue_head(&pgdat->kscrubd_wait);
 	pgdat->kswapd_max_order = 0;
-	
+
 	for (j = 0; j < MAX_NR_ZONES; j++) {
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long size, realsize;
@@ -2793,6 +2920,29 @@ int percpu_pagelist_fraction_sysctl_hand
 	return 0;
 }
 
+#ifdef CONFIG_NUMA
+/*
+ * sysctl handler for /proc/sys/vm/scrub_limit to allow the control of
+ * the ratio of zeroed pages to free pages.
+ */
+int scrub_limit_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec(table, write, file, buffer, length, ppos);
+	if (!write || (ret == -EINVAL))
+		return ret;
+	if (sysctl_scrub_limit <= 1024) {
+		struct zone *zone;
+
+		for_each_zone(zone)
+			wakeup_kscrubd(zone);
+	}
+	return 0;
+}
+#endif
+
 __initdata int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA