Index: linux-2.6.16-rc1-mm4/init/Kconfig =================================================================== --- linux-2.6.16-rc1-mm4.orig/init/Kconfig 2006-01-30 11:27:37.000000000 -0800 +++ linux-2.6.16-rc1-mm4/init/Kconfig 2006-01-30 17:27:11.000000000 -0800 @@ -274,6 +274,19 @@ config KALLSYMS_ALL Say N. +config SCRUBD + bool "Scrub Daemon (prezeroing of pages)" + depends on EXPERIMENTAL + help + The scrub daemon manages a pool of zeroed pages. Pages of higher + order are zeroed when the system is idle (configurable via + /proc/sys/vm/scrubd_load). + If the kernel later needs a zeroed page then a page may be + obtained from these pools instead of hot-zeroing a paged. + Prezeroing will in particular speed up applications allocating + large amounts of memory and will be effective for sparse + matrices (this includes multi-level page tables). + config KALLSYMS_EXTRA_PASS bool "Do an extra kallsyms pass" depends on KALLSYMS Index: linux-2.6.16-rc1-mm4/Documentation/vm/scrubd.txt =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.16-rc1-mm4/Documentation/vm/scrubd.txt 2006-01-30 17:27:11.000000000 -0800 @@ -0,0 +1,38 @@ +The SCRUB Daemon +---------------- + +The scrub daemon zeroes memory so that later requests for zeroed memory can +be satisifed without having to zero memory in a hot code path. The operations +of scrubd may be controlled through /proc/sys/vm/scrub_limit: + +/proc/sys/vm/scrub_limit default value 300 + + This represents a percentage *10. If more than this percentage + of free pages are unzeroed then invoke the scrub daemon. + +The amount of available zeroed memory may be seen in /proc/meminfo or in +/proc/buddyinfo. + +Zeroing Drivers: +---------------- + +If hardware is available that can zero memory without the use of the cpu then a +driver may be written that can then register itself with +register_zero_driver(). See include/linux/scrubd.h for details and +arch/ia64/sn/kernel/bte.c for an example of a zeroing driver) + +Performance considerations: +--------------------------- + +If there is no zeroing hardware available then zeroing may invalidate the +cpu cache and is therefore something that may cause a minimal performance loss +especially since scrubd may zero more pages than necessary. + +Scrubd is most effective for memory that is only sparsely accessed. Getting a +prezeroed page for an application that then immediately overwrites all bytes +in the page does not lead to any performance improvement. However, if the +application only uses certain cachelines of the page immediately after a page +fault then scrubd can be of tremendous benefit. + +Christoph Lameter, SGI, February 2006. + Index: linux-2.6.16-rc1-mm4/kernel/sysctl.c =================================================================== --- linux-2.6.16-rc1-mm4.orig/kernel/sysctl.c 2006-01-30 11:31:18.000000000 -0800 +++ linux-2.6.16-rc1-mm4/kernel/sysctl.c 2006-01-30 17:27:11.000000000 -0800 @@ -153,6 +153,10 @@ extern ctl_table inotify_table[]; int sysctl_legacy_va_layout; #endif +#ifdef CONFIG_SCRUBD +extern int sysctl_scrub_limit; +#endif + /* /proc declarations: */ #ifdef CONFIG_PROC_FS @@ -898,6 +902,18 @@ static ctl_table vm_table[] = { .strategy = &sysctl_jiffies, }, #endif +#ifdef CONFIG_SCRUBD + { + .ctl_name = VM_SCRUB_LIMIT, + .procname = "scrub_limit", + .data = &sysctl_scrub_limit, + .maxlen = sizeof(sysctl_scrub_limit), + .mode = 0644, + .proc_handler = &scrub_limit_handler, + .strategy = &sysctl_intvec, + .extra1 = &zero, + }, +#endif { .ctl_name = 0 } }; Index: linux-2.6.16-rc1-mm4/include/linux/sysctl.h =================================================================== --- linux-2.6.16-rc1-mm4.orig/include/linux/sysctl.h 2006-01-30 11:35:20.000000000 -0800 +++ linux-2.6.16-rc1-mm4/include/linux/sysctl.h 2006-01-30 17:27:11.000000000 -0800 @@ -185,6 +185,7 @@ enum VM_PERCPU_PAGELIST_FRACTION=30,/* int: fraction of pages in each percpu_pagelist */ VM_ZONE_RECLAIM_MODE=31, /* reclaim local zone memory before going off node */ VM_ZONE_RECLAIM_INTERVAL=32, /* time period to wait after reclaim failure */ + VM_SCRUB_LIMIT=33, /* if zeroed memory falls below run kscrubd */ }; Index: linux-2.6.16-rc1-mm4/include/linux/mmzone.h =================================================================== --- linux-2.6.16-rc1-mm4.orig/include/linux/mmzone.h 2006-01-30 17:27:07.000000000 -0800 +++ linux-2.6.16-rc1-mm4/include/linux/mmzone.h 2006-01-30 17:27:11.000000000 -0800 @@ -324,6 +324,9 @@ typedef struct pglist_data { wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; + + wait_queue_head_t kscrubd_wait; + struct task_struct *kscrubd; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) @@ -466,6 +469,8 @@ int lowmem_reserve_ratio_sysctl_handler( void __user *, size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int scrub_limit_handler(struct ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); #include /* Returns the number of the current Node. */ Index: linux-2.6.16-rc1-mm4/mm/page_alloc.c =================================================================== --- linux-2.6.16-rc1-mm4.orig/mm/page_alloc.c 2006-01-30 17:27:07.000000000 -0800 +++ linux-2.6.16-rc1-mm4/mm/page_alloc.c 2006-01-30 17:37:24.000000000 -0800 @@ -69,6 +69,23 @@ int percpu_pagelist_fraction; */ int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 }; +#ifdef CONFIG_NUMA +/* + * The scrub limit is the percentage of the free pages that must + * be zeroed. If the zeroed pages fall below that limit then all free pages + * of the zone will be zeroed. + */ +int sysctl_scrub_limit __read_mostly = 300 /* 300/1024 */; +/* + * Check if a zone has enough zeroed pages. The limit is specified in a + * percentage *10 in order to allow a division by 1024 (almost 1000) by\ + * shifting right by 10. + */ +#define ZONE_NEEDS_MORE_ZEROED_PAGES(z) \ + ((z)->zeroed_pages < (z)->free_pages * sysctl_scrub_limit >> 10) + +#endif + EXPORT_SYMBOL(totalram_pages); /* @@ -583,7 +600,38 @@ static int prep_new_page(struct page *pa return 0; } -/* + +#ifdef CONFIG_SCRUBD +/* + * Special access for scrubd: + * Remove the first unzeroed page from the indicated free area + */ +struct page *scrubd_rmpage(struct zone *zone, int order) +{ + struct page *page; + struct list_head *l; + struct free_area *area = zone->free_area + order; + + spin_lock(&zone->lock); + list_for_each(l, &area->free_list) { + page = list_entry(l, struct page, lru); + if (!PageZeroed(page)) { + list_del(&page->lru); + rmv_page_order(page); + area->nr_free--; + zone->free_pages -= 1 << order; + goto out; + } + } + page = NULL; +out: + spin_unlock(&zone->lock); + return page; +} + +#endif + +/* * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ @@ -623,7 +671,7 @@ static int rmqueue_bulk(struct zone *zon unsigned long count, struct list_head *list, int last) { int i; - + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, last); @@ -636,6 +684,13 @@ static int rmqueue_bulk(struct zone *zon } #ifdef CONFIG_NUMA +void wakeup_kscrubd(struct zone *zone) +{ + if (!waitqueue_active(&zone->zone_pgdat->kscrubd_wait)) + return; + wake_up_interruptible(&zone->zone_pgdat->kscrubd_wait); +} + /* Called from the slab reaper to drain remote pagesets */ void drain_remote_pages(void) { @@ -647,9 +702,15 @@ void drain_remote_pages(void) for_each_zone(zone) { struct per_cpu_pageset *pset; - /* Do not drain local pagesets */ - if (zone->zone_pgdat->node_id == numa_node_id()) + /* + * Do not drain local pagesets. Check zeroed pages + * instead. + */ + if (zone->zone_pgdat->node_id == numa_node_id()) { + if (ZONE_NEEDS_MORE_ZEROED_PAGES(zone)) + wakeup_kscrubd(zone); continue; + } pset = zone_pcp(zone, smp_processor_id()); for (i = 0; i < NR_PER_CPU_PAGES; i++) { @@ -2130,6 +2191,71 @@ void __init setup_per_cpu_pageset(void) register_cpu_notifier(&pageset_notifier); } +/* + * scrub_pgdat() will work across all this node's zones. + */ +static void scrub_pgdat(pg_data_t *pgdat) +{ + int order; + struct page *page; + struct zone *zone; + + for (zone = pgdat->node_zones; zone < pgdat->node_zones + MAX_NR_ZONES; zone++) { + if (!ZONE_NEEDS_MORE_ZEROED_PAGES(zone)) + continue; + + for (order = MAX_ORDER -1; order > 0; order--) { + while ((page = scrubd_rmpage(zone, order))) { + prep_zero_page(page, order, GFP_KERNEL); + __SetPageZeroed(page); + zone->zeroed_pages += 1 << order; + __free_one_page(page, zone, order); + } + } + } +} + +/* + * The background scrub daemon. + */ +static int kscrubd(void *p) +{ + pg_data_t *pgdat = (pg_data_t*)p; + struct task_struct *tsk = current; + DEFINE_WAIT(wait); + cpumask_t cpumask; + + daemonize("kscrubd%d", pgdat->node_id); + cpumask = node_to_cpumask(pgdat->node_id); + if (!cpus_empty(cpumask)) + set_cpus_allowed(tsk, cpumask); + + tsk->flags |= PF_MEMALLOC; + + /* kscrubd should always run at lowest priority */ + set_user_nice(current, 19); + + for (;;) { + if (current->flags & PF_FREEZE) + refrigerator(); + prepare_to_wait(&pgdat->kscrubd_wait, &wait, TASK_INTERRUPTIBLE); + schedule(); + finish_wait(&pgdat->kscrubd_wait, &wait); + + scrub_pgdat(pgdat); + } + return 0; +} + +static int __init kscrubd_init(void) +{ + pg_data_t *pgdat; + for_each_pgdat(pgdat) + pgdat->kscrubd = find_task_by_pid(kernel_thread(kscrubd, pgdat, CLONE_KERNEL)); + return 0; +} + +module_init(kscrubd_init) #endif static __meminit @@ -2202,8 +2328,9 @@ static void __init free_area_init_core(s pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); + init_waitqueue_head(&pgdat->kscrubd_wait); pgdat->kswapd_max_order = 0; - + for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize; @@ -2793,6 +2920,29 @@ int percpu_pagelist_fraction_sysctl_hand return 0; } +#ifdef CONFIG_NUMA +/* + * sysctl handler for /proc/sys/vm/scrub_limit to allow the control of + * the ratio of zeroed pages to free pages. + */ +int scrub_limit_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(table, write, file, buffer, length, ppos); + if (!write || (ret == -EINVAL)) + return ret; + if (sysctl_scrub_limit <= 1024) { + struct zone *zone; + + for_each_zone(zone) + wakeup_kscrubd(zone); + } + return 0; +} +#endif + __initdata int hashdist = HASHDIST_DEFAULT; #ifdef CONFIG_NUMA