--- include/linux/gfp.h | 6 --- mm/page_alloc.c | 65 +++++++++++++++++++++++++++++++++++++++- mm/slab.c | 41 ------------------------- mm/slub.c | 84 ---------------------------------------------------- 4 files changed, 63 insertions(+), 133 deletions(-) Index: slub/mm/page_alloc.c =================================================================== --- slub.orig/mm/page_alloc.c 2007-04-28 19:30:01.000000000 -0700 +++ slub/mm/page_alloc.c 2007-04-28 19:37:30.000000000 -0700 @@ -933,8 +933,6 @@ static void __init setup_nr_node_ids(voi #ifdef CONFIG_NUMA /* - * Called from the slab reaper to drain pagesets on a particular node that - * belongs to the currently executing processor. * Note that this function must be called with the thread pinned to * a single processor. */ @@ -971,6 +969,69 @@ void drain_node_pages(int nodeid) } } } + +static DEFINE_PER_CPU(unsigned long, reap_node); +static DEFINE_PER_CPU(struct delayed_work, node_reap_work); +static unsigned long drain_interval; + +void reap_node(struct work_struct *w) +{ + int node = __get_cpu_var(reap_node); + + /* + * Only drain per cpu pages on remote zones + */ + if (node != numa_node_id()) + drain_node_pages(node); + + node = next_node(node, node_online_map); + if (unlikely(node >= nr_node_ids)) + node = first_node(node_online_map); + __get_cpu_var(reap_node) = node; + + schedule_delayed_work(&__get_cpu_var(node_reap_work), + drain_interval); +} + +static void __devinit start_cpu_timer(int cpu) +{ + struct delayed_work *node_reap_work = &per_cpu(node_reap_work, cpu); + int node; + + node = next_node(cpu_to_node(cpu), node_online_map); + if (node == nr_node_ids) + node = first_node(node_online_map); + + __get_cpu_var(reap_node) = node; + INIT_DELAYED_WORK(node_reap_work, reap_node); + schedule_delayed_work_on(cpu, node_reap_work, HZ); +} + +int __init setup_node_draining(void) +{ + int cpu; + + /* + * Determine call interval for draining nodes + * Per cpu pages for remote nodes should be flushed at an interval + * of once per minute or so. + * + * The worst case here is a a system with 1024 nodes which will have + * a huge amount of remote node queues on each node. Then we get to + * a calling interval of 60 * HZ / 1024 = 50ms. Which is sadly + * getting into the range of frequency of timer interrupts. + * + * A more typical 2 node system needs only to do this every 30 seconds. + * + * All the reaping can run concurrently since it is very unlikely that all + * processors will be draining the memory to the same node. + */ + drain_interval = (60 * HZ) / num_online_nodes(); + for_each_online_cpu(cpu) + start_cpu_timer(cpu); + return 0; +} +__initcall(setup_node_draining); #endif static void __drain_pages(unsigned int cpu) Index: slub/mm/slab.c =================================================================== --- slub.orig/mm/slab.c 2007-04-28 19:35:26.000000000 -0700 +++ slub/mm/slab.c 2007-04-28 19:39:19.000000000 -0700 @@ -902,46 +902,6 @@ __setup("noaliencache", noaliencache_set #ifdef CONFIG_NUMA /* - * Special reaping functions for NUMA systems called from cache_reap(). - * These take care of doing round robin flushing of alien caches (containing - * objects freed on different nodes from which they were allocated) and the - * flushing of remote pcps by calling drain_node_pages. - */ -static DEFINE_PER_CPU(unsigned long, reap_node); - -static void init_reap_node(int cpu) -{ - int node; - - node = next_node(cpu_to_node(cpu), node_online_map); - if (node == MAX_NUMNODES) - node = first_node(node_online_map); - - per_cpu(reap_node, cpu) = node; -} - -static void next_reap_node(void) -{ - int node = __get_cpu_var(reap_node); - - /* - * Also drain per cpu pages on remote zones - */ - if (node != numa_node_id()) - drain_node_pages(node); - - node = next_node(node, node_online_map); - if (unlikely(node >= MAX_NUMNODES)) - node = first_node(node_online_map); - __get_cpu_var(reap_node) = node; -} - -#else -#define init_reap_node(cpu) do { } while (0) -#define next_reap_node(void) do { } while (0) -#endif - -/* * Initiate the reap timer running on the target CPU. We run at around 1 to 2Hz * via the workqueue/eventd. * Add the CPU number into the expiration time to minimize the possibility of @@ -958,7 +918,6 @@ static void __devinit start_cpu_timer(in * at that time. */ if (keventd_up() && reap_work->work.func == NULL) { - init_reap_node(cpu); INIT_DELAYED_WORK(reap_work, cache_reap); schedule_delayed_work_on(cpu, reap_work, __round_jiffies_relative(HZ, cpu)); Index: slub/include/linux/gfp.h =================================================================== --- slub.orig/include/linux/gfp.h 2007-04-28 19:30:01.000000000 -0700 +++ slub/include/linux/gfp.h 2007-04-28 19:37:30.000000000 -0700 @@ -198,10 +198,4 @@ extern void FASTCALL(free_cold_page(stru #define free_page(addr) free_pages((addr),0) void page_alloc_init(void); -#ifdef CONFIG_NUMA -void drain_node_pages(int node); -#else -static inline void drain_node_pages(int node) { }; -#endif - #endif /* __LINUX_GFP_H */ Index: slub/mm/slub.c =================================================================== --- slub.orig/mm/slub.c 2007-04-28 19:36:12.000000000 -0700 +++ slub/mm/slub.c 2007-04-28 19:40:06.000000000 -0700 @@ -2464,90 +2464,6 @@ static struct notifier_block __cpuinitda #endif -#ifdef CONFIG_NUMA - -/***************************************************************** - * Generic reaper used to support the page allocator - * (the cpu slabs are reaped by a per slab workqueue). - * - * Maybe move this to the page allocator? - ****************************************************************/ - -static DEFINE_PER_CPU(unsigned long, reap_node); - -static void init_reap_node(int cpu) -{ - int node; - - node = next_node(cpu_to_node(cpu), node_online_map); - if (node == MAX_NUMNODES) - node = first_node(node_online_map); - - __get_cpu_var(reap_node) = node; -} - -static void next_reap_node(void) -{ - int node = __get_cpu_var(reap_node); - - /* - * Also drain per cpu pages on remote zones - */ - if (node != numa_node_id()) - drain_node_pages(node); - - node = next_node(node, node_online_map); - if (unlikely(node >= MAX_NUMNODES)) - node = first_node(node_online_map); - __get_cpu_var(reap_node) = node; -} -#else -#define init_reap_node(cpu) do { } while (0) -#define next_reap_node(void) do { } while (0) -#endif - -#define REAPTIMEOUT_CPUC (2*HZ) - -#ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct delayed_work, reap_work); - -static void cache_reap(struct work_struct *unused) -{ - next_reap_node(); - schedule_delayed_work(&__get_cpu_var(reap_work), - REAPTIMEOUT_CPUC); -} - -static void __devinit start_cpu_timer(int cpu) -{ - struct delayed_work *reap_work = &per_cpu(reap_work, cpu); - - /* - * When this gets called from do_initcalls via cpucache_init(), - * init_workqueues() has already run, so keventd will be setup - * at that time. - */ - if (keventd_up() && reap_work->work.func == NULL) { - init_reap_node(cpu); - INIT_DELAYED_WORK(reap_work, cache_reap); - schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); - } -} - -static int __init cpucache_init(void) -{ - int cpu; - - /* - * Register the timers that drain pcp pages and update vm statistics - */ - for_each_online_cpu(cpu) - start_cpu_timer(cpu); - return 0; -} -__initcall(cpucache_init); -#endif - #ifdef SLUB_RESILIENCY_TEST static unsigned long validate_slab_cache(struct kmem_cache *s);