Page Allocator: Use allocpercpu Use the new allocpercpu functionality to avoid per cpu arrays in struct zone. Surprisingly this clears up much of the painful NUMA bringup. Bootstrap becomes very simple. No need for bootstrap pageset anymore. After this patch all per cpu pagesets are placed in the per cpu area. This means that in the future they may be manipulated using special per cpu operations that does not require disabling of preemption etc etc. VM statistics can become much more effective and there is the potential of optimizing the list operations in the same way as in SLUB to work without disabling interrupts. Signed-off-by: Christoph Lameter --- include/linux/mmzone.h | 16 ++++--- mm/page_alloc.c | 103 +++++++------------------------------------------ 2 files changed, 25 insertions(+), 94 deletions(-) Index: linux-2.6/include/linux/mmzone.h =================================================================== --- linux-2.6.orig/include/linux/mmzone.h 2007-11-01 11:47:43.665837139 -0700 +++ linux-2.6/include/linux/mmzone.h 2007-11-01 12:07:10.803743003 -0700 @@ -121,12 +121,14 @@ struct per_cpu_pageset { s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; #endif -} ____cacheline_aligned_in_smp; +}; -#ifdef CONFIG_NUMA -#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)]) +#ifdef CONFIG_SMP +#define zone_pcp(__z, __cpu) percpu_ptr((__z)->pageset, (__cpu)) +#define this_pcp(__z) this_cpu_ptr((__z)->pageset) #else -#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)]) +#define zone_pcp(__z, __cpu) (&(__z)->pageset) +#define this_pcp(__z) (&(__z)->pageset) #endif enum zone_type { @@ -231,9 +233,11 @@ struct zone { */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; - struct per_cpu_pageset *pageset[NR_CPUS]; +#endif +#ifdef CONFIG_SMP + struct per_cpu_pageset *pageset; #else - struct per_cpu_pageset pageset[NR_CPUS]; + struct per_cpu_pageset pageset; #endif /* * free areas of different sizes Index: linux-2.6/mm/page_alloc.c =================================================================== --- linux-2.6.orig/mm/page_alloc.c 2007-11-01 11:47:43.673837325 -0700 +++ linux-2.6/mm/page_alloc.c 2007-11-01 12:10:36.922743011 -0700 @@ -1011,8 +1011,8 @@ static void fastcall free_hot_cold_page( arch_free_page(page, 0); kernel_map_pages(page, 1, 0); - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); + pcp = &this_pcp(zone)->pcp[cold]; __count_vm_event(PGFREE); list_add(&page->lru, &pcp->list); set_page_private(page, get_pageblock_migratetype(page)); @@ -1022,7 +1022,6 @@ static void fastcall free_hot_cold_page( pcp->count -= pcp->batch; } local_irq_restore(flags); - put_cpu(); } void fastcall free_hot_page(struct page *page) @@ -1064,16 +1063,14 @@ static struct page *buffered_rmqueue(str unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); - int cpu; int migratetype = allocflags_to_migratetype(gfp_flags); again: - cpu = get_cpu(); if (likely(order == 0)) { struct per_cpu_pages *pcp; - pcp = &zone_pcp(zone, cpu)->pcp[cold]; local_irq_save(flags); + pcp = &this_pcp(zone)->pcp[cold]; if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, pcp->batch, &pcp->list, migratetype); @@ -1106,7 +1103,6 @@ again: __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(zonelist, zone); local_irq_restore(flags); - put_cpu(); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -1115,7 +1111,6 @@ again: failed: local_irq_restore(flags); - put_cpu(); return NULL; } @@ -2645,33 +2640,13 @@ static void setup_pagelist_highmark(stru } -#ifdef CONFIG_NUMA -/* - * Boot pageset table. One per cpu which is going to be used for all - * zones and all nodes. The parameters will be set in such a way - * that an item put on a list will immediately be handed over to - * the buddy list. This is safe since pageset manipulation is done - * with interrupts disabled. - * - * Some NUMA counter updates may also be caught by the boot pagesets. - * - * The boot_pagesets must be kept even after bootup is complete for - * unused processors and/or zones. They do play a role for bootstrapping - * hotplugged processors. - * - * zoneinfo_show() and maybe other functions do - * not check if the processor is online before following the pageset pointer. - * Other parts of the kernel may not check if the zone is available. - */ -static struct per_cpu_pageset boot_pageset[NR_CPUS]; - /* * Dynamically allocate memory for the * per cpu pageset array in struct zone. */ -static int __cpuinit process_zones(int cpu) +static void __cpuinit process_zones(int cpu) { - struct zone *zone, *dzone; + struct zone *zone; int node = cpu_to_node(cpu); node_set_state(node, N_CPU); /* this node has a cpu */ @@ -2681,43 +2656,10 @@ static int __cpuinit process_zones(int c if (!populated_zone(zone)) continue; - zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), - GFP_KERNEL, node); - if (!zone_pcp(zone, cpu)) - goto bad; - - setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); - if (percpu_pagelist_fraction) setup_pagelist_highmark(zone_pcp(zone, cpu), (zone->present_pages / percpu_pagelist_fraction)); } - - return 0; -bad: - for_each_zone(dzone) { - if (!populated_zone(dzone)) - continue; - if (dzone == zone) - break; - kfree(zone_pcp(dzone, cpu)); - zone_pcp(dzone, cpu) = NULL; - } - return -ENOMEM; -} - -static inline void free_zone_pagesets(int cpu) -{ - struct zone *zone; - - for_each_zone(zone) { - struct per_cpu_pageset *pset = zone_pcp(zone, cpu); - - /* Free per_cpu_pageset if it is slab allocated */ - if (pset != &boot_pageset[cpu]) - kfree(pset); - zone_pcp(zone, cpu) = NULL; - } } static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, @@ -2730,14 +2672,7 @@ static int __cpuinit pageset_cpuup_callb switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (process_zones(cpu)) - ret = NOTIFY_BAD; - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - free_zone_pagesets(cpu); + process_zones(cpu); break; default: break; @@ -2750,19 +2685,15 @@ static struct notifier_block __cpuinitda void __init setup_per_cpu_pageset(void) { - int err; - - /* Initialize per_cpu_pageset for cpu 0. + /* + * Initialize per_cpu settings for the boot cpu. * A cpuup callback will do this for every cpu - * as it comes online + * as it comes online. */ - err = process_zones(smp_processor_id()); - BUG_ON(err); + process_zones(smp_processor_id()); register_cpu_notifier(&pageset_notifier); } -#endif - static noinline __init_refok int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { @@ -2811,15 +2742,11 @@ static __meminit void zone_pcp_init(stru int cpu; unsigned long batch = zone_batchsize(zone); - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone_pcp(zone, cpu) = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); +#ifdef CONFIG_SMP + zone->pageset = alloc_percpu(struct per_cpu_pageset); #endif - } + for (cpu = 0; cpu < NR_CPUS; cpu++) + setup_pageset(zone_pcp(zone,cpu), batch); if (zone->present_pages) printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", zone->name, zone->present_pages, batch); @@ -4237,8 +4164,8 @@ int percpu_pagelist_fraction_sysctl_hand ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); if (!write || (ret == -EINVAL)) return ret; - for_each_zone(zone) { - for_each_online_cpu(cpu) { + for_each_online_cpu(cpu) { + for_each_zone(zone) { unsigned long high; high = zone->present_pages / percpu_pagelist_fraction; setup_pagelist_highmark(zone_pcp(zone, cpu), high);