Page Allocator: Use allocpercpu

Use the new allocpercpu functionality to avoid per cpu arrays in struct zone.

Surprisingly this clears up much of the painful NUMA bringup. Bootstrap
becomes very simple. No need for bootstrap pageset anymore.

After this patch all per cpu pagesets are placed in the per cpu area. This
means that in the future they may be manipulated using special per cpu
operations that does not require disabling of preemption etc etc. VM
statistics can become much more effective and there is the potential of
optimizing the list operations in the same way as in SLUB to work without
disabling interrupts.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

---
 include/linux/mmzone.h |   16 ++++---
 mm/page_alloc.c        |  103 +++++++------------------------------------------
 2 files changed, 25 insertions(+), 94 deletions(-)

Index: linux-2.6/include/linux/mmzone.h
===================================================================
--- linux-2.6.orig/include/linux/mmzone.h	2007-11-01 11:47:43.665837139 -0700
+++ linux-2.6/include/linux/mmzone.h	2007-11-01 12:07:10.803743003 -0700
@@ -121,12 +121,14 @@ struct per_cpu_pageset {
 	s8 stat_threshold;
 	s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS];
 #endif
-} ____cacheline_aligned_in_smp;
+};
 
-#ifdef CONFIG_NUMA
-#define zone_pcp(__z, __cpu) ((__z)->pageset[(__cpu)])
+#ifdef CONFIG_SMP
+#define zone_pcp(__z, __cpu) percpu_ptr((__z)->pageset, (__cpu))
+#define this_pcp(__z) this_cpu_ptr((__z)->pageset)
 #else
-#define zone_pcp(__z, __cpu) (&(__z)->pageset[(__cpu)])
+#define zone_pcp(__z, __cpu) (&(__z)->pageset)
+#define this_pcp(__z) (&(__z)->pageset)
 #endif
 
 enum zone_type {
@@ -231,9 +233,11 @@ struct zone {
 	 */
 	unsigned long		min_unmapped_pages;
 	unsigned long		min_slab_pages;
-	struct per_cpu_pageset	*pageset[NR_CPUS];
+#endif
+#ifdef CONFIG_SMP
+	struct per_cpu_pageset	*pageset;
 #else
-	struct per_cpu_pageset	pageset[NR_CPUS];
+	struct per_cpu_pageset	pageset;
 #endif
 	/*
 	 * free areas of different sizes
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c	2007-11-01 11:47:43.673837325 -0700
+++ linux-2.6/mm/page_alloc.c	2007-11-01 12:10:36.922743011 -0700
@@ -1011,8 +1011,8 @@ static void fastcall free_hot_cold_page(
 	arch_free_page(page, 0);
 	kernel_map_pages(page, 1, 0);
 
-	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
 	local_irq_save(flags);
+	pcp = &this_pcp(zone)->pcp[cold];
 	__count_vm_event(PGFREE);
 	list_add(&page->lru, &pcp->list);
 	set_page_private(page, get_pageblock_migratetype(page));
@@ -1022,7 +1022,6 @@ static void fastcall free_hot_cold_page(
 		pcp->count -= pcp->batch;
 	}
 	local_irq_restore(flags);
-	put_cpu();
 }
 
 void fastcall free_hot_page(struct page *page)
@@ -1064,16 +1063,14 @@ static struct page *buffered_rmqueue(str
 	unsigned long flags;
 	struct page *page;
 	int cold = !!(gfp_flags & __GFP_COLD);
-	int cpu;
 	int migratetype = allocflags_to_migratetype(gfp_flags);
 
 again:
-	cpu  = get_cpu();
 	if (likely(order == 0)) {
 		struct per_cpu_pages *pcp;
 
-		pcp = &zone_pcp(zone, cpu)->pcp[cold];
 		local_irq_save(flags);
+		pcp = &this_pcp(zone)->pcp[cold];
 		if (!pcp->count) {
 			pcp->count = rmqueue_bulk(zone, 0,
 					pcp->batch, &pcp->list, migratetype);
@@ -1106,7 +1103,6 @@ again:
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(zonelist, zone);
 	local_irq_restore(flags);
-	put_cpu();
 
 	VM_BUG_ON(bad_range(zone, page));
 	if (prep_new_page(page, order, gfp_flags))
@@ -1115,7 +1111,6 @@ again:
 
 failed:
 	local_irq_restore(flags);
-	put_cpu();
 	return NULL;
 }
 
@@ -2645,33 +2640,13 @@ static void setup_pagelist_highmark(stru
 }
 
 
-#ifdef CONFIG_NUMA
-/*
- * Boot pageset table. One per cpu which is going to be used for all
- * zones and all nodes. The parameters will be set in such a way
- * that an item put on a list will immediately be handed over to
- * the buddy list. This is safe since pageset manipulation is done
- * with interrupts disabled.
- *
- * Some NUMA counter updates may also be caught by the boot pagesets.
- *
- * The boot_pagesets must be kept even after bootup is complete for
- * unused processors and/or zones. They do play a role for bootstrapping
- * hotplugged processors.
- *
- * zoneinfo_show() and maybe other functions do
- * not check if the processor is online before following the pageset pointer.
- * Other parts of the kernel may not check if the zone is available.
- */
-static struct per_cpu_pageset boot_pageset[NR_CPUS];
-
 /*
  * Dynamically allocate memory for the
  * per cpu pageset array in struct zone.
  */
-static int __cpuinit process_zones(int cpu)
+static void __cpuinit process_zones(int cpu)
 {
-	struct zone *zone, *dzone;
+	struct zone *zone;
 	int node = cpu_to_node(cpu);
 
 	node_set_state(node, N_CPU);	/* this node has a cpu */
@@ -2681,43 +2656,10 @@ static int __cpuinit process_zones(int c
 		if (!populated_zone(zone))
 			continue;
 
-		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
-					 GFP_KERNEL, node);
-		if (!zone_pcp(zone, cpu))
-			goto bad;
-
-		setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
-
 		if (percpu_pagelist_fraction)
 			setup_pagelist_highmark(zone_pcp(zone, cpu),
 			 	(zone->present_pages / percpu_pagelist_fraction));
 	}
-
-	return 0;
-bad:
-	for_each_zone(dzone) {
-		if (!populated_zone(dzone))
-			continue;
-		if (dzone == zone)
-			break;
-		kfree(zone_pcp(dzone, cpu));
-		zone_pcp(dzone, cpu) = NULL;
-	}
-	return -ENOMEM;
-}
-
-static inline void free_zone_pagesets(int cpu)
-{
-	struct zone *zone;
-
-	for_each_zone(zone) {
-		struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
-
-		/* Free per_cpu_pageset if it is slab allocated */
-		if (pset != &boot_pageset[cpu])
-			kfree(pset);
-		zone_pcp(zone, cpu) = NULL;
-	}
 }
 
 static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
@@ -2730,14 +2672,7 @@ static int __cpuinit pageset_cpuup_callb
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		if (process_zones(cpu))
-			ret = NOTIFY_BAD;
-		break;
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		free_zone_pagesets(cpu);
+		process_zones(cpu);
 		break;
 	default:
 		break;
@@ -2750,19 +2685,15 @@ static struct notifier_block __cpuinitda
 
 void __init setup_per_cpu_pageset(void)
 {
-	int err;
-
-	/* Initialize per_cpu_pageset for cpu 0.
+	/*
+	 * Initialize per_cpu settings for the boot cpu.
 	 * A cpuup callback will do this for every cpu
-	 * as it comes online
+	 * as it comes online.
 	 */
-	err = process_zones(smp_processor_id());
-	BUG_ON(err);
+	process_zones(smp_processor_id());
 	register_cpu_notifier(&pageset_notifier);
 }
 
-#endif
-
 static noinline __init_refok
 int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 {
@@ -2811,15 +2742,11 @@ static __meminit void zone_pcp_init(stru
 	int cpu;
 	unsigned long batch = zone_batchsize(zone);
 
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
-#ifdef CONFIG_NUMA
-		/* Early boot. Slab allocator not functional yet */
-		zone_pcp(zone, cpu) = &boot_pageset[cpu];
-		setup_pageset(&boot_pageset[cpu],0);
-#else
-		setup_pageset(zone_pcp(zone,cpu), batch);
+#ifdef CONFIG_SMP
+	zone->pageset = alloc_percpu(struct per_cpu_pageset);
 #endif
-	}
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		setup_pageset(zone_pcp(zone,cpu), batch);
 	if (zone->present_pages)
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 			zone->name, zone->present_pages, batch);
@@ -4237,8 +4164,8 @@ int percpu_pagelist_fraction_sysctl_hand
 	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
 	if (!write || (ret == -EINVAL))
 		return ret;
-	for_each_zone(zone) {
-		for_each_online_cpu(cpu) {
+	for_each_online_cpu(cpu) {
+		for_each_zone(zone) {
 			unsigned long  high;
 			high = zone->present_pages / percpu_pagelist_fraction;
 			setup_pagelist_highmark(zone_pcp(zone, cpu), high);