---
 include/linux/cpu_alloc.h |   25 ----------------------
 mm/cpu_alloc.c            |   51 +++++++++++++++++++++-------------------------
 2 files changed, 24 insertions(+), 52 deletions(-)

Index: linux-2.6/include/linux/cpu_alloc.h
===================================================================
--- linux-2.6.orig/include/linux/cpu_alloc.h	2007-11-02 21:58:09.000000000 -0700
+++ linux-2.6/include/linux/cpu_alloc.h	2007-11-02 21:59:58.000000000 -0700
@@ -43,7 +43,6 @@
 					__alignof__(type))
 #define CPU_FREE(pointer)	cpu_free(pointer, sizeof(*(pointer)))
 
-#ifdef CONFIG_SMP
 /*
  * Raw calls
  */
@@ -82,10 +81,8 @@ extern u8 cpu_area[];
  * THIS_CPU_OFFSET constant that may work more efficiently than this one.
  * (Although this definition is typically very effective).
  */
-#ifndef THIS_CPU_OFFSET
 #define THIS_CPU_OFFSET ((unsigned long)smp_processor_id()	\
 						<< CPU_AREA_SHIFT)
-#endif
 
 #define CPU_PTR(__p, __cpu) ((__typeof__(__p))((void *)(__p) + \
 		((unsigned long)(__cpu) << CPU_AREA_SHIFT)))
@@ -94,26 +91,4 @@ extern u8 cpu_area[];
 #define THIS_CPU_PTR(__p) \
 	((__typeof__(__p))((void *)(__p) + THIS_CPU_OFFSET))
 
-#else /* !SMP */
-
-/*
- * Fallback for the single processor case.
- *
- * This is unsafe to use before slab allocator bootstrap!
- */
-static inline void *cpu_alloc(unsigned long size, gfp_t flags,
-				unsigned long align)
-{
-	return kmalloc(size, flags);
-}
-
-static inline void cpu_free(void *cpu_pointer, unsigned long size)
-{
-	kfree(cpu_pointer);
-}
-
-#define CPU_PTR(__p, __cpu) (__p)
-#define THIS_CPU_PTR(__p) (__p)
-#endif
-
 #endif /* _LINUX_CPU_ALLOC_H_ */
Index: linux-2.6/mm/cpu_alloc.c
===================================================================
--- linux-2.6.orig/mm/cpu_alloc.c	2007-11-02 21:58:09.000000000 -0700
+++ linux-2.6/mm/cpu_alloc.c	2007-11-02 22:00:48.000000000 -0700
@@ -7,6 +7,10 @@
  *
  * The per cpu allocator allows dynamic allocation of memory on all
  * processor simultaneously. A bitmap is used to track used areas.
+ * The allocator implements tight packing to reduce cache footprint
+ * and increase speed since cacheline contention is typically not a cocern
+ * for memory mainly used by a single cpu. Small objects will fill up gaps
+ * left by larger allocations that required alignments.
  */
 #include <linux/mm.h>
 #include <linux/mmzone.h>
@@ -22,9 +26,10 @@
  * CPU_AREA_BLOCK shift is the units in which the cpu areas are extended.
  * Setting it to PAGE_SHIFT allows increasing the per cpu areas in
  * PAGE_SIZE steps (which may be too small for large systems).
+ * The default is 64k if not specified.
  */
 #ifndef CPU_AREA_BLOCK_SHIFT
-#define CPU_AREA_BLOCK_SHIFT PAGE_SHIFT
+#define CPU_AREA_BLOCK_SHIFT 16
 #endif
 
 #ifdef CPU_AREA_BASE
@@ -39,11 +44,11 @@
 /*
  * No base specified. Fall back to a static configuration of the cpu
  * allocator. The cpu areas are of a fixed size. Such configurations
- * are mainly usedful for SMP on machines that do not have MMU support.
- * But it is also satisfactory for basic page and slab allocator
- * needs if the arch code has not yet provided an address for the cpu area.
+ * are mainly useful for SMP on machines that do not have MMU support.
  */
-u8 cpu_area[NR_CPUS][PAGE_SIZE];
+
+u8 cpu_area[NR_CPUS][1 << CPU_AREA_BLOCK_SHIFT];
+
 #define CPU_AREA_STATIC
 #define CPU_AREA_BASE cpu_area
 #define MAX_BLOCKS 1
@@ -116,6 +121,7 @@ static inline int expand_cpu_area(gfp_t 
 	return -ENOSYS;
 }
 #else
+
 /*
  * Allocate a block of memory to be used to provide cpu area memory
  * or to extend the bitmap for the cpu map.
@@ -123,18 +129,11 @@ static inline int expand_cpu_area(gfp_t 
  */
 void *cpu_area_alloc_block(unsigned long size, gfp_t flags, int node)
 {
-	extern int after_bootmem;
-
-	/* If the main allocator is up use that, fallback to bootmem. */
-	if (after_bootmem) {
-		struct page *page = alloc_pages_node(node,
-				flags | __GFP_ZERO, get_order(size));
-		if (page)
-			return page_address(page);
-		return NULL;
-	}
-	return __alloc_bootmem_node(NODE_DATA(node), size, size,
-					__pa(MAX_DMA_ADDRESS));
+	struct page *page = alloc_pages_node(node,
+			flags | __GFP_ZERO, get_order(size));
+	if (page)
+		return page_address(page);
+	return NULL;
 }
 
 pte_t *cpu_area_pte_populate(pmd_t *pmd, unsigned long addr,
@@ -231,27 +230,25 @@ int cpu_area_populate(void *start, unsig
 static int expand_cpu_area(gfp_t flags)
 {
 	unsigned long blocks = active_blocks;
+	unsigned long bits;
 	int cpu;
 	int err = -ENOMEM;
 	int map_order;
 	unsigned long *new_map = NULL;
+	void *start;
 
 	if (active_blocks == MAX_BLOCKS)
 		goto out;
 
+	spin_unlock(&cpu_alloc_map_lock);
 	/*
 	 * Determine the size of the bit map needed
 	 */
-	map_order = get_order((blocks + 1) * UNITS_PER_BLOCK) /
-			(BITS_PER_LONG * sizeof(unsigned long));
-//	map_order = get_order(BITS_TO_LONGS((active_blocks + 1)
-//			* UNITS_PER_BLOCK) * sizeof(unsigned long));
-	spin_unlock(&cpu_alloc_map_lock);
+	bits = (blocks + 1) * UNITS_PER_BLOCK;
+	map_order = get_order(DIV_ROUND_UP(bits, 8));
+	start = (void *)CPU_AREA_BASE + (blocks << CPU_AREA_BLOCK_SHIFT);
 
 	for_each_possible_cpu(cpu) {
-		void *start = (void *)CPU_AREA_BASE +
-				(blocks << CPU_AREA_BLOCK_SHIFT);
-
 		err = cpu_area_populate(CPU_PTR(start, cpu), BLOCK_SIZE,
 			flags, cpu_to_node(cpu));
 
@@ -267,8 +264,8 @@ static int expand_cpu_area(gfp_t flags)
 		if (!new_map)
 			goto out;
 	}
-	spin_lock(&cpu_alloc_map_lock);
 
+	spin_lock(&cpu_alloc_map_lock);
 	/*
 	 * We dropped the lock. Another processor may have already extended
 	 * the cpu area size as needed.
@@ -276,7 +273,7 @@ static int expand_cpu_area(gfp_t flags)
 	if (blocks != active_blocks) {
 		if (new_map)
 			free_pages((unsigned long)new_map,
-						cpu_alloc_map_order);
+						map_order);
 		err = 0;
 		goto out;
 	}