Index: linux-2.6.18-rc4-mm2/include/linux/kmalloc.h
===================================================================
--- linux-2.6.18-rc4-mm2.orig/include/linux/kmalloc.h	2006-08-25 23:48:24.369666763 -0700
+++ linux-2.6.18-rc4-mm2/include/linux/kmalloc.h	2006-08-26 16:26:16.212863974 -0700
@@ -15,16 +15,13 @@
 #define KMALLOC_ALLOCATOR slabifier_allocator
 #endif
 
-#ifdef ARCH_NEEDS_SMALL_SLABS
 #define KMALLOC_SHIFT_LOW 3
-#else
-#define KMALLOC_SHIFT_LOW 7
-#endif
 
-#define KMALLOC_SHIFT_HIGH 20
+#define KMALLOC_SHIFT_HIGH 17
 
-#ifdef ARCH_NEEDS_SMALL_SLABS
+#if L1_CACHE_BYTES <= 64
 #define KMALLOC_EXTRAS 2
+#define KMALLOC_EXTRA
 #else
 #define KMALLOC_EXTRAS 0
 #endif
@@ -37,7 +34,7 @@
  * non DMA cache (DMA simply means memory for legacy I/O. The regular
  * caches can be used for devices that can DMA to all of memory).
  */
-extern struct slab_control kmalloc_caches[2][KMALLOC_NR_CACHES];
+extern struct slab_control kmalloc_caches[KMALLOC_NR_CACHES];
 
 /*
  * Sorry that the following has to be that ugly but GCC has trouble
@@ -45,15 +42,15 @@ extern struct slab_control kmalloc_cache
  */
 static inline int kmalloc_index(int size)
 {
-#ifdef ARCH_NEEDS_SMALL_SLABS
 	if (size <=    8) return 3;
 	if (size <=   16) return 4;
 	if (size <=   32) return 5;
 	if (size <=   64) return 6;
+#ifdef KMALLOC_EXTRA
 	if (size <=   96) return KMALLOC_SHIFT_HIGH + 1;
 #endif
 	if (size <=  128) return 7;
-#ifdef ARCH_NEEDS_SMALL_SLABS
+#ifdef KMALLOC_EXTRA
 	if (size <=  192) return KMALLOC_SHIFT_HIGH + 2;
 #endif
 	if (size <=  256) return 8;
@@ -66,9 +63,6 @@ static inline int kmalloc_index(int size
 	if (size <=  32 * 1024) return 15;
 	if (size <=  64 * 1024) return 16;
 	if (size <= 128 * 1024) return 17;
-	if (size <= 256 * 1024) return 18;
-	if (size <= 512 * 1024) return 19;
-	if (size <=1024 * 1024) return 20;
 	return -1;
 }
 
@@ -78,7 +72,7 @@ static inline int kmalloc_index(int size
  * This ought to end up with a global pointer to the right cache
  * in kmalloc_caches.
  */
-static inline struct slab_cache *kmalloc_slab(size_t size, gfp_t flags)
+static inline struct slab_cache *kmalloc_slab(size_t size)
 {
 	int index = kmalloc_index(size) - KMALLOC_SHIFT_LOW;
 
@@ -90,7 +84,7 @@ static inline struct slab_cache *kmalloc
 		extern void __kmalloc_size_too_large(void);
 		__kmalloc_size_too_large();
 	}
-	return &kmalloc_caches[!!(flags & __GFP_DMA)][index].sc;
+	return &kmalloc_caches[index].sc;
 }
 
 extern void *__kmalloc(size_t, gfp_t);
@@ -98,8 +92,8 @@ extern void *__kmalloc(size_t, gfp_t);
 
 static inline void *kmalloc(size_t size, gfp_t flags)
 {
-	if (__builtin_constant_p(size)) {
-		struct slab_cache *s = kmalloc_slab(size, flags);
+	if (__builtin_constant_p(size) && !(flags & __GFP_DMA)) {
+		struct slab_cache *s = kmalloc_slab(size);
 
 		return KMALLOC_ALLOCATOR.alloc(s, flags);
 	} else
@@ -110,8 +104,8 @@ static inline void *kmalloc(size_t size,
 extern void *__kmalloc_node(size_t, gfp_t, int);
 static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
-	if (__builtin_constant_p(size)) {
-		struct slab_cache *s = kmalloc_slab(size, flags);
+	if (__builtin_constant_p(size) && !(flags & __GFP_DMA)) {
+		struct slab_cache *s = kmalloc_slab(size);
 
 		return KMALLOC_ALLOCATOR.alloc_node(s, flags, node);
 	} else
Index: linux-2.6.18-rc4-mm2/mm/kmalloc.c
===================================================================
--- linux-2.6.18-rc4-mm2.orig/mm/kmalloc.c	2006-08-25 23:48:24.370643265 -0700
+++ linux-2.6.18-rc4-mm2/mm/kmalloc.c	2006-08-26 16:23:21.486358615 -0700
@@ -10,17 +10,95 @@
 #include <linux/kmalloc.h>
 #include <linux/slabstat.h>
 
-struct slab_control kmalloc_caches[2][KMALLOC_NR_CACHES] __cacheline_aligned;
+#ifndef ARCH_KMALLOC_MINALIGN
+#define ARCH_KMALLOC_MINALIGN sizeof(void *)
+#endif
+
+struct slab_control kmalloc_caches[KMALLOC_NR_CACHES] __cacheline_aligned;
 EXPORT_SYMBOL(kmalloc_caches);
 
+static struct page_allocator *dma_allocator;
+
+static struct slab_cache *kmalloc_caches_dma[KMALLOC_NR_CACHES];
+
+/*
+ * Given a slab size find the correct order to use.
+ * We only support powers of two so there is really
+ * no need for anything special. Objects will always
+ * fit exactly into the slabs with no overhead.
+ */
+static __init int order(size_t size)
+{
+	if (size >= PAGE_SIZE)
+		/* One object per slab */
+		return fls(size -1) - PAGE_SHIFT;
+
+	/* Multiple objects per page which will fit neatly */
+	return 0;
+}
+
+static struct slab_cache *create_kmalloc_cache(struct slab_control *x,
+		const char *name,
+		const struct page_allocator *p,
+		int size)
+{
+	struct slab_cache s;
+	struct slab_cache *rs;
+
+	s.page_alloc = p;
+	s.slab_alloc = &KMALLOC_ALLOCATOR;
+	s.size = size;
+	s.align = ARCH_KMALLOC_MINALIGN;
+	s.offset = 0;
+	s.objsize = size;
+	s.inuse = size;
+	s.node = -1;
+	s.order = order(size);
+	s.name = "kmalloc";
+	rs = KMALLOC_ALLOCATOR.create(x, &s);
+	if (!rs)
+		panic("Creation of kmalloc slab %s size=%d failed.\n",
+			name, size);
+	register_slab(rs);
+	return rs;
+}
+
 static struct slab_cache *get_slab(size_t size, gfp_t flags)
 {
 	int index = kmalloc_index(size) - KMALLOC_SHIFT_LOW;
+	struct slab_cache *s;
+	struct slab_control *x;
+	size_t realsize;
 
 	BUG_ON(size < 0);
 
-	return &kmalloc_caches[!!(flags & __GFP_DMA)][index].sc;
+	if (!(flags & __GFP_DMA))
+		return &kmalloc_caches[index].sc;
+
+	s = kmalloc_caches_dma[index];
+	if (s)
+		return s;
+
+	/* Dynamically create dma cache */
+	x = kmalloc(sizeof(struct slab_control), flags & ~(__GFP_DMA));
+
+	if (!x)
+		panic("Unable to allocate memory for dma cache\n");
 
+#ifdef KMALLOC_EXTRA
+	if (index <= KMALLOC_SHIFT_HIGH - KMALLOC_SHIFT_LOW)
+#endif
+		realsize = 1 << index;
+#ifdef KMALLOC_EXTRA
+	else if (index = KMALLOC_EXTRA)
+		realsize = 96;
+	else
+		realsize = 192;
+#endif
+
+	s = create_kmalloc_cache(x, "kmalloc_dma", dma_allocator, realsize);
+	kmalloc_caches_dma[index] = s;
+	return s;
 }
 
 void *__kmalloc(size_t size, gfp_t flags)
@@ -55,27 +133,12 @@ size_t ksize(const void *object)
 EXPORT_SYMBOL(ksize);
 
 /*
- * Given a slab size find the correct order to use.
- * We only support powers of two so there is really
- * no need for anything special. Objects will always
- * fit exactly into the slabs with no overhead.
- */
-static __init int order(size_t size)
-{
-	if (size >= PAGE_SIZE)
-		/* One object per slab */
-		return fls(size -1) - PAGE_SHIFT;
-
-	/* Multiple objects per page which will fit neatly */
-	return 0;
-}
-/*
  * Provide the kmalloc array as regular slab allocator for the
  * generic allocator framework.
  */
 struct slab_allocator kmalloc_slab_allocator;
 
-struct slab_cache *kmalloc_create(struct slab_control *x,
+static struct slab_cache *kmalloc_create(struct slab_control *x,
 	const struct slab_cache *s)
 {
 	struct slab_cache *km;
@@ -86,53 +149,24 @@ struct slab_cache *kmalloc_create(struct
 			|| s->offset)
 		return NULL;
 
-	km = &kmalloc_caches[0][index].sc;
+	km = &kmalloc_caches[index].sc;
 
 	BUG_ON(s->size > km->size);
 
 	return KMALLOC_ALLOCATOR.dup(km);
 }
 
-#ifndef ARCH_KMALLOC_MINALIGN
-#define ARCH_KMALLOC_MINALIGN sizeof(void *)
-#endif
-
-void __init create_kmalloc_cache(struct slab_control *x,
-		const char *name,
-		const struct page_allocator *p,
-		int size)
-{
-	struct slab_cache s;
-	struct slab_cache *rs;
-
-	s.page_alloc = p;
-	s.slab_alloc = &KMALLOC_ALLOCATOR;
-	s.size = size;
-	s.align = ARCH_KMALLOC_MINALIGN;
-	s.offset = 0;
-	s.objsize = size;
-	s.inuse = size;
-	s.node = -1;
-	s.order = order(size);
-	s.name = "kmalloc";
-	rs = KMALLOC_ALLOCATOR.create(x, &s);
-	if (!rs)
-		panic("Creation of kmalloc slab %s size=%d failed.\n",
-			name, size);
-	register_slab(rs);
-}
-
-void __init kmalloc_init_array(int dma, const char *name,
+static void __init kmalloc_init_array(const char *name,
 		const struct page_allocator *pa)
 {
 	int i;
 
 	for (i =  KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
 		create_kmalloc_cache(
-			&kmalloc_caches[dma][i - KMALLOC_SHIFT_LOW],
+			&kmalloc_caches[i - KMALLOC_SHIFT_LOW],
 			name, pa, 1 << i);
 	}
-#ifdef ARCH_NEEDS_SMALL_SLABS
+#ifdef KMALLOC_EXTRA
 	/* Non-power of two caches */
 	create_kmalloc_cache(&kmalloc_caches[dma]
 		[KMALLOC_SHIFT_HIGH - KMALLOC_SHIFT_LOW + 1], name, pa, 96);
@@ -144,18 +178,12 @@ void __init kmalloc_init_array(int dma, 
 void __init kmalloc_init(void)
 {
 
-	kmalloc_init_array(0, "kmalloc", &page_allocator);
+	kmalloc_init_array("kmalloc", &page_allocator);
 	/*
 	 * The above must be done first. Deriving a page allocator requires
 	 * a working (normal) kmalloc array.
 	 */
-
-	/*
-	 * On all my machines the DMA array is always empty. I wish we
-	 * could get rid of it.
-	 */
-	kmalloc_init_array(1, "kmalloc-DMA",
-			dmaify_page_allocator(&page_allocator));
+	dma_allocator = dmaify_page_allocator(&page_allocator);
 
 	/* And deal with the kmalloc_cache_allocator */
 	memcpy(&kmalloc_slab_allocator, &KMALLOC_ALLOCATOR,