Subject: [PATCH] binalloc: best-fit allocation with binning
From: Pekka Enberg <penberg@cs.helsinki.fi>

As suggested by Linus, to optimize memory use, I have implemented a best-fit
general purpose kernel memory allocator with binning. You can find the original
discussion here:

  http://lkml.org/lkml/2008/1/10/229

The results are as follows:

[ the minimum, maximum, and average are of captured from 10 individual runs ]

                Total   Free (kB)                          Used (kB)
                (kB)    min     max     average    sd      min   max   average
SLOB            122248  117624  117676  117663.20  17.23   4572  4624  4584.8
SLUB (no debug) 122228  117180  117232  117204.00  13.97   4994  5048  5024.0
BINALLOC        122248  116824  116884  116860.00  15.80   5364  5424  5388.0

Thanks to Vegard Nossum for his help with debugging and testing the allocator
and to Johannes Weiner for fixing my bugs.

Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Johannes Weiner <hannes@saeurebad.de>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/binalloc_def.h |   34 +
 include/linux/slab.h         |    4 
 init/Kconfig                 |    7 
 lib/Kconfig.debug            |    7 
 mm/Makefile                  |    1 
 mm/binalloc.c                |  850 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 902 insertions(+), 1 deletion(-)

Index: slab-2.6/mm/binalloc.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ slab-2.6/mm/binalloc.c	2008-07-12 09:25:32.000000000 +0300
@@ -0,0 +1,850 @@
+/*
+ * A best-fit general purpose kernel memory allocator with binning.
+ *
+ * Copyright (C) 2008  Pekka Enberg
+ *
+ * This file is release under the GPLv2
+ *
+ * I. Overview
+ *
+ * This is a best-fit general purpose kernel memory allocator with binning. We
+ * use the page allocator to allocate one big contiguous chunk that is split
+ * into smaller chunks for successive kmalloc() calls until we run out of space
+ * after which a new page is allocated.
+ *
+ * This memory allocator bears close resemblance to the "Lea" memory allocator
+ * described in:
+ *
+ *   http://g.oswego.edu/dl/html/malloc.html
+ *
+ * II. Anatomy of a chunk
+ *
+ * To detect page boundaries, a zero-sized sentinel chunk is placed at the end
+ * of a page. Objects are aligned by padding bytes at the beginning of a chunk
+ * after the boundary tag. The padding is included in the allocated object size
+ * so that neighbor boundary tags can be calculated. Likewise, boundary tags
+ * are aligned by padding bytes at the end of a chunk when splitting the chunk
+ * so that the first non-allocated byte is properly aligned.
+ *
+ * III. Operation
+ *
+ * In the following example, we assume a 64-byte page although on real machines
+ * the page size ranges from 4 KB to 64 KB. The size of the boundary tag in
+ * this example is 4 bytes and the mandatory alignment required by the machine
+ * is 4 bytes.
+ *
+ * Initially, the kernel memory allocator allocates one page from the page
+ * allocator and turns that into contiguous chunk with sentinel. You can see
+ * the boundary tag bytes marked as 'B' and the sentinel chunk boundary tag
+ * bytes as 'S' in the following diagram:
+ *
+ *   0       8       16      24      32      40      48      56
+ *   +-------+-------+-------+-------+-------+-------+-------+-------
+ *   BBBB                                                        SSSS
+ *
+ * Now lets assume a kmalloc() call comes in and wants to allocate 3 bytes. As
+ * we find a big enough chunk, we simply split it in two parts as follows:
+ *
+ *   0       8       16      24      32      40      48      56
+ *   +-------+-------+-------+-------+-------+-------+-------+-------
+ *   BBBBOOOPBBBB                                                SSSS
+ *
+ * As the pointer after the boundary tag of the chunk is already aligned to the
+ * mandatory alignment 4, the object marked as 'O' starts immediately after the
+ * boundary tag. However, to make sure the boundary tag of the next chunk is
+ * also aligned propery, one byte of padding is added to the end of the object
+ * marked as 'P'.
+ *
+ * Now assume that a kmem_cache_alloc() call comes in to allocate 16 bytes of
+ * memory with mandatory alignment of 16. Now, as the location after the
+ * boundary tag of the second chunk is not aligned to 16 byte boundary, we add
+ * 8 bytes of padding in front of the object as illustarted in the following
+ * diagram:
+ *
+ *   0       8       16      24      32      40      48      56
+ *   +-------+-------+-------+-------+-------+-------+-------+-------
+ *   BBBBOOOPBBBBPPPPOOOOOOOOOOOOOOOOBBBB                        SSSS
+ *
+ * Note that the boundary tag is naturally aligned due to the fact that the
+ * object size is already aligned to 4 byte boundary which is the mandatory
+ * alignment for this machine.
+ */
+
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/rbtree.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+
+/*
+ * Minimum alignments
+ */
+#ifndef ARCH_KMALLOC_MINALIGN
+#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long)
+#endif
+
+#ifndef ARCH_SLAB_MINALIGN
+#define ARCH_SLAB_MINALIGN __alignof__(unsigned long)
+#endif
+
+struct kmem_rcu {
+	struct rcu_head		head;
+	size_t			size;
+};
+
+#define KMEM_CHUNK_RESERVED	0xababababUL
+#define KMEM_CHUNK_COALESCED	0xbcbcbcbcUL
+#define KMEM_CHUNK_FREE		0xfefefefeUL
+
+/*
+ * Each chunk has a boundary tag at the beginning of the chunk. The tag
+ * contains the size of this chunk and the size of the previous chunk which is
+ * required by chuck coalescing when an object is freed.
+ */
+struct kmem_boundary_tag {
+#ifdef CONFIG_BINALLOC_DEBUG
+	unsigned long		magic;
+#endif
+	unsigned long		prev_size;
+	unsigned long		size;
+	unsigned long		align;
+};
+
+#define CHUNK_RESERVED_SHIFT	(sizeof(unsigned long)*8-1)
+#define CHUNK_SIZE_MASK		((1UL << (CHUNK_RESERVED_SHIFT-1)) - 1)
+
+struct kmem_chunk {
+	struct kmem_boundary_tag	tag;
+	/* The following fields are defined only if the chunk is available */
+	struct rb_node			bin_tree;
+};
+
+/*
+ * The code assumes that the end of a boundary tag is aligned by power of two
+ * for calculating the alignment of an object in a chunk.
+ */
+#define BOUNDARY_TAG_SIZE roundup_pow_of_two(sizeof(struct kmem_boundary_tag))
+
+struct kmem_bin {
+	struct rb_root		freelist;
+};
+
+#define MIN_CHUNK_SIZE (BOUNDARY_TAG_SIZE*2)
+
+#define BIN_SHIFT 5
+#define NR_BINS ((PAGE_SIZE) / (1 << BIN_SHIFT))
+
+static struct kmem_bin bins[NR_BINS];
+static DEFINE_SPINLOCK(kmem_lock);
+
+#ifdef CONFIG_BINALLOC_DEBUG
+
+#define DUMP_BYTES	128
+
+static void kmem_dump_chunk(struct kmem_chunk *chunk)
+{
+	print_hex_dump(KERN_ERR, "kmem: ", DUMP_PREFIX_ADDRESS, 16, 1,
+		chunk, DUMP_BYTES, 1);
+}
+
+static void kmem_verify_chunk(struct kmem_chunk *chunk, unsigned long magic)
+{
+	if (chunk->tag.magic != magic) {
+		printk(KERN_ERR "kmem: bad chunk magic: %lx, expected: %lx\n",
+				chunk->tag.magic, magic);
+		kmem_dump_chunk(chunk);
+		BUG();
+	}
+}
+
+static void chunk_set_magic(struct kmem_chunk *chunk, unsigned long magic)
+{
+	chunk->tag.magic = magic;
+}
+
+static void kmem_verify_page(struct page *page)
+{
+	struct kmem_chunk *chunk = page_address(page);
+
+	do {
+		BUG_ON(chunk_size(chunk) < MIN_CHUNK_SIZE);
+		BUG_ON(virt_to_page(chunk) != page);
+		chunk = next_chunk(chunk);
+	} while (chunk_size(chunk) != 0);
+}
+#else
+
+static inline void
+kmem_verify_chunk(struct kmem_chunk *chunk, unsigned long magic)
+{
+}
+
+static inline void
+chunk_set_magic(struct kmem_chunk *chunk, unsigned long magic)
+{
+}
+
+static inline void kmem_verify_page(struct page *page)
+{
+}
+#endif /* CONFIG_BINALLOC_DEBUG */
+
+static bool chunk_is_available(struct kmem_chunk *chunk)
+{
+	return (chunk->tag.size >> CHUNK_RESERVED_SHIFT) == 0;
+}
+
+static void chunk_mark_reserved(struct kmem_chunk *chunk)
+{
+	chunk_set_magic(chunk, KMEM_CHUNK_RESERVED);
+	chunk->tag.size |= 1UL << CHUNK_RESERVED_SHIFT;
+}
+
+static void chunk_mark_available(struct kmem_chunk *chunk)
+{
+	chunk_set_magic(chunk, KMEM_CHUNK_FREE);
+	chunk->tag.size &= ~(1UL << CHUNK_RESERVED_SHIFT);
+}
+
+static unsigned long chunk_size(struct kmem_chunk *chunk)
+{
+	return chunk->tag.size & CHUNK_SIZE_MASK;
+}
+
+static void __chunk_set_size(struct kmem_chunk *chunk, unsigned long size)
+{
+	chunk->tag.size = (chunk->tag.size & ~CHUNK_SIZE_MASK) | size;
+}
+
+static void ptr_set_align(void *p, unsigned long align)
+{
+	unsigned long *buf = (void *)p - sizeof(unsigned long);
+
+	*buf = align;
+}
+
+static unsigned long ptr_get_align(const void *p)
+{
+	unsigned long *buf = (void *)p - sizeof(unsigned long);
+
+	return *buf;
+}
+
+/*
+ * Objects are aligned by padding empty space at the beginning of a chunk just
+ * after the boundary tag. Note: object padding can be zero.
+ */
+static unsigned long object_padding(unsigned long align)
+{
+	return ALIGN(BOUNDARY_TAG_SIZE, align) - BOUNDARY_TAG_SIZE;
+}
+
+/*
+ * Boundary tags are aligned by padding empty space at the end of a chunk just
+ * after the allocated object. Note: boundary tag padding can be zero.
+ */
+static unsigned long boundary_tag_padding(unsigned long size)
+{
+	return ALIGN(size, BOUNDARY_TAG_SIZE) - size;
+}
+
+static struct kmem_chunk *rawptr_to_chunk(const void *p)
+{
+	void *q = (void *)p - BOUNDARY_TAG_SIZE;
+	return q;
+}
+
+static void *chunk_to_rawptr(struct kmem_chunk *chunk)
+{
+	return (void *)chunk + BOUNDARY_TAG_SIZE;
+}
+
+static struct kmem_chunk *ptr_to_chunk(const void *p, unsigned long align)
+{
+	unsigned long padding = object_padding(align);
+
+	return rawptr_to_chunk(p - padding);
+}
+
+static void *chunk_to_ptr(struct kmem_chunk *chunk, unsigned long align)
+{
+	unsigned long padding;
+	void *p;
+
+	p = chunk_to_rawptr(chunk);
+	padding = object_padding(align);
+	return p + padding;
+}
+
+static struct kmem_chunk *prev_chunk(struct kmem_chunk *chunk)
+{
+	void *p = rawptr_to_chunk((void *) chunk - chunk->tag.prev_size);
+
+	BUG_ON(!virt_addr_valid(p));
+
+	return p;
+}
+
+static struct kmem_chunk *next_chunk(struct kmem_chunk *chunk)
+{
+	return chunk_to_rawptr(chunk) + chunk_size(chunk);
+}
+
+static bool chunk_is_first(struct kmem_chunk *b)
+{
+	return b->tag.prev_size == 0;
+}
+
+static bool chunk_is_last(struct kmem_chunk *b)
+{
+	struct kmem_chunk *next = next_chunk(b);
+
+	return chunk_size(next) == 0;
+}
+
+static void chunk_set_size(struct kmem_chunk *chunk, unsigned long size)
+{
+	BUG_ON(size < MIN_CHUNK_SIZE);
+	BUG_ON(size > PAGE_SIZE);
+
+	__chunk_set_size(chunk, size);
+
+	if (!chunk_is_last(chunk)) {
+		struct kmem_chunk *next = next_chunk(chunk);
+
+		next->tag.prev_size = size;
+	}
+}
+
+#define ALLOC_MASK (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK)
+
+static void *kmem_alloc_large(size_t size, gfp_t gfpflags, int node)
+{
+	struct page *page;
+	int order;
+
+	order = get_order(size);
+	page = alloc_pages_node(node, gfpflags & ALLOC_MASK, order);
+	if (!page)
+		return NULL;
+	page->private = size;
+
+	return page_address(page);
+}
+
+static int lookup_bin_idx(unsigned long size)
+{
+	return (size-1) >> BIN_SHIFT;
+}
+
+static struct kmem_bin *lookup_bin(unsigned long size)
+{
+	int idx = lookup_bin_idx(size);
+
+	BUG_ON(idx > NR_BINS);
+
+	return &bins[idx];
+}
+
+static struct kmem_bin *chunk_get_bin(struct kmem_chunk *chunk)
+{
+	unsigned long size = chunk_size(chunk);
+
+	return lookup_bin(size);
+}
+
+static void __insert_to_freelist(struct kmem_bin *bin, struct kmem_chunk *chunk)
+{
+	struct rb_node **p = &bin->freelist.rb_node;
+	struct rb_node *parent = NULL;
+
+	while (*p) {
+		struct kmem_chunk *this;
+		parent = *p;
+
+		this = rb_entry(parent, struct kmem_chunk, bin_tree);
+
+		if (chunk_size(chunk) < chunk_size(this))
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&chunk->bin_tree, parent, p);
+	rb_insert_color(&chunk->bin_tree, &bin->freelist);
+}
+
+static void insert_to_freelist(struct kmem_chunk *chunk)
+{
+	struct kmem_bin *bin;
+
+	kmem_verify_chunk(chunk, KMEM_CHUNK_FREE);
+
+	bin = chunk_get_bin(chunk);
+	__insert_to_freelist(bin, chunk);
+}
+
+static void remove_from_freelist(struct kmem_chunk *chunk)
+{
+	struct kmem_bin *bin = chunk_get_bin(chunk);
+
+	rb_erase(&chunk->bin_tree, &bin->freelist);
+}
+
+static struct kmem_chunk *chunk_page_alloc(gfp_t gfpflags)
+{
+	struct kmem_chunk *chunk, *sentinel;
+	struct page *page;
+
+	page = alloc_pages_node(-1, gfpflags & ALLOC_MASK, 0);
+	if (!page)
+		return NULL;
+
+	__SetPageSlab(page);
+
+	chunk = page_address(page);
+	chunk->tag.prev_size = 0;
+	__chunk_set_size(chunk, PAGE_SIZE - BOUNDARY_TAG_SIZE*2);
+	chunk_mark_available(chunk);
+
+	/*
+	 * The sentinel chunk marks the end of a page and it's the only one
+	 * that can have size zero.
+	 */
+	sentinel = page_address(page) + PAGE_SIZE - BOUNDARY_TAG_SIZE;
+	sentinel->tag.prev_size = chunk_size(chunk);
+	__chunk_set_size(sentinel, 0);
+	chunk_mark_reserved(sentinel);
+
+	return chunk;
+}
+
+static void split_chunk(struct kmem_chunk *chunk, size_t new_size)
+{
+	struct kmem_chunk *upper_half;
+	size_t size, remaining;
+
+	BUG_ON(new_size < MIN_CHUNK_SIZE);
+	BUG_ON(new_size > PAGE_SIZE);
+
+	kmem_verify_chunk(chunk, KMEM_CHUNK_FREE);
+
+	size = chunk_size(chunk);
+	BUG_ON(size < new_size);
+
+	remaining = size - new_size;
+
+	/*
+	 * Don't split if remaining half would end up too small.
+	 */
+	if (remaining < (BOUNDARY_TAG_SIZE + MIN_CHUNK_SIZE))
+		return;
+
+	chunk_set_size(chunk, new_size);
+
+	upper_half = next_chunk(chunk);
+	upper_half->tag.prev_size = chunk_size(chunk);
+	chunk_set_size(upper_half, remaining - BOUNDARY_TAG_SIZE);
+
+	chunk_mark_available(upper_half);
+	insert_to_freelist(upper_half);
+}
+
+static struct kmem_chunk *__kmem_alloc(struct kmem_bin *bin, size_t size)
+{
+	struct rb_node *n = bin->freelist.rb_node;
+	struct kmem_chunk *ret = NULL;
+
+	while (n) {
+		struct kmem_chunk *chunk = rb_entry(n, struct kmem_chunk, bin_tree);
+
+		if (chunk_size(chunk) < size) {
+			/*
+			 * The chunk is not big enough.
+			 */
+			n = n->rb_right;
+		} else {
+			/*
+			 * Look up the smallest possible chunk that is big
+			 * enough for us but bail out early if we find a
+			 * perfect fit.
+			 */
+			ret = chunk;
+			if (chunk_size(chunk) == size)
+				break;
+			n = n->rb_left;
+		}
+	}
+	if (ret)
+		remove_from_freelist(ret);
+
+	return ret;
+}
+
+static unsigned long calculate_padding(size_t size, unsigned long align)
+{
+	unsigned long obj_padding, tag_padding;
+
+	obj_padding = object_padding(align);
+	tag_padding = boundary_tag_padding(size + obj_padding);
+
+	return obj_padding + tag_padding;
+}
+
+/*
+ * This is the heart of the kernel memory allocator.
+ */
+static void *
+kmem_alloc(size_t size, gfp_t gfpflags, unsigned long align)
+{
+	struct kmem_chunk *chunk;
+	unsigned long padding;
+	unsigned long flags;
+	void *p;
+	int i;
+
+	if (size < MIN_CHUNK_SIZE)
+		size = MIN_CHUNK_SIZE;
+
+	padding = calculate_padding(size, align);
+
+	spin_lock_irqsave(&kmem_lock, flags);
+	/*
+	 * Look for available chunks from all bins starting from the smallest
+	 * one that is big enough for the requested size.
+	 */
+	for (i = lookup_bin_idx(size); i < NR_BINS; i++) {
+		struct kmem_bin *bin = &bins[i];
+
+		chunk = __kmem_alloc(bin, size + padding);
+
+		/*
+		 * If we found a free chunk, return a pointer it; otherwise
+		 * continue scanning.
+		 */
+		if (chunk)
+			goto allocate_obj;
+	}
+
+	/*
+	 * Ok, we need more pages.
+	 */
+	spin_unlock_irqrestore(&kmem_lock, flags);
+	chunk = chunk_page_alloc(gfpflags);
+	if (!chunk)
+		return NULL;
+	spin_lock_irqsave(&kmem_lock, flags);
+
+allocate_obj:
+	split_chunk(chunk, size + padding);
+	chunk_mark_reserved(chunk);
+	spin_unlock_irqrestore(&kmem_lock, flags);
+
+	p = chunk_to_ptr(chunk, align);
+	ptr_set_align(p, align);
+
+	BUG_ON(!IS_ALIGNED((unsigned long) p, align));
+	return p;
+}
+
+static void *
+kmem_alloc_node(size_t size, gfp_t gfpflags, unsigned long align, int node)
+{
+	void *p;
+
+	if (unlikely(!size))
+		return ZERO_SIZE_PTR;
+
+	if (size < PAGE_SIZE/2)
+		p = kmem_alloc(size, gfpflags, align);
+	else
+		p = kmem_alloc_large(size, gfpflags, node);
+
+	if ((gfpflags & __GFP_ZERO) && p)
+		memset(p, 0, size);
+
+	if (size < PAGE_SIZE/2)
+		kmem_verify_page(virt_to_page(p));
+
+	return p;
+}
+
+void *__kmalloc_node(size_t size, gfp_t gfpflags, int node)
+{
+	unsigned long align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+
+	return kmem_alloc_node(size, gfpflags, align, node);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+
+void *__kmalloc(size_t size, gfp_t gfpflags)
+{
+	unsigned long align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+
+	return kmem_alloc_node(size, gfpflags, align, -1);
+}
+EXPORT_SYMBOL(__kmalloc);
+
+static void
+coalesce_chunks(struct kmem_chunk *chunk, struct kmem_chunk *right_neighbor)
+{
+	unsigned long new_size;
+
+	kmem_verify_chunk(right_neighbor, KMEM_CHUNK_FREE);
+	kmem_verify_chunk(chunk, KMEM_CHUNK_FREE);
+
+	chunk_set_magic(right_neighbor, KMEM_CHUNK_COALESCED);
+
+	new_size = chunk_size(chunk) + chunk_size(right_neighbor);
+
+	/*
+	 * The boundary tag of the right-side neighbor is coalesced into the
+	 * chunk as well.
+	 */
+	chunk_set_size(chunk, new_size + BOUNDARY_TAG_SIZE);
+}
+
+static void kmem_free_chunk(struct kmem_chunk *chunk, struct page *page)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&kmem_lock, flags);
+
+	chunk_mark_available(chunk);
+
+	/*
+	 * Coalesce chunk with neighbor chunks that not reserved.
+	 */
+	if (likely(!chunk_is_first(chunk))) {
+		struct kmem_chunk *prev = prev_chunk(chunk);
+
+		if (chunk_is_available(prev)) {
+			remove_from_freelist(prev);
+			coalesce_chunks(prev, chunk);
+			chunk = prev;
+		}
+	}
+	if (likely(!chunk_is_last(chunk))) {
+		struct kmem_chunk *next = next_chunk(chunk);
+
+		if (chunk_is_available(next)) {
+			remove_from_freelist(next);
+			coalesce_chunks(chunk, next);
+		}
+	}
+
+	/*
+	 * If the chunk now covers a whole page, give it back to the page
+	 * allocator; otherwise insert it to the freelist.
+	 */
+	if (unlikely(chunk_size(chunk) == PAGE_SIZE-BOUNDARY_TAG_SIZE*2)) {
+		__ClearPageSlab(page);
+		__free_pages(page, 0);
+	} else
+		insert_to_freelist(chunk);
+
+	spin_unlock_irqrestore(&kmem_lock, flags);
+}
+
+static void kmem_free(const void *p, struct page *page)
+{
+	struct kmem_chunk *chunk;
+	unsigned long align;
+
+	kmem_verify_page(page);
+
+	align = ptr_get_align(p);
+	chunk = ptr_to_chunk(p, align);
+
+	kmem_free_chunk(chunk, page);
+}
+
+static void __kfree(const void *p)
+{
+	struct page *page = virt_to_page(p);
+
+	if (PageSlab(page))
+		kmem_free(p, page);
+	else
+		put_page(page);
+}
+
+void kfree(const void *p)
+{
+	if (unlikely(ZERO_OR_NULL_PTR(p)))
+		return;
+
+	__kfree(p);
+}
+EXPORT_SYMBOL(kfree);
+
+static size_t __ksize(const void *p)
+{
+	struct kmem_chunk *chunk;
+	unsigned long align;
+
+	align = ptr_get_align(p);
+	chunk = ptr_to_chunk(p, align);
+
+	/*
+	 * No need for locking here: the size of a reserved chunk can never
+	 * change.
+	 */
+	return chunk_size(chunk) - object_padding(align);
+}
+
+size_t ksize(const void *p)
+{
+	struct page *page;
+
+	BUG_ON(!p);
+
+	if (unlikely(ZERO_OR_NULL_PTR(p)))
+		return 0;
+
+	page = virt_to_page(p);
+
+	if (PageSlab(page))
+		return __ksize(p);
+
+	return page->private;
+}
+EXPORT_SYMBOL(ksize);
+
+struct kmem_cache {
+	unsigned int size, align;
+	unsigned long gfpflags;
+	const char *name;
+	void (*ctor)(struct kmem_cache *, void *);
+};
+
+struct kmem_cache *
+kmem_cache_create(const char *name, size_t size, size_t align,
+		  unsigned long gfpflags,
+		  void (*ctor)(struct kmem_cache *, void *))
+{
+	struct kmem_cache *cache;
+
+	BUG_ON(size == 0);
+
+	cache = kmalloc(sizeof(*cache), GFP_KERNEL);
+	if (cache) {
+		cache->size = size;
+		if (gfpflags & SLAB_DESTROY_BY_RCU) {
+			/* leave room for rcu footer at the end of chunk */
+			cache->size += sizeof(struct kmem_rcu);
+		}
+		cache->align = max(align, ARCH_SLAB_MINALIGN);
+		cache->gfpflags = gfpflags;
+		cache->name = name;
+		cache->ctor = ctor;
+	}
+	return cache;
+}
+EXPORT_SYMBOL(kmem_cache_create);
+
+void kmem_cache_destroy(struct kmem_cache *cache)
+{
+	kfree(cache);
+}
+EXPORT_SYMBOL(kmem_cache_destroy);
+
+void *
+kmem_cache_alloc_node(struct kmem_cache *cache, gfp_t gfpflags, int node)
+{
+	void *p;
+
+	p = kmem_alloc_node(cache->size, gfpflags, cache->align, node);
+	if (!p)
+		return NULL;
+
+	if (cache->ctor)
+		cache->ctor(cache, p);
+	return p;
+}
+EXPORT_SYMBOL(kmem_cache_alloc_node);
+
+static void kmem_rcu_free(struct rcu_head *head)
+{
+	struct kmem_rcu *kmem_rcu = (struct kmem_rcu *)head;
+	void *p = (void *)kmem_rcu - (kmem_rcu->size - sizeof(struct kmem_rcu));
+
+	__kfree(p);
+}
+
+void kmem_cache_free(struct kmem_cache *cache, void *p)
+{
+	if (unlikely(ZERO_OR_NULL_PTR(p)))
+		return;
+
+	if (unlikely(cache->gfpflags & SLAB_DESTROY_BY_RCU)) {
+		struct kmem_rcu *kmem_rcu;
+
+		kmem_rcu = p + (cache->size - sizeof(struct kmem_rcu));
+		INIT_RCU_HEAD(&kmem_rcu->head);
+		kmem_rcu->size = cache->size;
+		call_rcu(&kmem_rcu->head, kmem_rcu_free);
+	} else
+		__kfree(p);
+}
+EXPORT_SYMBOL(kmem_cache_free);
+
+unsigned int kmem_cache_size(struct kmem_cache *cache)
+{
+	return cache->size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
+const char *kmem_cache_name(struct kmem_cache *cache)
+{
+	return cache->name;
+}
+EXPORT_SYMBOL(kmem_cache_name);
+
+int kmem_cache_shrink(struct kmem_cache *cache)
+{
+	return 0;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
+int kmem_ptr_validate(struct kmem_cache *cache, const void *p)
+{
+	return 0;
+}
+
+static unsigned int kmem_ready __read_mostly;
+
+int slab_is_available(void)
+{
+	return kmem_ready;
+}
+
+static void kmem_init_bin(struct kmem_bin *bin)
+{
+	bin->freelist = RB_ROOT;
+}
+
+#define NR_ALLOCS 64
+#define ALLOC_SIZE 128
+
+static void kmem_selftest(void)
+{
+	void *objs[NR_ALLOCS];
+	int i;
+
+	for (i = 0; i < NR_ALLOCS; i++)
+		objs[i] = kmalloc(ALLOC_SIZE, GFP_KERNEL);
+
+	for (i = 0; i < NR_ALLOCS; i++)
+		kfree(objs[i]);
+}
+
+void __init kmem_cache_init(void)
+{
+	int i;
+
+	for (i = 0; i < NR_BINS; i++)
+		kmem_init_bin(&bins[i]);
+	kmem_ready = 1;
+
+	kmem_selftest();
+}
Index: slab-2.6/init/Kconfig
===================================================================
--- slab-2.6.orig/init/Kconfig	2008-07-12 09:24:46.000000000 +0300
+++ slab-2.6/init/Kconfig	2008-07-12 09:25:32.000000000 +0300
@@ -774,6 +774,13 @@
 	   allocator. SLOB is generally more space efficient but
 	   does not perform as well on large systems.
 
+config BINALLOC
+	depends on EMBEDDED
+	bool "BINALLOC"
+	help
+	   A best-fit general-purpose kernel memory allocator with binning
+	   that tries to be as memory efficient as possible.
+
 endchoice
 
 config PROFILING
Index: slab-2.6/mm/Makefile
===================================================================
--- slab-2.6.orig/mm/Makefile	2008-07-12 09:24:46.000000000 +0300
+++ slab-2.6/mm/Makefile	2008-07-12 09:25:32.000000000 +0300
@@ -27,6 +27,7 @@
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
+obj-$(CONFIG_BINALLOC) += binalloc.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
Index: slab-2.6/include/linux/slab.h
===================================================================
--- slab-2.6.orig/include/linux/slab.h	2008-07-12 09:24:46.000000000 +0300
+++ slab-2.6/include/linux/slab.h	2008-07-12 09:25:32.000000000 +0300
@@ -123,6 +123,8 @@
 #include <linux/slub_def.h>
 #elif defined(CONFIG_SLOB)
 #include <linux/slob_def.h>
+#elif defined(CONFIG_BINALLOC)
+#include <linux/binalloc_def.h>
 #else
 #include <linux/slab_def.h>
 #endif
@@ -185,7 +187,7 @@
 	return __kmalloc(n * size, flags | __GFP_ZERO);
 }
 
-#if !defined(CONFIG_NUMA) && !defined(CONFIG_SLOB)
+#if !defined(CONFIG_NUMA) && !defined(CONFIG_SLOB) && !defined(CONFIG_BINALLOC)
 /**
  * kmalloc_node - allocate memory from a specific node
  * @size: how many bytes of memory are required.
Index: slab-2.6/include/linux/binalloc_def.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ slab-2.6/include/linux/binalloc_def.h	2008-07-12 09:25:32.000000000 +0300
@@ -0,0 +1,34 @@
+#ifndef __LINUX_BINALLOC_DEF_H
+#define __LINUX_BINALLOC_DEF_H
+
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
+
+static inline void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+	return kmem_cache_alloc_node(cachep, flags, -1);
+}
+
+void *__kmalloc_node(size_t size, gfp_t flags, int node);
+
+static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	return __kmalloc_node(size, flags, node);
+}
+
+/**
+ * kmalloc - allocate memory
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kcalloc).
+ *
+ * kmalloc is the normal method of allocating memory
+ * in the kernel.
+ */
+static inline void *kmalloc(size_t size, gfp_t flags)
+{
+	return __kmalloc_node(size, flags, -1);
+}
+
+void *__kmalloc(size_t size, gfp_t flags);
+
+#endif /* __LINUX_BINALLOC_DEF_H */
+
Index: slab-2.6/lib/Kconfig.debug
===================================================================
--- slab-2.6.orig/lib/Kconfig.debug	2008-07-12 09:24:46.000000000 +0300
+++ slab-2.6/lib/Kconfig.debug	2008-07-12 09:25:32.000000000 +0300
@@ -263,6 +263,13 @@
 	  out which slabs are relevant to a particular load.
 	  Try running: slabinfo -DA
 
+config BINALLOC_DEBUG
+	bool "Debug binalloc memory allocations"
+	default n
+	help
+	  Say Y here to have the memory allocator to do sanity checks for
+	  allocation and deallocation.
+
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
 	depends on DEBUG_KERNEL && PREEMPT && (TRACE_IRQFLAGS_SUPPORT || PPC64)