Index: linux-2.6.20-rc1/include/linux/mm_types.h =================================================================== --- linux-2.6.20-rc1.orig/include/linux/mm_types.h 2006-12-15 17:24:21.000000000 -0800 +++ linux-2.6.20-rc1/include/linux/mm_types.h 2006-12-15 17:24:31.000000000 -0800 @@ -19,10 +19,16 @@ unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ atomic_t _count; /* Usage count, see below. */ - atomic_t _mapcount; /* Count of ptes mapped in mms, + union { + atomic_t _mapcount; /* Count of ptes mapped in mms, * to show when page is mapped * & limit reverse map searches. */ + struct { /* Slub */ + short unsigned int inuse; + short unsigned int offset; + }; + }; union { struct { unsigned long private; /* Mapping-private opaque data: @@ -43,8 +49,15 @@ #if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS spinlock_t ptl; #endif + struct { /* Slub */ + struct page *first_page; /* Compound pages */ + struct kmem_cache *slab; /* Pointer to slab */ + }; + }; + union { + pgoff_t index; /* Our offset within mapping. */ + void *freelist; /* Slabifier: free object */ }; - pgoff_t index; /* Our offset within mapping. */ struct list_head lru; /* Pageout list, eg. active_list * protected by zone->lru_lock ! */ Index: linux-2.6.20-rc1/include/linux/page-flags.h =================================================================== --- linux-2.6.20-rc1.orig/include/linux/page-flags.h 2006-12-15 17:24:21.000000000 -0800 +++ linux-2.6.20-rc1/include/linux/page-flags.h 2006-12-15 17:24:31.000000000 -0800 @@ -91,6 +91,7 @@ #define PG_nosave_free 18 /* Used for system suspend/resume */ #define PG_buddy 19 /* Page is free, on buddy lists */ +#define PG_slabsingle 20 /* Slab contains a single object */ #if (BITS_PER_LONG > 32) /* @@ -152,6 +153,7 @@ #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) +#define __SetPageActive(page) __set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) #define __ClearPageActive(page) __clear_bit(PG_active, &(page)->flags) @@ -251,6 +253,10 @@ #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#define PageSlabsingle(page) test_bit(PG_slabsingle, &(page)->flags) +#define __SetPageSlabsingle(page) __set_bit(PG_slabsingle, &(page)->flags) +#define __ClearPageSlabsingle(page) __clear_bit(PG_slabsingle, &(page)->flags) + struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); Index: linux-2.6.20-rc1/include/linux/slab.h =================================================================== --- linux-2.6.20-rc1.orig/include/linux/slab.h 2006-12-15 17:24:21.000000000 -0800 +++ linux-2.6.20-rc1/include/linux/slab.h 2006-12-15 17:24:31.000000000 -0800 @@ -50,6 +50,9 @@ void (*)(void *, struct kmem_cache *, unsigned long)); void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); +int kmem_cache_defrag(struct kmem_cache *s, + int (*move_object)(struct kmem_cache *, void *)); + void *kmem_cache_alloc(struct kmem_cache *, gfp_t); void *kmem_cache_zalloc(struct kmem_cache *, gfp_t); void kmem_cache_free(struct kmem_cache *, void *); @@ -73,7 +76,7 @@ void *__kmalloc(size_t, gfp_t); void *__kzalloc(size_t, gfp_t); void kfree(const void *); -unsigned int ksize(const void *); +size_t ksize(const void *); /** * kcalloc - allocate memory for an array. The memory is set to zero. @@ -94,9 +97,16 @@ * the appropriate general cache at compile time. */ -#ifdef CONFIG_SLAB +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) +#ifdef CONFIG_SLUB +#include +#else #include +#endif /* !CONFIG_SLUB */ #else + +#error "fallback definitions not allowed!" + /* * Fallback definitions for an allocator not wanting to provide * its own optimized kmalloc definitions (like SLOB). Index: linux-2.6.20-rc1/init/Kconfig =================================================================== --- linux-2.6.20-rc1.orig/init/Kconfig 2006-12-15 17:24:21.000000000 -0800 +++ linux-2.6.20-rc1/init/Kconfig 2006-12-15 17:24:31.000000000 -0800 @@ -448,15 +448,6 @@ option replaces shmem and tmpfs with the much simpler ramfs code, which may be appropriate on small systems without swap. -config SLAB - default y - bool "Use full SLAB allocator" if EMBEDDED - help - Disabling this replaces the advanced SLAB allocator and - kmalloc support with the drastically simpler SLOB allocator. - SLOB is more space efficient but does not scale well and is - more susceptible to fragmentation. - config VM_EVENT_COUNTERS default y bool "Enable VM event counters for /proc/vmstat" if EMBEDDED @@ -466,6 +457,46 @@ option allows the disabling of the VM event counters. /proc/vmstat will only show page counts. +choice + prompt "Choose SLAB allocator" + default SLAB + help + This options allows the use of alternate SLAB allocators. + +config SLAB + bool "Regular Allocator" + help + This is the regular slab allocator that is established and + known to work well. It organizes cache hot objects in + per cpu and per node queues. SLAB has advanced debugging + capability. It is advisable to chose this SLAB allocator. + +config SLUB + depends on EXPERIMENTAL + bool "SLUB (EXPERIMENTAL Allocator)" + help + Slub is a SLAB allocator that minimizes cache line usage + instead of managing object caches like the regular SLAB + allocator. The caching is done using blocks of objects + instead of queues of objects. The allocator supports + fine grained locks and realizes a less complex way of + supporting locality management for NUMA. + +config SLOB +# +# SLOB does not support SMP because SLAB_DESTROY_BY_RCU is not support. +# + depends on EMBEDDED && !SMP + bool "SLOB (Simple Allocator)" + help + Slob replaces the SLAB allocator with a drastically simpler + allocator. SLOB is more space efficient but does not scale + well (single lock for all operations) and is more susceptible + to fragmentation but it is a great choice to reduce + memory use and code size. + +endchoice + endmenu # General setup config RT_MUTEXES @@ -481,10 +512,6 @@ default 0 if BASE_FULL default 1 if !BASE_FULL -config SLOB - default !SLAB - bool - menu "Loadable module support" config MODULES Index: linux-2.6.20-rc1/mm/Makefile =================================================================== --- linux-2.6.20-rc1.orig/mm/Makefile 2006-12-15 17:24:20.000000000 -0800 +++ linux-2.6.20-rc1/mm/Makefile 2006-12-15 17:24:31.000000000 -0800 @@ -25,6 +25,7 @@ obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o obj-$(CONFIG_SLOB) += slob.o obj-$(CONFIG_SLAB) += slab.o +obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o Index: linux-2.6.20-rc1/mm/slub.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.20-rc1/mm/slub.c 2006-12-15 19:14:29.000000000 -0800 @@ -0,0 +1,1857 @@ +/* + * SLUB: A slab allocator/ + * + * This allocator uses slabs of objects as caches and does not manage + * lists of cached objects like the regular Linux SLAB allocator. + * + * The allocator synchronizes using slab based locks and only + * uses a list lock to manage the pool of partial slabs per node. + * + * (C) 2006 Silicon Graphics Inc., Christoph Lameter + * + * TODO: + * - Performance tests. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define SLUB_UNIMPLEMENTED (SLAB_DEBUG_FREE | SLAB_DEBUG_INITIAL | \ + SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) + +/* + * Enabling SLAB_DEBUG results in internal consistency checks + * being enabled. + */ +#define SLAB_DEBUG + +/* + * SLAB_DEBUG_KFREE enabled checking for double frees. In order to do this + * we have to look through the free lists of object in a slab on kfree which + * may slightly reduce performance. + */ +#define SLAB_DEBUG_KFREE + +/* + * SLAB_MERGE causes multiple slabs that have the same object size to be + * combined. This reduces the number of slabs significantly. This in turn + * increases the chance of finding a cache hot object. However, the slab + * statistics are only kept per slab and thus one will not be able to + * separate out the uses of various slabs. + */ +//#ifndef SLAB_DEBUG +#define SLAB_MERGE +//#endif + +/* + * Set of flags that will prohibit slab merging + */ +#define SLAB_NO_MERGE (SLAB_RECLAIM_ACCOUNT | SLAB_DESTROY_BY_RCU | \ + SLAB_CACHE_DMA | SLAB_DEBUG_FREE | SLAB_DEBUG_INITIAL | \ + SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) + +#ifndef ARCH_KMALLOC_MINALIGN +#define ARCH_KMALLOC_MINALIGN sizeof(void *) +#endif + +#ifndef ARCH_SLAB_MINALIGN +#define ARCH_SLAB_MINALIGN sizeof(void *) +#endif + +/* Special boot time slab for NUMA bootstrap */ +#define CACHELINE_SLAB_NR kmalloc_index(L1_CACHE_BYTES) +#define CACHELINE_SLAB_SLAB &kmalloc_caches[CACHELINE_SLAB_NR - KMALLOC_SHIFT_LOW] + +#ifdef CONFIG_NUMA +#define CPU_SLAB(__s,__cpu) ((__s)->cpu[__cpu]) +#define NODE_INFO(__s, __node) ((__s)->node[__node]) +#else +#define CPU_SLAB(__s,__cpu) (&(__s)->cpu[__cpu]) +#define NODE_INFO(__s, __node) (&(__s)->node[__node]) +#endif + +/********************************************************************* + * Track slabs and provide the ability to run operations on them + *********************************************************************/ + +static DECLARE_RWSEM(slabstat_sem); + +LIST_HEAD(slab_caches); + +void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) +{ + struct list_head *h; + + down_read(&slabstat_sem); + list_for_each(h, &slab_caches) { + struct kmem_cache *s = + container_of(h, struct kmem_cache, list); + + func(s, cpu); + } + up_read(&slabstat_sem); +} + +void register_slab(struct kmem_cache *s) +{ + down_write(&slabstat_sem); + list_add(&s->list, &slab_caches); + up_write(&slabstat_sem); +} + +void unregister_slab(struct kmem_cache *s) +{ + down_write(&slabstat_sem); + list_add(&s->list, &slab_caches); + up_write(&slabstat_sem); +} + +/******************************************************************** + * Core slab cache functions + *******************************************************************/ + +/* + * For a compound page the first page keeps the slab state. + * + * Lock order: + * 1. slab_lock(page) + * 2. slab->list_lock + * + * SLUB assigns one cpu slab for allocation to each processor. + * Allocations only occur from these cpu slabs. + * + * If a slab is active then a workqueue thread checks every few seconds + * seconds if the cpu slab is still in use. The cpu slab is pushed back + * to the list if inactive [only needed for SMP]. + * + * Slabs with free and used objects are kept on a partial list. + * There is no list for full slabs. If an object in a full slab is + * freed then the slab will show up again on the partial lists. + * Otherwise there is no need to track full slabs. + * + * Slabs are freed when they become empty. Teardown and setup is + * minimal so we rely on the page allocators per cpu caches for + * fast frees and allocations. + */ + +#ifdef SLAB_DEBUG +void check_free_chain(struct kmem_cache *, struct page *); +#else +void check_free_chain(struct kmem_cache *, struct page *) {} +#endif + +/* + * Locking for each individual slab using the pagelock + */ +static __always_inline void slab_lock(struct page *page) +{ +#ifdef CONFIG_SMP + bit_spin_lock(PG_locked, &page->flags); +#endif + check_free_chain(page->slab, page); +} + +static __always_inline void slab_unlock(struct page *page) +{ + check_free_chain(page->slab, page); +#ifdef CONFIG_SMP + bit_spin_unlock(PG_locked, &page->flags); +#endif +} + +static __always_inline int slab_trylock(struct page *page) +{ +#ifdef CONFIG_SMP + return bit_spin_trylock(PG_locked, &page->flags); +#else + return 1; +#endif +} + +/* + * Management of partially allocated slabs + */ +static void __always_inline add_partial(struct kmem_cache *s, struct page *page) +{ + struct node_slab *n = NODE_INFO(s, page_to_nid(page)); + + spin_lock(&n->list_lock); + n->nr_partial++; + list_add_tail(&page->lru, &n->partial); + spin_unlock(&n->list_lock); +} + +static void __always_inline remove_partial(struct kmem_cache *s, + struct page *page) +{ + struct node_slab *n = NODE_INFO(s, page_to_nid(page)); + + spin_lock(&n->list_lock); + list_del(&page->lru); + n->nr_partial--; + spin_unlock(&n->list_lock); +} + +/* + * Lock page and remove it from the partial list + * + * Must hold list_lock + */ +static __always_inline int lock_and_del_slab(struct node_slab *n, + struct page *page) +{ + if (slab_trylock(page)) { + list_del(&page->lru); + n->nr_partial--; + return 1; + } + return 0; +} + +/* + * Try to get a partial slab from the indicated node + */ +static struct page *get_partial_node(struct node_slab *n) +{ + struct page *page; + + /* + * Racy check. If we mistakenly see no partial slabs then we + * just allocate an empty slab. If we mistakenly try to get a + * partial slab then get_partials() will return NULL. + */ + if (!n->nr_partial) + return NULL; + + spin_lock(&n->list_lock); + list_for_each_entry(page, &n->partial, lru) + if (lock_and_del_slab(n, page)) + goto out; + page = NULL; +out: + spin_unlock(&n->list_lock); + return page; +} + +struct page *get_any_partial(struct kmem_cache *s, int node, gfp_t flags) +{ +#ifdef CONFIG_NUMA + struct zonelist *zonelist = &NODE_DATA(slab_node(current->mempolicy)) + ->node_zonelists[gfp_zone(flags)]; + struct zone **z; + struct page *page; + int nid; + + /* + * Look through allowed nodes for objects available + * from existing per node queues. + */ + for (z = zonelist->zones; *z; z++) { + nid = zone_to_nid(*z); + struct node_slab *n = NODE_INFO(s, nid); + + if (cpuset_zone_allowed_hardwall(*z, flags) && + *n) { + page = get_partial_node(n); + if (page) + return page; + } + } +#endif + return NULL; +} + +/* + * Get a partial page, lock it and return it. + */ +static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page *page; + int searchnode = (node == -1) ? numa_node_id() : node; + + page = get_partial_node(NODE_INFO(s, searchnode)); + if (page || (flags & __GFP_THISNODE)) + return page; + + /* NUMA Fallback */ + return get_any_partial(s, node, flags); +} + +/* + * Debugging checks + */ +static void check_slab(struct page *page) +{ +#ifdef SLAB_DEBUG + if (!PageSlab(page)) { + printk(KERN_CRIT "Not a valid slab page @%p flags=%lx" + " mapping=%p count=%d \n", + page, page->flags, page->mapping, page_count(page)); + BUG(); + } +#endif +} + +static int check_valid_pointer(struct kmem_cache *s, struct page *page, + void *object, void *origin) +{ +#ifdef SLAB_DEBUG + void *base = page_address(page); + + if (object < base || object >= base + s->objects * s->size) { + printk(KERN_CRIT "slab %s size %d: pointer %p->%p\nnot in" + " range (%p-%p) in page %p\n", s->name, s->size, + origin, object, base, base + s->objects * s->size, + page); + return 0; + } + + if ((object - base) % s->size) { + printk(KERN_CRIT "slab %s size %d: pointer %p->%p\n" + "does not properly point" + "to an object in page %p\n", + s->name, s->size, origin, object, page); + return 0; + } +#endif + return 1; +} + +/* + * Determine if a certain object on a page is on the freelist and + * therefore free. Must hold the slab lock for cpu slabs to + * guarantee that the chains are consistent. + */ +static int on_freelist(struct kmem_cache *s, struct page *page, void *search) +{ + int nr = 0; + void **object = page->freelist; + void *origin = &page->lru; + + if (PageSlabsingle(page)) + return 0; + + check_slab(page); + + while (object && nr <= s->objects) { + if (object == search) + return 1; + if (!check_valid_pointer(s, page, object, origin)) + goto try_recover; + origin = object; + object = object[s->offset]; + nr++; + } + + if (page->inuse != s->objects - nr) { + printk(KERN_CRIT "slab %s: page %p wrong object count." + " counter is %d but counted were %d\n", + s->name, page, page->inuse, + s->objects - nr); +try_recover: + printk(KERN_CRIT "****** Trying to continue by marking " + "all objects in the slab used (memory leak!)\n"); + page->inuse = s->objects; + page->freelist = NULL; + } + return 0; +} + +#ifdef SLAB_DEBUG +void check_free_chain(struct kmem_cache *s, struct page *page) +{ + on_freelist(s, page, NULL); +} +#endif + +static void __free_slab(struct kmem_cache *s, struct page *page) +{ + int pages = 1 << s->order; + + if (s->dtor) { + void *start = page_address(page); + void *end = start + (pages << PAGE_SHIFT); + void *p; + + for (p = start; p <= end - s->size; p += s->size) + s->dtor(p, s, 0); + } + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + - pages); + + page->mapping = NULL; + reset_page_mapcount(page); + __ClearPageSlab(page); + __ClearPageSlabsingle(page); + + __free_pages(page, s->order); +} + +static void rcu_free_slab(struct rcu_head *h) +{ + struct page *page; + + page = container_of((struct list_head *)h, struct page, lru); + __free_slab(page->slab, page); +} + + +static void discard_slab(struct kmem_cache *s, struct page *page) +{ + atomic_long_dec(&NODE_INFO(s, page_to_nid(page))->nr_slabs); + + if (s->flags & SLAB_DESTROY_BY_RCU) { + struct rcu_head *head = (void *)&page->lru; + + call_rcu(head, rcu_free_slab); + } else + __free_slab(s, page); +} + +/* + * Allocate a new slab and prepare an empty freelist and the basic struct + * page settings. + */ +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + struct page *page; + int pages = 1 << s->order; + void *start; + void *end; + struct node_slab *n; + + if (s->order) + flags |= __GFP_COMP; + + if (flags & __GFP_NO_GROW) + return NULL; + + if (flags & __GFP_WAIT) + local_irq_enable(); + + if (s->flags & __GFP_DMA) + flags |= GFP_DMA; + + if (node == -1) + page = alloc_pages(flags & GFP_LEVEL_MASK, s->order); + else + page = alloc_pages_node(node, flags & GFP_LEVEL_MASK, s->order); + + if (flags & __GFP_WAIT) + local_irq_disable(); + + if (!page) + return NULL; + + n = NODE_INFO(s, page_to_nid(page)); + if (n) + atomic_long_inc(&n->nr_slabs); + + __mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + pages); + + page->offset = s->offset; + page->slab = (struct kmem_cache *)s; + __SetPageSlab(page); + + start = page_address(page); + end = start + s->objects * s->size; + + if (s->ctor) { + void *p; + int mode = SLAB_CTOR_CONSTRUCTOR; + + if (!(flags & __GFP_WAIT)) + mode |= SLAB_CTOR_ATOMIC; + + for (p = start; p <= end - s->size; p += s->size) + s->ctor(p, s, mode); + } + + /* Initialize freelist */ + if (s->objects > 1) { + void **last = start; + void *p = start + s->size; + + while (p < end) { + last[s->offset] = p; + last = p; + p += s->size; + } + last[s->offset] = NULL; + page->freelist = start; + page->inuse = 0; + } else + __SetPageSlabsingle(page); + + return page; +} + +/* + * Move a page back to the lists. + * + * Must be called with the slab lock held. + * + * On exit the slab lock will have been dropped. + */ +static void __always_inline putback_slab(struct kmem_cache *s, struct page *page) +{ + if (page->inuse) { + if (page->inuse < s->objects) + add_partial(s, page); + slab_unlock(page); + } else { + slab_unlock(page); + discard_slab(s, page); + } +} + +/* + * Remove the currently cpu slab + */ +static void deactivate_slab(struct cpu_slab *a) +{ + struct page *page = a->page; + struct kmem_cache *s = a->slab; + + if (a->nr_free) { + if (unlikely(page->freelist)) { + /* + * Deal with the rare case where we have two + * freelists. + * + * Merge the two freelists. The freelist in the + * cpu slab comes first. + */ + void **freelist = page->freelist; + void **p; + + page->freelist = a->freelist; + + for (p = a->freelist; p[s->offset]; p = p[s->offset]) + page->inuse--; + + p[s->offset] = freelist; + + } else { + page->freelist = a->freelist; + page->inuse -= a->nr_free; + } + } + a->page = NULL; + a->referenced = 0; + a->nr_free = 0; + a->freelist = NULL; + __ClearPageActive(page); + + putback_slab(a->slab, page); +} + +/* + * Unconditionally flush any cpu slabs back to partial lists. + * + * Called from IPI handler with interrupts disabled. + */ +static void flush_cpu(void *d) +{ + struct kmem_cache *s = d; + struct cpu_slab *a = CPU_SLAB(s, smp_processor_id()); + + if (likely(a->page)) { + slab_lock(a->page); + deactivate_slab(a); +#ifdef CONFIG_SMP + a->flush_active = 0; +#endif + } +} + +#ifdef CONFIG_SMP +/* + * Check for a cpu slab and if it has not + * been references flush it back to the partial list. + * + * Called from kevent workqueue. + */ +void check_flush_cpu(struct work_struct *w) +{ + struct cpu_slab *a = container_of(w, struct cpu_slab, flush.work); + + if (!a->page) + return; + + local_irq_disable(); + if (a->referenced) { + a->referenced = 0; + a->flush_active = 1; + schedule_delayed_work(&a->flush, 2 * HZ); + } else { + slab_lock(a->page); + deactivate_slab(a); + a->flush_active = 0; + } + local_irq_enable(); +} +#endif + +static void drain_all(struct kmem_cache *s) +{ + on_each_cpu(flush_cpu, s , 1, 1); +} + +void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +{ + struct cpu_slab *a; + void **object; + unsigned long flags; + + local_irq_save(flags); + a = CPU_SLAB(s, smp_processor_id()); + if (unlikely(!a->page)) + goto new_slab; + + if (likely(a->nr_free)) + goto get_object; + + slab_lock(a->page); + if (likely(a->page->freelist)) + goto get_freelist; + + deactivate_slab(a); + +new_slab: + a->page = get_partial(s, gfpflags, numa_node_id()); + if (unlikely(!a->page)) { + struct page *page; + + page = new_slab(s, gfpflags, -1); + + if (!page) { + object = NULL; + goto out; + } + + /* + * There is no point in putting single object slabs + * on an active list. + */ + if (unlikely(s->objects == 1)) { + object = page_address(page); + goto out; + } + + /* + * We may have reenabled interrupts during the allocation + * Verify the state of the slab. + */ + a = CPU_SLAB(s, smp_processor_id()); + if (a->page) + /* + * Someone else already allocated a page. Drop the + * new one. + */ + discard_slab(s, page); + else + a->page = page; + + slab_lock(page); + } + + __SetPageActive(a->page); + +get_freelist: + a->freelist = a->page->freelist; + a->page->freelist = NULL; + a->nr_free = s->objects - a->page->inuse; + a->page->inuse += a->nr_free; + slab_unlock(a->page); + +get_object: + /* Fastpath */ + object = a->freelist; + a->nr_free--; + a->referenced = 1; + a->freelist = object[a->page->offset]; + +#ifdef CONFIG_SMP + if (unlikely(!a->flush_active && keventd_up())) { + a->flush_active = 1; + schedule_delayed_work(&a->flush, 2 * HZ); + } +#endif +out: + local_irq_restore(flags); + return object; +} +EXPORT_SYMBOL(kmem_cache_alloc); + +#ifdef CONFIG_NUMA +void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) +{ + void **object; + struct page *page; + unsigned long flags; + + local_irq_save(flags); + page = get_partial(s, gfpflags, node); + if (unlikely(!page)) { + struct page *page; + + page = new_slab(s, gfpflags, node); + + if (!page) { + object = NULL; + goto out; + } + + /* + * There is no point in putting single object slabs + * on an active list. + */ + if (unlikely(s->objects == 1)) { + object = page_address(page); + goto out; + } + slab_lock(page); + } + object = page->freelist; + page->freelist = object[page->offset]; + page->inuse++; + putback_slab(s, page); +out: + local_irq_restore(flags); + return object; +} +EXPORT_SYMBOL(kmem_cache_alloc_node); + +/* + * Bootstrap function to allow allocation without having cpu slabs + * and per node structures. + */ +static void * __init __early_cacheline_alloc(int node, struct page **pp) +{ + struct kmem_cache *s = CACHELINE_SLAB_SLAB; + struct page *page = *pp; + void **object; + + if (!page) { + page = new_slab(s, GFP_KERNEL, node); + *pp = page; + BUG_ON(!page); + slab_lock(page); + } + object = page->freelist; + page->freelist = object[s ->offset]; + page->inuse++; + return object; +} + +static void * __init early_cacheline_alloc(int node) +{ + struct kmem_cache *s = CACHELINE_SLAB_SLAB; + struct page *page; + void *object; + + page = get_partial_node(NODE_INFO(s, node)); + object = __early_cacheline_alloc(node, &page); + putback_slab(s, page); + return object; +} +#endif + +void kmem_cache_free(struct kmem_cache *s, void *x) +{ + struct page * page; + void *prior; + void **object = (void *)x; + unsigned long flags; + struct cpu_slab *a; + + if (!object) + return; + + page = virt_to_page(x); + if (unlikely(PageCompound(page))) + page = page->first_page; + + if (!s) + s = (void *)page->slab; + +#ifdef SLAB_DEBUG + if (unlikely(s != (void *)page->slab)) + goto slab_mismatch; + + if (unlikely(!check_valid_pointer(s, page, object, NULL))) + goto dumpret; +#endif + + local_irq_save(flags); + +#ifdef SLAB_DEBUG_KFREE + slab_lock(page); + if (on_freelist(s, page, object)) + goto double_free; + slab_unlock(page); +#endif + + a = CPU_SLAB(s, smp_processor_id()); + if (a->page == page) { + void **object = x; + + a->nr_free++; + object[s->offset] = a->freelist; + a->freelist = object; + goto out; + } + + if (unlikely(PageSlabsingle(page))) { + /* Slab must be emptyt */ + discard_slab(s, page); + goto out; + } + + slab_lock(page); + + prior = object[page->offset] = page->freelist; + page->freelist = object; + page->inuse--; + + if (likely(PageActive(page) || (page->inuse && prior))) + goto out_unlock; + + if (!prior) { + /* + * Page was fully used before. It will have one free + * object now. So move to the partial list. + */ + add_partial(s, page); +out_unlock: + slab_unlock(page); + goto out; + } + + /* + * Slab is empty. + */ + remove_partial(s, page); + slab_unlock(page); + discard_slab(s, page); +out: + local_irq_restore(flags); + return; + +#ifdef SLAB_DEBUG_KFREE +double_free: + printk(KERN_CRIT "slab_free %s: object %p already free.\n", + s->name, object); + dump_stack(); + goto out_unlock; +#endif + +#ifdef SLAB_DEBUG +slab_mismatch: + if (!PageSlab(page)) { + printk(KERN_CRIT "slab_free %s size %d: attempt to free " + "object(%p) outside of slab.\n", + s->name, s->size, object); + goto dumpret; + } + + if (!page->slab) { + printk(KERN_CRIT + "slab_free : no slab(NULL) for object %p.\n", + object); + goto dumpret; + } + + printk(KERN_CRIT "slab_free %s(%d): object at %p" + " belongs to slab %s(%d)\n", + s->name, s->size, object, + page->slab->name, page->slab->size); + +dumpret: + dump_stack(); + printk(KERN_CRIT "***** Trying to continue by not " + "freeing object.\n"); +#endif +} +EXPORT_SYMBOL(kmem_cache_free); + +/* Figure out on which slab object the object resides */ +static __always_inline struct page *get_object_page(const void *x) +{ + struct page * page = virt_to_page(x); + + if (unlikely(PageCompound(page))) + page = page->first_page; + + if (!PageSlab(page)) + return NULL; + + return page; +} + +/* + * slab_create produces objects aligned at size and the first object + * is placed at offset 0 in the slab (We have no metainformation on the + * slab, all slabs are in essence off slab). + * + * In order to get the desired alignment one just needs to align the + * size. + * + * Notice that the allocation order determines the sizes of the per cpu + * caches. Each processor has always one slab available for allocations. + * Increasing the allocation order reduces the number of times that slabs + * must be moved on and off the partial lists and therefore may influence + * locking overhead. + * + * The offset is used to relocate the free list link in each object. It is + * therefore possible to move the free list link behind the object. This + * is necessary for RCU to work properly and also useful for debugging. + * + * However no freelists are necessary if there is only one element per + * slab. + */ + +/* + * Mininum order of slab pages. This influences locking overhead and slab + * fragmentation. A higher order reduces the number of partial slabs + * and increases the number of allocations possible without having to + * take the list_lock. + */ +static int slab_min_order = 0; + +static int __init setup_slab_min_order(char *str) +{ + get_option (&str, &slab_min_order); + return 1; +} + +__setup("slab_min_order=", setup_slab_min_order); + +static int calculate_order(int size) +{ + int order; + int rem; + + if ((size & (size -1)) == 0) { + /* + * We can use the page allocator if the requested size + * is compatible with the page sizes supported. + */ + int order = fls(size) -1 - PAGE_SHIFT; + + if (order >= 0) + return order; + } + + for (order = max(slab_min_order, fls(size - 1) - PAGE_SHIFT); + order < MAX_ORDER; order++) { + unsigned long slab_size = PAGE_SIZE << order; + + if (slab_size < size) + continue; + + rem = slab_size % size; + + if (rem * 8 <= PAGE_SIZE << order) + break; + + } + if (order >= MAX_ORDER) + return -E2BIG; + return order; +} + +/* + * We can actually operate slabs any time after the page allocator is up. + * slab_is_available() merely means that the kmalloc array is available. + * + * However, be aware that deriving allocators depends on kmalloc being + * functional. + */ +static enum { DOWN, PARTIAL, UP } slab_state = DOWN; + +int slab_is_available(void) +{ + return slab_state == UP; +} + +static void alloc_cpu(struct kmem_cache *s, int cpu) +{ + struct cpu_slab *a; + +#ifdef CONFIG_NUMA + int node = cpu_to_node(cpu); + + if (slab_state == DOWN) + a = early_cacheline_alloc(node); + else + a = kmem_cache_alloc_node(CACHELINE_SLAB_SLAB, + GFP_KERNEL, node); + BUG_ON(!a); + + s->cpu[cpu] = a; +#else + a = CPU_SLAB(s, cpu); +#endif +#ifdef CONFIG_SMP + a->flush_active = 0; + INIT_DELAYED_WORK(&a->flush, check_flush_cpu); +#endif + a->page = NULL; + a->slab = s; + a->referenced = 0; +} + +static void alloc_node(struct kmem_cache *s, int node) +{ + struct node_slab *n; + struct page *page = NULL; + +#ifdef CONFIG_NUMA + if (slab_state == DOWN) { + page = new_slab(s, GFP_KERNEL, node); + BUG_ON(!page); + slab_lock(page); + n = __early_cacheline_alloc(node, &page); + } else + n = kmem_cache_alloc_node(CACHELINE_SLAB_SLAB, + GFP_KERNEL, node); + + BUG_ON(!n); + s->node[node] = n; +#else + n = s->node; +#endif + spin_lock_init(&n->list_lock); + INIT_LIST_HEAD(&n->partial); + if (page) { + putback_slab(s, page); + atomic_long_set(&n->nr_slabs, 1); + } else + atomic_long_set(&n->nr_slabs, 0); +} + +int kmem_cache_open(struct kmem_cache *s, + const char *name, size_t size, + size_t align, unsigned long flags, + void (*ctor)(void *, struct kmem_cache *, unsigned long), + void (*dtor)(void *, struct kmem_cache *, unsigned long)) +{ + int cpu; + int node; + + BUG_ON(flags & SLUB_UNIMPLEMENTED); + memset(s, 0, sizeof(struct kmem_cache)); + atomic_set(&s->refcount, 1); + + s->name = name; + s->ctor = ctor; + s->dtor = dtor; + s->objsize = size; + s->flags = flags; + + /* + * Here is the place to add other management type information + * to the end of the object F.e. debug info + */ + size = ALIGN(size, sizeof(void *)); + s->inuse = size; + + if (size * 2 <= (PAGE_SIZE << calculate_order(size)) && + ((flags & SLAB_DESTROY_BY_RCU) || ctor || dtor)) { + /* + * Relocate free pointer after the object if it is not + * permitted to overwrite the first word of the object on + * kmem_cache_free. + * + * This is the case if we do RCU, have a constructor or + * destructor. + * + * We never need a free pointer if each slab only has + * a single object. + */ + s->offset = size / sizeof(void *); + size += sizeof(void *); + } + + align = max(ARCH_SLAB_MINALIGN, ALIGN(align, sizeof(void *))); + if (flags & (SLAB_MUST_HWCACHE_ALIGN|SLAB_HWCACHE_ALIGN)) + align = L1_CACHE_BYTES; + + size = ALIGN(size, align); + s->size = size; + + s->order = calculate_order(size); + if (s->order < 0) + goto error; + + s->objects = (PAGE_SIZE << s->order) / size; + BUG_ON(s->objects > 65535); + if (!s->objects) + goto error; + + for_each_online_node(node) + alloc_node(s, node); + + for_each_online_cpu(cpu) + alloc_cpu(s, cpu); + + register_slab(s); + return 1; + +error: + if (flags & SLAB_PANIC) + panic("Cannot open slab %s size=%ld realsize=%d " + "order=%d offset=%d flags=%lx\n", + s->name, (unsigned long)size, s->size, s->order, + s->offset, flags); + return 0; +} +EXPORT_SYMBOL(kmem_cache_open); + +/* + * Check if a given pointer is valid + */ +int kmem_ptr_validate(struct kmem_cache *s, const void *object) +{ + struct page * page; + void *addr; + + page = get_object_page(object); + + if (!page || s != page->slab) + /* No slab or wrong slab */ + return 0; + + addr = page_address(page); + if (object < addr || object >= addr + s->objects * s->size) + /* Out of bounds */ + return 0; + + if ((object - addr) & s->size) + /* Improperly aligned */ + return 0; + + /* + * We could also check here if the object is on the slabs freelist. + * But this would be too expensive and it seems that the main + * purpose of kmem_ptr_valid is to check if the object belongs + * to a certain slab. + */ + return 1; +} +EXPORT_SYMBOL(kmem_ptr_validate); + +/* + * Determine the size of a slab object + */ +unsigned int kmem_cache_size(struct kmem_cache *s) +{ + return s->objsize; +} +EXPORT_SYMBOL(kmem_cache_size); + +const char *kmem_cache_name(struct kmem_cache *s) +{ + return s->name; +} +EXPORT_SYMBOL(kmem_cache_name); + +/* + * Move slab objects in a given slab by calling the move_objects function. + * + * Must be called with the slab lock held but will drop and reacquire the + * slab lock. + */ +static int move_slab_objects(struct kmem_cache *s, struct page *page, + int (*move_objects)(struct kmem_cache *, void *)) +{ + int unfreeable = 0; + void *addr = page_address(page); + + while (page->inuse - unfreeable > 0) { + void *p; + + for (p = addr; p < addr + s->objects; p+= s->size) { + if (!on_freelist(s, page, p)) { + /* + * Drop the lock here to allow the + * move_object function to do things + * with the slab and maybe this + * page. + */ + slab_unlock(page); + local_irq_enable(); + if (move_objects((struct kmem_cache *)s, p)) + kmem_cache_free(s, p); + else + unfreeable++; + local_irq_disable(); + slab_lock(page); + } + } + } + return unfreeable; +} + +/* + * Shrinking drops the active per cpu slabs and also reaps all empty + * slabs off the partial list. Returns the number of slabs freed. + * + * The move_object function will be called for each objects in partially + * allocated slabs. move_object() needs to perform a new allocation for + * the object and move the contents of the object to the new location. + * + * If move_object() returns 1 for success then the object is going to be + * removed. If 0 then the object cannot be freed at all. As a result the + * slab containing the object will also not be freeable. + * + * Returns the number of slabs freed. + */ +static unsigned int defrag_on_node(struct kmem_cache *s, int node, + int (*move_object)(struct kmem_cache *, void *)) +{ + int slabs_freed = 0; + int i; + struct node_slab *n = NODE_INFO(s, node); + unsigned long flags; + + local_irq_save(flags); + for (i = 0; n->nr_partial > 1 && i < n->nr_partial - 1; i++ ) { + struct page * page; + + page = get_partial_node(n); + if (!page) + break; + + /* + * Pin page so that slab_free will not free even if we + * drop the slab lock. + */ + __SetPageActive(page); + + if (page->inuse < s->objects && move_object) + if (move_slab_objects(s, + page, move_object) == 0) + slabs_freed++; + + /* + * This will put the slab on the front of the partial + * list, the used list or free it. + */ + __ClearPageActive(page); + putback_slab(s, page); + } + local_irq_restore(flags); + return slabs_freed; +} + +int kmem_cache_defrag(struct kmem_cache *s, + int (*move_object)(struct kmem_cache *, void *)) +{ + int node; + + drain_all(s); + for_each_online_node(node) + defrag_on_node(s, node, move_object); + return 0; +} +EXPORT_SYMBOL(kmem_cache_defrag); + +static struct kmem_cache *kmem_cache_dup(struct kmem_cache *s) +{ + atomic_inc(&s->refcount); + return s; +} + +static int free_list(struct node_slab *n) +{ + int slabs_inuse = 0; + unsigned long flags; + struct page *page, *h; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry_safe(page, h, &n->partial, lru) + if (!page->inuse) { + list_del(&n->partial); + discard_slab(page->slab, page); + } else + slabs_inuse++; + spin_unlock_irqrestore(&n->list_lock, flags); + return slabs_inuse; +} + +static void free_cpu(struct kmem_cache *s, int cpu) +{ +#ifdef CONFIG_NUMA + kfree(CPU_SLAB(s, cpu)); + s->cpu[cpu] = NULL; +#endif +} + +static void release_cpu(struct kmem_cache *s) +{ + int cpu; + + for_each_online_cpu(cpu) + free_cpu(s, cpu); +} + +/* + * Release all resources used by slab cache + * (Use with caches setup using kmem_cache_setup) + */ +int kmem_cache_close(struct kmem_cache *s) +{ + unsigned long remainder = 0; + int node; + + if (!atomic_dec_and_test(&s->refcount)) + return 0; + drain_all(s); + + for_each_online_node(node) { + struct node_slab *n = NODE_INFO(s, node); + + free_list(n); + remainder += atomic_long_read(&n->nr_slabs); + } + + if (remainder) + return 1; + + unregister_slab(s); + release_cpu(s); + return 0; +} +EXPORT_SYMBOL(kmem_cache_close); + +/* + * Close a cache and release the kmem_cache structure + * (must be used for caches created using kmem_cache_create + */ +void kmem_cache_destroy(struct kmem_cache *s) +{ + BUG_ON(kmem_cache_close(s)); + kfree(s); +} +EXPORT_SYMBOL(kmem_cache_destroy); + +static unsigned long count_objects(struct kmem_cache *s, + unsigned long *nr_partial, unsigned long *nodes) +{ + int count = 0; + struct page *page; + unsigned long flags; + unsigned long partial = 0; + int node; + + for_each_online_node(node) { + struct node_slab *n = NODE_INFO(s, node); + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) { + count += page->inuse; + nodes[page_to_nid(page)]++; + partial++; + } + spin_unlock_irqrestore(&n->list_lock, flags); + } + *nr_partial = partial; + return count; +} + +static unsigned long slab_objects(struct kmem_cache *s, + unsigned long *p_total, unsigned long *p_cpu, + unsigned long *p_partial, unsigned long *nodes) +{ + unsigned long nr_partial =0; /* Partial slabs */ + unsigned long nr_slabs = 0; /* Total slabs */ + unsigned long nr_cpu = 0; /* Cpu Slabs */ + unsigned long objects_partial = 0; /* Objects in partial slabs */ + unsigned long objects_cpu = 0; /* Objects in cpu slabs */ + int cpu; + int node; + + for_each_online_node(node) + nr_slabs += nodes[node] = + atomic_read(&NODE_INFO(s, node)->nr_slabs); + + for_each_possible_cpu(cpu) { + struct cpu_slab *a = CPU_SLAB(s, cpu); + + if (a->page) { + nr_cpu++; + objects_cpu += a->page->inuse; + nodes[page_to_nid(a->page)]++; + } + } + + objects_partial = count_objects(s, &nr_partial, nodes); + + if (p_partial) + *p_partial = nr_partial; + + if (p_cpu) + *p_cpu = nr_cpu; + + if (p_total) + *p_total = nr_slabs; + + return objects_partial + objects_cpu + + (nr_slabs - nr_partial - nr_cpu) * s->objects; +} + +/* + * Use the cpu notifier to insure that the thresholds are recalculated + * when necessary. + */ +static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + for_all_slabs(alloc_cpu, cpu); + break; + case CPU_UP_CANCELED: + case CPU_DEAD: + for_all_slabs(free_cpu, cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata slab_notifier = + { &slab_cpuup_callback, NULL, 0 }; + + +/******************************************************************** + * Kmalloc subsystem + *******************************************************************/ + +struct kmem_cache kmalloc_caches[KMALLOC_NR_CACHES] __cacheline_aligned; +EXPORT_SYMBOL(kmalloc_caches); + +static struct kmem_cache *kmalloc_caches_dma[KMALLOC_NR_CACHES]; + +static struct kmem_cache *get_slab(size_t size, gfp_t flags) +{ + int index = kmalloc_index(size) - KMALLOC_SHIFT_LOW; + struct kmem_cache *s; + size_t realsize; + + BUG_ON(index < 0); + + if (!(flags & __GFP_DMA)) + return &kmalloc_caches[index]; + + s = kmalloc_caches_dma[index]; + if (s) + return s; + + /* Dynamically create dma cache */ + s = kmalloc(sizeof(struct kmem_cache), flags & ~__GFP_DMA); + + if (!s) + panic("Unable to allocate memory for dma cache\n"); + +#ifdef KMALLOC_EXTRA + if (index <= KMALLOC_SHIFT_HIGH - KMALLOC_SHIFT_LOW) +#endif + realsize = 1 << index; +#ifdef KMALLOC_EXTRA + else if (index == KMALLOC_EXTRAS) + realsize = 96; + else + realsize = 192; +#endif + + kmem_cache_open(s, kasprintf(flags, "kmalloc-dma-%d", (unsigned int)realsize), realsize, + ARCH_KMALLOC_MINALIGN, SLAB_PANIC, NULL, NULL); + kmalloc_caches_dma[index] = s; + return s; +} + +void *__kmalloc(size_t size, gfp_t flags) +{ + return kmem_cache_alloc(get_slab(size, flags), flags); +} +EXPORT_SYMBOL(__kmalloc); + +#ifdef CONFIG_NUMA +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + return kmem_cache_alloc_node(get_slab(size, flags), + flags, node); +} +EXPORT_SYMBOL(__kmalloc_node); +#endif + +size_t ksize(const void *object) +{ + struct page *page = get_object_page(object); + struct kmem_cache *s; + + BUG_ON(!page); + s = page->slab; + BUG_ON(!s); + return s->size; +} +EXPORT_SYMBOL(ksize); + +void kfree(const void *object) +{ + kmem_cache_free(NULL, (void *)object); +} +EXPORT_SYMBOL(kfree); + +void __init kmem_cache_init(void) +{ + int i; + +#ifdef CONFIG_NUMA + /* + * NUMA Bootstrap only works if the slab for the cpu_slab + * structure does not use an EXTRA slab and if both are smaller + * than a cacheline. + */ + BUG_ON(CACHELINE_SLAB_NR > KMALLOC_SHIFT_HIGH || CACHELINE_SLAB_NR < 0); + BUG_ON(sizeof(struct cpu_slab) > L1_CACHE_BYTES); + BUG_ON(sizeof(struct node_slab) > L1_CACHE_BYTES); +#endif + + kmem_cache_open(CACHELINE_SLAB_SLAB, "cpu_slab", + 1 << CACHELINE_SLAB_NR, ARCH_KMALLOC_MINALIGN, + SLAB_PANIC, NULL, NULL); + + slab_state = PARTIAL; + + /* Power of two sized caches */ + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + if (i != CACHELINE_SLAB_NR) + kmem_cache_open( + &kmalloc_caches[i - KMALLOC_SHIFT_LOW], + "kmalloc", 1 << i, + ARCH_KMALLOC_MINALIGN, SLAB_PANIC, NULL, NULL); + +#ifdef KMALLOC_EXTRA + /* Non-power of two caches */ + kmem_cache_open(&kmalloc_caches + [KMALLOC_SHIFT_HIGH - KMALLOC_SHIFT_LOW + 1], + "kmalloc-96", 96 + ARCH_KMALLOC_MINALIGN, SLAB_PANIC, NULL, NULL); + kmem_cache_open(&kmalloc_caches + [KMALLOC_SHIFT_HIGH - KMALLOC_SHIFT_LOW + 1], + "kmalloc-192", 192 + ARCH_KMALLOC_MINALIGN, SLAB_PANIC, NULL, NULL); +#endif + slab_state = UP; + + /* We can provide the correct kmalloc names now that the caches are up */ + for (i = 0; i <= KMALLOC_SHIFT_HIGH - KMALLOC_SHIFT_LOW; i++) { + char *name = kasprintf(GFP_KERNEL, "kmalloc-%d", + kmalloc_caches[i].size); + + BUG_ON(!name); + kmalloc_caches[i].name = name; + }; + + printk(KERN_INFO "Kmalloc: Basic slab size %ld bytes, " + "%d general caches.\n", + PAGE_SIZE << slab_min_order, + KMALLOC_SHIFT_HIGH - KMALLOC_SHIFT_LOW + KMALLOC_EXTRAS); + + register_cpu_notifier(&slab_notifier); +} + +static struct kmem_cache *__kmalloc_slab(size_t size) +{ + int index = kmalloc_index(size) - KMALLOC_SHIFT_LOW; + + if (index < 0) + return NULL; + return &kmalloc_caches[index]; +} + +struct kmem_cache *kmem_cache_create(const char *name, size_t size, + size_t align, unsigned long flags, + void (*ctor)(void *, struct kmem_cache *, unsigned long), + void (*dtor)(void *, struct kmem_cache *, unsigned long)) +{ + struct kmem_cache *s; + +#ifdef SLAB_MERGE + if (!ctor && !dtor && !(flags & SLAB_NO_MERGE) && + align <= ARCH_SLAB_MINALIGN) { + int sz = ALIGN(size, ARCH_SLAB_MINALIGN); + + /* Find the kmalloc slab that would be used for this size */ + s = __kmalloc_slab(sz); + if (!s) + return NULL; + + /* + * Check if there would be less than a woprd difference + * between the size of the slab and the kmalloc slab. + * If so then just use the kmalloc array and avoid creating + * a new slab. + */ + if (s->size - sz <= sizeof(void *)) { + printk(KERN_INFO "SLUB: Merging slab_cache %s size %d" + " into kmalloc array size %d\n", + name, (unsigned int)size, s->size); + return kmem_cache_dup(s); + } + } +#endif + + s = kmalloc(sizeof(struct kmem_cache), GFP_KERNEL); + if (!s) { + if (flags & SLAB_PANIC) + panic("Unable to allocate memory for slab %s\n", name); + return NULL; + } + + if (!kmem_cache_open(s, name, size, align, flags, ctor, dtor)) { + kfree(s); + return NULL; + } + return s; +} +EXPORT_SYMBOL(kmem_cache_create); + +void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags) +{ + void *x; + + x = kmem_cache_alloc(s, flags); + if (x) + memset(x, 0, s->objsize); + return x; +} +EXPORT_SYMBOL(kmem_cache_zalloc); + +/******************************************************************** + * Slab proc interface + *******************************************************************/ + +static void print_slabinfo_header(struct seq_file *m) +{ + /* + * Output format version, so at least we can change it + * without _too_ many complaints. + */ + seq_puts(m, "slabinfo - version: 3.0\n"); + seq_puts(m, "# name /" + "/ "); + seq_putc(m, '\n'); +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + struct list_head *p; + + down_read(&slabstat_sem); + if (!n) + print_slabinfo_header(m); + p = slab_caches.next; + while (n--) { + p = p->next; + if (p == &slab_caches) + return NULL; + } + return list_entry(p, struct kmem_cache, list); +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct kmem_cache *s = p; + + ++*pos; + return s->list.next == &slab_caches ? + NULL : list_entry(s->list.next, struct kmem_cache, list); +} + +static void s_stop(struct seq_file *m, void *p) +{ + up_read(&slabstat_sem); +} + +static int s_show(struct seq_file *m, void *p) +{ + struct kmem_cache *s = p; + unsigned long total_slabs; + unsigned long cpu_slabs; + unsigned long partial_slabs; + unsigned long objects; + unsigned long nodes[MAX_NUMNODES]; + char *d; + int node; + char options[13]; + + objects = slab_objects(s, &total_slabs, &cpu_slabs, + &partial_slabs, nodes); + d = options; + if (s->ctor) + *d++ = 'C'; + if (s->dtor) + *d++ = 'D'; + if (s->flags & SLAB_DESTROY_BY_RCU) + *d++ = 'R'; + if (s->flags & SLAB_MEM_SPREAD) + *d++ = 'S'; + if (s->flags & SLAB_CACHE_DMA) + *d++ = 'd'; + if (s->flags & SLAB_RECLAIM_ACCOUNT) + *d++ = 'r'; + if (s->flags & SLAB_PANIC) + *d++ = 'P'; + if (s->flags & SLAB_HWCACHE_ALIGN) + *d++ = 'a'; + if (s->flags & SLAB_MUST_HWCACHE_ALIGN) + *d++ = 'A'; + if (s->flags & SLAB_DEBUG_FREE) + *d++ = 'F'; + if (s->flags & SLAB_DEBUG_INITIAL) + *d++ = 'I'; + if (s->flags & SLAB_STORE_USER) + *d++ = 'U'; + + *d = 0; + seq_printf(m, "%-21s %7lu %2d %7u\t%lu/%lu/%lu\t%s", + s->name, objects, s->order, s->size, total_slabs, + partial_slabs, cpu_slabs, options); + + for_each_online_node(node) + if (nodes[node]) + seq_printf(m, " N%d=%lu", node, nodes[node]); + seq_putc(m, '\n'); + return 0; +} + +/* + * slabinfo_op - iterator that generates /proc/slabinfo + */ +struct seq_operations slabinfo_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +/*************************************************************** + * Compatiblility definitions + **************************************************************/ + +int kmem_cache_shrink(struct kmem_cache *s) +{ + return kmem_cache_defrag(s, NULL); +} +EXPORT_SYMBOL(kmem_cache_shrink); + +#ifdef CONFIG_NUMA + +/***************************************************************** + * Generic reaper to support the page allocator + * (the cpu slabs are reaped by a per processor workqueue). + ****************************************************************/ + +/* + * Special reaping functions for NUMA systems called from cache_reap(). + */ +static DEFINE_PER_CPU(unsigned long, reap_node); + +static void init_reap_node(int cpu) +{ + int node; + + node = next_node(cpu_to_node(cpu), node_online_map); + if (node == MAX_NUMNODES) + node = first_node(node_online_map); + + __get_cpu_var(reap_node) = node; +} + +static void next_reap_node(void) +{ + int node = __get_cpu_var(reap_node); + + /* + * Also drain per cpu pages on remote zones + */ + if (node != numa_node_id()) + drain_node_pages(node); + + node = next_node(node, node_online_map); + if (unlikely(node >= MAX_NUMNODES)) + node = first_node(node_online_map); + __get_cpu_var(reap_node) = node; +} +#else +#define init_reap_node(cpu) do { } while (0) +#define next_reap_node(void) do { } while (0) +#endif + +#define REAPTIMEOUT_CPUC (2*HZ) + +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct delayed_work, reap_work); + +static void cache_reap(struct work_struct *unused) +{ + next_reap_node(); + refresh_cpu_vm_stats(smp_processor_id()); + schedule_delayed_work(&__get_cpu_var(reap_work), + REAPTIMEOUT_CPUC); +} + +static void __devinit start_cpu_timer(int cpu) +{ + struct delayed_work *reap_work = &per_cpu(reap_work, cpu); + + /* + * When this gets called from do_initcalls via cpucache_init(), + * init_workqueues() has already run, so keventd will be setup + * at that time. + */ + if (keventd_up() && reap_work->work.func == NULL) { + init_reap_node(cpu); + INIT_DELAYED_WORK(reap_work, cache_reap); + schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu); + } +} + +static int __init cpucache_init(void) +{ + int cpu; + + /* + * Register the timers that drain pcp pages and update vm statistics + */ + for_each_online_cpu(cpu) + start_cpu_timer(cpu); + return 0; +} +__initcall(cpucache_init); +#endif Index: linux-2.6.20-rc1/mm/slab.c =================================================================== --- linux-2.6.20-rc1.orig/mm/slab.c 2006-12-15 17:24:20.000000000 -0800 +++ linux-2.6.20-rc1/mm/slab.c 2006-12-15 17:24:32.000000000 -0800 @@ -2512,6 +2512,17 @@ } EXPORT_SYMBOL(kmem_cache_shrink); +int kmem_cache_defrag(struct kmem_cache *cachep, + int (*move_object)(struct kmem_cache *, void *)) +{ + /* + * No support for defragmentation. Just shrink the object + * and slabs we have cached. + */ + return kmem_cache_shrink(cachep); +} +EXPORT_SYMBOL(kmem_cache_defrag); + /** * kmem_cache_destroy - delete a cache * @cachep: the cache to destroy Index: linux-2.6.20-rc1/mm/slob.c =================================================================== --- linux-2.6.20-rc1.orig/mm/slob.c 2006-12-15 17:24:20.000000000 -0800 +++ linux-2.6.20-rc1/mm/slob.c 2006-12-15 17:24:32.000000000 -0800 @@ -334,6 +334,13 @@ } EXPORT_SYMBOL(kmem_cache_shrink); +int kmem_cache_defrag(struct kmem_cache *d, + int (*move_object)(struct kmem_cache *, void *)); +{ + return 0; +} +EXPORT_SYMBOL(kmem_cache_defrag); + int kmem_ptr_validate(struct kmem_cache *a, const void *b) { return 0; Index: linux-2.6.20-rc1/include/linux/slub_def.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.20-rc1/include/linux/slub_def.h 2006-12-15 19:14:06.000000000 -0800 @@ -0,0 +1,183 @@ +#ifndef _LINUX_SLUB_DEF_H +#define _LINUX_SLUB_DEF_H + +/* + * Generic API to memory allocators. + * (C) 2006 Silicon Graphics, Inc, + * Christoph Lameter + */ +#include +#include +#include + +/* + * Per cpu structure to manage active slabs. + * Must be less than a cacheline for bootstrap to work. + */ +struct cpu_slab { + struct page *page; + struct kmem_cache *slab; + void **freelist; + int nr_free; + int referenced; +#ifdef CONFIG_SMP + int flush_active; + struct delayed_work flush; +#endif +} ____cacheline_aligned_in_smp; + +/* + * Per node structure to manage partial slabs + * Must be less than a cacheline for bootstrap to work. + */ +struct node_slab { + spinlock_t list_lock; + struct list_head partial; + unsigned long nr_partial; + atomic_long_t nr_slabs; /* Total slabs used */ +} ____cacheline_aligned_in_smp; + +/* + * Slab cache management. + */ +struct kmem_cache { + int offset; /* Free pointer offset. */ + int size; /* Total size of an object */ + unsigned int order; /* Size of the slab page */ + int objects; /* Number of objects in a slab */ + unsigned long flags; + atomic_t refcount; /* Refcount for destroy */ + int align; + void (*ctor)(void *, struct kmem_cache *, unsigned long); + void (*dtor)(void *, struct kmem_cache *, unsigned long); + + int objsize; /* The size of an object that is in a chunk */ + int inuse; /* Used portion of the chunk */ + const char *name; /* Name (only for display!) */ + struct list_head list; /* List of slabs */ + +#ifdef CONFIG_NUMA + struct node_slab *node[MAX_NUMNODES]; +#else + struct node_slab node[MAX_NUMNODES]; +#endif + +#ifdef CONFIG_NUMA + struct cpu_slab *cpu[NR_CPUS]; +#else + struct cpu_slab cpu[NR_CPUS] ____cacheline_aligned_in_smp; +#endif + +}; + +/* + * Kmalloc subsystem. + */ +#define KMALLOC_SHIFT_LOW 3 + +#define KMALLOC_SHIFT_HIGH 18 + +#if L1_CACHE_BYTES <= 64 +#define KMALLOC_EXTRAS 2 +#define KMALLOC_EXTRA +#else +#define KMALLOC_EXTRAS 0 +#endif + +#define KMALLOC_NR_CACHES (KMALLOC_SHIFT_HIGH - KMALLOC_SHIFT_LOW \ + + 1 + KMALLOC_EXTRAS) +/* + * We keep the general caches in an array of slab caches that are used for + * 2^x bytes of allocations. For each size we generate a DMA and a + * non DMA cache (DMA simply means memory for legacy I/O. The regular + * caches can be used for devices that can DMA to all of memory). + */ +extern struct kmem_cache kmalloc_caches[KMALLOC_NR_CACHES]; + +/* + * Sorry that the following has to be that ugly but GCC has trouble + * with constant propagation and loops. + */ +static inline int kmalloc_index(int size) +{ + if (size <= 8) return 3; + if (size <= 16) return 4; + if (size <= 32) return 5; + if (size <= 64) return 6; +#ifdef KMALLOC_EXTRA + if (size <= 96) return KMALLOC_SHIFT_HIGH + 1; +#endif + if (size <= 128) return 7; +#ifdef KMALLOC_EXTRA + if (size <= 192) return KMALLOC_SHIFT_HIGH + 2; +#endif + if (size <= 256) return 8; + if (size <= 512) return 9; + if (size <= 1024) return 10; + if (size <= 2048) return 11; + if (size <= 4096) return 12; + if (size <= 8 * 1024) return 13; + if (size <= 16 * 1024) return 14; + if (size <= 32 * 1024) return 15; + if (size <= 64 * 1024) return 16; + if (size <= 128 * 1024) return 17; + if (size <= 256 * 1024) return 18; + return -1; +} + +/* + * Find the slab cache for a given combination of allocation flags and size. + * + * This ought to end up with a global pointer to the right cache + * in kmalloc_caches. + */ +static inline struct kmem_cache *kmalloc_slab(size_t size) +{ + int index = kmalloc_index(size) - KMALLOC_SHIFT_LOW; + + if (index < 0) { + /* + * Generate a link failure. Would be great if we could + * do something to stop the compile here. + */ + extern void __kmalloc_size_too_large(void); + __kmalloc_size_too_large(); + } + return &kmalloc_caches[index]; +} + +static inline void *kmalloc(size_t size, gfp_t flags) +{ + if (__builtin_constant_p(size) && !(flags & __GFP_DMA)) { + struct kmem_cache *s = kmalloc_slab(size); + + return kmem_cache_alloc(s, flags); + } else + return __kmalloc(size, flags); +} + +static inline void *kzalloc(size_t size, gfp_t flags) +{ + if (__builtin_constant_p(size) && !(flags & __GFP_DMA)) { + struct kmem_cache *s = kmalloc_slab(size); + + return kmem_cache_zalloc(s, flags); + } else + return __kzalloc(size, flags); +} + +#ifdef CONFIG_NUMA +extern void *__kmalloc_node(size_t size, gfp_t flags, int node); + +static inline void *kmalloc_node(size_t size, gfp_t flags, int node) +{ + if (__builtin_constant_p(size) && !(flags & __GFP_DMA)) { + struct kmem_cache *s = kmalloc_slab(size); + + return kmem_cache_alloc_node(s, flags, node); + } else + return __kmalloc_node(size, flags, node); +} +#endif + +#endif /* _LINUX_SLUB_DEF_H */ Index: linux-2.6.20-rc1/fs/proc/proc_misc.c =================================================================== --- linux-2.6.20-rc1.orig/fs/proc/proc_misc.c 2006-12-13 17:14:23.000000000 -0800 +++ linux-2.6.20-rc1/fs/proc/proc_misc.c 2006-12-15 17:24:32.000000000 -0800 @@ -408,17 +408,23 @@ }; #endif -#ifdef CONFIG_SLAB +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) extern struct seq_operations slabinfo_op; +#ifdef CONFIG_SLAB extern ssize_t slabinfo_write(struct file *, const char __user *, size_t, loff_t *); +#endif + static int slabinfo_open(struct inode *inode, struct file *file) { return seq_open(file, &slabinfo_op); } + static struct file_operations proc_slabinfo_operations = { .open = slabinfo_open, .read = seq_read, +#ifdef CONFIG_SLAB .write = slabinfo_write, +#endif .llseek = seq_lseek, .release = seq_release, }; @@ -717,7 +723,7 @@ #endif create_seq_entry("stat", 0, &proc_stat_operations); create_seq_entry("interrupts", 0, &proc_interrupts_operations); -#ifdef CONFIG_SLAB +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations); #ifdef CONFIG_DEBUG_SLAB_LEAK create_seq_entry("slab_allocators", 0 ,&proc_slabstats_operations);