--- mm/slub.c | 4175 +++++++++++++++++++++++++++++++------------------------------- 1 file changed, 2096 insertions(+), 2079 deletions(-) Index: slub/mm/slub.c =================================================================== --- slub.orig/mm/slub.c 2007-05-20 20:20:47.000000000 -0700 +++ slub/mm/slub.c 2007-05-20 20:27:52.000000000 -0700 @@ -329,2614 +329,2631 @@ static inline int slab_index(void *p, st return (p - addr) / s->size; } -#ifdef CONFIG_SLUB_DEBUG -/* - * Debug settings: - */ -static int slub_debug; +static void setup_object_debug(struct kmem_cache *s, + struct page *page, void *object); -static char *slub_debug_slabs; +static int alloc_debug_processing(struct kmem_cache *s, + struct page *page, void *object, void *addr); + +static int free_debug_processing(struct kmem_cache *s, + struct page *page, void *object, void *addr); + +static int slab_pad_check(struct kmem_cache *s, struct page *page); +static int check_object(struct kmem_cache *s, struct page *page, + void *object, int active); +static void add_full(struct kmem_cache_node *n, struct page *page); +static void remove_full(struct kmem_cache *s, struct page *page); +static void kmem_cache_open_debug_check(struct kmem_cache *s); /* - * Object debugging + * Slab allocation and freeing */ -static void print_section(char *text, u8 *addr, unsigned int length) +static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { - int i, offset; - int newline = 1; - char ascii[17]; + struct page * page; + int pages = 1 << s->order; - ascii[16] = 0; + if (s->order) + flags |= __GFP_COMP; - for (i = 0; i < length; i++) { - if (newline) { - printk(KERN_ERR "%10s 0x%p: ", text, addr + i); - newline = 0; - } - printk(" %02x", addr[i]); - offset = i % 16; - ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; - if (offset == 15) { - printk(" %s\n",ascii); - newline = 1; - } - } - if (!newline) { - i %= 16; - while (i < 16) { - printk(" "); - ascii[i] = ' '; - i++; - } - printk(" %s\n", ascii); - } -} + if (s->flags & SLAB_CACHE_DMA) + flags |= SLUB_DMA; -static struct track *get_track(struct kmem_cache *s, void *object, - enum track_item alloc) -{ - struct track *p; + if (s->flags & SLAB_RECLAIM_ACCOUNT) + flags |= __GFP_RECLAIMABLE; - if (s->offset) - p = object + s->offset + sizeof(void *); + if (node == -1) + page = alloc_pages(flags, s->order); else - p = object + s->inuse; - - return p + alloc; -} + page = alloc_pages_node(node, flags, s->order); -static void set_track(struct kmem_cache *s, void *object, - enum track_item alloc, void *addr) -{ - struct track *p; + if (!page) + return NULL; - if (s->offset) - p = object + s->offset + sizeof(void *); - else - p = object + s->inuse; + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + pages); - p += alloc; - if (addr) { - p->addr = addr; - p->cpu = smp_processor_id(); - p->pid = current ? current->pid : -1; - p->when = jiffies; - } else - memset(p, 0, sizeof(struct track)); + return page; } -static void init_tracking(struct kmem_cache *s, void *object) +static void setup_object(struct kmem_cache *s, struct page *page, + void *object) { - if (s->flags & SLAB_STORE_USER) { - set_track(s, object, TRACK_FREE, NULL); - set_track(s, object, TRACK_ALLOC, NULL); - } + setup_object_debug(s, page, object); + if (unlikely(s->ctor)) + s->ctor(object, s, 0); } -static void print_track(const char *s, struct track *t) +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { - if (!t->addr) - return; + struct page *page; + struct kmem_cache_node *n; + void *start; + void *end; + void *last; + void *p; - printk(KERN_ERR "%s: ", s); - __print_symbol("%s", (unsigned long)t->addr); - printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); -} + BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); -static void print_trailer(struct kmem_cache *s, u8 *p) -{ - unsigned int off; /* Offset of last byte */ + if (flags & __GFP_WAIT) + local_irq_enable(); - if (s->flags & SLAB_RED_ZONE) - print_section("Redzone", p + s->objsize, - s->inuse - s->objsize); + page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); + if (!page) + goto out; - printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n", - p + s->offset, - get_freepointer(s, p)); + n = get_node(s, page_to_nid(page)); + if (n) + atomic_long_inc(&n->nr_slabs); - if (s->offset) - off = s->offset + sizeof(void *); - else - off = s->inuse; + page->inuse = 0; + page->lockless_freelist = NULL; + page->offset = s->offset / sizeof(void *); + page->slab = s; - if (s->flags & SLAB_STORE_USER) { - print_track("Last alloc", get_track(s, p, TRACK_ALLOC)); - print_track("Last free ", get_track(s, p, TRACK_FREE)); - off += 2 * sizeof(struct track); + start = page_address(page); + end = start + s->objects * s->size; + + if (unlikely(s->flags & SLAB_POISON)) + memset(start, POISON_INUSE, PAGE_SIZE << s->order); + + last = start; + for_each_object(p, s, start) { + setup_object(s, page, last); + set_freepointer(s, last, p); + last = p; } + setup_object(s, page, last); + set_freepointer(s, last, NULL); - if (off != s->size) - /* Beginning of the filler is the free pointer */ - print_section("Filler", p + off, s->size - off); + page->freelist = start; + + /* + * page->inuse must be 0 when PageSlab(page) becomes + * true so that defrag knows that this slab is not in use. + */ + smp_wmb(); + __SetPageSlab(page); + if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | + SLAB_STORE_USER | SLAB_TRACE)) + SetSlabDebug(page); + + out: + if (flags & __GFP_WAIT) + local_irq_disable(); + return page; } -static void object_err(struct kmem_cache *s, struct page *page, - u8 *object, char *reason) +static void __free_slab(struct kmem_cache *s, struct page *page) { - u8 *addr = page_address(page); + int pages = 1 << s->order; - printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n", - s->name, reason, object, page); - printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n", - object - addr, page->flags, page->inuse, page->freelist); - if (object > addr + 16) - print_section("Bytes b4", object - 16, 16); - print_section("Object", object, min(s->objsize, 128)); - print_trailer(s, object); - dump_stack(); + if (unlikely(SlabDebug(page))) { + void *p; + + slab_pad_check(s, page); + for_each_object(p, s, page_address(page)) + check_object(s, page, p, 0); + } + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + - pages); + + page->mapping = NULL; + __free_pages(page, s->order); } -static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...) +static void rcu_free_slab(struct rcu_head *h) { - va_list args; - char buf[100]; + struct page *page; - va_start(args, reason); - vsnprintf(buf, sizeof(buf), reason, args); - va_end(args); - printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf, - page); - dump_stack(); + page = container_of((struct list_head *)h, struct page, lru); + __free_slab(page->slab, page); } -static void init_object(struct kmem_cache *s, void *object, int active) +static void free_slab(struct kmem_cache *s, struct page *page) { - u8 *p = object; - - if (s->flags & __OBJECT_POISON) { - memset(p, POISON_FREE, s->objsize - 1); - p[s->objsize -1] = POISON_END; - } + if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { + /* + * RCU free overloads the RCU head over the LRU + */ + struct rcu_head *head = (void *)&page->lru; - if (s->flags & SLAB_RED_ZONE) - memset(p + s->objsize, - active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, - s->inuse - s->objsize); + call_rcu(head, rcu_free_slab); + } else + __free_slab(s, page); } -static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) +static void discard_slab(struct kmem_cache *s, struct page *page) { - while (bytes) { - if (*start != (u8)value) - return 0; - start++; - bytes--; - } - return 1; + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + + atomic_long_dec(&n->nr_slabs); + reset_page_mapcount(page); + ClearSlabDebug(page); + __ClearPageSlab(page); + free_slab(s, page); } /* - * Object layout: - * - * object address - * Bytes of the object to be managed. - * If the freepointer may overlay the object then the free - * pointer is the first word of the object. - * - * Poisoning uses 0x6b (POISON_FREE) and the last byte is - * 0xa5 (POISON_END) - * - * object + s->objsize - * Padding to reach word boundary. This is also used for Redzoning. - * Padding is extended by another word if Redzoning is enabled and - * objsize == inuse. - * - * We fill with 0xbb (RED_INACTIVE) for inactive objects and with - * 0xcc (RED_ACTIVE) for objects in use. - * - * object + s->inuse - * Meta data starts here. - * - * A. Free pointer (if we cannot overwrite object on free) - * B. Tracking data for SLAB_STORE_USER - * C. Padding to reach required alignment boundary or at mininum - * one word if debuggin is on to be able to detect writes - * before the word boundary. - * - * Padding is done using 0x5a (POISON_INUSE) - * - * object + s->size - * Nothing is used beyond s->size. - * - * If slabcaches are merged then the objsize and inuse boundaries are mostly - * ignored. And therefore no slab options that rely on these boundaries - * may be used with merged slabcaches. + * Per slab locking using the pagelock */ +static __always_inline void slab_lock(struct page *page) +{ + bit_spin_lock(PG_locked, &page->flags); +} -static void restore_bytes(struct kmem_cache *s, char *message, u8 data, - void *from, void *to) +static __always_inline void slab_unlock(struct page *page) { - printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n", - s->name, message, data, from, to - 1); - memset(from, data, to - from); + bit_spin_unlock(PG_locked, &page->flags); } -static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) +static __always_inline int slab_trylock(struct page *page) { - unsigned long off = s->inuse; /* The end of info */ + int rc = 1; - if (s->offset) - /* Freepointer is placed after the object. */ - off += sizeof(void *); + rc = bit_spin_trylock(PG_locked, &page->flags); + return rc; +} - if (s->flags & SLAB_STORE_USER) - /* We also have user information there */ - off += 2 * sizeof(struct track); +/* + * Management of partially allocated slabs + */ +static void add_partial_tail(struct kmem_cache_node *n, struct page *page) +{ + spin_lock(&n->list_lock); + n->nr_partial++; + list_add_tail(&page->lru, &n->partial); + spin_unlock(&n->list_lock); +} - if (s->size == off) - return 1; +static void add_partial(struct kmem_cache_node *n, struct page *page) +{ + spin_lock(&n->list_lock); + n->nr_partial++; + list_add(&page->lru, &n->partial); + spin_unlock(&n->list_lock); +} - if (check_bytes(p + off, POISON_INUSE, s->size - off)) - return 1; +static void remove_partial(struct kmem_cache *s, + struct page *page) +{ + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - object_err(s, page, p, "Object padding check fails"); + spin_lock(&n->list_lock); + list_del(&page->lru); + n->nr_partial--; + spin_unlock(&n->list_lock); +} - /* - * Restore padding - */ - restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size); +/* + * Lock slab and remove from the partial list. + * + * Must hold list_lock. + */ +static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) +{ + if (slab_trylock(page)) { + list_del(&page->lru); + n->nr_partial--; + SetSlabFrozen(page); + return 1; + } return 0; } -static int slab_pad_check(struct kmem_cache *s, struct page *page) +/* + * Try to allocate a partial slab from a specific node. + */ +static struct page *get_partial_node(struct kmem_cache_node *n) { - u8 *p; - int length, remainder; - - if (!(s->flags & SLAB_POISON)) - return 1; + struct page *page; - p = page_address(page); - length = s->objects * s->size; - remainder = (PAGE_SIZE << s->order) - length; - if (!remainder) - return 1; + /* + * Racy check. If we mistakenly see no partial slabs then we + * just allocate an empty slab. If we mistakenly try to get a + * partial slab and there is none available then get_partials() + * will return NULL. + */ + if (!n || !n->nr_partial) + return NULL; - if (!check_bytes(p + length, POISON_INUSE, remainder)) { - slab_err(s, page, "Padding check failed"); - restore_bytes(s, "slab padding", POISON_INUSE, p + length, - p + length + remainder); - return 0; - } - return 1; + spin_lock(&n->list_lock); + list_for_each_entry(page, &n->partial, lru) + if (lock_and_freeze_slab(n, page)) + goto out; + page = NULL; +out: + spin_unlock(&n->list_lock); + return page; } -static int check_object(struct kmem_cache *s, struct page *page, - void *object, int active) +/* + * Get a page from somewhere. Search in increasing NUMA distances. + */ +static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) { - u8 *p = object; - u8 *endobject = object + s->objsize; +#ifdef CONFIG_NUMA + struct zonelist *zonelist; + struct zone **z; + struct page *page; - if (s->flags & SLAB_RED_ZONE) { - unsigned int red = - active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; + /* + * The defrag ratio allows a configuration of the tradeoffs between + * inter node defragmentation and node local allocations. A lower + * defrag_ratio increases the tendency to do local allocations + * instead of attempting to obtain partial slabs from other nodes. + * + * If the defrag_ratio is set to 0 then kmalloc() always + * returns node local objects. If the ratio is higher then kmalloc() + * may return off node objects because partial slabs are obtained + * from other nodes and filled up. + * + * If /sys/slab/xx/defrag_ratio is set to 100 (which makes + * defrag_ratio = 1000) then every (well almost) allocation will + * first attempt to defrag slab caches on other nodes. This means + * scanning over all nodes to look for partial slabs which may be + * expensive if we do it every time we are trying to find a slab + * with available objects. + */ + if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) + return NULL; - if (!check_bytes(endobject, red, s->inuse - s->objsize)) { - object_err(s, page, object, - active ? "Redzone Active" : "Redzone Inactive"); - restore_bytes(s, "redzone", red, - endobject, object + s->inuse); - return 0; - } - } else { - if ((s->flags & SLAB_POISON) && s->objsize < s->inuse && - !check_bytes(endobject, POISON_INUSE, - s->inuse - s->objsize)) { - object_err(s, page, p, "Alignment padding check fails"); - /* - * Fix it so that there will not be another report. - * - * Hmmm... We may be corrupting an object that now expects - * to be longer than allowed. - */ - restore_bytes(s, "alignment padding", POISON_INUSE, - endobject, object + s->inuse); - } - } + zonelist = &NODE_DATA(slab_node(current->mempolicy)) + ->node_zonelists[gfp_zone(flags)]; + for (z = zonelist->zones; *z; z++) { + struct kmem_cache_node *n; - if (s->flags & SLAB_POISON) { - if (!active && (s->flags & __OBJECT_POISON) && - (!check_bytes(p, POISON_FREE, s->objsize - 1) || - p[s->objsize - 1] != POISON_END)) { + n = get_node(s, zone_to_nid(*z)); - object_err(s, page, p, "Poison check failed"); - restore_bytes(s, "Poison", POISON_FREE, - p, p + s->objsize -1); - restore_bytes(s, "Poison", POISON_END, - p + s->objsize - 1, p + s->objsize); - return 0; + if (n && cpuset_zone_allowed_hardwall(*z, flags) && + n->nr_partial > MIN_PARTIAL) { + page = get_partial_node(n); + if (page) + return page; } - /* - * check_pad_bytes cleans up on its own. - */ - check_pad_bytes(s, page, p); - } - - if (!s->offset && active) - /* - * Object and freepointer overlap. Cannot check - * freepointer while object is allocated. - */ - return 1; - - /* Check free pointer validity */ - if (!check_valid_pointer(s, page, get_freepointer(s, p))) { - object_err(s, page, p, "Freepointer corrupt"); - /* - * No choice but to zap it and thus loose the remainder - * of the free objects in this slab. May cause - * another error because the object count is now wrong. - */ - set_freepointer(s, p, NULL); - return 0; } - return 1; +#endif + return NULL; } -static int check_slab(struct kmem_cache *s, struct page *page) +/* + * Get a partial page, lock it and return it. + */ +static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) { - VM_BUG_ON(!irqs_disabled()); + struct page *page; + int searchnode = (node == -1) ? numa_node_id() : node; - if (!PageSlab(page)) { - slab_err(s, page, "Not a valid slab page flags=%lx " - "mapping=0x%p count=%d", page->flags, page->mapping, - page_count(page)); - return 0; - } - if (page->offset * sizeof(void *) != s->offset) { - slab_err(s, page, "Corrupted offset %lu flags=0x%lx " - "mapping=0x%p count=%d", - (unsigned long)(page->offset * sizeof(void *)), - page->flags, - page->mapping, - page_count(page)); - return 0; - } - if (page->inuse > s->objects) { - slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx " - "mapping=0x%p count=%d", - s->name, page->inuse, s->objects, page->flags, - page->mapping, page_count(page)); - return 0; - } - /* Slab_pad_check fixes things up after itself */ - slab_pad_check(s, page); - return 1; + page = get_partial_node(get_node(s, searchnode)); + if (page || (flags & __GFP_THISNODE)) + return page; + + return get_any_partial(s, flags); } /* - * Determine if a certain object on a page is on the freelist. Must hold the - * slab lock to guarantee that the chains are in a consistent state. + * Move a page back to the lists. + * + * Must be called with the slab lock held. + * + * On exit the slab lock will have been dropped. */ -static int on_freelist(struct kmem_cache *s, struct page *page, void *search) +static void unfreeze_slab(struct kmem_cache *s, struct page *page) { - int nr = 0; - void *fp = page->freelist; - void *object = NULL; + struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - while (fp && nr <= s->objects) { - if (fp == search) - return 1; - if (!check_valid_pointer(s, page, fp)) { - if (object) { - object_err(s, page, object, - "Freechain corrupt"); - set_freepointer(s, object, NULL); - break; - } else { - slab_err(s, page, "Freepointer 0x%p corrupt", - fp); - page->freelist = NULL; - page->inuse = s->objects; - printk(KERN_ERR "@@@ SLUB %s: Freelist " - "cleared. Slab 0x%p\n", - s->name, page); - return 0; - } - break; - } - object = fp; - fp = get_freepointer(s, object); - nr++; - } + ClearSlabFrozen(page); + if (page->inuse) { - if (page->inuse != s->objects - nr) { - slab_err(s, page, "Wrong object count. Counter is %d but " - "counted were %d", s, page, page->inuse, - s->objects - nr); - page->inuse = s->objects - nr; - printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. " - "Slab @0x%p\n", s->name, page); + if (page->freelist) + add_partial(n, page); + else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) + add_full(n, page); + slab_unlock(page); + + } else { + if (n->nr_partial < MIN_PARTIAL) { + /* + * Adding an empty slab to the partial slabs in order + * to avoid page allocator overhead. This slab needs + * to come after the other slabs with objects in + * order to fill them up. That way the size of the + * partial list stays small. kmem_cache_shrink can + * reclaim empty slabs from the partial list. + */ + add_partial_tail(n, page); + slab_unlock(page); + } else { + slab_unlock(page); + discard_slab(s, page); + } } - return search == NULL; } -static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) +/* + * Remove the cpu slab + */ +static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) { - if (s->flags & SLAB_TRACE) { - printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", - s->name, - alloc ? "alloc" : "free", - object, page->inuse, - page->freelist); + /* + * Merge cpu freelist into freelist. Typically we get here + * because both freelists are empty. So this is unlikely + * to occur. + */ + while (unlikely(page->lockless_freelist)) { + void **object; - if (!alloc) - print_section("Object", (void *)object, s->objsize); + /* Retrieve object from cpu_freelist */ + object = page->lockless_freelist; + page->lockless_freelist = page->lockless_freelist[page->offset]; - dump_stack(); + /* And put onto the regular freelist */ + object[page->offset] = page->freelist; + page->freelist = object; + page->inuse--; } + s->cpu_slab[cpu] = NULL; + unfreeze_slab(s, page); } -/* - * Tracking of fully allocated slabs for debugging purposes. - */ -static void add_full(struct kmem_cache_node *n, struct page *page) +static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) { - spin_lock(&n->list_lock); - list_add(&page->lru, &n->full); - spin_unlock(&n->list_lock); + slab_lock(page); + deactivate_slab(s, page, cpu); } -static void remove_full(struct kmem_cache *s, struct page *page) +/* + * Flush cpu slab. + * Called from IPI handler with interrupts disabled. + */ +static void __flush_cpu_slab(struct kmem_cache *s, int cpu) { - struct kmem_cache_node *n; + struct page *page = s->cpu_slab[cpu]; - if (!(s->flags & SLAB_STORE_USER)) - return; + if (likely(page)) + flush_slab(s, page, cpu); +} - n = get_node(s, page_to_nid(page)); +static void flush_cpu_slab(void *d) +{ + struct kmem_cache *s = d; + int cpu = smp_processor_id(); - spin_lock(&n->list_lock); - list_del(&page->lru); - spin_unlock(&n->list_lock); + __flush_cpu_slab(s, cpu); } -static void setup_object_debug(struct kmem_cache *s, struct page *page, - void *object) +static void flush_all(struct kmem_cache *s) { - if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) - return; +#ifdef CONFIG_SMP + on_each_cpu(flush_cpu_slab, s, 1, 1); +#else + unsigned long flags; - init_object(s, object, 0); - init_tracking(s, object); + local_irq_save(flags); + flush_cpu_slab(s); + local_irq_restore(flags); +#endif } -static int alloc_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) +/* + * Slow path. The lockless freelist is empty or we need to perform + * debugging duties. + * + * Interrupts are disabled. + * + * Processing is still very fast if new objects have been freed to the + * regular freelist. In that case we simply take over the regular freelist + * as the lockless freelist and zap the regular freelist. + * + * If that is not working then we fall back to the partial lists. We take the + * first element of the freelist as the object to allocate now and move the + * rest of the freelist to the lockless freelist. + * + * And if we were unable to get a new slab from the partial slab lists then + * we need to allocate a new slab. This is slowest path since we may sleep. + */ +static void *__slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, int node, void *addr, struct page *page) { - if (!check_slab(s, page)) - goto bad; + void **object; + int cpu = smp_processor_id(); - if (object && !on_freelist(s, page, object)) { - slab_err(s, page, "Object 0x%p already allocated", object); - goto bad; - } + if (!page) + goto new_slab; - if (!check_valid_pointer(s, page, object)) { - object_err(s, page, object, "Freelist Pointer check fails"); - goto bad; - } + slab_lock(page); + if (unlikely(node != -1 && page_to_nid(page) != node)) + goto another_slab; +load_freelist: + object = page->freelist; + if (unlikely(!object)) + goto another_slab; + if (unlikely(SlabDebug(page))) + goto debug; - if (object && !check_object(s, page, object, 0)) - goto bad; + object = page->freelist; + page->lockless_freelist = object[page->offset]; + page->inuse = s->objects; + page->freelist = NULL; + slab_unlock(page); + return object; - /* Success perform special debug activities for allocs */ - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_ALLOC, addr); - trace(s, page, object, 1); - init_object(s, object, 1); - return 1; +another_slab: + deactivate_slab(s, page, cpu); -bad: - if (PageSlab(page)) { - /* - * If this is a slab page then lets do the best we can - * to avoid issues in the future. Marking all objects - * as used avoids touching the remaining objects. - */ - printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", - s->name, page); - page->inuse = s->objects; - page->freelist = NULL; - /* Fix up fields that may be corrupted */ - page->offset = s->offset / sizeof(void *); +new_slab: + page = get_partial(s, gfpflags, node); + if (page) { + s->cpu_slab[cpu] = page; + goto load_freelist; } - return 0; -} -static int free_debug_processing(struct kmem_cache *s, struct page *page, - void *object, void *addr) -{ - if (!check_slab(s, page)) - goto fail; - - if (!check_valid_pointer(s, page, object)) { - slab_err(s, page, "Invalid object pointer 0x%p", object); - goto fail; - } - - if (on_freelist(s, page, object)) { - slab_err(s, page, "Object 0x%p already free", object); - goto fail; - } - - if (!check_object(s, page, object, 1)) - return 0; - - if (unlikely(s != page->slab)) { - if (!PageSlab(page)) - slab_err(s, page, "Attempt to free object(0x%p) " - "outside of slab", object); - else - if (!page->slab) { - printk(KERN_ERR - "SLUB : no slab for object 0x%p.\n", - object); - dump_stack(); + page = new_slab(s, gfpflags, node); + if (page) { + cpu = smp_processor_id(); + if (s->cpu_slab[cpu]) { + /* + * Someone else populated the cpu_slab while we + * enabled interrupts, or we have gotten scheduled + * on another cpu. The page may not be on the + * requested node even if __GFP_THISNODE was + * specified. So we need to recheck. + */ + if (node == -1 || + page_to_nid(s->cpu_slab[cpu]) == node) { + /* + * Current cpuslab is acceptable and we + * want the current one since its cache hot + */ + discard_slab(s, page); + page = s->cpu_slab[cpu]; + slab_lock(page); + goto load_freelist; + } + /* New slab does not fit our expectations */ + flush_slab(s, s->cpu_slab[cpu], cpu); } - else - slab_err(s, page, "object at 0x%p belongs " - "to slab %s", object, page->slab->name); - goto fail; + slab_lock(page); + SetSlabFrozen(page); + s->cpu_slab[cpu] = page; + goto load_freelist; } + return NULL; +debug: + object = page->freelist; + if (!alloc_debug_processing(s, page, object, addr)) + goto another_slab; - /* Special debug activities for freeing objects */ - if (!SlabFrozen(page) && !page->freelist) - remove_full(s, page); - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_FREE, addr); - trace(s, page, object, 0); - init_object(s, object, 0); - return 1; - -fail: - printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", - s->name, page, object); - return 0; + page->inuse++; + page->freelist = object[page->offset]; + slab_unlock(page); + return object; } -static int __init setup_slub_debug(char *str) +/* + * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) + * have the fastpath folded into their functions. So no function call + * overhead for requests that can be satisfied on the fastpath. + * + * The fastpath works by first checking if the lockless freelist can be used. + * If not then __slab_alloc is called for slow processing. + * + * Otherwise we can simply pick the next object from the lockless free list. + */ +static void __always_inline *slab_alloc(struct kmem_cache *s, + gfp_t gfpflags, int node, void *addr) { - if (!str || *str != '=') - slub_debug = DEBUG_DEFAULT_FLAGS; - else { - str++; - if (*str == 0 || *str == ',') - slub_debug = DEBUG_DEFAULT_FLAGS; - else - for( ;*str && *str != ','; str++) - switch (*str) { - case 'f' : case 'F' : - slub_debug |= SLAB_DEBUG_FREE; - break; - case 'z' : case 'Z' : - slub_debug |= SLAB_RED_ZONE; - break; - case 'p' : case 'P' : - slub_debug |= SLAB_POISON; - break; - case 'u' : case 'U' : - slub_debug |= SLAB_STORE_USER; - break; - case 't' : case 'T' : - slub_debug |= SLAB_TRACE; - break; - default: - printk(KERN_ERR "slub_debug option '%c' " - "unknown. skipped\n",*str); - } - } + struct page *page; + void **object; + unsigned long flags; - if (*str == ',') - slub_debug_slabs = str + 1; - return 1; -} + local_irq_save(flags); + page = s->cpu_slab[smp_processor_id()]; + if (unlikely(!page || !page->lockless_freelist || + (node != -1 && page_to_nid(page) != node))) -__setup("slub_debug", setup_slub_debug); + object = __slab_alloc(s, gfpflags, node, addr, page); -static void kmem_cache_open_debug_check(struct kmem_cache *s) -{ - /* - * The page->offset field is only 16 bit wide. This is an offset - * in units of words from the beginning of an object. If the slab - * size is bigger then we cannot move the free pointer behind the - * object anymore. - * - * On 32 bit platforms the limit is 256k. On 64bit platforms - * the limit is 512k. - * - * Debugging or ctor may create a need to move the free - * pointer. Fail if this happens. - */ - if (s->objsize >= 65535 * sizeof(void *)) { - BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON | - SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); - BUG_ON(s->ctor); + else { + object = page->lockless_freelist; + page->lockless_freelist = object[page->offset]; } - else - /* - * Enable debugging if selected on the kernel commandline. - */ - if (slub_debug && (!slub_debug_slabs || - strncmp(slub_debug_slabs, s->name, - strlen(slub_debug_slabs)) == 0)) - s->flags |= slub_debug; + local_irq_restore(flags); + return object; } -#else -static inline void setup_object_debug(struct kmem_cache *s, - struct page *page, void *object) {} -static inline int alloc_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } - -static inline int free_debug_processing(struct kmem_cache *s, - struct page *page, void *object, void *addr) { return 0; } - -static inline int slab_pad_check(struct kmem_cache *s, struct page *page) - { return 1; } -static inline int check_object(struct kmem_cache *s, struct page *page, - void *object, int active) { return 1; } -static inline void add_full(struct kmem_cache_node *n, struct page *page) {} -static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {} -#define slub_debug 0 -#endif -/* - * Slab allocation and freeing - */ -static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) +void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - struct page * page; - int pages = 1 << s->order; - - if (s->order) - flags |= __GFP_COMP; - - if (s->flags & SLAB_CACHE_DMA) - flags |= SLUB_DMA; - - if (s->flags & SLAB_RECLAIM_ACCOUNT) - flags |= __GFP_RECLAIMABLE; - - if (node == -1) - page = alloc_pages(flags, s->order); - else - page = alloc_pages_node(node, flags, s->order); - - if (!page) - return NULL; - - mod_zone_page_state(page_zone(page), - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - pages); - - return page; + return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); } +EXPORT_SYMBOL(kmem_cache_alloc); -static void setup_object(struct kmem_cache *s, struct page *page, - void *object) +#ifdef CONFIG_NUMA +void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) { - setup_object_debug(s, page, object); - if (unlikely(s->ctor)) - s->ctor(object, s, 0); + return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); } +EXPORT_SYMBOL(kmem_cache_alloc_node); +#endif -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +/* + * Slow patch handling. This may still be called frequently since objects + * have a longer lifetime than the cpu slabs in most processing loads. + * + * So we still attempt to reduce cache line usage. Just take the slab + * lock and free the item. If there is no additional partial page + * handling required then we can return immediately. + */ +static void __slab_free(struct kmem_cache *s, struct page *page, + void *x, void *addr) { - struct page *page; - struct kmem_cache_node *n; - void *start; - void *end; - void *last; - void *p; - - BUG_ON(flags & ~(GFP_DMA | GFP_LEVEL_MASK)); - - if (flags & __GFP_WAIT) - local_irq_enable(); - - page = allocate_slab(s, flags & GFP_LEVEL_MASK, node); - if (!page) - goto out; - - n = get_node(s, page_to_nid(page)); - if (n) - atomic_long_inc(&n->nr_slabs); - - page->inuse = 0; - page->lockless_freelist = NULL; - page->offset = s->offset / sizeof(void *); - page->slab = s; + void *prior; + void **object = (void *)x; - start = page_address(page); - end = start + s->objects * s->size; + slab_lock(page); - if (unlikely(s->flags & SLAB_POISON)) - memset(start, POISON_INUSE, PAGE_SIZE << s->order); + if (unlikely(SlabDebug(page))) + goto debug; +checks_ok: + prior = object[page->offset] = page->freelist; + page->freelist = object; + page->inuse--; - last = start; - for_each_object(p, s, start) { - setup_object(s, page, last); - set_freepointer(s, last, p); - last = p; - } - setup_object(s, page, last); - set_freepointer(s, last, NULL); + if (unlikely(SlabFrozen(page))) + goto out_unlock; - page->freelist = start; + if (unlikely(!page->inuse)) + goto slab_empty; /* - * page->inuse must be 0 when PageSlab(page) becomes - * true so that defrag knows that this slab is not in use. + * Objects left in the slab. If it + * was not on the partial list before + * then add it. */ - smp_wmb(); - __SetPageSlab(page); - if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | - SLAB_STORE_USER | SLAB_TRACE)) - SetSlabDebug(page); - - out: - if (flags & __GFP_WAIT) - local_irq_disable(); - return page; -} - -static void __free_slab(struct kmem_cache *s, struct page *page) -{ - int pages = 1 << s->order; + if (unlikely(!prior)) + add_partial(get_node(s, page_to_nid(page)), page); - if (unlikely(SlabDebug(page))) { - void *p; +out_unlock: + slab_unlock(page); + return; - slab_pad_check(s, page); - for_each_object(p, s, page_address(page)) - check_object(s, page, p, 0); - } +slab_empty: + if (prior) + /* + * Slab still on the partial list. + */ + remove_partial(s, page); - mod_zone_page_state(page_zone(page), - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - - pages); + slab_unlock(page); + discard_slab(s, page); + return; - page->mapping = NULL; - __free_pages(page, s->order); +debug: + if (!free_debug_processing(s, page, x, addr)) + goto out_unlock; + goto checks_ok; } -static void rcu_free_slab(struct rcu_head *h) +/* + * Fastpath with forced inlining to produce a kfree and kmem_cache_free that + * can perform fastpath freeing without additional function calls. + * + * The fastpath is only possible if we are freeing to the current cpu slab + * of this processor. This typically the case if we have just allocated + * the item before. + * + * If fastpath is not possible then fall back to __slab_free where we deal + * with all sorts of special processing. + */ +static void __always_inline slab_free(struct kmem_cache *s, + struct page *page, void *x, void *addr) { - struct page *page; + void **object = (void *)x; + unsigned long flags; - page = container_of((struct list_head *)h, struct page, lru); - __free_slab(page->slab, page); + local_irq_save(flags); + if (likely(page == s->cpu_slab[smp_processor_id()] && + !SlabDebug(page))) { + object[page->offset] = page->lockless_freelist; + page->lockless_freelist = object; + } else + __slab_free(s, page, x, addr); + + local_irq_restore(flags); } -static void free_slab(struct kmem_cache *s, struct page *page) +void kmem_cache_free(struct kmem_cache *s, void *x) { - if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { - /* - * RCU free overloads the RCU head over the LRU - */ - struct rcu_head *head = (void *)&page->lru; + struct page *page; - call_rcu(head, rcu_free_slab); - } else - __free_slab(s, page); + page = virt_to_head_page(x); + + slab_free(s, page, x, __builtin_return_address(0)); } +EXPORT_SYMBOL(kmem_cache_free); -static void discard_slab(struct kmem_cache *s, struct page *page) +/* Figure out on which slab object the object resides */ +static struct page *get_object_page(const void *x) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + struct page *page = virt_to_head_page(x); - atomic_long_dec(&n->nr_slabs); - reset_page_mapcount(page); - ClearSlabDebug(page); - __ClearPageSlab(page); - free_slab(s, page); + if (!PageSlab(page)) + return NULL; + + return page; } /* - * Per slab locking using the pagelock + * Object placement in a slab is made very easy because we always start at + * offset 0. If we tune the size of the object to the alignment then we can + * get the required alignment by putting one properly sized object after + * another. + * + * Notice that the allocation order determines the sizes of the per cpu + * caches. Each processor has always one slab available for allocations. + * Increasing the allocation order reduces the number of times that slabs + * must be moved on and off the partial lists and is therefore a factor in + * locking overhead. */ -static __always_inline void slab_lock(struct page *page) -{ - bit_spin_lock(PG_locked, &page->flags); -} -static __always_inline void slab_unlock(struct page *page) -{ - bit_spin_unlock(PG_locked, &page->flags); -} +/* + * Set if the user has overridden any of the order related defaults. + */ +static int user_override; -static __always_inline int slab_trylock(struct page *page) -{ - int rc = 1; +/* + * Mininum / Maximum order of slab pages. This influences locking overhead + * and slab fragmentation. A higher order reduces the number of partial slabs + * and increases the number of allocations possible without having to + * take the list_lock. + */ +static int slub_min_order; +static int slub_max_order = DEFAULT_MAX_ORDER; +static int slub_min_objects = DEFAULT_MIN_OBJECTS; - rc = bit_spin_trylock(PG_locked, &page->flags); - return rc; -} +/* + * Merge control. If this is set then no merging of slab caches will occur. + * (Could be removed. This was introduced to pacify the merge skeptics.) + */ +static int slub_nomerge; /* - * Management of partially allocated slabs + * Calculate the order of allocation given an slab object size. + * + * The order of allocation has significant impact on performance and other + * system components. Generally order 0 allocations should be preferred since + * order 0 does not cause fragmentation in the page allocator. Larger objects + * be problematic to put into order 0 slabs because there may be too much + * unused space left. We go to a higher order if more than 1/8th of the slab + * would be wasted. + * + * In order to reach satisfactory performance we must ensure that a minimum + * number of objects is in one slab. Otherwise we may generate too much + * activity on the partial lists which requires taking the list_lock. This is + * less a concern for large slabs though which are rarely used. + * + * slub_max_order specifies the order where we begin to stop considering the + * number of objects in a slab as critical. If we reach slub_max_order then + * we try to keep the page order as low as possible. So we accept more waste + * of space in favor of a small page order. + * + * Higher order allocations also allow the placement of more objects in a + * slab and thereby reduce object handling overhead. If the user has + * requested a higher mininum order then we start with that one instead of + * the smallest order which will fit the object. */ -static void add_partial_tail(struct kmem_cache_node *n, struct page *page) +static inline int slab_order(int size, int min_objects, + int max_order, int fract_leftover) { - spin_lock(&n->list_lock); - n->nr_partial++; - list_add_tail(&page->lru, &n->partial); - spin_unlock(&n->list_lock); -} + int order; + int rem; -static void add_partial(struct kmem_cache_node *n, struct page *page) -{ - spin_lock(&n->list_lock); - n->nr_partial++; - list_add(&page->lru, &n->partial); - spin_unlock(&n->list_lock); + for (order = max(slub_min_order, + fls(min_objects * size - 1) - PAGE_SHIFT); + order <= max_order; order++) { + + unsigned long slab_size = PAGE_SIZE << order; + + if (slab_size < min_objects * size) + continue; + + rem = slab_size % size; + + if (rem <= slab_size / fract_leftover) + break; + + } + + return order; } -static void remove_partial(struct kmem_cache *s, - struct page *page) +static inline int calculate_order(int size) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + int order; + int min_objects; + int fraction; - spin_lock(&n->list_lock); - list_del(&page->lru); - n->nr_partial--; - spin_unlock(&n->list_lock); + /* + * Attempt to find best configuration for a slab. This + * works by first attempting to generate a layout with + * the best configuration and backing off gradually. + * + * First we reduce the acceptable waste in a slab. Then + * we reduce the minimum objects required in a slab. + */ + min_objects = slub_min_objects; + while (min_objects > 1) { + fraction = 8; + while (fraction >= 4) { + order = slab_order(size, min_objects, + slub_max_order, fraction); + if (order <= slub_max_order) + return order; + fraction /= 2; + } + min_objects /= 2; + } + + /* + * We were unable to place multiple objects in a slab. Now + * lets see if we can place a single object there. + */ + order = slab_order(size, 1, slub_max_order, 1); + if (order <= slub_max_order) + return order; + + /* + * Doh this slab cannot be placed using slub_max_order. + */ + order = slab_order(size, 1, MAX_ORDER, 1); + if (order <= MAX_ORDER) + return order; + return -ENOSYS; } /* - * Lock slab and remove from the partial list. - * - * Must hold list_lock. + * Figure out what the alignment of the objects will be. */ -static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) +static unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size) { - if (slab_trylock(page)) { - list_del(&page->lru); - n->nr_partial--; - SetSlabFrozen(page); - return 1; - } - return 0; + /* + * If the user wants hardware cache aligned objects then + * follow that suggestion if the object is sufficiently + * large. + * + * The hardware cache alignment cannot override the + * specified alignment though. If that is greater + * then use it. + */ + if ((flags & SLAB_HWCACHE_ALIGN) && + size > cache_line_size() / 2) + return max_t(unsigned long, align, cache_line_size()); + + if (align < ARCH_SLAB_MINALIGN) + return ARCH_SLAB_MINALIGN; + + return ALIGN(align, sizeof(void *)); +} + +static void init_kmem_cache_node(struct kmem_cache_node *n) +{ + n->nr_partial = 0; + atomic_long_set(&n->nr_slabs, 0); + spin_lock_init(&n->list_lock); + INIT_LIST_HEAD(&n->partial); + INIT_LIST_HEAD(&n->full); } +#ifdef CONFIG_NUMA /* - * Try to allocate a partial slab from a specific node. + * No kmalloc_node yet so do it by hand. We know that this is the first + * slab on the node for this slabcache. There are no concurrent accesses + * possible. + * + * Note that this function only works on the kmalloc_node_cache + * when allocating for the kmalloc_node_cache. */ -static struct page *get_partial_node(struct kmem_cache_node *n) +static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, + int node) { struct page *page; + struct kmem_cache_node *n; - /* - * Racy check. If we mistakenly see no partial slabs then we - * just allocate an empty slab. If we mistakenly try to get a - * partial slab and there is none available then get_partials() - * will return NULL. - */ - if (!n || !n->nr_partial) - return NULL; + BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); - spin_lock(&n->list_lock); - list_for_each_entry(page, &n->partial, lru) - if (lock_and_freeze_slab(n, page)) - goto out; - page = NULL; -out: - spin_unlock(&n->list_lock); - return page; + page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); + /* new_slab() disables interupts */ + local_irq_enable(); + + BUG_ON(!page); + n = page->freelist; + BUG_ON(!n); + page->freelist = get_freepointer(kmalloc_caches, n); + page->inuse++; + kmalloc_caches->node[node] = n; + setup_object_debug(kmalloc_caches, page, n); + init_kmem_cache_node(n); + atomic_long_inc(&n->nr_slabs); + add_partial(n, page); + return n; } -/* - * Get a page from somewhere. Search in increasing NUMA distances. - */ -static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) +static void free_kmem_cache_nodes(struct kmem_cache *s) { -#ifdef CONFIG_NUMA - struct zonelist *zonelist; - struct zone **z; - struct page *page; + int node; - /* - * The defrag ratio allows a configuration of the tradeoffs between - * inter node defragmentation and node local allocations. A lower - * defrag_ratio increases the tendency to do local allocations - * instead of attempting to obtain partial slabs from other nodes. - * - * If the defrag_ratio is set to 0 then kmalloc() always - * returns node local objects. If the ratio is higher then kmalloc() - * may return off node objects because partial slabs are obtained - * from other nodes and filled up. - * - * If /sys/slab/xx/defrag_ratio is set to 100 (which makes - * defrag_ratio = 1000) then every (well almost) allocation will - * first attempt to defrag slab caches on other nodes. This means - * scanning over all nodes to look for partial slabs which may be - * expensive if we do it every time we are trying to find a slab - * with available objects. - */ - if (!s->defrag_ratio || get_cycles() % 1024 > s->defrag_ratio) - return NULL; + for_each_online_node(node) { + struct kmem_cache_node *n = s->node[node]; + if (n && n != &s->local_node) + kmem_cache_free(kmalloc_caches, n); + s->node[node] = NULL; + } +} - zonelist = &NODE_DATA(slab_node(current->mempolicy)) - ->node_zonelists[gfp_zone(flags)]; - for (z = zonelist->zones; *z; z++) { +static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +{ + int node; + int local_node; + + if (slab_state >= UP) + local_node = page_to_nid(virt_to_page(s)); + else + local_node = 0; + + for_each_online_node(node) { struct kmem_cache_node *n; - n = get_node(s, zone_to_nid(*z)); + if (local_node == node) + n = &s->local_node; + else { + if (slab_state == DOWN) { + n = early_kmem_cache_node_alloc(gfpflags, + node); + continue; + } + n = kmem_cache_alloc_node(kmalloc_caches, + gfpflags, node); + + if (!n) { + free_kmem_cache_nodes(s); + return 0; + } - if (n && cpuset_zone_allowed_hardwall(*z, flags) && - n->nr_partial > MIN_PARTIAL) { - page = get_partial_node(n); - if (page) - return page; } + s->node[node] = n; + init_kmem_cache_node(n); } -#endif - return NULL; + return 1; } - -/* - * Get a partial page, lock it and return it. - */ -static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +#else +static void free_kmem_cache_nodes(struct kmem_cache *s) { - struct page *page; - int searchnode = (node == -1) ? numa_node_id() : node; - - page = get_partial_node(get_node(s, searchnode)); - if (page || (flags & __GFP_THISNODE)) - return page; +} - return get_any_partial(s, flags); +static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +{ + init_kmem_cache_node(&s->local_node); + return 1; } +#endif /* - * Move a page back to the lists. - * - * Must be called with the slab lock held. - * - * On exit the slab lock will have been dropped. + * calculate_sizes() determines the order and the distribution of data within + * a slab object. */ -static void unfreeze_slab(struct kmem_cache *s, struct page *page) +static int calculate_sizes(struct kmem_cache *s) { - struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + unsigned long flags = s->flags; + unsigned long size = s->objsize; + unsigned long align = s->align; - ClearSlabFrozen(page); - if (page->inuse) { + /* + * Determine if we can poison the object itself. If the user of + * the slab may touch the object after free or before allocation + * then we should never poison the object itself. + */ + if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && + !s->ctor) + s->flags |= __OBJECT_POISON; + else + s->flags &= ~__OBJECT_POISON; - if (page->freelist) - add_partial(n, page); - else if (SlabDebug(page) && (s->flags & SLAB_STORE_USER)) - add_full(n, page); - slab_unlock(page); + /* + * Round up object size to the next word boundary. We can only + * place the free pointer at word boundaries and this determines + * the possible location of the free pointer. + */ + size = ALIGN(size, sizeof(void *)); - } else { - if (n->nr_partial < MIN_PARTIAL) { - /* - * Adding an empty slab to the partial slabs in order - * to avoid page allocator overhead. This slab needs - * to come after the other slabs with objects in - * order to fill them up. That way the size of the - * partial list stays small. kmem_cache_shrink can - * reclaim empty slabs from the partial list. - */ - add_partial_tail(n, page); - slab_unlock(page); - } else { - slab_unlock(page); - discard_slab(s, page); - } - } -} +#ifdef CONFIG_SLUB_DEBUG + /* + * If we are Redzoning then check if there is some space between the + * end of the object and the free pointer. If not then add an + * additional word to have some bytes to store Redzone information. + */ + if ((flags & SLAB_RED_ZONE) && size == s->objsize) + size += sizeof(void *); +#endif -/* - * Remove the cpu slab - */ -static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) -{ /* - * Merge cpu freelist into freelist. Typically we get here - * because both freelists are empty. So this is unlikely - * to occur. + * With that we have determined the number of bytes in actual use + * by the object. This is the potential offset to the free pointer. */ - while (unlikely(page->lockless_freelist)) { - void **object; + s->inuse = size; + +#ifdef CONFIG_SLUB_DEBUG + if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || + s->ctor)) { + /* + * Relocate free pointer after the object if it is not + * permitted to overwrite the first word of the object on + * kmem_cache_free. + * + * This is the case if we do RCU, have a constructor or + * destructor or are poisoning the objects. + */ + s->offset = size; + size += sizeof(void *); + } + + if (flags & SLAB_STORE_USER) + /* + * Need to store information about allocs and frees after + * the object. + */ + size += 2 * sizeof(struct track); + + if (flags & SLAB_RED_ZONE) + /* + * Add some empty padding so that we can catch + * overwrites from earlier objects rather than let + * tracking information or the free pointer be + * corrupted if an user writes before the start + * of the object. + */ + size += sizeof(void *); +#endif + + /* + * Determine the alignment based on various parameters that the + * user specified and the dynamic determination of cache line size + * on bootup. + */ + align = calculate_alignment(flags, align, s->objsize); - /* Retrieve object from cpu_freelist */ - object = page->lockless_freelist; - page->lockless_freelist = page->lockless_freelist[page->offset]; + /* + * SLUB stores one object immediately after another beginning from + * offset 0. In order to align the objects we have to simply size + * each object to conform to the alignment. + */ + size = ALIGN(size, align); + s->size = size; - /* And put onto the regular freelist */ - object[page->offset] = page->freelist; - page->freelist = object; - page->inuse--; - } - s->cpu_slab[cpu] = NULL; - unfreeze_slab(s, page); -} + s->order = calculate_order(size); + if (s->order < 0) + return 0; -static void flush_slab(struct kmem_cache *s, struct page *page, int cpu) -{ - slab_lock(page); - deactivate_slab(s, page, cpu); -} + /* + * Determine the number of objects per slab + */ + s->objects = (PAGE_SIZE << s->order) / size; -/* - * Flush cpu slab. - * Called from IPI handler with interrupts disabled. - */ -static void __flush_cpu_slab(struct kmem_cache *s, int cpu) -{ - struct page *page = s->cpu_slab[cpu]; + /* + * Verify that the number of objects is within permitted limits. + * The page->inuse field is only 16 bit wide! So we cannot have + * more than 64k objects per slab. + */ + if (!s->objects || s->objects > 65535) + return 0; + return 1; - if (likely(page)) - flush_slab(s, page, cpu); } -static void flush_cpu_slab(void *d) +static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, + const char *name, size_t size, + size_t align, unsigned long flags, + void (*ctor)(void *, struct kmem_cache *, unsigned long), + const struct kmem_cache_ops *ops) { - struct kmem_cache *s = d; - int cpu = smp_processor_id(); - - __flush_cpu_slab(s, cpu); -} + memset(s, 0, kmem_size); + s->name = name; + s->ctor = ctor; + s->ops = ops; + s->objsize = size; + s->flags = flags; + s->align = align; + kmem_cache_open_debug_check(s); -static void flush_all(struct kmem_cache *s) -{ -#ifdef CONFIG_SMP - on_each_cpu(flush_cpu_slab, s, 1, 1); -#else - unsigned long flags; + if (!calculate_sizes(s)) + goto error; - local_irq_save(flags); - flush_cpu_slab(s); - local_irq_restore(flags); + s->refcount = 1; +#ifdef CONFIG_NUMA + s->defrag_ratio = 100; #endif + if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + return 1; +error: + if (flags & SLAB_PANIC) + panic("Cannot create slab %s size=%lu realsize=%u " + "order=%u offset=%u flags=%lx\n", + s->name, (unsigned long)size, s->size, s->order, + s->offset, flags); + return 0; } +EXPORT_SYMBOL(kmem_cache_open); /* - * Slow path. The lockless freelist is empty or we need to perform - * debugging duties. - * - * Interrupts are disabled. - * - * Processing is still very fast if new objects have been freed to the - * regular freelist. In that case we simply take over the regular freelist - * as the lockless freelist and zap the regular freelist. - * - * If that is not working then we fall back to the partial lists. We take the - * first element of the freelist as the object to allocate now and move the - * rest of the freelist to the lockless freelist. - * - * And if we were unable to get a new slab from the partial slab lists then - * we need to allocate a new slab. This is slowest path since we may sleep. + * Check if a given pointer is valid */ -static void *__slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr, struct page *page) +int kmem_ptr_validate(struct kmem_cache *s, const void *object) { - void **object; - int cpu = smp_processor_id(); - - if (!page) - goto new_slab; - - slab_lock(page); - if (unlikely(node != -1 && page_to_nid(page) != node)) - goto another_slab; -load_freelist: - object = page->freelist; - if (unlikely(!object)) - goto another_slab; - if (unlikely(SlabDebug(page))) - goto debug; - - object = page->freelist; - page->lockless_freelist = object[page->offset]; - page->inuse = s->objects; - page->freelist = NULL; - slab_unlock(page); - return object; + struct page * page; -another_slab: - deactivate_slab(s, page, cpu); + page = get_object_page(object); -new_slab: - page = get_partial(s, gfpflags, node); - if (page) { - s->cpu_slab[cpu] = page; - goto load_freelist; - } + if (!page || s != page->slab) + /* No slab or wrong slab */ + return 0; - page = new_slab(s, gfpflags, node); - if (page) { - cpu = smp_processor_id(); - if (s->cpu_slab[cpu]) { - /* - * Someone else populated the cpu_slab while we - * enabled interrupts, or we have gotten scheduled - * on another cpu. The page may not be on the - * requested node even if __GFP_THISNODE was - * specified. So we need to recheck. - */ - if (node == -1 || - page_to_nid(s->cpu_slab[cpu]) == node) { - /* - * Current cpuslab is acceptable and we - * want the current one since its cache hot - */ - discard_slab(s, page); - page = s->cpu_slab[cpu]; - slab_lock(page); - goto load_freelist; - } - /* New slab does not fit our expectations */ - flush_slab(s, s->cpu_slab[cpu], cpu); - } - slab_lock(page); - SetSlabFrozen(page); - s->cpu_slab[cpu] = page; - goto load_freelist; - } - return NULL; -debug: - object = page->freelist; - if (!alloc_debug_processing(s, page, object, addr)) - goto another_slab; + if (!check_valid_pointer(s, page, object)) + return 0; - page->inuse++; - page->freelist = object[page->offset]; - slab_unlock(page); - return object; + /* + * We could also check if the object is on the slabs freelist. + * But this would be too expensive and it seems that the main + * purpose of kmem_ptr_valid is to check if the object belongs + * to a certain slab. + */ + return 1; } +EXPORT_SYMBOL(kmem_ptr_validate); /* - * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) - * have the fastpath folded into their functions. So no function call - * overhead for requests that can be satisfied on the fastpath. - * - * The fastpath works by first checking if the lockless freelist can be used. - * If not then __slab_alloc is called for slow processing. - * - * Otherwise we can simply pick the next object from the lockless free list. + * Determine the size of a slab object */ -static void __always_inline *slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr) +unsigned int kmem_cache_size(struct kmem_cache *s) { - struct page *page; - void **object; - unsigned long flags; - - local_irq_save(flags); - page = s->cpu_slab[smp_processor_id()]; - if (unlikely(!page || !page->lockless_freelist || - (node != -1 && page_to_nid(page) != node))) - - object = __slab_alloc(s, gfpflags, node, addr, page); - - else { - object = page->lockless_freelist; - page->lockless_freelist = object[page->offset]; - } - local_irq_restore(flags); - return object; + return s->objsize; } +EXPORT_SYMBOL(kmem_cache_size); -void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +const char *kmem_cache_name(struct kmem_cache *s) { - return slab_alloc(s, gfpflags, -1, __builtin_return_address(0)); + return s->name; } -EXPORT_SYMBOL(kmem_cache_alloc); +EXPORT_SYMBOL(kmem_cache_name); -#ifdef CONFIG_NUMA -void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) +/* + * Attempt to free all slabs on a node. Return the number of slabs we + * were unable to free. + */ +static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, + struct list_head *list) { - return slab_alloc(s, gfpflags, node, __builtin_return_address(0)); + int slabs_inuse = 0; + unsigned long flags; + struct page *page, *h; + + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry_safe(page, h, list, lru) + if (!page->inuse) { + list_del(&page->lru); + discard_slab(s, page); + } else + slabs_inuse++; + spin_unlock_irqrestore(&n->list_lock, flags); + return slabs_inuse; } -EXPORT_SYMBOL(kmem_cache_alloc_node); -#endif /* - * Slow patch handling. This may still be called frequently since objects - * have a longer lifetime than the cpu slabs in most processing loads. - * - * So we still attempt to reduce cache line usage. Just take the slab - * lock and free the item. If there is no additional partial page - * handling required then we can return immediately. + * Release all resources used by a slab cache. */ -static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, void *addr) +static int kmem_cache_close(struct kmem_cache *s) { - void *prior; - void **object = (void *)x; - - slab_lock(page); - - if (unlikely(SlabDebug(page))) - goto debug; -checks_ok: - prior = object[page->offset] = page->freelist; - page->freelist = object; - page->inuse--; + int node; - if (unlikely(SlabFrozen(page))) - goto out_unlock; + flush_all(s); - if (unlikely(!page->inuse)) - goto slab_empty; + /* Attempt to free all objects */ + for_each_online_node(node) { + struct kmem_cache_node *n = get_node(s, node); - /* - * Objects left in the slab. If it - * was not on the partial list before - * then add it. - */ - if (unlikely(!prior)) - add_partial(get_node(s, page_to_nid(page)), page); + n->nr_partial -= free_list(s, n, &n->partial); + if (atomic_long_read(&n->nr_slabs)) + return 1; + } + free_kmem_cache_nodes(s); + return 0; +} -out_unlock: - slab_unlock(page); - return; +/* + * Close a cache and release the kmem_cache structure + * (must be used for caches created using kmem_cache_create) + */ +void kmem_cache_destroy(struct kmem_cache *s) +{ + down_write(&slub_lock); + s->refcount--; + if (!s->refcount) { + list_del(&s->list); + if (kmem_cache_close(s)) + WARN_ON(1); + sysfs_slab_remove(s); + kfree(s); + } + up_write(&slub_lock); +} +EXPORT_SYMBOL(kmem_cache_destroy); -slab_empty: - if (prior) - /* - * Slab still on the partial list. - */ - remove_partial(s, page); +/******************************************************************** + * Kmalloc subsystem + *******************************************************************/ - slab_unlock(page); - discard_slab(s, page); - return; +struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; +EXPORT_SYMBOL(kmalloc_caches); -debug: - if (!free_debug_processing(s, page, x, addr)) - goto out_unlock; - goto checks_ok; -} +#ifdef CONFIG_ZONE_DMA +static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; +#endif -/* - * Fastpath with forced inlining to produce a kfree and kmem_cache_free that - * can perform fastpath freeing without additional function calls. - * - * The fastpath is only possible if we are freeing to the current cpu slab - * of this processor. This typically the case if we have just allocated - * the item before. - * - * If fastpath is not possible then fall back to __slab_free where we deal - * with all sorts of special processing. - */ -static void __always_inline slab_free(struct kmem_cache *s, - struct page *page, void *x, void *addr) +static int __init setup_slub_min_order(char *str) { - void **object = (void *)x; - unsigned long flags; + get_option (&str, &slub_min_order); + user_override = 1; + return 1; +} - local_irq_save(flags); - if (likely(page == s->cpu_slab[smp_processor_id()] && - !SlabDebug(page))) { - object[page->offset] = page->lockless_freelist; - page->lockless_freelist = object; - } else - __slab_free(s, page, x, addr); +__setup("slub_min_order=", setup_slub_min_order); - local_irq_restore(flags); +static int __init setup_slub_max_order(char *str) +{ + get_option (&str, &slub_max_order); + user_override = 1; + return 1; } -void kmem_cache_free(struct kmem_cache *s, void *x) +__setup("slub_max_order=", setup_slub_max_order); + +static int __init setup_slub_min_objects(char *str) { - struct page *page; + get_option (&str, &slub_min_objects); + user_override = 1; + return 1; +} - page = virt_to_head_page(x); +__setup("slub_min_objects=", setup_slub_min_objects); - slab_free(s, page, x, __builtin_return_address(0)); +static int __init setup_slub_nomerge(char *str) +{ + slub_nomerge = 1; + return 1; } -EXPORT_SYMBOL(kmem_cache_free); -/* Figure out on which slab object the object resides */ -static struct page *get_object_page(const void *x) +__setup("slub_nomerge", setup_slub_nomerge); + +static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, + const char *name, int size, gfp_t gfp_flags) { - struct page *page = virt_to_head_page(x); + unsigned int flags = 0; - if (!PageSlab(page)) - return NULL; + if (gfp_flags & SLUB_DMA) + flags = SLAB_CACHE_DMA; - return page; + down_write(&slub_lock); + if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, + flags, NULL, &slub_default_ops)) + goto panic; + + list_add(&s->list, &slab_caches); + up_write(&slub_lock); + if (sysfs_slab_add(s)) + goto panic; + return s; + +panic: + panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); } -/* - * Object placement in a slab is made very easy because we always start at - * offset 0. If we tune the size of the object to the alignment then we can - * get the required alignment by putting one properly sized object after - * another. - * - * Notice that the allocation order determines the sizes of the per cpu - * caches. Each processor has always one slab available for allocations. - * Increasing the allocation order reduces the number of times that slabs - * must be moved on and off the partial lists and is therefore a factor in - * locking overhead. - */ +static struct kmem_cache *get_slab(size_t size, gfp_t flags) +{ + int index = kmalloc_index(size); -/* - * Set if the user has overridden any of the order related defaults. - */ -static int user_override; + if (!index) + return NULL; -/* - * Mininum / Maximum order of slab pages. This influences locking overhead - * and slab fragmentation. A higher order reduces the number of partial slabs - * and increases the number of allocations possible without having to - * take the list_lock. - */ -static int slub_min_order; -static int slub_max_order = DEFAULT_MAX_ORDER; -static int slub_min_objects = DEFAULT_MIN_OBJECTS; + /* Allocation too large? */ + BUG_ON(index < 0); -/* - * Merge control. If this is set then no merging of slab caches will occur. - * (Could be removed. This was introduced to pacify the merge skeptics.) - */ -static int slub_nomerge; +#ifdef CONFIG_ZONE_DMA + if ((flags & SLUB_DMA)) { + struct kmem_cache *s; + struct kmem_cache *x; + char *text; + size_t realsize; -/* - * Calculate the order of allocation given an slab object size. - * - * The order of allocation has significant impact on performance and other - * system components. Generally order 0 allocations should be preferred since - * order 0 does not cause fragmentation in the page allocator. Larger objects - * be problematic to put into order 0 slabs because there may be too much - * unused space left. We go to a higher order if more than 1/8th of the slab - * would be wasted. - * - * In order to reach satisfactory performance we must ensure that a minimum - * number of objects is in one slab. Otherwise we may generate too much - * activity on the partial lists which requires taking the list_lock. This is - * less a concern for large slabs though which are rarely used. - * - * slub_max_order specifies the order where we begin to stop considering the - * number of objects in a slab as critical. If we reach slub_max_order then - * we try to keep the page order as low as possible. So we accept more waste - * of space in favor of a small page order. - * - * Higher order allocations also allow the placement of more objects in a - * slab and thereby reduce object handling overhead. If the user has - * requested a higher mininum order then we start with that one instead of - * the smallest order which will fit the object. - */ -static inline int slab_order(int size, int min_objects, - int max_order, int fract_leftover) -{ - int order; - int rem; + s = kmalloc_caches_dma[index]; + if (s) + return s; - for (order = max(slub_min_order, - fls(min_objects * size - 1) - PAGE_SHIFT); - order <= max_order; order++) { + /* Dynamically create dma cache */ + x = kmalloc(kmem_size, flags & ~SLUB_DMA); + if (!x) + panic("Unable to allocate memory for dma cache\n"); - unsigned long slab_size = PAGE_SIZE << order; + if (index <= KMALLOC_SHIFT_HIGH) + realsize = 1 << index; + else { + if (index == 1) + realsize = 96; + else + realsize = 192; + } - if (slab_size < min_objects * size) - continue; + text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", + (unsigned int)realsize); + s = create_kmalloc_cache(x, text, realsize, flags); + kmalloc_caches_dma[index] = s; + return s; + } +#endif + return &kmalloc_caches[index]; +} - rem = slab_size % size; +void *__kmalloc(size_t size, gfp_t flags) +{ + struct kmem_cache *s = get_slab(size, flags); - if (rem <= slab_size / fract_leftover) - break; + if (s) + return slab_alloc(s, flags, -1, __builtin_return_address(0)); + return NULL; +} +EXPORT_SYMBOL(__kmalloc); - } +#ifdef CONFIG_NUMA +void *__kmalloc_node(size_t size, gfp_t flags, int node) +{ + struct kmem_cache *s = get_slab(size, flags); - return order; + if (s) + return slab_alloc(s, flags, node, __builtin_return_address(0)); + return NULL; } +EXPORT_SYMBOL(__kmalloc_node); +#endif -static inline int calculate_order(int size) +size_t ksize(const void *object) { - int order; - int min_objects; - int fraction; + struct page *page = get_object_page(object); + struct kmem_cache *s; - /* - * Attempt to find best configuration for a slab. This - * works by first attempting to generate a layout with - * the best configuration and backing off gradually. - * - * First we reduce the acceptable waste in a slab. Then - * we reduce the minimum objects required in a slab. - */ - min_objects = slub_min_objects; - while (min_objects > 1) { - fraction = 8; - while (fraction >= 4) { - order = slab_order(size, min_objects, - slub_max_order, fraction); - if (order <= slub_max_order) - return order; - fraction /= 2; - } - min_objects /= 2; - } + BUG_ON(!page); + s = page->slab; + BUG_ON(!s); /* - * We were unable to place multiple objects in a slab. Now - * lets see if we can place a single object there. + * Debugging requires use of the padding between object + * and whatever may come after it. */ - order = slab_order(size, 1, slub_max_order, 1); - if (order <= slub_max_order) - return order; + if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) + return s->objsize; /* - * Doh this slab cannot be placed using slub_max_order. + * If we have the need to store the freelist pointer + * back there or track user information then we can + * only use the space before that information. */ - order = slab_order(size, 1, MAX_ORDER, 1); - if (order <= MAX_ORDER) - return order; - return -ENOSYS; -} + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + return s->inuse; -/* - * Figure out what the alignment of the objects will be. - */ -static unsigned long calculate_alignment(unsigned long flags, - unsigned long align, unsigned long size) -{ /* - * If the user wants hardware cache aligned objects then - * follow that suggestion if the object is sufficiently - * large. - * - * The hardware cache alignment cannot override the - * specified alignment though. If that is greater - * then use it. + * Else we can use all the padding etc for the allocation */ - if ((flags & SLAB_HWCACHE_ALIGN) && - size > cache_line_size() / 2) - return max_t(unsigned long, align, cache_line_size()); - - if (align < ARCH_SLAB_MINALIGN) - return ARCH_SLAB_MINALIGN; - - return ALIGN(align, sizeof(void *)); -} - -static void init_kmem_cache_node(struct kmem_cache_node *n) -{ - n->nr_partial = 0; - atomic_long_set(&n->nr_slabs, 0); - spin_lock_init(&n->list_lock); - INIT_LIST_HEAD(&n->partial); - INIT_LIST_HEAD(&n->full); + return s->size; } +EXPORT_SYMBOL(ksize); -#ifdef CONFIG_NUMA -/* - * No kmalloc_node yet so do it by hand. We know that this is the first - * slab on the node for this slabcache. There are no concurrent accesses - * possible. - * - * Note that this function only works on the kmalloc_node_cache - * when allocating for the kmalloc_node_cache. - */ -static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflags, - int node) +void kfree(const void *x) { + struct kmem_cache *s; struct page *page; - struct kmem_cache_node *n; - - BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); - - page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); - /* new_slab() disables interupts */ - local_irq_enable(); - BUG_ON(!page); - n = page->freelist; - BUG_ON(!n); - page->freelist = get_freepointer(kmalloc_caches, n); - page->inuse++; - kmalloc_caches->node[node] = n; - setup_object_debug(kmalloc_caches, page, n); - init_kmem_cache_node(n); - atomic_long_inc(&n->nr_slabs); - add_partial(n, page); - return n; -} + if (!x) + return; -static void free_kmem_cache_nodes(struct kmem_cache *s) -{ - int node; + page = virt_to_head_page(x); + s = page->slab; - for_each_online_node(node) { - struct kmem_cache_node *n = s->node[node]; - if (n && n != &s->local_node) - kmem_cache_free(kmalloc_caches, n); - s->node[node] = NULL; - } + slab_free(s, page, (void *)x, __builtin_return_address(0)); } +EXPORT_SYMBOL(kfree); -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +/* + * Order the freelist so that addresses increase as object are allocated. + * This is useful to trigger the cpu cacheline prefetching logic. + */ +void resequence_freelist(struct kmem_cache *s, struct page *page) { - int node; - int local_node; - - if (slab_state >= UP) - local_node = page_to_nid(virt_to_page(s)); - else - local_node = 0; - - for_each_online_node(node) { - struct kmem_cache_node *n; + void *p; + void *last; + void *addr = page_address(page); + DECLARE_BITMAP(map, s->objects); - if (local_node == node) - n = &s->local_node; - else { - if (slab_state == DOWN) { - n = early_kmem_cache_node_alloc(gfpflags, - node); - continue; - } - n = kmem_cache_alloc_node(kmalloc_caches, - gfpflags, node); + bitmap_zero(map, s->objects); - if (!n) { - free_kmem_cache_nodes(s); - return 0; - } + /* Figure out which objects are on the freelist */ + for_each_free_object(p, s, page->freelist) + set_bit(slab_index(p, s, addr), map); + last = NULL; + for_each_object(p, s, addr) + if (test_bit(slab_index(p, s, addr), map)) { + if (last) + set_freepointer(s, last, p); + else + page->freelist = p; + last = p; } - s->node[node] = n; - init_kmem_cache_node(n); - } - return 1; -} -#else -static void free_kmem_cache_nodes(struct kmem_cache *s) -{ -} -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) -{ - init_kmem_cache_node(&s->local_node); - return 1; + if (last) + set_freepointer(s, last, NULL); + else + page->freelist = NULL; } -#endif /* - * calculate_sizes() determines the order and the distribution of data within - * a slab object. + * Vacate all objects in the given slab. + * + * Slab must be locked and frozen. Interrupts are disabled (flags must + * be passed). + * + * Will drop and regain and drop the slab lock. At the end the slab will + * either be freed or returned to the partial lists. + * + * Returns the number of remaining objects */ -static int calculate_sizes(struct kmem_cache *s) +static int __kmem_cache_vacate(struct kmem_cache *s, + struct page *page, unsigned long flags, void **vector) { - unsigned long flags = s->flags; - unsigned long size = s->objsize; - unsigned long align = s->align; + void *p; + void *addr = page_address(page); + DECLARE_BITMAP(map, s->objects); + int leftover; + int objects; + void *private; + + if (!page->inuse) + goto out; + + /* Determine used objects */ + bitmap_fill(map, s->objects); + for_each_free_object(p, s, page->freelist) + __clear_bit(slab_index(p, s, addr), map); - /* - * Determine if we can poison the object itself. If the user of - * the slab may touch the object after free or before allocation - * then we should never poison the object itself. - */ - if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && - !s->ctor) - s->flags |= __OBJECT_POISON; - else - s->flags &= ~__OBJECT_POISON; + objects = 0; + memset(vector, 0, s->objects * sizeof(void **)); + for_each_object(p, s, addr) { + if (test_bit(slab_index(p, s, addr), map)) + vector[objects++] = p; + } + + private = s->ops->get(s, objects, vector); /* - * Round up object size to the next word boundary. We can only - * place the free pointer at word boundaries and this determines - * the possible location of the free pointer. + * Got references. Now we can drop the slab lock. The slab + * is frozen so it cannot vanish from under us nor will + * allocations be performed on the slab. However, unlocking the + * slab will allow concurrent slab_frees to proceed. */ - size = ALIGN(size, sizeof(void *)); + slab_unlock(page); + local_irq_restore(flags); -#ifdef CONFIG_SLUB_DEBUG /* - * If we are Redzoning then check if there is some space between the - * end of the object and the free pointer. If not then add an - * additional word to have some bytes to store Redzone information. + * Perform the KICK callbacks to remove the objects. */ - if ((flags & SLAB_RED_ZONE) && size == s->objsize) - size += sizeof(void *); -#endif + s->ops->kick(s, objects, vector, private); + local_irq_save(flags); + slab_lock(page); +out: /* - * With that we have determined the number of bytes in actual use - * by the object. This is the potential offset to the free pointer. + * Check the result and unfreeze the slab */ - s->inuse = size; - -#ifdef CONFIG_SLUB_DEBUG - if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || - s->ctor)) { + leftover = page->inuse; + if (leftover > 0) /* - * Relocate free pointer after the object if it is not - * permitted to overwrite the first word of the object on - * kmem_cache_free. - * - * This is the case if we do RCU, have a constructor or - * destructor or are poisoning the objects. + * Cannot free. Lets at least optimize the freelist. We have + * likely touched all the cachelines with the free pointers + * already so it is cheap to do here. */ - s->offset = size; - size += sizeof(void *); - } + resequence_freelist(s, page); + unfreeze_slab(s, page); + local_irq_restore(flags); + return leftover; +} - if (flags & SLAB_STORE_USER) - /* - * Need to store information about allocs and frees after - * the object. - */ - size += 2 * sizeof(struct track); +/* + * Get a page off a list and freeze it. Must be holding slab lock. + */ +static void freeze_from_list(struct kmem_cache *s, struct page *page) +{ + if (page->inuse < s->objects) + remove_partial(s, page); + else if (s->flags & SLAB_STORE_USER) + remove_full(s, page); + SetSlabFrozen(page); +} - if (flags & SLAB_RED_ZONE) - /* - * Add some empty padding so that we can catch - * overwrites from earlier objects rather than let - * tracking information or the free pointer be - * corrupted if an user writes before the start - * of the object. - */ - size += sizeof(void *); -#endif +/* + * Attempt to free objects in a page. Return 1 if succesful. + */ +int kmem_cache_vacate(struct page *page) +{ + unsigned long flags; + struct kmem_cache *s; + int vacated = 0; + void **vector = NULL; /* - * Determine the alignment based on various parameters that the - * user specified and the dynamic determination of cache line size - * on bootup. + * Get a reference to the page. Return if its freed or being freed. + * This is necessary to make sure that the page does not vanish + * from under us before we are able to check the result. */ - align = calculate_alignment(flags, align, s->objsize); + if (!get_page_unless_zero(page)) + return 0; - /* - * SLUB stores one object immediately after another beginning from - * offset 0. In order to align the objects we have to simply size - * each object to conform to the alignment. - */ - size = ALIGN(size, align); - s->size = size; + if (!PageSlab(page)) + goto out; - s->order = calculate_order(size); - if (s->order < 0) + s = page->slab; + if (!s) + goto out; + + vector = kmalloc(s->objects * sizeof(void *), GFP_KERNEL); + if (!vector) return 0; + local_irq_save(flags); /* - * Determine the number of objects per slab + * The implicit memory barrier in slab_lock guarantees that page->inuse + * is loaded after PageSlab(page) has been established to be true. This is + * only revelant for a newly created slab. */ - s->objects = (PAGE_SIZE << s->order) / size; + slab_lock(page); /* - * Verify that the number of objects is within permitted limits. - * The page->inuse field is only 16 bit wide! So we cannot have - * more than 64k objects per slab. + * We may now have locked a page that may be in various stages of + * being freed. If the PageSlab bit is off then we have already + * reached the page allocator. If page->inuse is zero then we are + * in SLUB but freeing or allocating the page. + * page->inuse is never modified without the slab lock held. + * + * Also abort if the page happens to be already frozen. If its + * frozen then a concurrent vacate may be in progress. */ - if (!s->objects || s->objects > 65535) - return 0; - return 1; + if (!PageSlab(page) || SlabFrozen(page) || !page->inuse) + goto out_locked; + + /* + * We are holding a lock on a slab page and all operations on the + * slab are blocking. + */ + if (!s->ops->get || !s->ops->kick) + goto out_locked; + freeze_from_list(s, page); + vacated = __kmem_cache_vacate(s, page, flags, vector) == 0; +out: + put_page(page); + kfree(vector); + return vacated; +out_locked: + slab_unlock(page); + local_irq_restore(flags); + goto out; } -static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, - const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(void *, struct kmem_cache *, unsigned long), - const struct kmem_cache_ops *ops) +/* + * kmem_cache_shrink removes empty slabs from the partial lists and sorts + * the remaining slabs by the number of items in use. The slabs with the + * most items in use come first. New allocations will then fill those up + * and thus they can be removed from the partial lists. + * + * The slabs with the least items are placed last. This results in them + * being allocated from last increasing the chance that the last objects + * are freed in them. + */ +int kmem_cache_shrink(struct kmem_cache *s) { - memset(s, 0, kmem_size); - s->name = name; - s->ctor = ctor; - s->ops = ops; - s->objsize = size; - s->flags = flags; - s->align = align; - kmem_cache_open_debug_check(s); + int node; + int i; + struct kmem_cache_node *n; + struct page *page, *page2; + struct page *t; + struct list_head *slabs_by_inuse = + kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); + unsigned long flags; + LIST_HEAD(zaplist); + unsigned long t0; + unsigned long t0_partial; + unsigned long discarded; + unsigned long defrag_success; + unsigned long defrag_fail; - if (!calculate_sizes(s)) - goto error; + if (!slabs_by_inuse) + return -ENOMEM; + + flush_all(s); + for_each_online_node(node) { + t0 = jiffies; + n = get_node(s, node); + + if (!n->nr_partial) + continue; + + t0_partial = n->nr_partial; + discarded = 0; + defrag_success = 0; + defrag_fail = 0; + + for (i = 0; i < s->objects; i++) + INIT_LIST_HEAD(slabs_by_inuse + i); + + spin_lock_irqsave(&n->list_lock, flags); + + /* + * Build lists indexed by the items in use in each slab. + * + * Note that concurrent frees may occur while we hold the + * list_lock. page->inuse here is the upper limit. + */ + list_for_each_entry_safe(page, t, &n->partial, lru) { + int inuse = page->inuse; + + if (!inuse && slab_trylock(page)) { + /* + * Must hold slab lock here because slab_free + * may have freed the last object and be + * waiting to release the slab. + */ + list_del(&page->lru); + n->nr_partial--; + slab_unlock(page); + discard_slab(s, page); + discarded++; + } else + if (inuse < (2 * s->objects) / 3 && + n->nr_partial > MAX_PARTIAL) { + list_move(&page->lru, + slabs_by_inuse + page->inuse); + } + } - s->refcount = 1; -#ifdef CONFIG_NUMA - s->defrag_ratio = 100; -#endif - if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) - return 1; -error: - if (flags & SLAB_PANIC) - panic("Cannot create slab %s size=%lu realsize=%u " - "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)size, s->size, s->order, - s->offset, flags); - return 0; -} -EXPORT_SYMBOL(kmem_cache_open); + if (n->nr_partial <= MAX_PARTIAL) + goto out; -/* - * Check if a given pointer is valid - */ -int kmem_ptr_validate(struct kmem_cache *s, const void *object) -{ - struct page * page; + /* + * Rebuild the partial list with the slabs filled up most + * first and the least used slabs at the end. + */ + for (i = s->objects - 1; i >= 0; i--) + list_splice(slabs_by_inuse + i, n->partial.prev); - page = get_object_page(object); + /* + * If we have no functions available to defragment the slabs + * then we are done. + */ + if (!s->ops->get || !s->ops->kick) + goto out; - if (!page || s != page->slab) - /* No slab or wrong slab */ - return 0; + /* Take objects with just a few objects off the tail */ + while (n->nr_partial > MAX_PARTIAL) { + page = container_of(n->partial.prev, struct page, lru); - if (!check_valid_pointer(s, page, object)) - return 0; + /* + * We are holding the list_lock so we can only + * trylock the slab + */ + if (page->inuse > s->objects / 3) + break; - /* - * We could also check if the object is on the slabs freelist. - * But this would be too expensive and it seems that the main - * purpose of kmem_ptr_valid is to check if the object belongs - * to a certain slab. - */ - return 1; -} -EXPORT_SYMBOL(kmem_ptr_validate); + if (!slab_trylock(page)) + break; -/* - * Determine the size of a slab object - */ -unsigned int kmem_cache_size(struct kmem_cache *s) -{ - return s->objsize; -} -EXPORT_SYMBOL(kmem_cache_size); + list_move_tail(&page->lru, &zaplist); + n->nr_partial--; + SetSlabFrozen(page); + slab_unlock(page); + } + out: + spin_unlock_irqrestore(&n->list_lock, flags); -const char *kmem_cache_name(struct kmem_cache *s) -{ - return s->name; -} -EXPORT_SYMBOL(kmem_cache_name); + /* Now we can free objects in the slabs on the zaplist */ + list_for_each_entry_safe(page, page2, &zaplist, lru) { + unsigned long flags; -/* - * Attempt to free all slabs on a node. Return the number of slabs we - * were unable to free. - */ -static int free_list(struct kmem_cache *s, struct kmem_cache_node *n, - struct list_head *list) -{ - int slabs_inuse = 0; - unsigned long flags; - struct page *page, *h; + local_irq_save(flags); + slab_lock(page); + if (__kmem_cache_vacate(s, page, flags, + (void **)slabs_by_inuse)) + defrag_fail++; + else + defrag_success++; + } + printk(KERN_INFO "Defrag %s: node=%d jiff=%lu, partial=%lu, " + "discard=%lu, defrag_success=%lu defrag_fail=%lu Pafter=%lu\n", + s->name, node, jiffies - t0, t0_partial, + discarded, defrag_success, defrag_fail, n->nr_partial); + } - spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry_safe(page, h, list, lru) - if (!page->inuse) { - list_del(&page->lru); - discard_slab(s, page); - } else - slabs_inuse++; - spin_unlock_irqrestore(&n->list_lock, flags); - return slabs_inuse; + kfree(slabs_by_inuse); + return 0; } +EXPORT_SYMBOL(kmem_cache_shrink); -/* - * Release all resources used by a slab cache. +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. */ -static int kmem_cache_close(struct kmem_cache *s) +void *krealloc(const void *p, size_t new_size, gfp_t flags) { - int node; - - flush_all(s); + void *ret; + size_t ks; - /* Attempt to free all objects */ - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); + if (unlikely(!p)) + return kmalloc(new_size, flags); - n->nr_partial -= free_list(s, n, &n->partial); - if (atomic_long_read(&n->nr_slabs)) - return 1; + if (unlikely(!new_size)) { + kfree(p); + return NULL; } - free_kmem_cache_nodes(s); - return 0; -} -/* - * Close a cache and release the kmem_cache structure - * (must be used for caches created using kmem_cache_create) - */ -void kmem_cache_destroy(struct kmem_cache *s) -{ - down_write(&slub_lock); - s->refcount--; - if (!s->refcount) { - list_del(&s->list); - if (kmem_cache_close(s)) - WARN_ON(1); - sysfs_slab_remove(s); - kfree(s); + ks = ksize(p); + if (ks >= new_size) + return (void *)p; + + ret = kmalloc(new_size, flags); + if (ret) { + memcpy(ret, p, min(new_size, ks)); + kfree(p); } - up_write(&slub_lock); + return ret; } -EXPORT_SYMBOL(kmem_cache_destroy); +EXPORT_SYMBOL(krealloc); /******************************************************************** - * Kmalloc subsystem + * Basic setup of slabs *******************************************************************/ -struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __cacheline_aligned; -EXPORT_SYMBOL(kmalloc_caches); +void __init kmem_cache_init(void) +{ + int i; -#ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[KMALLOC_SHIFT_HIGH + 1]; + if (!page_group_by_mobility_disabled && !user_override) { + /* + * Antifrag support available. Increase usable + * page order and generate slabs with more objects. + */ + slub_max_order = DEFAULT_ANTIFRAG_MAX_ORDER; + slub_min_objects = DEFAULT_ANTIFRAG_MIN_OBJECTS; + } + +#ifdef CONFIG_NUMA + /* + * Must first have the slab cache available for the allocations of the + * struct kmem_cache_node's. There is special bootstrap code in + * kmem_cache_open for slab_state == DOWN. + */ + create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", + sizeof(struct kmem_cache_node), GFP_KERNEL); #endif -static int __init setup_slub_min_order(char *str) -{ - get_option (&str, &slub_min_order); - user_override = 1; - return 1; -} + /* Able to allocate the per node structures */ + slab_state = PARTIAL; -__setup("slub_min_order=", setup_slub_min_order); + /* Caches that are not of the two-to-the-power-of size */ + create_kmalloc_cache(&kmalloc_caches[1], + "kmalloc-96", 96, GFP_KERNEL); + create_kmalloc_cache(&kmalloc_caches[2], + "kmalloc-192", 192, GFP_KERNEL); -static int __init setup_slub_max_order(char *str) -{ - get_option (&str, &slub_max_order); - user_override = 1; - return 1; -} + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + create_kmalloc_cache(&kmalloc_caches[i], + "kmalloc", 1 << i, GFP_KERNEL); -__setup("slub_max_order=", setup_slub_max_order); + slab_state = UP; -static int __init setup_slub_min_objects(char *str) -{ - get_option (&str, &slub_min_objects); - user_override = 1; - return 1; -} + /* Provide the correct kmalloc names now that the caches are up */ + for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) + kmalloc_caches[i]. name = + kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); -__setup("slub_min_objects=", setup_slub_min_objects); +#ifdef CONFIG_SMP + register_cpu_notifier(&slab_notifier); +#endif -static int __init setup_slub_nomerge(char *str) -{ - slub_nomerge = 1; - return 1; + kmem_size = offsetof(struct kmem_cache, cpu_slab) + + nr_cpu_ids * sizeof(struct page *); + + printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," + " Processors=%d, Nodes=%d\n", + KMALLOC_SHIFT_HIGH, cache_line_size(), + slub_min_order, slub_max_order, slub_min_objects, + nr_cpu_ids, nr_node_ids); } -__setup("slub_nomerge", setup_slub_nomerge); - -static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, - const char *name, int size, gfp_t gfp_flags) +/* + * Find a mergeable slab cache + */ +static int slab_unmergeable(struct kmem_cache *s) { - unsigned int flags = 0; - - if (gfp_flags & SLUB_DMA) - flags = SLAB_CACHE_DMA; + if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) + return 1; - down_write(&slub_lock); - if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, - flags, NULL, &slub_default_ops)) - goto panic; + if (s->ctor) + return 1; - list_add(&s->list, &slab_caches); - up_write(&slub_lock); - if (sysfs_slab_add(s)) - goto panic; - return s; + if (s->ops != &slub_default_ops) + return 1; -panic: - panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); + return 0; } -static struct kmem_cache *get_slab(size_t size, gfp_t flags) +static struct kmem_cache *find_mergeable(size_t size, + size_t align, unsigned long flags, + void (*ctor)(void *, struct kmem_cache *, unsigned long), + const struct kmem_cache_ops *ops) { - int index = kmalloc_index(size); + struct list_head *h; - if (!index) + if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) return NULL; - /* Allocation too large? */ - BUG_ON(index < 0); - -#ifdef CONFIG_ZONE_DMA - if ((flags & SLUB_DMA)) { - struct kmem_cache *s; - struct kmem_cache *x; - char *text; - size_t realsize; + if (ctor) + return NULL; - s = kmalloc_caches_dma[index]; - if (s) - return s; + if (ops != &slub_default_ops) + return NULL; - /* Dynamically create dma cache */ - x = kmalloc(kmem_size, flags & ~SLUB_DMA); - if (!x) - panic("Unable to allocate memory for dma cache\n"); + size = ALIGN(size, sizeof(void *)); + align = calculate_alignment(flags, align, size); + size = ALIGN(size, align); - if (index <= KMALLOC_SHIFT_HIGH) - realsize = 1 << index; - else { - if (index == 1) - realsize = 96; - else - realsize = 192; - } + list_for_each(h, &slab_caches) { + struct kmem_cache *s = + container_of(h, struct kmem_cache, list); - text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", - (unsigned int)realsize); - s = create_kmalloc_cache(x, text, realsize, flags); - kmalloc_caches_dma[index] = s; - return s; - } -#endif - return &kmalloc_caches[index]; -} + if (slab_unmergeable(s)) + continue; -void *__kmalloc(size_t size, gfp_t flags) -{ - struct kmem_cache *s = get_slab(size, flags); + if (size > s->size) + continue; - if (s) - return slab_alloc(s, flags, -1, __builtin_return_address(0)); - return NULL; -} -EXPORT_SYMBOL(__kmalloc); + if ((flags & SLUB_MERGE_SAME) != + (s->flags & SLUB_MERGE_SAME)) + continue; + /* + * Check if alignment is compatible. + * Courtesy of Adrian Drzewiecki + */ + if ((s->size & ~(align -1)) != s->size) + continue; -#ifdef CONFIG_NUMA -void *__kmalloc_node(size_t size, gfp_t flags, int node) -{ - struct kmem_cache *s = get_slab(size, flags); + if (s->size - size >= sizeof(void *)) + continue; - if (s) - return slab_alloc(s, flags, node, __builtin_return_address(0)); + return s; + } return NULL; } -EXPORT_SYMBOL(__kmalloc_node); -#endif -size_t ksize(const void *object) +struct kmem_cache *kmem_cache_create(const char *name, size_t size, + size_t align, unsigned long flags, + void (*ctor)(void *, struct kmem_cache *, unsigned long), + const struct kmem_cache_ops *ops) { - struct page *page = get_object_page(object); struct kmem_cache *s; - BUG_ON(!page); - s = page->slab; - BUG_ON(!s); - - /* - * Debugging requires use of the padding between object - * and whatever may come after it. - */ - if (s->flags & (SLAB_RED_ZONE | SLAB_POISON)) - return s->objsize; + if (!ops) + ops = &slub_default_ops; - /* - * If we have the need to store the freelist pointer - * back there or track user information then we can - * only use the space before that information. - */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) - return s->inuse; + down_write(&slub_lock); + s = find_mergeable(size, align, flags, ctor, ops); + if (s) { + s->refcount++; + /* + * Adjust the object sizes so that we clear + * the complete object on kzalloc. + */ + s->objsize = max(s->objsize, (int)size); + s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + if (sysfs_slab_alias(s, name)) + goto err; + } else { + s = kmalloc(kmem_size, GFP_KERNEL); + if (s && kmem_cache_open(s, GFP_KERNEL, name, + size, align, flags, ctor, ops)) { + if (sysfs_slab_add(s)) { + kfree(s); + goto err; + } + list_add(&s->list, &slab_caches); + raise_kswapd_order(s->order); + } else + kfree(s); + } + up_write(&slub_lock); + return s; - /* - * Else we can use all the padding etc for the allocation - */ - return s->size; +err: + up_write(&slub_lock); + if (flags & SLAB_PANIC) + panic("Cannot create slabcache %s\n", name); + else + s = NULL; + return s; } -EXPORT_SYMBOL(ksize); +EXPORT_SYMBOL(kmem_cache_create); -void kfree(const void *x) +void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags) { - struct kmem_cache *s; - struct page *page; + void *x; - if (!x) - return; + x = slab_alloc(s, flags, -1, __builtin_return_address(0)); + if (x) + memset(x, 0, s->objsize); + return x; +} +EXPORT_SYMBOL(kmem_cache_zalloc); - page = virt_to_head_page(x); - s = page->slab; +#ifdef CONFIG_SMP +static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) +{ + struct list_head *h; - slab_free(s, page, (void *)x, __builtin_return_address(0)); + down_read(&slub_lock); + list_for_each(h, &slab_caches) { + struct kmem_cache *s = + container_of(h, struct kmem_cache, list); + + func(s, cpu); + } + up_read(&slub_lock); } -EXPORT_SYMBOL(kfree); /* - * Order the freelist so that addresses increase as object are allocated. - * This is useful to trigger the cpu cacheline prefetching logic. + * Use the cpu notifier to insure that the cpu slabs are flushed when + * necessary. */ -void resequence_freelist(struct kmem_cache *s, struct page *page) +static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { - void *p; - void *last; - void *addr = page_address(page); - DECLARE_BITMAP(map, s->objects); - - bitmap_zero(map, s->objects); + long cpu = (long)hcpu; - /* Figure out which objects are on the freelist */ - for_each_free_object(p, s, page->freelist) - set_bit(slab_index(p, s, addr), map); + switch (action) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + for_all_slabs(__flush_cpu_slab, cpu); + break; + default: + break; + } + return NOTIFY_OK; +} - last = NULL; - for_each_object(p, s, addr) - if (test_bit(slab_index(p, s, addr), map)) { - if (last) - set_freepointer(s, last, p); - else - page->freelist = p; - last = p; - } +static struct notifier_block __cpuinitdata slab_notifier = + { &slab_cpuup_callback, NULL, 0 }; - if (last) - set_freepointer(s, last, NULL); - else - page->freelist = NULL; -} +#endif -/* - * Vacate all objects in the given slab. - * - * Slab must be locked and frozen. Interrupts are disabled (flags must - * be passed). - * - * Will drop and regain and drop the slab lock. At the end the slab will - * either be freed or returned to the partial lists. - * - * Returns the number of remaining objects - */ -static int __kmem_cache_vacate(struct kmem_cache *s, - struct page *page, unsigned long flags, void **vector) +void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) { - void *p; - void *addr = page_address(page); - DECLARE_BITMAP(map, s->objects); - int leftover; - int objects; - void *private; + struct kmem_cache *s = get_slab(size, gfpflags); - if (!page->inuse) - goto out; + if (!s) + return NULL; - /* Determine used objects */ - bitmap_fill(map, s->objects); - for_each_free_object(p, s, page->freelist) - __clear_bit(slab_index(p, s, addr), map); + return slab_alloc(s, gfpflags, -1, caller); +} - objects = 0; - memset(vector, 0, s->objects * sizeof(void **)); - for_each_object(p, s, addr) { - if (test_bit(slab_index(p, s, addr), map)) - vector[objects++] = p; - } +void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, + int node, void *caller) +{ + struct kmem_cache *s = get_slab(size, gfpflags); - private = s->ops->get(s, objects, vector); + if (!s) + return NULL; - /* - * Got references. Now we can drop the slab lock. The slab - * is frozen so it cannot vanish from under us nor will - * allocations be performed on the slab. However, unlocking the - * slab will allow concurrent slab_frees to proceed. - */ - slab_unlock(page); - local_irq_restore(flags); + return slab_alloc(s, gfpflags, node, caller); +} - /* - * Perform the KICK callbacks to remove the objects. - */ - s->ops->kick(s, objects, vector, private); +#ifdef CONFIG_SLUB_DEBUG +/* + * Debug settings: + */ +static int slub_debug; - local_irq_save(flags); - slab_lock(page); -out: - /* - * Check the result and unfreeze the slab - */ - leftover = page->inuse; - if (leftover > 0) - /* - * Cannot free. Lets at least optimize the freelist. We have - * likely touched all the cachelines with the free pointers - * already so it is cheap to do here. - */ - resequence_freelist(s, page); - unfreeze_slab(s, page); - local_irq_restore(flags); - return leftover; -} +static char *slub_debug_slabs; /* - * Get a page off a list and freeze it. Must be holding slab lock. + * Object debugging */ -static void freeze_from_list(struct kmem_cache *s, struct page *page) +static void print_section(char *text, u8 *addr, unsigned int length) { - if (page->inuse < s->objects) - remove_partial(s, page); - else if (s->flags & SLAB_STORE_USER) - remove_full(s, page); - SetSlabFrozen(page); + int i, offset; + int newline = 1; + char ascii[17]; + + ascii[16] = 0; + + for (i = 0; i < length; i++) { + if (newline) { + printk(KERN_ERR "%10s 0x%p: ", text, addr + i); + newline = 0; + } + printk(" %02x", addr[i]); + offset = i % 16; + ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; + if (offset == 15) { + printk(" %s\n",ascii); + newline = 1; + } + } + if (!newline) { + i %= 16; + while (i < 16) { + printk(" "); + ascii[i] = ' '; + i++; + } + printk(" %s\n", ascii); + } } -/* - * Attempt to free objects in a page. Return 1 if succesful. - */ -int kmem_cache_vacate(struct page *page) +static struct track *get_track(struct kmem_cache *s, void *object, + enum track_item alloc) { - unsigned long flags; - struct kmem_cache *s; - int vacated = 0; - void **vector = NULL; + struct track *p; - /* - * Get a reference to the page. Return if its freed or being freed. - * This is necessary to make sure that the page does not vanish - * from under us before we are able to check the result. - */ - if (!get_page_unless_zero(page)) - return 0; + if (s->offset) + p = object + s->offset + sizeof(void *); + else + p = object + s->inuse; - if (!PageSlab(page)) - goto out; + return p + alloc; +} - s = page->slab; - if (!s) - goto out; +static void set_track(struct kmem_cache *s, void *object, + enum track_item alloc, void *addr) +{ + struct track *p; - vector = kmalloc(s->objects * sizeof(void *), GFP_KERNEL); - if (!vector) - return 0; + if (s->offset) + p = object + s->offset + sizeof(void *); + else + p = object + s->inuse; - local_irq_save(flags); - /* - * The implicit memory barrier in slab_lock guarantees that page->inuse - * is loaded after PageSlab(page) has been established to be true. This is - * only revelant for a newly created slab. - */ - slab_lock(page); + p += alloc; + if (addr) { + p->addr = addr; + p->cpu = smp_processor_id(); + p->pid = current ? current->pid : -1; + p->when = jiffies; + } else + memset(p, 0, sizeof(struct track)); +} - /* - * We may now have locked a page that may be in various stages of - * being freed. If the PageSlab bit is off then we have already - * reached the page allocator. If page->inuse is zero then we are - * in SLUB but freeing or allocating the page. - * page->inuse is never modified without the slab lock held. - * - * Also abort if the page happens to be already frozen. If its - * frozen then a concurrent vacate may be in progress. - */ - if (!PageSlab(page) || SlabFrozen(page) || !page->inuse) - goto out_locked; +static void init_tracking(struct kmem_cache *s, void *object) +{ + if (s->flags & SLAB_STORE_USER) { + set_track(s, object, TRACK_FREE, NULL); + set_track(s, object, TRACK_ALLOC, NULL); + } +} - /* - * We are holding a lock on a slab page and all operations on the - * slab are blocking. - */ - if (!s->ops->get || !s->ops->kick) - goto out_locked; - freeze_from_list(s, page); - vacated = __kmem_cache_vacate(s, page, flags, vector) == 0; -out: - put_page(page); - kfree(vector); - return vacated; -out_locked: - slab_unlock(page); - local_irq_restore(flags); - goto out; +static void print_track(const char *s, struct track *t) +{ + if (!t->addr) + return; + printk(KERN_ERR "%s: ", s); + __print_symbol("%s", (unsigned long)t->addr); + printk(" jiffies_ago=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid); } -/* - * kmem_cache_shrink removes empty slabs from the partial lists and sorts - * the remaining slabs by the number of items in use. The slabs with the - * most items in use come first. New allocations will then fill those up - * and thus they can be removed from the partial lists. - * - * The slabs with the least items are placed last. This results in them - * being allocated from last increasing the chance that the last objects - * are freed in them. - */ -int kmem_cache_shrink(struct kmem_cache *s) +static void print_trailer(struct kmem_cache *s, u8 *p) { - int node; - int i; - struct kmem_cache_node *n; - struct page *page, *page2; - struct page *t; - struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * s->objects, GFP_KERNEL); - unsigned long flags; - LIST_HEAD(zaplist); - unsigned long t0; - unsigned long t0_partial; - unsigned long discarded; - unsigned long defrag_success; - unsigned long defrag_fail; + unsigned int off; /* Offset of last byte */ + + if (s->flags & SLAB_RED_ZONE) + print_section("Redzone", p + s->objsize, + s->inuse - s->objsize); + + printk(KERN_ERR "FreePointer 0x%p -> 0x%p\n", + p + s->offset, + get_freepointer(s, p)); + + if (s->offset) + off = s->offset + sizeof(void *); + else + off = s->inuse; - if (!slabs_by_inuse) - return -ENOMEM; + if (s->flags & SLAB_STORE_USER) { + print_track("Last alloc", get_track(s, p, TRACK_ALLOC)); + print_track("Last free ", get_track(s, p, TRACK_FREE)); + off += 2 * sizeof(struct track); + } - flush_all(s); - for_each_online_node(node) { - t0 = jiffies; - n = get_node(s, node); + if (off != s->size) + /* Beginning of the filler is the free pointer */ + print_section("Filler", p + off, s->size - off); +} - if (!n->nr_partial) - continue; +static void object_err(struct kmem_cache *s, struct page *page, + u8 *object, char *reason) +{ + u8 *addr = page_address(page); - t0_partial = n->nr_partial; - discarded = 0; - defrag_success = 0; - defrag_fail = 0; + printk(KERN_ERR "*** SLUB %s: %s@0x%p slab 0x%p\n", + s->name, reason, object, page); + printk(KERN_ERR " offset=%tu flags=0x%04lx inuse=%u freelist=0x%p\n", + object - addr, page->flags, page->inuse, page->freelist); + if (object > addr + 16) + print_section("Bytes b4", object - 16, 16); + print_section("Object", object, min(s->objsize, 128)); + print_trailer(s, object); + dump_stack(); +} - for (i = 0; i < s->objects; i++) - INIT_LIST_HEAD(slabs_by_inuse + i); +static void slab_err(struct kmem_cache *s, struct page *page, char *reason, ...) +{ + va_list args; + char buf[100]; - spin_lock_irqsave(&n->list_lock, flags); + va_start(args, reason); + vsnprintf(buf, sizeof(buf), reason, args); + va_end(args); + printk(KERN_ERR "*** SLUB %s: %s in slab @0x%p\n", s->name, buf, + page); + dump_stack(); +} - /* - * Build lists indexed by the items in use in each slab. - * - * Note that concurrent frees may occur while we hold the - * list_lock. page->inuse here is the upper limit. - */ - list_for_each_entry_safe(page, t, &n->partial, lru) { - int inuse = page->inuse; +static void init_object(struct kmem_cache *s, void *object, int active) +{ + u8 *p = object; - if (!inuse && slab_trylock(page)) { - /* - * Must hold slab lock here because slab_free - * may have freed the last object and be - * waiting to release the slab. - */ - list_del(&page->lru); - n->nr_partial--; - slab_unlock(page); - discard_slab(s, page); - discarded++; - } else - if (inuse < (2 * s->objects) / 3 && - n->nr_partial > MAX_PARTIAL) { - list_move(&page->lru, - slabs_by_inuse + page->inuse); - } - } + if (s->flags & __OBJECT_POISON) { + memset(p, POISON_FREE, s->objsize - 1); + p[s->objsize -1] = POISON_END; + } - if (n->nr_partial <= MAX_PARTIAL) - goto out; + if (s->flags & SLAB_RED_ZONE) + memset(p + s->objsize, + active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, + s->inuse - s->objsize); +} - /* - * Rebuild the partial list with the slabs filled up most - * first and the least used slabs at the end. - */ - for (i = s->objects - 1; i >= 0; i--) - list_splice(slabs_by_inuse + i, n->partial.prev); +static int check_bytes(u8 *start, unsigned int value, unsigned int bytes) +{ + while (bytes) { + if (*start != (u8)value) + return 0; + start++; + bytes--; + } + return 1; +} - /* - * If we have no functions available to defragment the slabs - * then we are done. - */ - if (!s->ops->get || !s->ops->kick) - goto out; +/* + * Object layout: + * + * object address + * Bytes of the object to be managed. + * If the freepointer may overlay the object then the free + * pointer is the first word of the object. + * + * Poisoning uses 0x6b (POISON_FREE) and the last byte is + * 0xa5 (POISON_END) + * + * object + s->objsize + * Padding to reach word boundary. This is also used for Redzoning. + * Padding is extended by another word if Redzoning is enabled and + * objsize == inuse. + * + * We fill with 0xbb (RED_INACTIVE) for inactive objects and with + * 0xcc (RED_ACTIVE) for objects in use. + * + * object + s->inuse + * Meta data starts here. + * + * A. Free pointer (if we cannot overwrite object on free) + * B. Tracking data for SLAB_STORE_USER + * C. Padding to reach required alignment boundary or at mininum + * one word if debuggin is on to be able to detect writes + * before the word boundary. + * + * Padding is done using 0x5a (POISON_INUSE) + * + * object + s->size + * Nothing is used beyond s->size. + * + * If slabcaches are merged then the objsize and inuse boundaries are mostly + * ignored. And therefore no slab options that rely on these boundaries + * may be used with merged slabcaches. + */ - /* Take objects with just a few objects off the tail */ - while (n->nr_partial > MAX_PARTIAL) { - page = container_of(n->partial.prev, struct page, lru); +static void restore_bytes(struct kmem_cache *s, char *message, u8 data, + void *from, void *to) +{ + printk(KERN_ERR "@@@ SLUB %s: Restoring %s (0x%x) from 0x%p-0x%p\n", + s->name, message, data, from, to - 1); + memset(from, data, to - from); +} - /* - * We are holding the list_lock so we can only - * trylock the slab - */ - if (page->inuse > s->objects / 3) - break; +static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) +{ + unsigned long off = s->inuse; /* The end of info */ - if (!slab_trylock(page)) - break; + if (s->offset) + /* Freepointer is placed after the object. */ + off += sizeof(void *); - list_move_tail(&page->lru, &zaplist); - n->nr_partial--; - SetSlabFrozen(page); - slab_unlock(page); - } - out: - spin_unlock_irqrestore(&n->list_lock, flags); + if (s->flags & SLAB_STORE_USER) + /* We also have user information there */ + off += 2 * sizeof(struct track); - /* Now we can free objects in the slabs on the zaplist */ - list_for_each_entry_safe(page, page2, &zaplist, lru) { - unsigned long flags; + if (s->size == off) + return 1; - local_irq_save(flags); - slab_lock(page); - if (__kmem_cache_vacate(s, page, flags, - (void **)slabs_by_inuse)) - defrag_fail++; - else - defrag_success++; - } - printk(KERN_INFO "Defrag %s: node=%d jiff=%lu, partial=%lu, " - "discard=%lu, defrag_success=%lu defrag_fail=%lu Pafter=%lu\n", - s->name, node, jiffies - t0, t0_partial, - discarded, defrag_success, defrag_fail, n->nr_partial); - } + if (check_bytes(p + off, POISON_INUSE, s->size - off)) + return 1; - kfree(slabs_by_inuse); + object_err(s, page, p, "Object padding check fails"); + + /* + * Restore padding + */ + restore_bytes(s, "object padding", POISON_INUSE, p + off, p + s->size); return 0; } -EXPORT_SYMBOL(kmem_cache_shrink); -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. - */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) +static int slab_pad_check(struct kmem_cache *s, struct page *page) { - void *ret; - size_t ks; - - if (unlikely(!p)) - return kmalloc(new_size, flags); + u8 *p; + int length, remainder; - if (unlikely(!new_size)) { - kfree(p); - return NULL; - } + if (!(s->flags & SLAB_POISON)) + return 1; - ks = ksize(p); - if (ks >= new_size) - return (void *)p; + p = page_address(page); + length = s->objects * s->size; + remainder = (PAGE_SIZE << s->order) - length; + if (!remainder) + return 1; - ret = kmalloc(new_size, flags); - if (ret) { - memcpy(ret, p, min(new_size, ks)); - kfree(p); + if (!check_bytes(p + length, POISON_INUSE, remainder)) { + slab_err(s, page, "Padding check failed"); + restore_bytes(s, "slab padding", POISON_INUSE, p + length, + p + length + remainder); + return 0; } - return ret; + return 1; } -EXPORT_SYMBOL(krealloc); - -/******************************************************************** - * Basic setup of slabs - *******************************************************************/ -void __init kmem_cache_init(void) +static int check_object(struct kmem_cache *s, struct page *page, + void *object, int active) { - int i; + u8 *p = object; + u8 *endobject = object + s->objsize; - if (!page_group_by_mobility_disabled && !user_override) { + if (s->flags & SLAB_RED_ZONE) { + unsigned int red = + active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; + + if (!check_bytes(endobject, red, s->inuse - s->objsize)) { + object_err(s, page, object, + active ? "Redzone Active" : "Redzone Inactive"); + restore_bytes(s, "redzone", red, + endobject, object + s->inuse); + return 0; + } + } else { + if ((s->flags & SLAB_POISON) && s->objsize < s->inuse && + !check_bytes(endobject, POISON_INUSE, + s->inuse - s->objsize)) { + object_err(s, page, p, "Alignment padding check fails"); /* - * Antifrag support available. Increase usable - * page order and generate slabs with more objects. - */ - slub_max_order = DEFAULT_ANTIFRAG_MAX_ORDER; - slub_min_objects = DEFAULT_ANTIFRAG_MIN_OBJECTS; + * Fix it so that there will not be another report. + * + * Hmmm... We may be corrupting an object that now expects + * to be longer than allowed. + */ + restore_bytes(s, "alignment padding", POISON_INUSE, + endobject, object + s->inuse); + } } -#ifdef CONFIG_NUMA - /* - * Must first have the slab cache available for the allocations of the - * struct kmem_cache_node's. There is special bootstrap code in - * kmem_cache_open for slab_state == DOWN. - */ - create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", - sizeof(struct kmem_cache_node), GFP_KERNEL); -#endif - - /* Able to allocate the per node structures */ - slab_state = PARTIAL; - - /* Caches that are not of the two-to-the-power-of size */ - create_kmalloc_cache(&kmalloc_caches[1], - "kmalloc-96", 96, GFP_KERNEL); - create_kmalloc_cache(&kmalloc_caches[2], - "kmalloc-192", 192, GFP_KERNEL); - - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) - create_kmalloc_cache(&kmalloc_caches[i], - "kmalloc", 1 << i, GFP_KERNEL); + if (s->flags & SLAB_POISON) { + if (!active && (s->flags & __OBJECT_POISON) && + (!check_bytes(p, POISON_FREE, s->objsize - 1) || + p[s->objsize - 1] != POISON_END)) { - slab_state = UP; + object_err(s, page, p, "Poison check failed"); + restore_bytes(s, "Poison", POISON_FREE, + p, p + s->objsize -1); + restore_bytes(s, "Poison", POISON_END, + p + s->objsize - 1, p + s->objsize); + return 0; + } + /* + * check_pad_bytes cleans up on its own. + */ + check_pad_bytes(s, page, p); + } - /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) - kmalloc_caches[i]. name = - kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i); + if (!s->offset && active) + /* + * Object and freepointer overlap. Cannot check + * freepointer while object is allocated. + */ + return 1; -#ifdef CONFIG_SMP - register_cpu_notifier(&slab_notifier); -#endif + /* Check free pointer validity */ + if (!check_valid_pointer(s, page, get_freepointer(s, p))) { + object_err(s, page, p, "Freepointer corrupt"); + /* + * No choice but to zap it and thus loose the remainder + * of the free objects in this slab. May cause + * another error because the object count is now wrong. + */ + set_freepointer(s, p, NULL); + return 0; + } + return 1; +} - kmem_size = offsetof(struct kmem_cache, cpu_slab) + - nr_cpu_ids * sizeof(struct page *); +static int check_slab(struct kmem_cache *s, struct page *page) +{ + VM_BUG_ON(!irqs_disabled()); - printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," - " Processors=%d, Nodes=%d\n", - KMALLOC_SHIFT_HIGH, cache_line_size(), - slub_min_order, slub_max_order, slub_min_objects, - nr_cpu_ids, nr_node_ids); + if (!PageSlab(page)) { + slab_err(s, page, "Not a valid slab page flags=%lx " + "mapping=0x%p count=%d", page->flags, page->mapping, + page_count(page)); + return 0; + } + if (page->offset * sizeof(void *) != s->offset) { + slab_err(s, page, "Corrupted offset %lu flags=0x%lx " + "mapping=0x%p count=%d", + (unsigned long)(page->offset * sizeof(void *)), + page->flags, + page->mapping, + page_count(page)); + return 0; + } + if (page->inuse > s->objects) { + slab_err(s, page, "inuse %u > max %u @0x%p flags=%lx " + "mapping=0x%p count=%d", + s->name, page->inuse, s->objects, page->flags, + page->mapping, page_count(page)); + return 0; + } + /* Slab_pad_check fixes things up after itself */ + slab_pad_check(s, page); + return 1; } /* - * Find a mergeable slab cache + * Determine if a certain object on a page is on the freelist. Must hold the + * slab lock to guarantee that the chains are in a consistent state. */ -static int slab_unmergeable(struct kmem_cache *s) +static int on_freelist(struct kmem_cache *s, struct page *page, void *search) { - if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) - return 1; - - if (s->ctor) - return 1; + int nr = 0; + void *fp = page->freelist; + void *object = NULL; - if (s->ops != &slub_default_ops) - return 1; + while (fp && nr <= s->objects) { + if (fp == search) + return 1; + if (!check_valid_pointer(s, page, fp)) { + if (object) { + object_err(s, page, object, + "Freechain corrupt"); + set_freepointer(s, object, NULL); + break; + } else { + slab_err(s, page, "Freepointer 0x%p corrupt", + fp); + page->freelist = NULL; + page->inuse = s->objects; + printk(KERN_ERR "@@@ SLUB %s: Freelist " + "cleared. Slab 0x%p\n", + s->name, page); + return 0; + } + break; + } + object = fp; + fp = get_freepointer(s, object); + nr++; + } - return 0; + if (page->inuse != s->objects - nr) { + slab_err(s, page, "Wrong object count. Counter is %d but " + "counted were %d", s, page, page->inuse, + s->objects - nr); + page->inuse = s->objects - nr; + printk(KERN_ERR "@@@ SLUB %s: Object count adjusted. " + "Slab @0x%p\n", s->name, page); + } + return search == NULL; } -static struct kmem_cache *find_mergeable(size_t size, - size_t align, unsigned long flags, - void (*ctor)(void *, struct kmem_cache *, unsigned long), - const struct kmem_cache_ops *ops) +static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) { - struct list_head *h; - - if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) - return NULL; + if (s->flags & SLAB_TRACE) { + printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + s->name, + alloc ? "alloc" : "free", + object, page->inuse, + page->freelist); - if (ctor) - return NULL; + if (!alloc) + print_section("Object", (void *)object, s->objsize); - if (ops != &slub_default_ops) - return NULL; + dump_stack(); + } +} - size = ALIGN(size, sizeof(void *)); - align = calculate_alignment(flags, align, size); - size = ALIGN(size, align); +/* + * Tracking of fully allocated slabs for debugging purposes. + */ +static void add_full(struct kmem_cache_node *n, struct page *page) +{ + spin_lock(&n->list_lock); + list_add(&page->lru, &n->full); + spin_unlock(&n->list_lock); +} - list_for_each(h, &slab_caches) { - struct kmem_cache *s = - container_of(h, struct kmem_cache, list); +static void remove_full(struct kmem_cache *s, struct page *page) +{ + struct kmem_cache_node *n; - if (slab_unmergeable(s)) - continue; + if (!(s->flags & SLAB_STORE_USER)) + return; - if (size > s->size) - continue; + n = get_node(s, page_to_nid(page)); - if (((flags | slub_debug) & SLUB_MERGE_SAME) != - (s->flags & SLUB_MERGE_SAME)) - continue; - /* - * Check if alignment is compatible. - * Courtesy of Adrian Drzewiecki - */ - if ((s->size & ~(align -1)) != s->size) - continue; + spin_lock(&n->list_lock); + list_del(&page->lru); + spin_unlock(&n->list_lock); +} - if (s->size - size >= sizeof(void *)) - continue; +static void setup_object_debug(struct kmem_cache *s, struct page *page, + void *object) +{ + if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) + return; - return s; - } - return NULL; + init_object(s, object, 0); + init_tracking(s, object); } -struct kmem_cache *kmem_cache_create(const char *name, size_t size, - size_t align, unsigned long flags, - void (*ctor)(void *, struct kmem_cache *, unsigned long), - const struct kmem_cache_ops *ops) +static int alloc_debug_processing(struct kmem_cache *s, struct page *page, + void *object, void *addr) { - struct kmem_cache *s; + if (!check_slab(s, page)) + goto bad; - if (!ops) - ops = &slub_default_ops; + if (object && !on_freelist(s, page, object)) { + slab_err(s, page, "Object 0x%p already allocated", object); + goto bad; + } - down_write(&slub_lock); - s = find_mergeable(size, align, flags, ctor, ops); - if (s) { - s->refcount++; - /* - * Adjust the object sizes so that we clear - * the complete object on kzalloc. - */ - s->objsize = max(s->objsize, (int)size); - s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); - if (sysfs_slab_alias(s, name)) - goto err; - } else { - s = kmalloc(kmem_size, GFP_KERNEL); - if (s && kmem_cache_open(s, GFP_KERNEL, name, - size, align, flags, ctor, ops)) { - if (sysfs_slab_add(s)) { - kfree(s); - goto err; - } - list_add(&s->list, &slab_caches); - raise_kswapd_order(s->order); - } else - kfree(s); + if (!check_valid_pointer(s, page, object)) { + object_err(s, page, object, "Freelist Pointer check fails"); + goto bad; } - up_write(&slub_lock); - return s; -err: - up_write(&slub_lock); - if (flags & SLAB_PANIC) - panic("Cannot create slabcache %s\n", name); - else - s = NULL; - return s; -} -EXPORT_SYMBOL(kmem_cache_create); + if (object && !check_object(s, page, object, 0)) + goto bad; -void *kmem_cache_zalloc(struct kmem_cache *s, gfp_t flags) -{ - void *x; + /* Success perform special debug activities for allocs */ + if (s->flags & SLAB_STORE_USER) + set_track(s, object, TRACK_ALLOC, addr); + trace(s, page, object, 1); + init_object(s, object, 1); + return 1; - x = slab_alloc(s, flags, -1, __builtin_return_address(0)); - if (x) - memset(x, 0, s->objsize); - return x; +bad: + if (PageSlab(page)) { + /* + * If this is a slab page then lets do the best we can + * to avoid issues in the future. Marking all objects + * as used avoids touching the remaining objects. + */ + printk(KERN_ERR "@@@ SLUB: %s slab 0x%p. Marking all objects used.\n", + s->name, page); + page->inuse = s->objects; + page->freelist = NULL; + /* Fix up fields that may be corrupted */ + page->offset = s->offset / sizeof(void *); + } + return 0; } -EXPORT_SYMBOL(kmem_cache_zalloc); -#ifdef CONFIG_SMP -static void for_all_slabs(void (*func)(struct kmem_cache *, int), int cpu) +static int free_debug_processing(struct kmem_cache *s, struct page *page, + void *object, void *addr) { - struct list_head *h; + if (!check_slab(s, page)) + goto fail; - down_read(&slub_lock); - list_for_each(h, &slab_caches) { - struct kmem_cache *s = - container_of(h, struct kmem_cache, list); + if (!check_valid_pointer(s, page, object)) { + slab_err(s, page, "Invalid object pointer 0x%p", object); + goto fail; + } - func(s, cpu); + if (on_freelist(s, page, object)) { + slab_err(s, page, "Object 0x%p already free", object); + goto fail; } - up_read(&slub_lock); -} -/* - * Use the cpu notifier to insure that the cpu slabs are flushed when - * necessary. - */ -static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - long cpu = (long)hcpu; + if (!check_object(s, page, object, 1)) + return 0; - switch (action) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - for_all_slabs(__flush_cpu_slab, cpu); - break; - default: - break; + if (unlikely(s != page->slab)) { + if (!PageSlab(page)) + slab_err(s, page, "Attempt to free object(0x%p) " + "outside of slab", object); + else + if (!page->slab) { + printk(KERN_ERR + "SLUB : no slab for object 0x%p.\n", + object); + dump_stack(); + } + else + slab_err(s, page, "object at 0x%p belongs " + "to slab %s", object, page->slab->name); + goto fail; } - return NOTIFY_OK; -} -static struct notifier_block __cpuinitdata slab_notifier = - { &slab_cpuup_callback, NULL, 0 }; + /* Special debug activities for freeing objects */ + if (!SlabFrozen(page) && !page->freelist) + remove_full(s, page); + if (s->flags & SLAB_STORE_USER) + set_track(s, object, TRACK_FREE, addr); + trace(s, page, object, 0); + init_object(s, object, 0); + return 1; -#endif +fail: + printk(KERN_ERR "@@@ SLUB: %s slab 0x%p object at 0x%p not freed.\n", + s->name, page, object); + return 0; +} -void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, void *caller) +static int __init setup_slub_debug(char *str) { - struct kmem_cache *s = get_slab(size, gfpflags); - - if (!s) - return NULL; + if (!str || *str != '=') + slub_debug = DEBUG_DEFAULT_FLAGS; + else { + str++; + if (*str == 0 || *str == ',') + slub_debug = DEBUG_DEFAULT_FLAGS; + else + for( ;*str && *str != ','; str++) + switch (*str) { + case 'f' : case 'F' : + slub_debug |= SLAB_DEBUG_FREE; + break; + case 'z' : case 'Z' : + slub_debug |= SLAB_RED_ZONE; + break; + case 'p' : case 'P' : + slub_debug |= SLAB_POISON; + break; + case 'u' : case 'U' : + slub_debug |= SLAB_STORE_USER; + break; + case 't' : case 'T' : + slub_debug |= SLAB_TRACE; + break; + default: + printk(KERN_ERR "slub_debug option '%c' " + "unknown. skipped\n",*str); + } + } - return slab_alloc(s, gfpflags, -1, caller); + if (*str == ',') + slub_debug_slabs = str + 1; + return 1; } -void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, - int node, void *caller) +__setup("slub_debug", setup_slub_debug); + +static void kmem_cache_open_debug_check(struct kmem_cache *s) { - struct kmem_cache *s = get_slab(size, gfpflags); + /* + * The page->offset field is only 16 bit wide. This is an offset + * in units of words from the beginning of an object. If the slab + * size is bigger then we cannot move the free pointer behind the + * object anymore. + * + * On 32 bit platforms the limit is 256k. On 64bit platforms + * the limit is 512k. + * + * Debugging or ctor may create a need to move the free + * pointer. Fail if this happens. + */ + if (s->objsize >= 65535 * sizeof(void *)) { + BUG_ON(s->flags & (SLAB_RED_ZONE | SLAB_POISON | + SLAB_STORE_USER | SLAB_DESTROY_BY_RCU)); + BUG_ON(s->ctor); + } + else + /* + * Enable debugging if selected on the kernel commandline. + */ + if (slub_debug && (!slub_debug_slabs || + strncmp(slub_debug_slabs, s->name, + strlen(slub_debug_slabs)) == 0)) + s->flags |= slub_debug; +} +#else +static inline void setup_object_debug(struct kmem_cache *s, + struct page *page, void *object) {} - if (!s) - return NULL; +static inline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, void *object, void *addr) { return 0; } - return slab_alloc(s, gfpflags, node, caller); -} +static inline int free_debug_processing(struct kmem_cache *s, + struct page *page, void *object, void *addr) { return 0; } + +static inline int slab_pad_check(struct kmem_cache *s, struct page *page) + { return 1; } +static inline int check_object(struct kmem_cache *s, struct page *page, + void *object, int active) { return 1; } +static inline void add_full(struct kmem_cache_node *n, struct page *page) {} +static inline void kmem_cache_open_debug_check(struct kmem_cache *s) {} +#define slub_debug 0 +#endif #if defined(CONFIG_SYSFS) && defined(CONFIG_SLUB_DEBUG) static int validate_slab(struct kmem_cache *s, struct page *page)