From npiggin@suse.de Sun Feb 10 23:18:39 2008 Date: Mon, 11 Feb 2008 08:18:29 +0100 From: Nick Piggin To: Christoph Lameter Cc: Andrew Morton , Mel Gorman , linux-mm@kvack.org, Pekka J Enberg Subject: Re: SLUB tbench regression due to page allocator deficiency On Sat, Feb 09, 2008 at 07:39:17PM -0800, Christoph Lameter wrote: > On Sun, 10 Feb 2008, Nick Piggin wrote: > > > What kind of allocating and freeing of pages are you talking about? Are > > you just measuring single threaded performance? > > What I did was (on an 8p, may want to tune this to the # procs you have): > > 1. Run tbench_srv on console > > 2. run tbench 8 from an ssh session OK, it's a bit variable, so I used 20 10 second runs and took the average. With this patch, I got a 1% increase of that average (with 2.6.25-rc1 and slub). It avoids some branches and tests; doesn't check the watermarks if there are pcp pages; avoids atomic refcounting operations in the caller requests it (this is really annoying because it adds another branch -- I don't think we should be funneling all these options through flags, rather provide a few helpers or something for it). I don't know if this will get back all the regression, but it should help (although I guess we should do the same refcounting for slab, so that might speed up a bit too). BTW. could you please make kmalloc-2048 just use order-0 allocations by default, like kmalloc-1024 and kmalloc-4096, and kmalloc-2048 with slub. Thanks, Nick --- Index: linux-2.6/include/linux/gfp.h =================================================================== --- linux-2.6.orig/include/linux/gfp.h 2008-02-11 10:06:36.000000000 +1100 +++ linux-2.6/include/linux/gfp.h 2008-02-11 11:08:00.000000000 +1100 @@ -50,8 +50,9 @@ #define __GFP_THISNODE ((__force gfp_t)0x40000u)/* No fallback, no policies */ #define __GFP_RECLAIMABLE ((__force gfp_t)0x80000u) /* Page is reclaimable */ #define __GFP_MOVABLE ((__force gfp_t)0x100000u) /* Page is movable */ +#define __GFP_NOREFS ((__force gfp_t)0x200000u) /* Page is not refcounted */ -#define __GFP_BITS_SHIFT 21 /* Room for 21 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 24 /* Room for 24 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ @@ -218,6 +219,7 @@ #define __get_dma_pages(gfp_mask, order) \ __get_free_pages((gfp_mask) | GFP_DMA,(order)) +extern void FASTCALL(__free_pages_noref(struct page *page, unsigned int order)); extern void FASTCALL(__free_pages(struct page *page, unsigned int order)); extern void FASTCALL(free_pages(unsigned long addr, unsigned int order)); extern void FASTCALL(free_hot_page(struct page *page)); Index: linux-2.6/mm/page_alloc.c =================================================================== --- linux-2.6.orig/mm/page_alloc.c 2008-02-11 10:06:36.000000000 +1100 +++ linux-2.6/mm/page_alloc.c 2008-02-11 11:08:00.000000000 +1100 @@ -449,7 +449,7 @@ zone->free_area[order].nr_free++; } -static inline int free_pages_check(struct page *page) +static inline void free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -467,12 +467,6 @@ bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); - /* - * For now, we report if PG_reserved was found set, but do not - * clear it, and do not free the page. But we shall soon need - * to do more, for when the ZERO_PAGE count wraps negative. - */ - return PageReserved(page); } /* @@ -517,12 +511,9 @@ { unsigned long flags; int i; - int reserved = 0; for (i = 0 ; i < (1 << order) ; ++i) - reserved += free_pages_check(page + i); - if (reserved) - return; + free_pages_check(page + i); if (!PageHighMem(page)) debug_check_no_locks_freed(page_address(page),PAGE_SIZE<mapping != NULL) | @@ -616,18 +607,12 @@ 1 << PG_buddy )))) bad_page(page); - /* - * For now, we report if PG_reserved was found set, but do not - * clear it, and do not allocate the page: as a safety net. - */ - if (PageReserved(page)) - return 1; - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); set_page_private(page, 0); - set_page_refcounted(page); + if (!(gfp_flags & __GFP_NOREFS)) + set_page_refcounted(page); arch_alloc_page(page, order); kernel_map_pages(page, 1 << order, 1); @@ -637,8 +622,6 @@ if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); - - return 0; } /* @@ -983,8 +966,7 @@ if (PageAnon(page)) page->mapping = NULL; - if (free_pages_check(page)) - return; + free_pages_check(page); if (!PageHighMem(page)) debug_check_no_locks_freed(page_address(page), PAGE_SIZE); @@ -1037,13 +1019,43 @@ set_page_refcounted(page + i); } +#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ +#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ +#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ +#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ + +static int alloc_watermarks_ok(struct zonelist *zonelist, struct zone *zone, + int order, gfp_t gfp_mask, int alloc_flags) +{ + int classzone_idx = zone_idx(zonelist->zones[0]); + if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { + unsigned long mark; + if (alloc_flags & ALLOC_WMARK_MIN) + mark = zone->pages_min; + else if (alloc_flags & ALLOC_WMARK_LOW) + mark = zone->pages_low; + else + mark = zone->pages_high; + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) { + if (!zone_reclaim_mode || + !zone_reclaim(zone, gfp_mask, order)) + return 0; + } + } + return 1; +} + /* * Really, prep_compound_page() should be called from __rmqueue_bulk(). But * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ static struct page *buffered_rmqueue(struct zonelist *zonelist, - struct zone *zone, int order, gfp_t gfp_flags) + struct zone *zone, int order, gfp_t gfp_flags, int alloc_flags) { unsigned long flags; struct page *page; @@ -1051,14 +1063,15 @@ int cpu; int migratetype = allocflags_to_migratetype(gfp_flags); -again: - cpu = get_cpu(); + local_irq_save(flags); if (likely(order == 0)) { struct per_cpu_pages *pcp; + cpu = smp_processor_id(); pcp = &zone_pcp(zone, cpu)->pcp; - local_irq_save(flags); if (!pcp->count) { + if (!alloc_watermarks_ok(zonelist, zone, order, gfp_flags, alloc_flags)) + goto failed; pcp->count = rmqueue_bulk(zone, 0, pcp->batch, &pcp->list, migratetype); if (unlikely(!pcp->count)) @@ -1078,7 +1091,8 @@ /* Allocate more to the pcp list if necessary */ if (unlikely(&page->lru == &pcp->list)) { - pcp->count += rmqueue_bulk(zone, 0, + if (alloc_watermarks_ok(zonelist, zone, order, gfp_flags, alloc_flags)) + pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list, migratetype); page = list_entry(pcp->list.next, struct page, lru); } @@ -1086,7 +1100,9 @@ list_del(&page->lru); pcp->count--; } else { - spin_lock_irqsave(&zone->lock, flags); + if (!alloc_watermarks_ok(zonelist, zone, order, gfp_flags, alloc_flags)) + goto failed; + spin_lock(&zone->lock); page = __rmqueue(zone, order, migratetype); spin_unlock(&zone->lock); if (!page) @@ -1096,27 +1112,16 @@ __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(zonelist, zone); local_irq_restore(flags); - put_cpu(); VM_BUG_ON(bad_range(zone, page)); - if (prep_new_page(page, order, gfp_flags)) - goto again; + prep_new_page(page, order, gfp_flags); return page; failed: local_irq_restore(flags); - put_cpu(); return NULL; } -#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ -#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ -#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ -#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ -#define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ -#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ - #ifdef CONFIG_FAIL_PAGE_ALLOC static struct fail_page_alloc_attr { @@ -1374,7 +1379,6 @@ { struct zone **z; struct page *page = NULL; - int classzone_idx = zone_idx(zonelist->zones[0]); struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ @@ -1409,26 +1413,10 @@ !cpuset_zone_allowed_softwall(zone, gfp_mask)) goto try_next_zone; - if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { - unsigned long mark; - if (alloc_flags & ALLOC_WMARK_MIN) - mark = zone->pages_min; - else if (alloc_flags & ALLOC_WMARK_LOW) - mark = zone->pages_low; - else - mark = zone->pages_high; - if (!zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) { - if (!zone_reclaim_mode || - !zone_reclaim(zone, gfp_mask, order)) - goto this_zone_full; - } - } - - page = buffered_rmqueue(zonelist, zone, order, gfp_mask); + page = buffered_rmqueue(zonelist, zone, order, gfp_mask, alloc_flags); if (page) break; -this_zone_full: + if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); try_next_zone: @@ -1669,7 +1657,6 @@ return (unsigned long) page_address(page); return 0; } - EXPORT_SYMBOL(get_zeroed_page); void __pagevec_free(struct pagevec *pvec) @@ -1680,16 +1667,20 @@ free_hot_cold_page(pvec->pages[i], pvec->cold); } -void __free_pages(struct page *page, unsigned int order) +void __free_pages_noref(struct page *page, unsigned int order) { - if (put_page_testzero(page)) { - if (order == 0) - free_hot_page(page); - else - __free_pages_ok(page, order); - } + if (likely(order == 0)) + free_hot_page(page); + else + __free_pages_ok(page, order); } +EXPORT_SYMBOL(__free_pages_noref); +void __free_pages(struct page *page, unsigned int order) +{ + if (put_page_testzero(page)) + __free_pages_noref(page, order); +} EXPORT_SYMBOL(__free_pages); void free_pages(unsigned long addr, unsigned int order) Index: linux-2.6/mm/slub.c =================================================================== --- linux-2.6.orig/mm/slub.c 2008-02-11 10:06:36.000000000 +1100 +++ linux-2.6/mm/slub.c 2008-02-11 11:08:00.000000000 +1100 @@ -1078,6 +1078,7 @@ struct page *page; int pages = 1 << s->order; + flags |= __GFP_NOREFS; if (s->order) flags |= __GFP_COMP; @@ -1175,7 +1176,7 @@ -pages); page->mapping = NULL; - __free_pages(page, s->order); + __free_pages_noref(page, s->order); } static void rcu_free_slab(struct rcu_head *h) @@ -2671,7 +2672,7 @@ struct kmem_cache *s; if (unlikely(size > PAGE_SIZE / 2)) - return (void *)__get_free_pages(flags | __GFP_COMP, + return (void *)__get_free_pages(flags | __GFP_COMP | __GFP_NOREFS, get_order(size)); s = get_slab(size, flags); @@ -2689,7 +2690,7 @@ struct kmem_cache *s; if (unlikely(size > PAGE_SIZE / 2)) - return (void *)__get_free_pages(flags | __GFP_COMP, + return (void *)__get_free_pages(flags | __GFP_COMP | __GFP_NOREFS, get_order(size)); s = get_slab(size, flags); @@ -2752,7 +2753,10 @@ page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { - put_page(page); + unsigned int order = 0; + if (unlikely(PageCompound(page))) + order = compound_order(page); + __free_pages_noref(page, order); return; } slab_free(page->slab, page, object, __builtin_return_address(0)); @@ -3219,7 +3223,7 @@ struct kmem_cache *s; if (unlikely(size > PAGE_SIZE / 2)) - return (void *)__get_free_pages(gfpflags | __GFP_COMP, + return (void *)__get_free_pages(gfpflags | __GFP_COMP | __GFP_NOREFS, get_order(size)); s = get_slab(size, gfpflags); @@ -3235,7 +3239,7 @@ struct kmem_cache *s; if (unlikely(size > PAGE_SIZE / 2)) - return (void *)__get_free_pages(gfpflags | __GFP_COMP, + return (void *)__get_free_pages(gfpflags | __GFP_COMP | __GFP_NOREFS, get_order(size)); s = get_slab(size, gfpflags); Index: linux-2.6/include/linux/slub_def.h =================================================================== --- linux-2.6.orig/include/linux/slub_def.h 2008-02-11 10:06:36.000000000 +1100 +++ linux-2.6/include/linux/slub_def.h 2008-02-11 11:08:00.000000000 +1100 @@ -192,7 +192,7 @@ { if (__builtin_constant_p(size)) { if (size > PAGE_SIZE / 2) - return (void *)__get_free_pages(flags | __GFP_COMP, + return (void *)__get_free_pages(flags | __GFP_COMP | __GFP_NOREFS, get_order(size)); if (!(flags & SLUB_DMA)) {