--- include/linux/gfp.h | 3 include/linux/mmzone.h | 1 include/linux/pagepool.h | 20 ++++ mm/Kconfig | 7 + mm/Makefile | 1 mm/filemap.c | 15 ++- mm/page_alloc.c | 12 +- mm/pagepool.c | 195 +++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 245 insertions(+), 9 deletions(-) Index: linux-2.6.21-rc7-mm2/mm/filemap.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/filemap.c 2007-05-01 10:34:27.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/filemap.c 2007-05-02 10:51:44.000000000 -0700 @@ -30,6 +30,7 @@ #include #include #include +#include #include "filemap.h" #include "internal.h" @@ -471,11 +472,15 @@ int add_to_page_cache_lru(struct page *p #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp, int order) { - if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, gfp, order); - } - return alloc_pages(gfp, order); + int node = -1; + + if (cpuset_do_page_mem_spread()) + node = cpuset_mem_spread_node(); + + if (order) + return pagepool_alloc(node, gfp, order); + else + return alloc_pages_node(node, gfp, order); } EXPORT_SYMBOL(__page_cache_alloc); #endif Index: linux-2.6.21-rc7-mm2/mm/page_alloc.c =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/page_alloc.c 2007-05-01 10:36:57.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/page_alloc.c 2007-05-01 19:49:42.000000000 -0700 @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -280,14 +281,14 @@ static void free_compound_page(struct pa __free_pages_ok(page, compound_order(page)); } -static void prep_compound_page(struct page *page, unsigned long order) +void prep_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; + __SetPageHead(page); set_compound_page_dtor(page, free_compound_page); set_compound_order(page, order); - __SetPageHead(page); for (i = 1; i < nr_pages; i++) { struct page *p = page + i; @@ -549,8 +550,11 @@ static void __free_pages_ok(struct page int i; int reserved = 0; + if (PageHead(page)) + destroy_compound_page(page, order); + for (i = 0 ; i < (1 << order) ; ++i) - reserved += free_pages_check(page + i); + reserved += free_pages_check(page + i); if (reserved) return; @@ -628,7 +632,7 @@ static inline void expand(struct zone *z /* * This page is about to be returned from the page allocator */ -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | Index: linux-2.6.21-rc7-mm2/include/linux/pagepool.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.21-rc7-mm2/include/linux/pagepool.h 2007-05-02 10:51:04.000000000 -0700 @@ -0,0 +1,20 @@ +#ifndef _LINUX_PAGE_POOL_H +#define _LINUX_PAGE_POOL_H + +#ifdef CONFIG_PAGEPOOL +#include + +/* + * Pagepools: Facility to provide higher order pages + * + * (C) 2007 sgi. Christoph Lameter + * + */ +struct page *pagepool_alloc(int node, gfp_t flags, int order); +void pagepool_free(struct pagepool *pp, struct page *page); + +#define PagePoolPage(page) PageSwapCache((page) + 1) + +#endif /* CONFIG_PAGEPOOL */ + +#endif /* _LINUX_PAGE_POOL_H */ Index: linux-2.6.21-rc7-mm2/mm/pagepool.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.21-rc7-mm2/mm/pagepool.c 2007-05-02 10:51:04.000000000 -0700 @@ -0,0 +1,195 @@ +/* + * Pagepools: A facility to allocate higher order pages in Linux + * + * Higher order pages are a problem for the VM because of memory fragmentation issues. Pagepools allow + * the use of a number of higher order pages at bootup. These are kept strictly for the pagepools and + * are never freed. Thus there is no issue with fragmentation. + * + * (C) 2007 sgi. Christoph Lameter + * + * Page flag overloading + * + * PageSwapCache Pagepools do not support swapping. Used to indicate that page is free. + * + * PagePrivate(page + 1) Indicates that this is a page pool page + */ +#include +#include +#include +#include +#include + +static int pagepool_order = 4; /* 64k */ +static int pagepool_pages = 4096; /* Uses 256k of memory */ + +struct pagepool { + int nr_pages; + int nr_allocated; + int alloc_cursor; + int reclaim_high; + int reclaim_low; + struct zone *zone; + struct page *pages[]; +}; + +static void SetPagePagepool(struct page *page) +{ + SetPageSwapCache(page + 1); +} + +static void ClearPagePagepool(struct page *page) +{ + ClearPageSwapCache(page + 1); +} + +static struct page *get_pagepool_page(struct pagepool *pp, gfp_t flags) +{ + int pos; + struct page *page; + + pos = pp->alloc_cursor; + while (!PageSwapCache(pp->pages[pos])) { + pos++; + if (pos > pp->nr_pages) + pos = 0; + if (pos == pp->alloc_cursor) + return NULL; + } + pp->alloc_cursor = pos; + page = pp->pages[pos]; + /* + * Prepare page. Fool the page allocator into thinking this is + * a normal page + */ + prep_new_page(page, pagepool_order, flags & ~__GFP_COMP); + SetPagePagepool(page); + return page; +} + +/* + * Called from page allocator with zone lock held. + */ +void pagepool_free(struct pagepool *pp, struct page *page) +{ + if (compound_order(page) != pagepool_order) { + /* Freeing a page of an order not supported. */ + WARN_ON(1); + return; + } + if (!PagePrivate(page + 1)) { + /* Freeing a non page pool page */ + WARN_ON(1); + return; + } + WARN_ON(page_count(page)); + BUG_ON(pp->nr_allocated == pp->nr_pages); + ClearPagePagepool(page); + pp->nr_allocated--; +} + +/* + * Try to reclaim memory + */ +void pagepool_reclaim(struct pagepool *pp, gfp_t flags) +{ + struct zone *zl[2]; + + zl[0] = pp->zone; + zl[1] = NULL; + + while (pp->nr_pages - pp->nr_allocated < pp->reclaim_high) { + if (!try_to_free_pages(zl, pagepool_order, flags)) + printk(KERN_ERR "pagepool_reclaim: " + "Cannot free memory\n"); + } +} + +struct page *pagepool_alloc(int node, gfp_t gfpflags, int order) +{ + unsigned long flags; + struct pagepool *pp; + struct page *page; + + if (node == -1) + node = numa_node_id(); + + BUG_ON(pagepool_order != order); + pp = NODE_DATA(node)->node_zones[0].pagepool; + + if (pp->nr_pages - pp->nr_allocated <= pp->reclaim_low) + pagepool_reclaim(pp, gfpflags); + + spin_lock_irqsave(&pp->zone->lock, flags); + page = get_pagepool_page(pp, gfpflags); + spin_unlock_irqrestore(&pp->zone->lock, flags); + BUG_ON(!page); + return page; +} + +static int __init pagepool_init(void) +{ + int per_node_pages = pagepool_pages / num_online_nodes(); + int pagepool_page_size = PAGE_SIZE << pagepool_order; + int pagepool_pages_per_page = 1 << pagepool_order; + int node; + int i; + int z; + + for_each_online_node(node) { + struct pagepool *pp = kmalloc_node(sizeof(struct pagepool) + + per_node_pages * sizeof(void *), GFP_KERNEL, node); + int done = 0; + + BUG_ON(!pp); + + memset(pp, 0, sizeof(struct pagepool)); + pp->nr_pages = per_node_pages; + pp->nr_allocated = 0; + pp->reclaim_low = per_node_pages << 6; + pp->reclaim_high = per_node_pages << 5; + + do { + int order; + int pages = 0; + struct page *page; + + order = get_order((per_node_pages - done) * + pagepool_page_size); + + while (!(page = alloc_pages_node(node, + GFP_KERNEL | GFP_THISNODE, order))) { + order--; + if (order < pagepool_order) { + printk(KERN_ERR "PagePool: Can only " + "allocate %d of %d pages on node %d\n", + done, per_node_pages, node); + goto out; + } + } + + pages = 1 << (order - pagepool_order); + for (i = 0; i < pages; i++) { + pp->pages[i] = page + i * pagepool_pages_per_page; + prep_compound_page(pp->pages[i], pagepool_order); + done++; + } + } + while (done < pp->nr_pages); +out: + pp->nr_pages = done; + + z = ZONE_NORMAL; + /* + * Accomodate strange configs that have DMA or DMA32 as the + * highest zone. + */ + if (policy_zone < z) + z = policy_zone; + pp->zone = &NODE_DATA(node)->node_zones[z]; + for (i = 0; i < MAX_NR_ZONES; i++) + NODE_DATA(node)->node_zones[i].pagepool = pp; + } + return 0; +} + +__initcall(pagepool_init); Index: linux-2.6.21-rc7-mm2/include/linux/gfp.h =================================================================== --- linux-2.6.21-rc7-mm2.orig/include/linux/gfp.h 2007-05-01 14:48:40.000000000 -0700 +++ linux-2.6.21-rc7-mm2/include/linux/gfp.h 2007-05-01 14:49:19.000000000 -0700 @@ -204,4 +204,7 @@ void drain_node_pages(int node); static inline void drain_node_pages(int node) { }; #endif +void prep_compound_page(struct page *page, unsigned long order); +int prep_new_page(struct page *page, int order, gfp_t gfp_flags); + #endif /* __LINUX_GFP_H */ Index: linux-2.6.21-rc7-mm2/include/linux/mmzone.h =================================================================== --- linux-2.6.21-rc7-mm2.orig/include/linux/mmzone.h 2007-05-01 14:21:15.000000000 -0700 +++ linux-2.6.21-rc7-mm2/include/linux/mmzone.h 2007-05-01 14:21:45.000000000 -0700 @@ -332,6 +332,7 @@ struct zone { * rarely used fields: */ const char *name; + struct pagepool *pagepool; } ____cacheline_internodealigned_in_smp; /* Index: linux-2.6.21-rc7-mm2/mm/Kconfig =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/Kconfig 2007-05-01 14:02:13.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/Kconfig 2007-05-01 14:20:21.000000000 -0700 @@ -168,6 +168,13 @@ config NR_QUICK depends on QUICKLIST default "1" +config PAGEPOOL + bool "Page Pools (EXPERIMENTAL)" + default n + depends on EXPERIMENTAL + help + Page Pools to support higher order allocations. + # # Adaptive file readahead # Index: linux-2.6.21-rc7-mm2/mm/Makefile =================================================================== --- linux-2.6.21-rc7-mm2.orig/mm/Makefile 2007-05-01 14:01:51.000000000 -0700 +++ linux-2.6.21-rc7-mm2/mm/Makefile 2007-05-01 14:02:08.000000000 -0700 @@ -32,4 +32,5 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_PAGEPOOL) += pagepool.o