If we can be sure that elevating the page_count on a pagecache page will pin it, we can speculatively run this operation, and subsequently check to see if we hit the right page rather than relying on holding a lock or otherwise pinning a reference to the page. This can be done if get_page/put_page behaves in the same manner throughout the whole tree (ie. if we "get" the page after it has been used for something else, we must be able to free it with a put_page). This needs an atomic_cmpxchg operation to ensure we don't try to grab a free page. Index: linux-2.6/include/linux/page-flags.h =================================================================== --- linux-2.6.orig/include/linux/page-flags.h +++ linux-2.6/include/linux/page-flags.h @@ -76,6 +76,8 @@ #define PG_nosave_free 18 /* Free, should not be written */ #define PG_uncached 19 /* Page has been mapped as uncached */ +#define PG_freeing 20 /* Pagecache is about to be freed */ + /* * Global page accounting. One instance per CPU. Only unsigned longs are * allowed. @@ -320,6 +322,11 @@ extern unsigned long *__page_state(unsig #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#define PageFreeing(page) test_bit(PG_freeing, &(page)->flags) +#define SetPageFreeing(page) set_bit(PG_freeing, &(page)->flags) +#define ClearPageFreeing(page) clear_bit(PG_freeing, &(page)->flags) +#define __ClearPageFreeing(page) __clear_bit(PG_freeing, &(page)->flags) + struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); Index: linux-2.6/include/linux/pagemap.h =================================================================== --- linux-2.6.orig/include/linux/pagemap.h +++ linux-2.6/include/linux/pagemap.h @@ -50,6 +50,60 @@ static inline void mapping_set_gfp_mask( #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); +static inline struct page *page_cache_get_speculative(struct page **pagep) +{ + struct page *page; + + page = *pagep; + if (unlikely(!page)) + return NULL; + + if (unlikely(!get_page_unless_zero(page))) + return NULL; + + /* + * Note that atomic_cmpxchg provides a memory barrier. + * This is needed to ensure PageFreeing is evaluated after the + * page refcount has been raised. See below comment. + */ + + if (unlikely(PageFreeing(page))) /* page being freed? */ + goto bad_page; + + /* + * smp_rmb is to ensure the load of page->flags (ie. PageFreeing(page)) + * is performed before the load of *pagep in the below comparison. + * + * Those places that set PageFreeing have the following pattern: + * SetPageFreeing(page) + * wmb(); + * if (page_count == X) + * remove page from pagecache + * wmb(); + * ClearPageFreeing(page) + * + * So if !PageFreeing() _after_ we've elevated page refcount, then + * either the page will be safely pinned in pagecache, or it will have + * been already removed. In the latter case, *pagep will be NULL in the + * below test - provided it is loaded after testing PageFreeing(). + * + * If the load was out of order, *pagep might be loaded before the + * page is removed from pagecache while PageFreeing evaluated after + * the ClearPageFreeing(). + */ + smp_rmb(); + + if (unlikely(page != *pagep)) /* page no longer at *pagep? */ + goto bad_page; + + return page; + +bad_page: + /* Now that we have a reference, we must do a put_page. */ + put_page(page); + return NULL; +} + static inline struct page *page_cache_alloc(struct address_space *x) { return alloc_pages(mapping_gfp_mask(x)|__GFP_NORECLAIM, 0); Index: linux-2.6/mm/vmscan.c =================================================================== --- linux-2.6.orig/mm/vmscan.c +++ linux-2.6/mm/vmscan.c @@ -504,6 +504,7 @@ static int shrink_list(struct list_head if (!mapping) goto keep_locked; /* truncate got there first */ + SetPageFreeing(page); write_lock_irq(&mapping->tree_lock); /* @@ -534,6 +535,7 @@ static int shrink_list(struct list_head free_it: unlock_page(page); + ClearPageFreeing(page); reclaimed++; if (!pagevec_add(&freed_pvec, page)) __pagevec_release_nonlru(&freed_pvec); @@ -541,6 +543,7 @@ free_it: cannot_free: write_unlock_irq(&mapping->tree_lock); + ClearPageFreeing(page); goto keep_locked; activate_locked: @@ -592,7 +595,7 @@ static int isolate_lru_pages(int nr_to_s BUG_ON(!PageLRU(page)); list_del(&page->lru); target = src; - if (get_page_unless_zero(page)) { + if (likely(get_page_unless_zero(page))) { ClearPageLRU(page); target = dst; nr_taken++; Index: linux-2.6/mm/bootmem.c =================================================================== --- linux-2.6.orig/mm/bootmem.c +++ linux-2.6/mm/bootmem.c @@ -296,19 +296,20 @@ static unsigned long __init free_all_boo int j, order; page = pfn_to_page(pfn); + prefetchw(page); + count += BITS_PER_LONG; - __ClearPageReserved(page); order = ffs(BITS_PER_LONG) - 1; - set_page_refs(page, order); - for (j = 1; j < BITS_PER_LONG; j++) { - if (j + 16 < BITS_PER_LONG) - prefetchw(page + j + 16); + for (j = 0; j < BITS_PER_LONG; j++) { + if (j + 1 < BITS_PER_LONG) + prefetchw(page + j + 1); __ClearPageReserved(page + j); set_page_count(page + j, 0); } + set_page_refs(page, order); __free_pages(page, order); + i += BITS_PER_LONG; - page += BITS_PER_LONG; } else if (v) { unsigned long m; @@ -317,6 +318,7 @@ static unsigned long __init free_all_boo if (v & m) { count++; __ClearPageReserved(page); + set_page_count(page, 0); set_page_refs(page, 0); __free_page(page); } Index: linux-2.6/mm/swapfile.c =================================================================== --- linux-2.6.orig/mm/swapfile.c +++ linux-2.6/mm/swapfile.c @@ -348,6 +348,7 @@ int remove_exclusive_swap_page(struct pa retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the swapcache lock held.. */ + SetPageFreeing(page); write_lock_irq(&swapper_space.tree_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); @@ -355,6 +356,7 @@ int remove_exclusive_swap_page(struct pa retval = 1; } write_unlock_irq(&swapper_space.tree_lock); + ClearPageFreeing(page); } spin_unlock(&swap_lock);