Index: linux-2.6/mm/page_alloc.c =================================================================== --- linux-2.6.orig/mm/page_alloc.c +++ linux-2.6/mm/page_alloc.c @@ -127,7 +127,6 @@ static int bad_range(struct zone *zone, return 0; } - #else static inline int bad_range(struct zone *zone, struct page *page) { @@ -218,12 +217,12 @@ static inline void prep_zero_page(struct { int i; - BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + VM_BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); /* * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO * and __GFP_HIGHMEM from hard or soft interrupt context. */ - BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); + VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); for (i = 0; i < (1 << order); i++) clear_highpage(page + i); } @@ -342,8 +341,8 @@ static inline void __free_one_page(struc page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); - BUG_ON(page_idx & (order_size - 1)); - BUG_ON(bad_range(zone, page)); + VM_BUG_ON(page_idx & (order_size - 1)); + VM_BUG_ON(bad_range(zone, page)); zone->free_pages += order_size; while (order < MAX_ORDER-1) { @@ -416,7 +415,7 @@ static void free_pages_bulk(struct zone while (count--) { struct page *page; - BUG_ON(list_empty(list)); + VM_BUG_ON(list_empty(list)); page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_one_page list manipulates */ list_del(&page->lru); @@ -507,7 +506,7 @@ static inline void expand(struct zone *z area--; high--; size >>= 1; - BUG_ON(bad_range(zone, &page[size])); + VM_BUG_ON(bad_range(zone, &page[size])); list_add(&page[size].lru, &area->free_list); area->nr_free++; set_page_order(&page[size], high); @@ -777,8 +776,8 @@ void split_page(struct page *page, unsig { int i; - BUG_ON(PageCompound(page)); - BUG_ON(!page_count(page)); + VM_BUG_ON(PageCompound(page)); + VM_BUG_ON(!page_count(page)); for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); } @@ -825,7 +824,7 @@ again: local_irq_restore(flags); put_cpu(); - BUG_ON(bad_range(zone, page)); + VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) goto again; return page; @@ -1100,7 +1099,7 @@ fastcall unsigned long get_zeroed_page(g * get_zeroed_page() returns a 32-bit address, which cannot represent * a highmem page */ - BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); + VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); page = alloc_pages(gfp_mask | __GFP_ZERO, 0); if (page) @@ -1133,7 +1132,7 @@ EXPORT_SYMBOL(__free_pages); fastcall void free_pages(unsigned long addr, unsigned int order) { if (addr != 0) { - BUG_ON(!virt_addr_valid((void *)addr)); + VM_BUG_ON(!virt_addr_valid((void *)addr)); __free_pages(virt_to_page((void *)addr), order); } } Index: linux-2.6/mm/swap.c =================================================================== --- linux-2.6.orig/mm/swap.c +++ linux-2.6/mm/swap.c @@ -214,7 +214,7 @@ void fastcall __page_cache_release(struc struct zone *zone = page_zone(page); spin_lock_irqsave(&zone->lru_lock, flags); - BUG_ON(!PageLRU(page)); + VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); @@ -265,7 +265,7 @@ void release_pages(struct page **pages, zone = pagezone; spin_lock_irq(&zone->lru_lock); } - BUG_ON(!PageLRU(page)); + VM_BUG_ON(!PageLRU(page)); __ClearPageLRU(page); del_page_from_lru(zone, page); } @@ -318,7 +318,7 @@ void __pagevec_release_nonlru(struct pag for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); if (put_page_testzero(page)) pagevec_add(&pages_to_free, page); } @@ -345,7 +345,7 @@ void __pagevec_lru_add(struct pagevec *p zone = pagezone; spin_lock_irq(&zone->lru_lock); } - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); SetPageLRU(page); add_page_to_inactive_list(zone, page); } @@ -372,9 +372,9 @@ void __pagevec_lru_add_active(struct pag zone = pagezone; spin_lock_irq(&zone->lru_lock); } - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - BUG_ON(PageActive(page)); + VM_BUG_ON(PageActive(page)); SetPageActive(page); add_page_to_active_list(zone, page); } Index: linux-2.6/mm/vmscan.c =================================================================== --- linux-2.6.orig/mm/vmscan.c +++ linux-2.6/mm/vmscan.c @@ -365,7 +365,8 @@ int remove_mapping(struct address_space if (!mapping) return 0; /* truncate got there first */ - write_lock_irq(&mapping->tree_lock); + SetPageNoNewRefs(page); + spin_lock_irq(&mapping->tree_lock); /* * The non-racy check for busy page. It is critical to check @@ -381,19 +382,22 @@ int remove_mapping(struct address_space if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); swap_free(swap); - __put_page(page); /* The pagecache ref */ - return 1; + goto free_it; } __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); - __put_page(page); + spin_unlock_irq(&mapping->tree_lock); + +free_it: + __ClearPageNoNewRefs(page); + __put_page(page); /* The pagecache ref */ return 1; cannot_free: - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + ClearPageNoNewRefs(page); return 0; } @@ -425,7 +429,7 @@ static unsigned long shrink_page_list(st if (TestSetPageLocked(page)) goto keep; - BUG_ON(PageActive(page)); + VM_BUG_ON(PageActive(page)); sc->nr_scanned++; @@ -549,7 +553,7 @@ keep_locked: unlock_page(page); keep: list_add(&page->lru, &ret_pages); - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); } list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) @@ -588,7 +592,7 @@ static unsigned long isolate_lru_pages(u page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - BUG_ON(!PageLRU(page)); + VM_BUG_ON(!PageLRU(page)); list_del(&page->lru); target = src; @@ -659,7 +663,7 @@ static unsigned long shrink_inactive_lis */ while (!list_empty(&page_list)) { page = lru_to_page(&page_list); - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); SetPageLRU(page); list_del(&page->lru); if (PageActive(page)) @@ -780,9 +784,9 @@ static void shrink_active_list(unsigned while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - BUG_ON(!PageActive(page)); + VM_BUG_ON(!PageActive(page)); ClearPageActive(page); list_move(&page->lru, &zone->inactive_list); @@ -810,9 +814,9 @@ static void shrink_active_list(unsigned while (!list_empty(&l_active)) { page = lru_to_page(&l_active); prefetchw_prev_lru_page(page, &l_active, flags); - BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - BUG_ON(!PageActive(page)); + VM_BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); pgmoved++; if (!pagevec_add(&pvec, page)) { Index: linux-2.6/mm/internal.h =================================================================== --- linux-2.6.orig/mm/internal.h +++ linux-2.6/mm/internal.h @@ -24,8 +24,8 @@ static inline void set_page_count(struct */ static inline void set_page_refcounted(struct page *page) { - BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); - BUG_ON(atomic_read(&page->_count)); + VM_BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page); + VM_BUG_ON(atomic_read(&page->_count)); set_page_count(page, 1); } Index: linux-2.6/include/linux/mm.h =================================================================== --- linux-2.6.orig/include/linux/mm.h +++ linux-2.6/include/linux/mm.h @@ -274,6 +274,12 @@ struct page { */ #include +#ifdef CONFIG_DEBUG_VM +#define VM_BUG_ON(cond) BUG_ON(cond) +#else +#define VM_BUG_ON(condition) do { } while(0) +#endif + /* * Methods to modify the page usage count. * @@ -293,7 +299,7 @@ struct page { */ static inline int put_page_testzero(struct page *page) { - BUG_ON(atomic_read(&page->_count) == 0); + VM_BUG_ON(atomic_read(&page->_count) == 0); return atomic_dec_and_test(&page->_count); } @@ -303,6 +309,7 @@ static inline int put_page_testzero(stru */ static inline int get_page_unless_zero(struct page *page) { + VM_BUG_ON(PageCompound(page)); return atomic_inc_not_zero(&page->_count); } @@ -319,6 +326,7 @@ static inline void get_page(struct page { if (unlikely(PageCompound(page))) page = (struct page *)page_private(page); + VM_BUG_ON(atomic_read(&page->_count) == 0); atomic_inc(&page->_count); } Index: linux-2.6/lib/radix-tree.c =================================================================== --- linux-2.6.orig/lib/radix-tree.c +++ linux-2.6/lib/radix-tree.c @@ -30,6 +30,7 @@ #include #include #include +#include #ifdef __KERNEL__ @@ -45,7 +46,9 @@ ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) struct radix_tree_node { + unsigned int height; /* Height from the bottom */ unsigned int count; + struct rcu_head rcu_head; void *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; @@ -97,10 +100,17 @@ radix_tree_node_alloc(struct radix_tree_ return ret; } +static void radix_tree_node_rcu_free(struct rcu_head *head) +{ + struct radix_tree_node *node = + container_of(head, struct radix_tree_node, rcu_head); + kmem_cache_free(radix_tree_node_cachep, node); +} + static inline void radix_tree_node_free(struct radix_tree_node *node) { - kmem_cache_free(radix_tree_node_cachep, node); + call_rcu(&node->rcu_head, radix_tree_node_rcu_free); } /* @@ -206,6 +216,7 @@ static int radix_tree_extend(struct radi } do { + unsigned int newheight; if (!(node = radix_tree_node_alloc(root))) return -ENOMEM; @@ -218,9 +229,11 @@ static int radix_tree_extend(struct radi tag_set(node, tag, 0); } + newheight = root->height+1; + node->height = newheight; node->count = 1; - root->rnode = node; - root->height++; + rcu_assign_pointer(root->rnode, node); + root->height = newheight; } while (height > root->height); out: return 0; @@ -260,11 +273,12 @@ int radix_tree_insert(struct radix_tree_ /* Have to add a child node. */ if (!(slot = radix_tree_node_alloc(root))) return -ENOMEM; + slot->height = height; if (node) { - node->slots[offset] = slot; + rcu_assign_pointer(node->slots[offset], slot); node->count++; } else - root->rnode = slot; + rcu_assign_pointer(root->rnode, slot); } /* Go a level down */ @@ -280,7 +294,7 @@ int radix_tree_insert(struct radix_tree_ BUG_ON(!node); node->count++; - node->slots[offset] = item; + rcu_assign_pointer(node->slots[offset], item); BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); @@ -292,25 +306,29 @@ static inline void **__lookup_slot(struc unsigned long index) { unsigned int height, shift; - struct radix_tree_node **slot; + struct radix_tree_node *node, **slot; - height = root->height; + /* Must take a copy now because root->rnode may change */ + node = rcu_dereference(root->rnode); + if (node == NULL) + return NULL; + + height = node->height; if (index > radix_tree_maxindex(height)) return NULL; shift = (height-1) * RADIX_TREE_MAP_SHIFT; - slot = &root->rnode; - while (height > 0) { - if (*slot == NULL) + do { + slot = (struct radix_tree_node **) + (node->slots + ((index>>shift) & RADIX_TREE_MAP_MASK)); + node = rcu_dereference(*slot); + if (node == NULL) return NULL; - slot = (struct radix_tree_node **) - ((*slot)->slots + - ((index >> shift) & RADIX_TREE_MAP_MASK)); shift -= RADIX_TREE_MAP_SHIFT; height--; - } + } while (height > 0); return (void **)slot; } @@ -341,7 +359,7 @@ void *radix_tree_lookup(struct radix_tre void **slot; slot = __lookup_slot(root, index); - return slot != NULL ? *slot : NULL; + return slot != NULL ? rcu_dereference(*slot) : NULL; } EXPORT_SYMBOL(radix_tree_lookup); @@ -505,26 +523,25 @@ EXPORT_SYMBOL(radix_tree_tag_get); #endif static unsigned int -__lookup(struct radix_tree_root *root, void **results, unsigned long index, +__lookup(struct radix_tree_node *slot, void ***results, unsigned long index, unsigned int max_items, unsigned long *next_index) { unsigned int nr_found = 0; unsigned int shift, height; - struct radix_tree_node *slot; unsigned long i; - height = root->height; + height = slot->height; if (height == 0) goto out; - shift = (height-1) * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; for ( ; height > 1; height--) { + struct radix_tree_node *__s; for (i = (index >> shift) & RADIX_TREE_MAP_MASK ; i < RADIX_TREE_MAP_SIZE; i++) { - if (slot->slots[i] != NULL) + __s = rcu_dereference(slot->slots[i]); + if (__s != NULL) break; index &= ~((1UL << shift) - 1); index += 1UL << shift; @@ -535,14 +552,14 @@ __lookup(struct radix_tree_root *root, v goto out; shift -= RADIX_TREE_MAP_SHIFT; - slot = slot->slots[i]; + slot = __s; } /* Bottom level: grab some items */ for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) { index++; if (slot->slots[i]) { - results[nr_found++] = slot->slots[i]; + results[nr_found++] = &slot->slots[i]; if (nr_found == max_items) goto out; } @@ -569,18 +586,29 @@ unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items) { - const unsigned long max_index = radix_tree_maxindex(root->height); + unsigned long max_index; + struct radix_tree_node *node; unsigned long cur_index = first_index; unsigned int ret = 0; + void ***__results = (void ***)results; /* use results as a temporary + * store for the pointers to + * the actual results */ + + node = rcu_dereference(root->rnode); + if (!node || node->height == 0) + return ret; + max_index = radix_tree_maxindex(node->height); while (ret < max_items) { - unsigned int nr_found; + unsigned int nr_found, i; unsigned long next_index; /* Index of next search */ if (cur_index > max_index) break; - nr_found = __lookup(root, results + ret, cur_index, + nr_found = __lookup(node, __results + ret, cur_index, max_items - ret, &next_index); + for (i = 0; i < nr_found; i++) + results[ret + i] = *rcu_dereference(__results[ret + i]); ret += nr_found; if (next_index == 0) break; @@ -590,21 +618,61 @@ radix_tree_gang_lookup(struct radix_tree } EXPORT_SYMBOL(radix_tree_gang_lookup); +/** + * radix_tree_gang_lookup_slot - perform multiple lookup on a radix tree + * @root: radix tree root + * @results: where the results of the lookup are placed + * @first_index: start the lookup from this key + * @max_items: place up to this many items at *results + * + * Same as radix_tree_gang_lookup, but returns an array of pointers + * (slots) to the stored items instead of the items themselves. + */ +unsigned int +radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items) +{ + struct radix_tree_node *node; + unsigned long max_index; + unsigned long cur_index = first_index; + unsigned int ret = 0; + + node = root->rnode; + if (!node || node->height == 0) + return ret; + max_index = radix_tree_maxindex(node->height); + + while (ret < max_items) { + unsigned int nr_found; + unsigned long next_index; /* Index of next search */ + + if (cur_index > max_index) + break; + nr_found = __lookup(node, results + ret, cur_index, + max_items - ret, &next_index); + ret += nr_found; + if (next_index == 0) + break; + cur_index = next_index; + } + return ret; +} +EXPORT_SYMBOL_GPL(radix_tree_gang_lookup_slot); + + /* * FIXME: the two tag_get()s here should use find_next_bit() instead of * open-coding the search. */ static unsigned int -__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index, +__lookup_tag(struct radix_tree_node *slot, void **results, unsigned long index, unsigned int max_items, unsigned long *next_index, unsigned int tag) { unsigned int nr_found = 0; unsigned int shift; - unsigned int height = root->height; - struct radix_tree_node *slot; + unsigned int height = slot->height; shift = (height - 1) * RADIX_TREE_MAP_SHIFT; - slot = root->rnode; while (height > 0) { unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK; @@ -661,17 +729,23 @@ radix_tree_gang_lookup_tag(struct radix_ unsigned long first_index, unsigned int max_items, unsigned int tag) { - const unsigned long max_index = radix_tree_maxindex(root->height); + struct radix_tree_node *node; + unsigned long max_index; unsigned long cur_index = first_index; unsigned int ret = 0; + node = root->rnode; + if (!node || node->height == 0) + return ret; + max_index = radix_tree_maxindex(node->height); + while (ret < max_items) { unsigned int nr_found; unsigned long next_index; /* Index of next search */ if (cur_index > max_index) break; - nr_found = __lookup_tag(root, results + ret, cur_index, + nr_found = __lookup_tag(node, results + ret, cur_index, max_items - ret, &next_index, tag); ret += nr_found; if (next_index == 0) @@ -694,6 +768,11 @@ static inline void radix_tree_shrink(str root->rnode->slots[0]) { struct radix_tree_node *to_free = root->rnode; + /* + * this doesn't need an rcu_assign_pointer, because + * we aren't touching the object that to_free->slots[0] + * points to. + */ root->rnode = to_free->slots[0]; root->height--; /* must only free zeroed nodes into the slab */ @@ -809,7 +888,7 @@ EXPORT_SYMBOL(radix_tree_delete); int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag) { struct radix_tree_node *rnode; - rnode = root->rnode; + rnode = rcu_dereference(root->rnode); if (!rnode) return 0; return any_tag_set(rnode, tag); Index: linux-2.6/include/linux/radix-tree.h =================================================================== --- linux-2.6.orig/include/linux/radix-tree.h +++ linux-2.6/include/linux/radix-tree.h @@ -54,6 +54,9 @@ void *radix_tree_delete(struct radix_tre unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); +unsigned int +radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, + unsigned long first_index, unsigned int max_items); int radix_tree_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, Index: linux-2.6/include/linux/page-flags.h =================================================================== --- linux-2.6.orig/include/linux/page-flags.h +++ linux-2.6/include/linux/page-flags.h @@ -88,7 +88,9 @@ #define PG_nosave_free 18 /* Free, should not be written */ #define PG_buddy 19 /* Page is free, on buddy lists */ -#define PG_uncached 20 /* Page has been mapped as uncached */ +#define PG_nonewrefs 20 /* Block concurrent pagecache lookups + * while testing refcount */ +#define PG_uncached 21 /* Page has been mapped as uncached */ /* * Global page accounting. One instance per CPU. Only unsigned longs are @@ -221,16 +223,13 @@ extern void __mod_page_state_offset(unsi /* * Manipulation of page state flags */ -#define PageLocked(page) \ - test_bit(PG_locked, &(page)->flags) -#define SetPageLocked(page) \ - set_bit(PG_locked, &(page)->flags) -#define TestSetPageLocked(page) \ - test_and_set_bit(PG_locked, &(page)->flags) -#define ClearPageLocked(page) \ - clear_bit(PG_locked, &(page)->flags) -#define TestClearPageLocked(page) \ - test_and_clear_bit(PG_locked, &(page)->flags) +#define PageLocked(page) test_bit(PG_locked, &(page)->flags) +#define SetPageLocked(page) set_bit(PG_locked, &(page)->flags) +#define __SetPageLocked(page) __set_bit(PG_locked, &(page)->flags) +#define TestSetPageLocked(page) test_and_set_bit(PG_locked, &(page)->flags) +#define ClearPageLocked(page) clear_bit(PG_locked, &(page)->flags) +#define __ClearPageLocked(page) __clear_bit(PG_locked, &(page)->flags) +#define TestClearPageLocked(page) test_and_clear_bit(PG_locked, &(page)->flags) #define PageError(page) test_bit(PG_error, &(page)->flags) #define SetPageError(page) set_bit(PG_error, &(page)->flags) @@ -360,6 +359,11 @@ extern void __mod_page_state_offset(unsi #define SetPageUncached(page) set_bit(PG_uncached, &(page)->flags) #define ClearPageUncached(page) clear_bit(PG_uncached, &(page)->flags) +#define PageNoNewRefs(page) test_bit(PG_nonewrefs, &(page)->flags) +#define SetPageNoNewRefs(page) set_bit(PG_nonewrefs, &(page)->flags) +#define ClearPageNoNewRefs(page) clear_bit(PG_nonewrefs, &(page)->flags) +#define __ClearPageNoNewRefs(page) __clear_bit(PG_nonewrefs, &(page)->flags) + struct page; /* forward declaration */ int test_clear_page_dirty(struct page *page); Index: linux-2.6/include/linux/pagemap.h =================================================================== --- linux-2.6.orig/include/linux/pagemap.h +++ linux-2.6/include/linux/pagemap.h @@ -11,6 +11,8 @@ #include #include #include +#include +#include /* for in_interrupt() */ /* * Bits in mapping->flags. The lower __GFP_BITS_SHIFT bits are the page @@ -51,6 +53,91 @@ static inline void mapping_set_gfp_mask( #define page_cache_release(page) put_page(page) void release_pages(struct page **pages, int nr, int cold); +static inline struct page *page_cache_get_speculative(struct page **pagep) +{ + struct page *page; + + VM_BUG_ON(in_interrupt()); + +#ifndef CONFIG_SMP + page = *pagep; + if (unlikely(!page)) + return NULL; + + VM_BUG_ON(!in_atomic()); + /* + * Preempt must be disabled here - we rely on rcu_read_lock doing + * this for us. + * + * Pagecache won't be truncated from interrupt context, so if we have + * found a page in the radix tree here, we have pinned its refcount by + * disabling preempt, and hence no need for the "speculative get" that + * SMP requires. + */ + VM_BUG_ON(page_count(page) == 0); + atomic_inc(&page->_count); + VM_BUG_ON(page != *pagep); + +#else + again: + page = rcu_dereference(*pagep); + if (unlikely(!page)) + return NULL; + + if (unlikely(!get_page_unless_zero(page))) + goto again; /* page has been freed */ + + /* + * Note that get_page_unless_zero provides a memory barrier. + * This is needed to ensure PageNoNewRefs is evaluated after the + * page refcount has been raised. See below comment. + */ + + /* + * PageNoNewRefs is set in order to prevent new references to the + * page (eg. before it gets removed from pagecache). Wait until it + * becomes clear (and checks below will ensure we still have the + * correct one). + */ + while (unlikely(PageNoNewRefs(page))) + cpu_relax(); + + /* + * smp_rmb is to ensure the load of page->flags (for PageNoNewRefs()) + * is performed before the load of *pagep in the below comparison. + * + * Those places that set PageNoNewRefs have the following pattern: + * SetPageNoNewRefs(page) + * wmb(); + * if (page_count(page) == X) + * remove page from pagecache + * wmb(); + * ClearPageNoNewRefs(page) + * + * So PageNoNewRefs() becomes clear _after_ we've elevated page + * refcount, then either the page will be safely pinned in pagecache, + * or it will have been already removed. In the latter case, *pagep + * will be changed in the below test - provided it is loaded after + * testing PageNoNewRefs() (which is what the smp_rmb is for). + * + * If the load was out of order, *pagep might be loaded before the + * page is removed from pagecache while PageNoNewRefs evaluated after + * the ClearPageNoNewRefs(). + */ + smp_rmb(); + + if (unlikely(page != *pagep)) { + /* page no longer at *pagep */ + put_page(page); + goto again; + } + +#endif + VM_BUG_ON(PageCompound(page) && (struct page *)page_private(page) != page); + + return page; +} + #ifdef CONFIG_NUMA extern struct page *page_cache_alloc(struct address_space *x); extern struct page *page_cache_alloc_cold(struct address_space *x); @@ -101,6 +188,8 @@ extern int read_cache_pages(struct addre int add_to_page_cache(struct page *page, struct address_space *mapping, unsigned long index, gfp_t gfp_mask); +int __add_to_page_cache(struct page *page, struct address_space *mapping, + unsigned long index, gfp_t gfp_mask); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, unsigned long index, gfp_t gfp_mask); extern void remove_from_page_cache(struct page *page); Index: linux-2.6/mm/filemap.c =================================================================== --- linux-2.6.orig/mm/filemap.c +++ linux-2.6/mm/filemap.c @@ -111,7 +111,7 @@ generic_file_direct_IO(int rw, struct ki /* * Remove a page from the page cache and free it. Caller has to make * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold a write_lock on the mapping's tree_lock. + * is safe. The caller must hold the mapping's tree_lock. */ void __remove_from_page_cache(struct page *page) { @@ -129,9 +129,9 @@ void remove_from_page_cache(struct page BUG_ON(!PageLocked(page)); - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); } static int sync_page(void *word) @@ -407,7 +407,47 @@ int add_to_page_cache(struct page *page, int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { - write_lock_irq(&mapping->tree_lock); + /* + * Can get away with less atomic ops and without using + * Set/ClearPageNoNewRefs if we order operations correctly. + */ + page_cache_get(page); + __SetPageLocked(page); + page->mapping = mapping; + page->index = offset; + + spin_lock_irq(&mapping->tree_lock); + error = radix_tree_insert(&mapping->page_tree, offset, page); + if (!error) { + mapping->nrpages++; + pagecache_acct(1); + } + spin_unlock_irq(&mapping->tree_lock); + radix_tree_preload_end(); + + if (error) { + page->mapping = NULL; + __put_page(page); + __ClearPageLocked(page); + } + } + return error; +} +EXPORT_SYMBOL(add_to_page_cache); + +/* + * Same as add_to_page_cache, but works on pages that are already in + * swapcache and possibly visible to external lookups. + * (special case for move_from_swap_cache). + */ +int __add_to_page_cache(struct page *page, struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + + if (error == 0) { + SetPageNoNewRefs(page); + spin_lock_irq(&mapping->tree_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { page_cache_get(page); @@ -417,14 +457,13 @@ int add_to_page_cache(struct page *page, mapping->nrpages++; pagecache_acct(1); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + ClearPageNoNewRefs(page); radix_tree_preload_end(); } return error; } -EXPORT_SYMBOL(add_to_page_cache); - int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { @@ -545,21 +584,21 @@ void fastcall __lock_page(struct page *p EXPORT_SYMBOL(__lock_page); /* - * a rather lightweight function, finding and getting a reference to a - * hashed page atomically. + * find_get_page finds and gets a reference to a pagecache page. */ -struct page * find_get_page(struct address_space *mapping, unsigned long offset) +struct page *find_get_page(struct address_space *mapping, unsigned long offset) { - struct page *page; + struct page **pagep; + struct page *page = NULL; - read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); - if (page) - page_cache_get(page); - read_unlock_irq(&mapping->tree_lock); + rcu_read_lock(); + pagep = (struct page **)radix_tree_lookup_slot(&mapping->page_tree, + offset); + if (likely(pagep)) + page = page_cache_get_speculative(pagep); + rcu_read_unlock(); return page; } - EXPORT_SYMBOL(find_get_page); /* @@ -569,11 +608,11 @@ struct page *find_trylock_page(struct ad { struct page *page; - read_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page && TestSetPageLocked(page)) page = NULL; - read_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return page; } @@ -595,26 +634,17 @@ struct page *find_lock_page(struct addre { struct page *page; - read_lock_irq(&mapping->tree_lock); repeat: - page = radix_tree_lookup(&mapping->page_tree, offset); + page = find_get_page(mapping, offset); if (page) { - page_cache_get(page); - if (TestSetPageLocked(page)) { - read_unlock_irq(&mapping->tree_lock); - __lock_page(page); - read_lock_irq(&mapping->tree_lock); - - /* Has the page been truncated while we slept? */ - if (unlikely(page->mapping != mapping || - page->index != offset)) { - unlock_page(page); - page_cache_release(page); - goto repeat; - } + lock_page(page); + /* Has the page been truncated while we slept? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; } } - read_unlock_irq(&mapping->tree_lock); return page; } @@ -685,15 +715,27 @@ EXPORT_SYMBOL(find_or_create_page); unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages) { + unsigned int i; + unsigned int nr_found; unsigned int ret; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, start, nr_pages); - for (i = 0; i < ret; i++) - page_cache_get(pages[i]); - read_unlock_irq(&mapping->tree_lock); + /* + * We do some unsightly casting to use the array first for storing + * pointers to the page pointers, and then for the pointers to + * the pages themselves that the caller wants. + */ + rcu_read_lock(); + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, start, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; + page = page_cache_get_speculative(((struct page ***)pages)[i]); + if (likely(page)) + pages[ret++] = page; + } + rcu_read_unlock(); return ret; } @@ -713,19 +755,26 @@ unsigned find_get_pages_contig(struct ad unsigned int nr_pages, struct page **pages) { unsigned int i; + unsigned int nr_found; unsigned int ret; - read_lock_irq(&mapping->tree_lock); - ret = radix_tree_gang_lookup(&mapping->page_tree, - (void **)pages, index, nr_pages); - for (i = 0; i < ret; i++) { - if (pages[i]->mapping == NULL || pages[i]->index != index) + rcu_read_lock(); + nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, + (void ***)pages, index, nr_pages); + ret = 0; + for (i = 0; i < nr_found; i++) { + struct page *page; + page = page_cache_get_speculative(((struct page ***)pages)[i]); + if (unlikely(!page)) break; - - page_cache_get(pages[i]); + if (page->index != index) { + put_page(page); + break; + } + pages[ret++] = page; index++; } - read_unlock_irq(&mapping->tree_lock); + rcu_read_unlock(); return i; } @@ -739,14 +788,14 @@ unsigned find_get_pages_tag(struct addre unsigned int i; unsigned int ret; - read_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); ret = radix_tree_gang_lookup_tag(&mapping->page_tree, (void **)pages, *index, nr_pages, tag); for (i = 0; i < ret; i++) page_cache_get(pages[i]); if (ret) *index = pages[ret - 1]->index + 1; - read_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return ret; } Index: linux-2.6/mm/swap_state.c =================================================================== --- linux-2.6.orig/mm/swap_state.c +++ linux-2.6/mm/swap_state.c @@ -38,7 +38,7 @@ static struct backing_dev_info swap_back struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN), - .tree_lock = RW_LOCK_UNLOCKED, + .tree_lock = SPIN_LOCK_UNLOCKED, .a_ops = &swap_aops, .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .backing_dev_info = &swap_backing_dev_info, @@ -78,7 +78,8 @@ static int __add_to_swap_cache(struct pa BUG_ON(PagePrivate(page)); error = radix_tree_preload(gfp_mask); if (!error) { - write_lock_irq(&swapper_space.tree_lock); + SetPageNoNewRefs(page); + spin_lock_irq(&swapper_space.tree_lock); error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); if (!error) { @@ -89,7 +90,8 @@ static int __add_to_swap_cache(struct pa total_swapcache_pages++; pagecache_acct(1); } - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); + ClearPageNoNewRefs(page); radix_tree_preload_end(); } return error; @@ -200,9 +202,9 @@ void delete_from_swap_cache(struct page entry.val = page_private(page); - write_lock_irq(&swapper_space.tree_lock); + spin_lock_irq(&swapper_space.tree_lock); __delete_from_swap_cache(page); - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); swap_free(entry); page_cache_release(page); @@ -232,7 +234,7 @@ int move_to_swap_cache(struct page *page int move_from_swap_cache(struct page *page, unsigned long index, struct address_space *mapping) { - int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC); + int err = __add_to_page_cache(page, mapping, index, GFP_ATOMIC); if (!err) { delete_from_swap_cache(page); /* shift page from clean_pages to dirty_pages list */ Index: linux-2.6/mm/migrate.c =================================================================== --- linux-2.6.orig/mm/migrate.c +++ linux-2.6/mm/migrate.c @@ -25,8 +25,6 @@ #include #include -#include "internal.h" - /* The maximum number of pages to take off the LRU for migration */ #define MIGRATE_CHUNK_SIZE 256 @@ -221,7 +219,8 @@ int migrate_page_remove_references(struc if (page_mapcount(page)) return -EAGAIN; - write_lock_irq(&mapping->tree_lock); + SetPageNoNewRefs(page); + spin_lock_irq(&mapping->tree_lock); radix_pointer = (struct page **)radix_tree_lookup_slot( &mapping->page_tree, @@ -229,7 +228,8 @@ int migrate_page_remove_references(struc if (!page_mapping(page) || page_count(page) != nr_refs || *radix_pointer != page) { - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); + ClearPageNoNewRefs(page); return -EAGAIN; } @@ -249,9 +249,13 @@ int migrate_page_remove_references(struc set_page_private(newpage, page_private(page)); } - *radix_pointer = newpage; + SetPageNoNewRefs(newpage); + rcu_assign_pointer(*radix_pointer, newpage); + + spin_unlock_irq(&mapping->tree_lock); __put_page(page); - write_unlock_irq(&mapping->tree_lock); + ClearPageNoNewRefs(page); + ClearPageNoNewRefs(newpage); return 0; } Index: linux-2.6/mm/readahead.c =================================================================== --- linux-2.6.orig/mm/readahead.c +++ linux-2.6/mm/readahead.c @@ -286,27 +286,26 @@ __do_page_cache_readahead(struct address /* * Preallocate as many pages as we will need. */ - read_lock_irq(&mapping->tree_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { pgoff_t page_offset = offset + page_idx; if (page_offset > end_index) break; + /* Don't need mapping->tree_lock - lookup can be racy */ + rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, page_offset); + rcu_read_unlock(); if (page) continue; - read_unlock_irq(&mapping->tree_lock); page = page_cache_alloc_cold(mapping); - read_lock_irq(&mapping->tree_lock); if (!page) break; page->index = page_offset; list_add(&page->lru, &page_pool); ret++; } - read_unlock_irq(&mapping->tree_lock); /* * Now start the IO. We ignore I/O errors - if the page is not Index: linux-2.6/mm/truncate.c =================================================================== --- linux-2.6.orig/mm/truncate.c +++ linux-2.6/mm/truncate.c @@ -67,15 +67,15 @@ invalidate_complete_page(struct address_ if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); if (PageDirty(page)) { - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); return 0; } BUG_ON(PagePrivate(page)); __remove_from_page_cache(page); - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; @@ -227,7 +227,7 @@ unsigned long invalidate_mapping_pages(s pagevec_init(&pvec, 0); while (next <= end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; Index: linux-2.6/mm/page-writeback.c =================================================================== --- linux-2.6.orig/mm/page-writeback.c +++ linux-2.6/mm/page-writeback.c @@ -632,7 +632,7 @@ int __set_page_dirty_nobuffers(struct pa struct address_space *mapping2; if (mapping) { - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); mapping2 = page_mapping(page); if (mapping2) { /* Race with truncate? */ BUG_ON(mapping2 != mapping); @@ -641,7 +641,7 @@ int __set_page_dirty_nobuffers(struct pa radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); if (mapping->host) { /* !PageAnon && !swapper_space */ __mark_inode_dirty(mapping->host, @@ -719,17 +719,17 @@ int test_clear_page_dirty(struct page *p unsigned long flags; if (mapping) { - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); if (TestClearPageDirty(page)) { radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); + spin_unlock_irqrestore(&mapping->tree_lock, flags); if (mapping_cap_account_dirty(mapping)) dec_page_state(nr_dirty); return 1; } - write_unlock_irqrestore(&mapping->tree_lock, flags); + spin_unlock_irqrestore(&mapping->tree_lock, flags); return 0; } return TestClearPageDirty(page); @@ -769,33 +769,32 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); int test_clear_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - int ret; if (mapping) { unsigned long flags; + int ret; - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestClearPageWriteback(page); if (ret) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_WRITEBACK); - write_unlock_irqrestore(&mapping->tree_lock, flags); - } else { - ret = TestClearPageWriteback(page); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + return ret; } - return ret; + return TestClearPageWriteback(page); } int test_set_page_writeback(struct page *page) { struct address_space *mapping = page_mapping(page); - int ret; if (mapping) { unsigned long flags; + int ret; - write_lock_irqsave(&mapping->tree_lock, flags); + spin_lock_irqsave(&mapping->tree_lock, flags); ret = TestSetPageWriteback(page); if (!ret) radix_tree_tag_set(&mapping->page_tree, @@ -805,27 +804,24 @@ int test_set_page_writeback(struct page radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); - write_unlock_irqrestore(&mapping->tree_lock, flags); - } else { - ret = TestSetPageWriteback(page); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + return ret; } - return ret; + return TestSetPageWriteback(page); } EXPORT_SYMBOL(test_set_page_writeback); /* - * Return true if any of the pages in the mapping are marged with the + * Return true if any of the pages in the mapping are marked with the * passed tag. */ int mapping_tagged(struct address_space *mapping, int tag) { - unsigned long flags; int ret; - - read_lock_irqsave(&mapping->tree_lock, flags); + rcu_read_lock(); ret = radix_tree_tagged(&mapping->page_tree, tag); - read_unlock_irqrestore(&mapping->tree_lock, flags); + rcu_read_unlock(); return ret; } EXPORT_SYMBOL(mapping_tagged); Index: linux-2.6/fs/buffer.c =================================================================== --- linux-2.6.orig/fs/buffer.c +++ linux-2.6/fs/buffer.c @@ -851,7 +851,7 @@ int __set_page_dirty_buffers(struct page spin_unlock(&mapping->private_lock); if (!TestSetPageDirty(page)) { - write_lock_irq(&mapping->tree_lock); + spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ if (mapping_cap_account_dirty(mapping)) inc_page_state(nr_dirty); @@ -859,7 +859,7 @@ int __set_page_dirty_buffers(struct page page_index(page), PAGECACHE_TAG_DIRTY); } - write_unlock_irq(&mapping->tree_lock); + spin_unlock_irq(&mapping->tree_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return 1; } Index: linux-2.6/fs/inode.c =================================================================== --- linux-2.6.orig/fs/inode.c +++ linux-2.6/fs/inode.c @@ -195,7 +195,7 @@ void inode_init_once(struct inode *inode mutex_init(&inode->i_mutex); init_rwsem(&inode->i_alloc_sem); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - rwlock_init(&inode->i_data.tree_lock); + spin_lock_init(&inode->i_data.tree_lock); spin_lock_init(&inode->i_data.i_mmap_lock); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); Index: linux-2.6/include/linux/fs.h =================================================================== --- linux-2.6.orig/include/linux/fs.h +++ linux-2.6/include/linux/fs.h @@ -384,7 +384,7 @@ struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ - rwlock_t tree_lock; /* and rwlock protecting it */ + spinlock_t tree_lock; /* and lock protecting it */ unsigned int i_mmap_writable;/* count VM_SHARED mappings */ struct prio_tree_root i_mmap; /* tree of private and shared mappings */ struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */ Index: linux-2.6/mm/swapfile.c =================================================================== --- linux-2.6.orig/mm/swapfile.c +++ linux-2.6/mm/swapfile.c @@ -368,13 +368,13 @@ int remove_exclusive_swap_page(struct pa retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the swapcache lock held.. */ - write_lock_irq(&swapper_space.tree_lock); + spin_lock_irq(&swapper_space.tree_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; } - write_unlock_irq(&swapper_space.tree_lock); + spin_unlock_irq(&swapper_space.tree_lock); } spin_unlock(&swap_lock); Index: linux-2.6/include/asm-arm/cacheflush.h =================================================================== --- linux-2.6.orig/include/asm-arm/cacheflush.h +++ linux-2.6/include/asm-arm/cacheflush.h @@ -327,9 +327,9 @@ extern void flush_cache_page(struct vm_a extern void flush_dcache_page(struct page *); #define flush_dcache_mmap_lock(mapping) \ - write_lock_irq(&(mapping)->tree_lock) + spin_lock_irq(&(mapping)->tree_lock) #define flush_dcache_mmap_unlock(mapping) \ - write_unlock_irq(&(mapping)->tree_lock) + spin_unlock_irq(&(mapping)->tree_lock) #define flush_icache_user_range(vma,page,addr,len) \ flush_dcache_page(page) Index: linux-2.6/include/asm-parisc/cacheflush.h =================================================================== --- linux-2.6.orig/include/asm-parisc/cacheflush.h +++ linux-2.6/include/asm-parisc/cacheflush.h @@ -58,9 +58,9 @@ flush_user_icache_range(unsigned long st extern void flush_dcache_page(struct page *page); #define flush_dcache_mmap_lock(mapping) \ - write_lock_irq(&(mapping)->tree_lock) + spin_lock_irq(&(mapping)->tree_lock) #define flush_dcache_mmap_unlock(mapping) \ - write_unlock_irq(&(mapping)->tree_lock) + spin_unlock_irq(&(mapping)->tree_lock) #define flush_icache_page(vma,page) do { flush_kernel_dcache_page(page); flush_kernel_icache_page(page_address(page)); } while (0) Index: linux-2.6/drivers/mtd/devices/block2mtd.c =================================================================== --- linux-2.6.orig/drivers/mtd/devices/block2mtd.c +++ linux-2.6/drivers/mtd/devices/block2mtd.c @@ -59,28 +59,27 @@ static void cache_readahead(struct addre end_index = ((isize - 1) >> PAGE_CACHE_SHIFT); - read_lock_irq(&mapping->tree_lock); for (i = 0; i < PAGE_READAHEAD; i++) { pagei = index + i; if (pagei > end_index) { INFO("Overrun end of disk in cache readahead\n"); break; } + /* Don't need mapping->tree_lock - lookup can be racy */ + rcu_read_lock(); page = radix_tree_lookup(&mapping->page_tree, pagei); + rcu_read_unlock(); if (page && (!i)) break; if (page) continue; - read_unlock_irq(&mapping->tree_lock); page = page_cache_alloc_cold(mapping); - read_lock_irq(&mapping->tree_lock); if (!page) break; page->index = pagei; list_add(&page->lru, &page_pool); ret++; } - read_unlock_irq(&mapping->tree_lock); if (ret) read_cache_pages(mapping, &page_pool, filler, NULL); } Index: linux-2.6/mm/hugetlb.c =================================================================== --- linux-2.6.orig/mm/hugetlb.c +++ linux-2.6/mm/hugetlb.c @@ -188,7 +188,7 @@ int hugetlb_extend_reservation(struct hu int ret = 0; spin_lock(&hugetlb_lock); - read_lock_irq(&inode->i_mapping->tree_lock); + spin_lock_irq(&inode->i_mapping->tree_lock); if (info->prereserved_hpages >= atleast) goto out; @@ -207,7 +207,7 @@ int hugetlb_extend_reservation(struct hu info->prereserved_hpages = atleast; out: - read_unlock_irq(&inode->i_mapping->tree_lock); + spin_unlock_irq(&inode->i_mapping->tree_lock); spin_unlock(&hugetlb_lock); return ret; @@ -230,7 +230,7 @@ void hugetlb_truncate_reservation(struct struct page *page; spin_lock(&hugetlb_lock); - read_lock_irq(&inode->i_mapping->tree_lock); + spin_lock_irq(&inode->i_mapping->tree_lock); if (info->prereserved_hpages <= atmost) goto out; @@ -251,7 +251,7 @@ void hugetlb_truncate_reservation(struct info->prereserved_hpages = atmost; out: - read_unlock_irq(&inode->i_mapping->tree_lock); + spin_unlock_irq(&inode->i_mapping->tree_lock); spin_unlock(&hugetlb_lock); }