diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/Documentation/sysctl/vm.txt linux-2.6.14-ck2/Documentation/sysctl/vm.txt --- linux-2.6.14-ck1/Documentation/sysctl/vm.txt 2005-11-03 00:02:31.000000000 +1100 +++ linux-2.6.14-ck2/Documentation/sysctl/vm.txt 2005-11-03 00:02:46.000000000 +1100 @@ -27,6 +27,7 @@ Currently, these files are in /proc/sys/ - laptop_mode - block_dump - swap_prefetch +- readahead_ratio ============================================================== @@ -114,3 +115,16 @@ except when laptop_mode is enabled and t Setting it to 0 disables prefetching entirely. The default value is dependant on ramsize. + +============================================================== + +readahead_ratio + +This limits read-ahead size to percent of the thrashing-threshold. +The thrashing-threshold is dynamicly estimated according to the +_history_ read speed and system load, and used to limit the +_future_ read-ahead request size. So you should set it to a low +value if you have not enough memory to counteract the I/O load +fluctuation. + +The default value is 50. diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/fs/buffer.c linux-2.6.14-ck2/fs/buffer.c --- linux-2.6.14-ck1/fs/buffer.c 2005-10-28 20:22:00.000000000 +1000 +++ linux-2.6.14-ck2/fs/buffer.c 2005-11-03 00:02:46.000000000 +1100 @@ -504,7 +504,7 @@ static void free_more_memory(void) for_each_pgdat(pgdat) { zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones; if (*zones) - try_to_free_pages(zones, GFP_NOFS); + try_to_free_pages(zones, GFP_NOFS, NULL); } } diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/fs.h linux-2.6.14-ck2/include/linux/fs.h --- linux-2.6.14-ck1/include/linux/fs.h 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/include/linux/fs.h 2005-11-03 00:02:46.000000000 +1100 @@ -570,28 +570,15 @@ struct file_ra_state { unsigned long mmap_hit; /* Cache hit stat for mmap accesses */ unsigned long mmap_miss; /* Cache miss stat for mmap accesses */ + unsigned long age; unsigned long la_index; unsigned long ra_index; unsigned long lookahead_index; unsigned long readahead_index; - unsigned long nr_page_aging; }; #define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */ #define RA_FLAG_INCACHE 0x02 /* file is already in cache */ -#define RA_CLASS_SHIFT 3 -#define RA_CLASS_MASK ((1 << RA_CLASS_SHIFT) - 1) -enum file_ra_class { /* the same order must be kept in page_state */ - RA_CLASS_NEWFILE = 1, - RA_CLASS_STATE, - RA_CLASS_CONTEXT, - RA_CLASS_CONTEXT_ACCELERATED, - RA_CLASS_BACKWARD, - /* RA_CLASS_AROUND, */ - RA_CLASS_RANDOM, - RA_CLASS_END, -}; - struct file { struct list_head f_list; struct dentry *f_dentry; diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/mm.h linux-2.6.14-ck2/include/linux/mm.h --- linux-2.6.14-ck1/include/linux/mm.h 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/include/linux/mm.h 2005-11-03 00:02:46.000000000 +1100 @@ -880,6 +880,9 @@ int write_one_page(struct page *page, in #define VM_MAX_CACHE_HIT 256 /* max pages in a row in cache before * turning readahead off */ +/* turn on read-ahead thrashing protection if (readahead_ratio >= ##) */ +#define VM_READAHEAD_PROTECT_RATIO 80 + int do_page_cache_readahead(struct address_space *mapping, struct file *filp, unsigned long offset, unsigned long nr_to_read); int force_page_cache_readahead(struct address_space *mapping, struct file *filp, @@ -900,8 +903,7 @@ page_cache_readahead_adaptive(struct add unsigned long index, unsigned long last_index); void fastcall ra_access(struct file_ra_state *ra, struct page *page); int rescue_ra_pages(struct list_head *page_list, struct list_head *save_list); -int save_chunk(struct page *head, struct page *tail, - struct list_head *save_list); + /* Do stack extension */ extern int expand_stack(struct vm_area_struct * vma, unsigned long address); diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/mm_inline.h linux-2.6.14-ck2/include/linux/mm_inline.h --- linux-2.6.14-ck1/include/linux/mm_inline.h 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/include/linux/mm_inline.h 2004-03-11 21:29:27.000000000 +1100 @@ -11,7 +11,6 @@ add_page_to_inactive_list(struct zone *z { list_add(&page->lru, &zone->inactive_list); zone->nr_inactive++; - zone->nr_page_aging++; } static inline void diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/mmzone.h linux-2.6.14-ck2/include/linux/mmzone.h --- linux-2.6.14-ck1/include/linux/mmzone.h 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/include/linux/mmzone.h 2005-11-03 00:02:46.000000000 +1100 @@ -316,7 +316,7 @@ void __get_zone_counts(unsigned long *ac void get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free); void build_all_zonelists(void); -void wakeup_kswapd(struct zone *zone, int order); +void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p); int zone_watermark_ok(struct zone *z, int order, unsigned long mark, int alloc_type, int can_try_harder, int gfp_high); @@ -342,11 +342,13 @@ unsigned long __init node_memmap_size_by #define PAGE_AGE_MASK ((1 << PAGE_AGE_SHIFT) - 1) /* - * The percent of pages in inactive_list that have been scanned / aged. - * It's not really ##%, but a high resolution normalized value. + * Keep track of the percent of pages in inactive_list that have been scanned + * / aged. It's not really ##%, but a high resolution normalized value. */ -static inline void update_page_age(struct zone *z) +static inline void update_page_age(struct zone *z, int nr_scan) { + z->nr_page_aging += nr_scan; + if (z->nr_page_aging - z->aging_milestone > z->nr_inactive) z->aging_milestone += z->nr_inactive; diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/page-flags.h linux-2.6.14-ck2/include/linux/page-flags.h --- linux-2.6.14-ck1/include/linux/page-flags.h 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/include/linux/page-flags.h 2005-11-03 00:02:46.000000000 +1100 @@ -106,6 +106,8 @@ struct page_state { unsigned long pgfree; /* page freeings */ unsigned long pgactivate; /* pages moved inactive->active */ unsigned long pgdeactivate; /* pages moved active->inactive */ + unsigned long pgkeephot; /* pages sent back to active */ + unsigned long pgkeepcold; /* pages sent back to inactive */ unsigned long pgfault; /* faults (major+minor) */ unsigned long pgmajfault; /* faults (major only) */ @@ -133,63 +135,6 @@ struct page_state { unsigned long pgrotated; /* pages rotated to tail of the LRU */ unsigned long nr_bounce; /* pages for bounce buffers */ - - unsigned long cache_miss; /* read cache misses */ - unsigned long readrandom; /* random reads */ - unsigned long pgreadrandom; /* random read pages */ - unsigned long readahead_rescue; /* read-aheads rescued*/ - unsigned long pgreadahead_rescue; - unsigned long readahead_end; /* read-aheads passed EOF */ - - unsigned long readahead; /* read-aheads issued */ - unsigned long readahead_return; /* look-ahead marks returned */ - unsigned long readahead_eof; /* read-aheads stop at EOF */ - unsigned long pgreadahead; /* read-ahead pages issued */ - unsigned long pgreadahead_hit; /* read-ahead pages accessed */ - unsigned long pgreadahead_eof; - - unsigned long ra_newfile; /* read-ahead on start of file */ - unsigned long ra_newfile_return; - unsigned long ra_newfile_eof; - unsigned long pgra_newfile; - unsigned long pgra_newfile_hit; - unsigned long pgra_newfile_eof; - - unsigned long ra_state; /* state based read-ahead */ - unsigned long ra_state_return; - unsigned long ra_state_eof; - unsigned long pgra_state; - unsigned long pgra_state_hit; - unsigned long pgra_state_eof; - - unsigned long ra_context; /* context based read-ahead */ - unsigned long ra_context_return; - unsigned long ra_context_eof; - unsigned long pgra_context; - unsigned long pgra_context_hit; - unsigned long pgra_context_eof; - - unsigned long ra_contexta; /* accelerated context based read-ahead */ - unsigned long ra_contexta_return; - unsigned long ra_contexta_eof; - unsigned long pgra_contexta; - unsigned long pgra_contexta_hit; - unsigned long pgra_contexta_eof; - - unsigned long ra_backward; /* prefetch pages for backward reading */ - unsigned long ra_backward_return; - unsigned long ra_backward_eof; - unsigned long pgra_backward; - unsigned long pgra_backward_hit; - unsigned long pgra_backward_eof; - - unsigned long ra_random; /* read-ahead on seek-and-read-pages */ - unsigned long ra_random_return; - unsigned long ra_random_eof; - unsigned long pgra_random; - unsigned long pgra_random_hit; - unsigned long pgra_random_eof; - }; extern void get_page_state(struct page_state *ret); @@ -374,9 +319,7 @@ extern void __mod_page_state(unsigned lo #define PageReadahead(page) test_bit(PG_readahead, &(page)->flags) #define SetPageReadahead(page) set_bit(PG_readahead, &(page)->flags) -#define ClearPageReadahead(page) clear_bit(PG_readahead, &(page)->flags) #define TestClearPageReadahead(page) test_and_clear_bit(PG_readahead, &(page)->flags) -#define TestSetPageReadahead(page) test_and_set_bit(PG_readahead, &(page)->flags) struct page; /* forward declaration */ @@ -394,4 +337,28 @@ static inline void set_page_writeback(st test_set_page_writeback(page); } +#if PG_activate < PG_referenced +#error unexpected page flags order +#endif + +#define PAGE_REFCNT_0 0 +#define PAGE_REFCNT_1 (1 << PG_referenced) +#define PAGE_REFCNT_2 (1 << PG_activate) +#define PAGE_REFCNT_3 ((1 << PG_activate) | (1 << PG_referenced)) +#define PAGE_REFCNT_MASK PAGE_REFCNT_3 + +/* + * STATUS REFERENCE COUNT + * __ 0 + * _R PAGE_REFCNT_1 + * A_ PAGE_REFCNT_2 + * AR PAGE_REFCNT_3 + * + * A/R: Active / Referenced + */ +static inline unsigned long page_refcnt(struct page *page) +{ + return page->flags & PAGE_REFCNT_MASK; +} + #endif /* PAGE_FLAGS_H */ diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/radix-tree.h linux-2.6.14-ck2/include/linux/radix-tree.h --- linux-2.6.14-ck1/include/linux/radix-tree.h 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/include/linux/radix-tree.h 2005-11-03 00:02:46.000000000 +1100 @@ -22,12 +22,39 @@ #include #include +#ifdef __KERNEL__ +#define RADIX_TREE_MAP_SHIFT 6 +#else +#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ +#endif +#define RADIX_TREE_TAGS 2 + +#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) +#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) + +#define RADIX_TREE_TAG_LONGS \ + ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) + +struct radix_tree_node { + unsigned int count; + void *slots[RADIX_TREE_MAP_SIZE]; + unsigned long tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS]; +}; + struct radix_tree_root { unsigned int height; unsigned int gfp_mask; struct radix_tree_node *rnode; }; +/* + * Support access patterns with strong locality. + */ +struct radix_tree_cache { + unsigned long first_index; + struct radix_tree_node *tree_node; +}; + #define RADIX_TREE_INIT(mask) { \ .height = 0, \ .gfp_mask = (mask), \ @@ -45,10 +72,13 @@ do { \ } while (0) int radix_tree_insert(struct radix_tree_root *, unsigned long, void *); -void *radix_tree_delete(struct radix_tree_root *, unsigned long); -void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void *radix_tree_lookup_node(struct radix_tree_root *, unsigned long, unsigned int); +void *radix_tree_delete(struct radix_tree_root *, unsigned long); +unsigned long radix_tree_lookup_head(struct radix_tree_root *root, + unsigned long index, unsigned int max_scan); +unsigned long radix_tree_lookup_tail(struct radix_tree_root *root, + unsigned long index, unsigned int max_scan); unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items); @@ -70,19 +100,119 @@ static inline void radix_tree_preload_en preempt_enable(); } -/* - * Support access patterns with locality. +/** + * radix_tree_lookup - perform lookup operation on a radix tree + * @root: radix tree root + * @index: index key + * + * Lookup the item at the position @index in the radix tree @root. */ -struct radix_tree_cache { - unsigned long first_index; - struct radix_tree_node *tree_node; -}; +static inline void *radix_tree_lookup(struct radix_tree_root *root, + unsigned long index) +{ + return radix_tree_lookup_node(root, index, 0); +} + +/** + * radix_tree_lookup_slot - lookup a slot in a radix tree + * @root: radix tree root + * @index: index key + * + * Lookup the slot corresponding to the position @index in the radix tree + * @root. This is useful for update-if-exists operations. + */ +static inline void **radix_tree_lookup_slot(struct radix_tree_root *root, + unsigned long index) +{ + struct radix_tree_node *node; + + node = radix_tree_lookup_node(root, index, 1); + return node->slots + (index & RADIX_TREE_MAP_MASK); +} -void radix_tree_cache_init(struct radix_tree_cache *cache); -void *radix_tree_cache_lookup(struct radix_tree_root *, - struct radix_tree_cache *, unsigned long); -int radix_tree_cache_size(struct radix_tree_cache *cache); -int radix_tree_cache_count(struct radix_tree_cache *cache); -int radix_tree_cache_first_index(struct radix_tree_cache *cache); +/** + * radix_tree_cache_lookup_node - cached lookup node + * @root: radix tree root + * @cache: look-aside cache + * @index: index key + * + * Lookup the item at the position @index in the radix tree @root, + * and return the node @level levels from the bottom in the search path. + * @cache stores the last accessed upper level tree node by this + * function, and is always checked first before searching in the tree. + * It can improve speed for access patterns with strong locality. + * NOTE: + * - The cache becomes invalid on leaving the lock; + * - Do not intermix calls with different @level. + */ +static inline void *radix_tree_cache_lookup_node(struct radix_tree_root *root, + struct radix_tree_cache *cache, + unsigned long index, unsigned int level) +{ + struct radix_tree_node *node; + unsigned long i; + unsigned long mask; + + if (level && level >= root->height) + return root->rnode; + + i = ((index >> (level * RADIX_TREE_MAP_SHIFT)) & RADIX_TREE_MAP_MASK); + mask = ~((RADIX_TREE_MAP_SIZE << (level * RADIX_TREE_MAP_SHIFT)) - 1); + + if ((index & mask) == cache->first_index) + return cache->tree_node->slots[i]; + + node = radix_tree_lookup_node(root, index, level + 1); + if (!node) + return 0; + + cache->tree_node = node; + cache->first_index = (index & mask); + return node->slots[i]; +} + +/** + * radix_tree_cache_lookup - cached lookup page + * @root: radix tree root + * @cache: look-aside cache + * @index: index key + * + * Lookup the item at the position @index in the radix tree @root. + */ +static inline void *radix_tree_cache_lookup(struct radix_tree_root *root, + struct radix_tree_cache *cache, + unsigned long index) +{ + return radix_tree_cache_lookup_node(root, cache, index, 0); +} + +static inline void radix_tree_cache_init(struct radix_tree_cache *cache) +{ + cache->first_index = 0x77; + cache->tree_node = NULL; /* just to kill gcc warning */ +} + +static inline int radix_tree_cache_size(struct radix_tree_cache *cache) +{ + return RADIX_TREE_MAP_SIZE; +} + +static inline int radix_tree_cache_count(struct radix_tree_cache *cache) +{ + if (cache->first_index != 0x77) + return cache->tree_node->count; + else + return 0; +} + +static inline int radix_tree_cache_full(struct radix_tree_cache *cache) +{ + return radix_tree_cache_count(cache) == radix_tree_cache_size(cache); +} + +static inline int radix_tree_cache_first_index(struct radix_tree_cache *cache) +{ + return cache->first_index; +} #endif /* _LINUX_RADIX_TREE_H */ diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/swap.h linux-2.6.14-ck2/include/linux/swap.h --- linux-2.6.14-ck1/include/linux/swap.h 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/include/linux/swap.h 2005-11-03 00:02:46.000000000 +1100 @@ -171,7 +171,8 @@ extern int rotate_reclaimable_page(struc extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern int try_to_free_pages(struct zone **, unsigned int); +extern int try_to_free_pages(struct zone **, unsigned int, + struct task_struct *p); extern int zone_reclaim(struct zone *, unsigned int, unsigned int); extern int shrink_all_memory(int); extern int vm_mapped; diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/sysctl.h linux-2.6.14-ck2/include/linux/sysctl.h --- linux-2.6.14-ck1/include/linux/sysctl.h 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/include/linux/sysctl.h 2005-11-03 00:02:46.000000000 +1100 @@ -183,9 +183,9 @@ enum VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */ VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */ VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */ - VM_HARDMAPLIMIT=29, /* Make mapped a hard limit */ - VM_SWAP_PREFETCH=30, /* int: amount to swap prefetch */ - VM_READAHEAD_RATIO=31, /* percent of read-ahead size to thrashing-threshold */ + VM_SWAP_PREFETCH=29, /* int: amount to swap prefetch */ + VM_READAHEAD_RATIO=30, /* percent of read-ahead size to thrashing-threshold */ + VM_HARDMAPLIMIT=31, /* Make mapped a hard limit */ }; diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/writeback.h linux-2.6.14-ck2/include/linux/writeback.h --- linux-2.6.14-ck1/include/linux/writeback.h 2005-10-28 20:22:02.000000000 +1000 +++ linux-2.6.14-ck2/include/linux/writeback.h 2005-11-03 00:02:46.000000000 +1100 @@ -90,6 +90,12 @@ void laptop_io_completion(void); void laptop_sync_completion(void); void throttle_vm_writeout(void); +extern struct timer_list laptop_mode_wb_timer; +static inline int laptop_spinned_down(void) +{ + return !timer_pending(&laptop_mode_wb_timer); +} + /* These are exported to sysctl. */ extern int dirty_background_ratio; extern int vm_dirty_ratio; diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/kernel/sched.c linux-2.6.14-ck2/kernel/sched.c --- linux-2.6.14-ck1/kernel/sched.c 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/kernel/sched.c 2005-11-03 00:02:46.000000000 +1100 @@ -16,9 +16,9 @@ * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin - * 2005-09-16 New staircase scheduling policy by Con Kolivas with help + * 2005-11-02 New staircase scheduling policy by Con Kolivas with help * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams. - * Staircase v12.1 + * Staircase v12.2 */ #include @@ -779,7 +779,11 @@ static inline void recalc_task_prio(task NS_TO_JIFFIES(sleep_time) < p->slice) { p->flags &= ~PF_NONSLEEP; dec_burst(p); - p->totalrun += sleep_time - JIFFIES_TO_NS(p->slice); + p->totalrun -= JIFFIES_TO_NS(p->slice); + if (sleep_time > p->totalrun) + p->totalrun = 0; + else + p->totalrun -= sleep_time; goto out; } @@ -3431,6 +3435,8 @@ void set_user_nice(task_t *p, long nice) delta = new_prio - old_prio; p->static_prio = NICE_TO_PRIO(nice); p->prio += delta; + if (p->burst > burst(p)) + p->burst = burst(p); if (queued) { enqueue_task(p, rq); diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/lib/radix-tree.c linux-2.6.14-ck2/lib/radix-tree.c --- linux-2.6.14-ck1/lib/radix-tree.c 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/lib/radix-tree.c 2005-11-03 00:02:46.000000000 +1100 @@ -32,25 +32,6 @@ #include -#ifdef __KERNEL__ -#define RADIX_TREE_MAP_SHIFT 6 -#else -#define RADIX_TREE_MAP_SHIFT 3 /* For more stressful testing */ -#endif -#define RADIX_TREE_TAGS 2 - -#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) -#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) - -#define RADIX_TREE_TAG_LONGS \ - ((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG) - -struct radix_tree_node { - unsigned int count; - void *slots[RADIX_TREE_MAP_SIZE]; - unsigned long tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS]; -}; - struct radix_tree_path { struct radix_tree_node *node; int offset; @@ -134,6 +115,7 @@ int radix_tree_preload(gfp_t gfp_mask) out: return ret; } +EXPORT_SYMBOL(radix_tree_preload); static inline void tag_set(struct radix_tree_node *node, int tag, int offset) { @@ -281,6 +263,19 @@ int radix_tree_insert(struct radix_tree_ } EXPORT_SYMBOL(radix_tree_insert); +/** + * radix_tree_lookup_node - low level lookup routine + * @root: radix tree root + * @index: index key + * @level: stop at that many levels from bottom + * + * Lookup the item at the position @index in the radix tree @root. + * The return value is: + * @level == 0: page at @index; + * @level == 1: the corresponding bottom level tree node; + * @level < height: (height - @level)th level tree node; + * @level >= height: root node. + */ void *radix_tree_lookup_node(struct radix_tree_root *root, unsigned long index, unsigned int level) { @@ -308,63 +303,111 @@ void *radix_tree_lookup_node(struct radi EXPORT_SYMBOL(radix_tree_lookup_node); /** - * radix_tree_lookup - perform lookup operation on a radix tree + * radix_tree_lookup_head - lookup the head index * @root: radix tree root * @index: index key + * @max_scan: max items to scan * - * Lookup the item at the position @index in the radix tree @root. + * Lookup head index of the segment which contains @index. A segment is + * a set of continuous pages in a file. + * CASE RETURN VALUE + * no page at @index (not head) = @index + 1 + * found in the range @index - @max_scan < (head index) <= @index + * not found in range (unfinished head) <= @index - @max_scan */ -void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) -{ - return radix_tree_lookup_node(root, index, 0); -} -EXPORT_SYMBOL(radix_tree_lookup); - -void *radix_tree_cache_lookup(struct radix_tree_root *root, - struct radix_tree_cache *cache, - unsigned long index) +unsigned long radix_tree_lookup_head(struct radix_tree_root *root, + unsigned long index, unsigned int max_scan) { + struct radix_tree_cache cache; struct radix_tree_node *node; + int i; + unsigned long origin; - if ((index & ~RADIX_TREE_MAP_MASK) == cache->first_index) - return cache->tree_node->slots[index & RADIX_TREE_MAP_MASK]; + origin = index; + if (unlikely(max_scan > index)) + max_scan = index; + radix_tree_cache_init(&cache); + +next_node: + if (origin - index > max_scan) + goto out; - node = radix_tree_lookup_node(root, index, 1); + node = radix_tree_cache_lookup_node(root, &cache, index, 1); if (!node) - return 0; + goto out; - cache->tree_node = node; - cache->first_index = (index & ~RADIX_TREE_MAP_MASK); - return node->slots[index & RADIX_TREE_MAP_MASK]; -} -EXPORT_SYMBOL(radix_tree_cache_lookup); + if (node->count == RADIX_TREE_MAP_SIZE) { + if (index < RADIX_TREE_MAP_SIZE) { + index = -1; + goto out; + } + index = (index - RADIX_TREE_MAP_SIZE) | RADIX_TREE_MAP_MASK; + goto next_node; + } -void radix_tree_cache_init(struct radix_tree_cache *cache) -{ - cache->first_index = 1; -} -EXPORT_SYMBOL(radix_tree_cache_init); + for (i = index & RADIX_TREE_MAP_MASK; i >= 0; i--, index--) { + if (!node->slots[i]) + goto out; + } -int radix_tree_cache_size(struct radix_tree_cache *cache) -{ - return RADIX_TREE_MAP_SIZE; -} -EXPORT_SYMBOL(radix_tree_cache_size); + goto next_node; -int radix_tree_cache_count(struct radix_tree_cache *cache) -{ - if (cache->first_index != 1) - return cache->tree_node->count; - else - return 0; +out: + return index + 1; } -EXPORT_SYMBOL(radix_tree_cache_count); +EXPORT_SYMBOL(radix_tree_lookup_head); -int radix_tree_cache_first_index(struct radix_tree_cache *cache) +/** + * radix_tree_lookup_tail - lookup the tail index + * @root: radix tree root + * @index: index key + * @max_scan: max items to scan + * + * Lookup tail(pass the end) index of the segment which contains @index. + * A segment is a set of continuous pages in a file. + * CASE RETURN VALUE + * found in the range @index <= (tail index) < @index + @max_scan + * not found in range @index + @max_scan <= (non tail) + */ +unsigned long radix_tree_lookup_tail(struct radix_tree_root *root, + unsigned long index, unsigned int max_scan) { - return cache->first_index; + struct radix_tree_cache cache; + struct radix_tree_node *node; + int i; + unsigned long origin; + + origin = index; + if (unlikely(index + max_scan < index)) + max_scan = LONG_MAX - index; + radix_tree_cache_init(&cache); + +next_node: + if (index - origin >= max_scan) + goto out; + + node = radix_tree_cache_lookup_node(root, &cache, index, 1); + if (!node) + goto out; + + if (node->count == RADIX_TREE_MAP_SIZE) { + index = (index | RADIX_TREE_MAP_MASK) + 1; + if (unlikely(!index)) + goto out; + goto next_node; + } + + for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++, index++) { + if (!node->slots[i]) + goto out; + } + + goto next_node; + +out: + return index; } -EXPORT_SYMBOL(radix_tree_cache_first_index); +EXPORT_SYMBOL(radix_tree_lookup_tail); /** * radix_tree_tag_set - set a tag on a radix tree node diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/Makefile linux-2.6.14-ck2/Makefile --- linux-2.6.14-ck1/Makefile 2005-11-03 00:02:31.000000000 +1100 +++ linux-2.6.14-ck2/Makefile 2005-11-03 00:02:46.000000000 +1100 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 14 -EXTRAVERSION =-ck1 +EXTRAVERSION =-ck2 NAME=Cognac Affected Albatross # *DOCUMENTATION* diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/mm/filemap.c linux-2.6.14-ck2/mm/filemap.c --- linux-2.6.14-ck1/mm/filemap.c 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/mm/filemap.c 2005-11-03 00:02:46.000000000 +1100 @@ -765,7 +765,7 @@ void do_generic_mapping_read(struct addr nr = nr - offset; cond_resched(); - + if (readahead_ratio <= 9 && index == next_index) next_index = page_cache_readahead(mapping, &ra, filp, index, last_index - index); @@ -794,6 +794,7 @@ find_page: if (prev_page) page_cache_release(prev_page); prev_page = page; + ra_access(&ra, page); if (!PageUptodate(page)) goto page_not_up_to_date; page_ok: @@ -810,7 +811,6 @@ page_ok: * in succession, only mark it as accessed the first time. */ if (prev_index != index) { - ra_access(&ra, page); mark_page_accessed(page); } prev_index = index; @@ -1309,6 +1309,8 @@ retry_find: if (!did_readaround) ra->mmap_hit++; + ra_access(ra, page); + /* * Ok, found a page in the page cache, now we need to check * that it's up-to-date. @@ -1320,10 +1322,11 @@ success: /* * Found the page and have a reference on it. */ - ra_access(ra, page); mark_page_accessed(page); if (type) *type = majmin; + if (readahead_ratio > 9) + ra->prev_page = page->index; return page; outside_data_content: diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/mm/page_alloc.c linux-2.6.14-ck2/mm/page_alloc.c --- linux-2.6.14-ck1/mm/page_alloc.c 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/mm/page_alloc.c 2005-11-03 00:02:46.000000000 +1100 @@ -110,10 +110,9 @@ static void bad_page(const char *functio 1 << PG_private | 1 << PG_locked | 1 << PG_active | - 1 << PG_activate| 1 << PG_dirty | 1 << PG_reclaim | - 1 << PG_slab | + 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback); set_page_count(page, 0); @@ -820,11 +819,11 @@ __alloc_pages(gfp_t gfp_mask, unsigned i classzone_idx = zone_idx(zones[0]); -restart: /* * Go through the zonelist once, looking for a zone with enough free. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ +restart: /* * To fulfill three goals: * - balanced page aging @@ -904,7 +903,7 @@ zone_reclaim_retry: goto got_pg; try_harder: - wakeup_kswapd(z, order); + wakeup_kswapd(z, order, p); /* * Put stress on the zone. Let __GFP_HIGH and allocations @@ -956,7 +955,7 @@ rebalance: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zones, gfp_mask); + did_some_progress = try_to_free_pages(zones, gfp_mask, p); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; @@ -2282,6 +2281,8 @@ static char *vmstat_text[] = { "pgfree", "pgactivate", "pgdeactivate", + "pgkeephot", + "pgkeepcold", "pgfault", "pgmajfault", @@ -2309,63 +2310,6 @@ static char *vmstat_text[] = { "pgrotated", "nr_bounce", - - "cache_miss", - "readrandom", - "pgreadrandom", - "readahead_rescue", - "pgreadahead_rescue", - "readahead_end", - - "readahead", - "readahead_return", - "readahead_eof", - "pgreadahead", - "pgreadahead_hit", - "pgreadahead_eof", - - "ra_newfile", - "ra_newfile_return", - "ra_newfile_eof", - "pgra_newfile", - "pgra_newfile_hit", - "pgra_newfile_eof", - - "ra_state", - "ra_state_return", - "ra_state_eof", - "pgra_state", - "pgra_state_hit", - "pgra_state_eof", - - "ra_context", - "ra_context_return", - "ra_context_eof", - "pgra_context", - "pgra_context_hit", - "pgra_context_eof", - - "ra_contexta", - "ra_contexta_return", - "ra_contexta_eof", - "pgra_contexta", - "pgra_contexta_hit", - "pgra_contexta_eof", - - "ra_backward", - "ra_backward_return", - "ra_backward_eof", - "pgra_backward", - "pgra_backward_hit", - "pgra_backward_eof", - - "ra_random", - "ra_random_return", - "ra_random_eof", - "pgra_random", - "pgra_random_hit", - "pgra_random_eof", - }; static void *vmstat_start(struct seq_file *m, loff_t *pos) diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/mm/page-writeback.c linux-2.6.14-ck2/mm/page-writeback.c --- linux-2.6.14-ck1/mm/page-writeback.c 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/mm/page-writeback.c 2005-11-03 00:02:46.000000000 +1100 @@ -369,7 +369,7 @@ static void wb_timer_fn(unsigned long un static void laptop_timer_fn(unsigned long unused); static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); -static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); +DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); /* * Periodic writeback of "old" data. diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/mm/readahead.c linux-2.6.14-ck2/mm/readahead.c --- linux-2.6.14-ck1/mm/readahead.c 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/mm/readahead.c 2005-11-03 00:02:46.000000000 +1100 @@ -14,56 +14,256 @@ #include #include #include +#include +/* Set look-ahead size to 1/8 of the thrashing-threshold. */ +#define LOOKAHEAD_RATIO 8 + +/* Set read-ahead size to ##% of the thrashing-threshold. */ +int readahead_ratio = 50; +EXPORT_SYMBOL(readahead_ratio); + +/* Analog to nr_page_aging. + * But mainly increased on fresh page references, so is much more smoother. + */ +DEFINE_PER_CPU(unsigned long, smooth_aging); +EXPORT_PER_CPU_SYMBOL(smooth_aging); + +/* Detailed classification of read-ahead behaviors. */ +#define RA_CLASS_SHIFT 3 +#define RA_CLASS_MASK ((1 << RA_CLASS_SHIFT) - 1) +enum ra_class { + RA_CLASS_ALL, + RA_CLASS_NEWFILE, + RA_CLASS_STATE, + RA_CLASS_CONTEXT, + RA_CLASS_CONTEXT_ACCELERATED, + RA_CLASS_BACKWARD, + RA_CLASS_RANDOM_THRASHING, + RA_CLASS_RANDOM_SEEK, + RA_CLASS_END, +}; + +/* Read-ahead events to be accounted. */ +enum ra_event { + RA_EVENT_CACHE_MISS, /* read cache misses */ + RA_EVENT_READRANDOM, /* random reads */ + RA_EVENT_IO_CONGESTION, /* io congestion */ + RA_EVENT_IO_CACHE_HIT, /* canceled io due to cache hit */ + RA_EVENT_IO_BLOCK, /* read on locked page */ + + RA_EVENT_READAHEAD, /* read-ahead issued */ + RA_EVENT_READAHEAD_HIT, /* read-ahead page hit */ + RA_EVENT_LOOKAHEAD, /* look-ahead issued */ + RA_EVENT_LOOKAHEAD_HIT, /* look-ahead mark hit */ + RA_EVENT_READAHEAD_EOF, /* read-ahead reaches EOF */ + RA_EVENT_READAHEAD_SHRINK, /* ra_size decreased, reflects var. */ + RA_EVENT_READAHEAD_THRASHING, /* read-ahead thrashing happened */ + RA_EVENT_READAHEAD_RESCUE, /* read-ahead rescued */ + + RA_EVENT_END +}; + +/* + * Debug facilities. + */ +#ifdef CONFIG_DEBUG_FS #define DEBUG_READAHEAD +#endif #ifdef DEBUG_READAHEAD +#include +#include +#include +#include + +static char *ra_class_name[] = { + "total", + "newfile", + "state", + "context", + "contexta", + "backward", + "onthrash", + "onraseek", + "none", +}; + +static char *ra_event_name[] = { + "cache_miss", + "read_random", + "io_congestion", + "io_cache_hit", + "io_block", + "readahead", + "readahead_hit", + "lookahead", + "lookahead_hit", + "readahead_eof", + "readahead_shrink", + "readahead_thrash", + "readahead_rescue", +}; + +static unsigned long ra_event_count[RA_CLASS_END+1][RA_EVENT_END][2]; + +static inline void ra_account(struct file_ra_state *ra, + enum ra_event e, int pages) +{ + enum ra_class c; + + c = (ra ? ra->flags & RA_CLASS_MASK : RA_CLASS_END); + if (e == RA_EVENT_READAHEAD_HIT && pages < 0) { + c = (ra->flags >> RA_CLASS_SHIFT) & RA_CLASS_MASK; + pages = -pages; + } + if (!c) + c = RA_CLASS_END; + BUG_ON(c > RA_CLASS_END); + + ra_event_count[c][e][0] += 1; + ra_event_count[c][e][1] += pages; +} + +static int ra_account_show(struct seq_file *s, void *_) +{ + int i; + int c; + int e; + static char event_fmt[] = "%-16s"; + static char class_fmt[] = "%11s"; + static char item_fmt[] = "%11lu"; + static char percent_format[] = "%10lu%%"; + static char *table_name[] = { + "[table requests]", + "[table pages]", + "[table summary]"}; + + for (i = 0; i <= 1; i++) { + for (e = 0; e < RA_EVENT_END; e++) { + ra_event_count[0][e][i] = 0; + for (c = 1; c <= RA_CLASS_END; c++) + ra_event_count[0][e][i] += + ra_event_count[c][e][i]; + } + + seq_printf(s, event_fmt, table_name[i]); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, class_fmt, ra_class_name[c]); + seq_puts(s, "\n"); + + for (e = 0; e < RA_EVENT_END; e++) { + if (e == RA_EVENT_READAHEAD_HIT && i == 0) + continue; + + seq_printf(s, event_fmt, ra_event_name[e]); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, item_fmt, + ra_event_count[c][e][i]); + seq_puts(s, "\n"); + } + seq_puts(s, "\n"); + } + + seq_printf(s, event_fmt, table_name[2]); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, class_fmt, ra_class_name[c]); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "random_rate"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, percent_format, + (ra_event_count[c][RA_EVENT_READRANDOM][0] * 100) / + (ra_event_count[c][RA_EVENT_READRANDOM][0] + + ra_event_count[c][RA_EVENT_READAHEAD][0] + 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "ra_hit_rate"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, percent_format, + (ra_event_count[c][RA_EVENT_READAHEAD_HIT][1] * 100) / + (ra_event_count[c][RA_EVENT_READAHEAD][1] + 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "la_hit_rate"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, percent_format, + (ra_event_count[c][RA_EVENT_LOOKAHEAD_HIT][0] * 100) / + (ra_event_count[c][RA_EVENT_LOOKAHEAD][0] + 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "avg_ra_size"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, item_fmt, + (ra_event_count[c][RA_EVENT_READAHEAD][1] + + ra_event_count[c][RA_EVENT_READAHEAD][0] / 2) / + (ra_event_count[c][RA_EVENT_READAHEAD][0] + 1)); + seq_puts(s, "\n"); + + seq_printf(s, event_fmt, "avg_la_size"); + for (c = 0; c <= RA_CLASS_END; c++) + seq_printf(s, item_fmt, + (ra_event_count[c][RA_EVENT_LOOKAHEAD][1] + + ra_event_count[c][RA_EVENT_LOOKAHEAD][0] / 2) / + (ra_event_count[c][RA_EVENT_LOOKAHEAD][0] + 1)); + seq_puts(s, "\n"); + + return 0; +} + +static struct dentry *readahead_dentry; + +static int ra_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, ra_account_show, NULL); +} + +static ssize_t ra_debug_write(struct file *file, const char __user *buf, + size_t size, loff_t *offset) +{ + if (file->f_dentry == readahead_dentry) + memset(ra_event_count, 0, sizeof(ra_event_count)); + return 1; +} + +static struct file_operations ra_debug_fops = { + .owner = THIS_MODULE, + .open = ra_debug_open, + .write = ra_debug_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init readahead_init(void) +{ + readahead_dentry = debugfs_create_file("readahead", + 0644, NULL, NULL, &ra_debug_fops); + return 0; +} + +module_init(readahead_init) + #define dprintk(args...) \ if (readahead_ratio & 1) printk(KERN_DEBUG args) #define ddprintk(args...) \ if ((readahead_ratio & 3) == 3) printk(KERN_DEBUG args) -#define ra_account_page(ra, member, delta) do { \ - unsigned long opg = offsetof(struct page_state, pgreadahead) - \ - offsetof(struct page_state, readahead); \ - unsigned long o1 = offsetof(struct page_state, member); \ - unsigned long o2 = o1 + 2 * opg * ((ra)->flags & RA_CLASS_MASK);\ - BUG_ON(opg + o2 >= sizeof(struct page_state)); \ - __mod_page_state(o1, 1UL); \ - __mod_page_state(o2, 1UL); \ - __mod_page_state(opg + o1, (delta)); \ - __mod_page_state(opg + o2, (delta)); \ -} while (0) - -#define ra_account(member, class, delta) do { \ - unsigned long opg = offsetof(struct page_state, pgreadahead) - \ - offsetof(struct page_state, readahead); \ - unsigned long o1 = offsetof(struct page_state, member); \ - unsigned long o2 = o1 + 2 * opg * (class); \ - if ((class) >= RA_CLASS_END) \ - break; \ - BUG_ON(o2 >= sizeof(struct page_state)); \ - __mod_page_state(o1, (delta)); \ - __mod_page_state(o2, (delta)); \ -} while (0) - -#else -#undef inc_page_state -#undef mod_page_state -#define inc_page_state(a) do {} while(0) -#define mod_page_state(a, b) do {} while(0) +#else /* !DEBUG_READAHEAD */ + +static inline void ra_account(struct file_ra_state *ra, + enum ra_event e, int pages) +{ +} #define dprintk(args...) do {} while(0) #define ddprintk(args...) do {} while(0) -#define ra_account(member, class, delta) do {} while(0) -#define ra_account_page(member, class, delta) do {} while(0) -#endif -/* Set look-ahead size to 1/8 of the read-ahead size. */ -#define LOOKAHEAD_RATIO 8 +#endif /* DEBUG_READAHEAD */ -/* Set read-ahead size to ##% of the thrashing-threshold. */ -int readahead_ratio = 0; -EXPORT_SYMBOL(readahead_ratio); + +/* The default max/min read-ahead pages. */ +#define MAX_RA_PAGES (VM_MAX_READAHEAD >> (PAGE_CACHE_SHIFT - 10)) +#define MIN_RA_PAGES (VM_MIN_READAHEAD >> (PAGE_CACHE_SHIFT - 10)) void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { @@ -71,7 +271,7 @@ void default_unplug_io_fn(struct backing EXPORT_SYMBOL(default_unplug_io_fn); struct backing_dev_info default_backing_dev_info = { - .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, + .ra_pages = MAX_RA_PAGES, .state = 0, .capabilities = BDI_CAP_MAP_COPY, .unplug_io_fn = default_unplug_io_fn, @@ -99,7 +299,7 @@ static inline unsigned long get_max_read static inline unsigned long get_min_readahead(struct file_ra_state *ra) { - return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE; + return MIN_RA_PAGES; } static inline void ra_off(struct file_ra_state *ra) @@ -326,7 +526,7 @@ __do_page_cache_readahead(struct address read_lock_irq(&mapping->tree_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { unsigned long page_offset = offset + page_idx; - + if (page_offset > end_index) break; @@ -632,38 +832,48 @@ unsigned long max_sane_readahead(unsigne * In every read-ahead chunk, it selects one page and tag it with PG_readahead. * Later when the page with PG_readahead is to be read, the logic knows that * it's time to carry out the next read-ahead chunk in advance. - * + * * a read-ahead chunk - * +-----------------------------------------+ - * | # PG_readahead | - * +-----------------------------------------+ + * +-----------------------------------------+ + * | # PG_readahead | + * +-----------------------------------------+ * ^ When this page is read, we submit I/O for the next read-ahead. * * * Here are some variable names used frequently: * * |<------- la_size ------>| - * +-----------------------------------------+ - * | # | - * +-----------------------------------------+ + * +-----------------------------------------+ + * | # | + * +-----------------------------------------+ * ra_index -->|<---------------- ra_size -------------->| * */ -#define next_page(page) (list_entry((page)->lru.prev, struct page, lru)) -#define prev_page(page) (list_entry((page)->lru.next, struct page, lru)) +#define next_page(pg) (list_entry((pg)->lru.prev, struct page, lru)) +#define prev_page(pg) (list_entry((pg)->lru.next, struct page, lru)) /* * The nature of read-ahead allows most tests to fail or even be wrong. * Here we just do not bother to call get_page(), it's meaningless anyway. */ +static inline struct page *__find_page(struct address_space *mapping, + unsigned long offset) +{ + return radix_tree_lookup(&mapping->page_tree, offset); +} + struct page *find_page(struct address_space *mapping, unsigned long offset) { struct page *page; read_lock_irq(&mapping->tree_lock); - page = radix_tree_lookup(&mapping->page_tree, offset); + page = __find_page(mapping, offset); read_unlock_irq(&mapping->tree_lock); +#ifdef DEBUG_READAHEAD_RADIXTREE + if (page) + BUG_ON(page->index != offset); +#endif return page; } @@ -680,7 +890,7 @@ static int rescue_pages(struct page *pag BUG_ON(!nr_pages || !page); pgrescue = 0; - index = page->index; + index = page_index(page); mapping = page_mapping(page); dprintk("rescue_pages(ino=%lu, index=%lu nr=%lu)\n", @@ -693,7 +903,8 @@ static int rescue_pages(struct page *pag if (!PageLRU(page)) goto out_unlock; - while (page_mapping(page) == mapping && page->index == index) { + while (page_mapping(page) == mapping && + page_index(page) == index) { struct page *the_page = page; page = next_page(page); if (!PageActive(the_page) && @@ -701,7 +912,6 @@ static int rescue_pages(struct page *pag !PageLocked(the_page) && page_count(the_page) == 1) { list_move(&the_page->lru, &zone->inactive_list); - zone->nr_page_aging++; pgrescue++; } index++; @@ -718,8 +928,7 @@ static int rescue_pages(struct page *pag out_unlock: spin_unlock_irq(&zone->lru_lock); out: - inc_page_state(readahead_rescue); - mod_page_state(pgreadahead_rescue, pgrescue); + ra_account(0, RA_EVENT_READAHEAD_RESCUE, pgrescue); return nr_pages ? index : 0; } @@ -734,7 +943,7 @@ out: * | # | # | * +---------------------------+-------------------------------------------+ * ^ ^ ^ ^ - * la_index ra_index lookahead_index readahead_index + * la_index ra_index lookahead_index readahead_index */ /* @@ -754,16 +963,16 @@ static unsigned long nr_free_inactive(vo } /* - * The accumulated count of pages pushed into inactive_list(s). + * A much smoother analog to nr_page_aging. */ -static unsigned long nr_page_aging(void) +static unsigned long nr_smooth_aging(void) { - unsigned int i; + unsigned long cpu; unsigned long sum = 0; - struct zone *zones = NODE_DATA(numa_node_id())->node_zones; + cpumask_t mask = node_to_cpumask(numa_node_id()); - for (i = 0; i < MAX_NR_ZONES; i++) - sum += zones[i].nr_page_aging; + for_each_cpu_mask(cpu, mask) + sum += per_cpu(smooth_aging, cpu); return sum; } @@ -772,7 +981,7 @@ static unsigned long nr_page_aging(void) * Set class of read-ahead */ static inline void set_ra_class(struct file_ra_state *ra, - enum file_ra_class ra_class) + enum ra_class ra_class) { ra->flags <<= RA_CLASS_SHIFT; ra->flags += ra_class; @@ -788,6 +997,11 @@ static inline int ra_cache_hit(struct fi return (ra->cache_hit >> (nr * 16)) & 0xFFFF; } +/* + * Something like: + * ra_cache_hit(ra, 1) += ra_cache_hit(ra, 0); + * ra_cache_hit(ra, 0) = 0; + */ static inline void ra_addup_cache_hit(struct file_ra_state *ra) { int n; @@ -838,12 +1052,17 @@ static inline void ra_state_init(struct static inline void ra_state_update(struct file_ra_state *ra, unsigned long ra_size, unsigned long la_size) { +#ifdef DEBUG_READAHEAD + unsigned long old_ra = ra->readahead_index - ra->ra_index; + if (ra_size < old_ra && ra_cache_hit(ra, 0)) + ra_account(ra, RA_EVENT_READAHEAD_SHRINK, old_ra - ra_size); +#endif ra_addup_cache_hit(ra); ra->ra_index = ra->readahead_index; ra->la_index = ra->lookahead_index; ra->readahead_index += ra_size; ra->lookahead_index = ra->readahead_index - la_size; - ra->nr_page_aging = nr_page_aging(); + ra->age = nr_smooth_aging(); } /* @@ -866,16 +1085,7 @@ static int ra_dispatch(struct file_ra_st unsigned long ra_size; unsigned long la_size; int actual; - enum file_ra_class ra_class; - static char *ra_class_name[] = { - "newfile", - "state", - "context", - "contexta", - "backward", - /* "around", */ - "random", - }; + enum ra_class ra_class; ra_class = (ra->flags & RA_CLASS_MASK); BUG_ON(ra_class == 0 || ra_class > RA_CLASS_END); @@ -885,10 +1095,8 @@ static int ra_dispatch(struct file_ra_st la_size = ra->readahead_index - ra->lookahead_index; /* Snap to EOF. */ - if (unlikely(ra->ra_index >= eof_index)) { - inc_page_state(readahead_end); + if (unlikely(ra->ra_index >= eof_index)) return 0; - } if (ra->readahead_index + ra_size / 2 > eof_index) { if (ra_class == RA_CLASS_CONTEXT_ACCELERATED && eof_index > ra->lookahead_index + 1) @@ -902,12 +1110,16 @@ static int ra_dispatch(struct file_ra_st actual = __do_page_cache_readahead(mapping, filp, ra->ra_index, ra_size, la_size); - if (!la_size && ra->readahead_index == eof_index) - ra_account_page(ra, readahead_eof, actual); - ra_account_page(ra, readahead, actual); + if (ra->readahead_index == eof_index) + ra_account(ra, RA_EVENT_READAHEAD_EOF, actual); + if (la_size) + ra_account(ra, RA_EVENT_LOOKAHEAD, la_size); + if (ra_size > actual) + ra_account(ra, RA_EVENT_IO_CACHE_HIT, ra_size - actual); + ra_account(ra, RA_EVENT_READAHEAD, actual); dprintk("readahead-%s(ino=%lu, index=%lu, ra=%lu+%lu-%lu) = %d\n", - ra_class_name[ra_class - 1], + ra_class_name[ra_class], mapping->host->i_ino, ra->la_index, ra->ra_index, ra_size, la_size, actual); @@ -958,7 +1170,7 @@ static inline int adjust_rala(unsigned l * It is returned to make the next read-ahead request. * 2. the remained space for the current chunk * It will be checked to ensure that the current chunk is safe. - * + * * The computation will be pretty accurate under heavy load, and will change * vastly with light load(small global_shift), so the grow speed of ra_size * must be limited, and a moderate large stream_shift must be insured. @@ -986,11 +1198,11 @@ static inline unsigned long compute_thra unsigned long ra_size; global_size = nr_free_inactive(); - global_shift = nr_page_aging() - ra->nr_page_aging; + global_shift = nr_smooth_aging() - ra->age; stream_shift = ra_cache_hit(ra, 0); ra_size = stream_shift * - global_size * readahead_ratio / 100 / global_shift; + global_size * readahead_ratio / (100 * global_shift); if (global_size > global_shift) *remain = stream_shift * @@ -1006,7 +1218,7 @@ static inline unsigned long compute_thra return ra_size; } -/* +/* * Main function for file_ra_state based read-ahead. */ static inline unsigned long @@ -1023,7 +1235,7 @@ state_based_readahead(struct address_spa ra_old = ra->readahead_index - ra->ra_index; ra_size = compute_thrashing_threshold(ra, &remain_space); - if (readahead_ratio < 80 && + if (readahead_ratio < VM_READAHEAD_PROTECT_RATIO && remain_space <= la_size && la_size > 1) { rescue_pages(page, la_size); return 0; @@ -1039,7 +1251,6 @@ state_based_readahead(struct address_spa return ra_dispatch(ra, mapping, filp); } - /* * Page cache context based estimation of read-ahead/look-ahead size/index. * @@ -1048,11 +1259,11 @@ state_based_readahead(struct address_spa * the start point of next read-ahead. * * The estimation theory can be illustrated with figure: - * + * * chunk A chunk B chunk C head * * l01 l11 l12 l21 l22 - *| |-->|-->| |------>|-->| |------>| + *| |-->|-->| |------>|-->| |------>| *| +-------+ +-----------+ +-------------+ | *| | # | | # | | # | | *| +-------+ +-----------+ +-------------+ | @@ -1075,33 +1286,22 @@ state_based_readahead(struct address_spa * a lower estimation of the true thrashing-threshold. */ -#if PG_activate < PG_referenced -#error unexpected page flags order -#endif - -#define PAGE_REFCNT_1 (1 << PG_referenced) -#define PAGE_REFCNT_2 (1 << PG_activate) -#define PAGE_REFCNT_3 ((1 << PG_activate) | (1 << PG_referenced)) -#define PAGE_REFCNT_MASK PAGE_REFCNT_3 /* * STATUS REFERENCE COUNT TYPE - * __ - not in inactive list - * __ 0 fresh - * _R PAGE_REFCNT_1 stale - * A_ PAGE_REFCNT_2 disturbed once - * AR PAGE_REFCNT_3 disturbed twice + * A__ 0 not in inactive list + * ___ 0 fresh + * __R PAGE_REFCNT_1 stale + * _a_ PAGE_REFCNT_2 disturbed once + * _aR PAGE_REFCNT_3 disturbed twice + * + * A/a/R: Active / aCTIVATE / Referenced */ -static inline unsigned long __page_refcnt(struct page *page) -{ - return page->flags & PAGE_REFCNT_MASK; -} - -static inline unsigned long page_refcnt(struct page *page) +static inline unsigned long cold_page_refcnt(struct page *page) { if (!page || PageActive(page)) return 0; - return __page_refcnt(page); + return page_refcnt(page); } static inline char page_refcnt_symbol(struct page *page) @@ -1110,7 +1310,7 @@ static inline char page_refcnt_symbol(st return 'X'; if (PageActive(page)) return 'A'; - switch (__page_refcnt(page)) { + switch (page_refcnt(page)) { case 0: return '_'; case PAGE_REFCNT_1: @@ -1124,201 +1324,177 @@ static inline char page_refcnt_symbol(st } /* - * Look back and count history pages to estimate thrashing-threshold. - * - * Strategies - * - Sequential read that extends from index 0 - * The counted value may well be far under the true threshold, so return - * it unmodified for further process in adjust_rala_accelerated(). - * - Sequential read with a large history count - * Check 3 evenly spread pages to be sure there is no hole or many - * not-yet-accessed pages. This prevents unnecessary IO, and allows some - * almost sequential patterns to survive. - * - Return equal or smaller count; but ensure a reasonable minimal value. - * - * Optimization - * - The count will normally be min(nr_lookback, offset), unless either memory - * or read speed is low, or it is still in grow up phase. - * - A rigid implementation would be a simple loop to scan page by page - * backward, though this may be unnecessary and inefficient, so the - * stepping backward/forward scheme is used. - * - * FIXME: it seems ugly :( - */ -static int count_sequential_pages(struct address_space *mapping, - int refcnt, unsigned long *remain, - unsigned long offset, + * Count/estimate cache hits in range [first_index, last_index]. + * The estimation is simple and a bit optimistic. + */ +static int count_cache_hit(struct address_space *mapping, + unsigned long first_index, unsigned long last_index) +{ + static int steps[8] = {0, 4, 2, 6, 1, 3, 5, 7}; + struct page *page; + int size = last_index - first_index + 1; + int count = 0; + int i; + + read_lock_irq(&mapping->tree_lock); + + for (i = 0; i < 8;) { + page = __find_page(mapping, + first_index + size * steps[i++] / 8); + if (cold_page_refcnt(page) >= PAGE_REFCNT_1 && ++count >= 2) + break; + } + + read_unlock_irq(&mapping->tree_lock); + + return size * count / i; +} + +/* + * Look back and check history pages to estimate thrashing-threshold. + */ +static int query_page_cache(struct address_space *mapping, + unsigned long *remain, unsigned long offset, unsigned long ra_min, unsigned long ra_max) { int step; int count; unsigned long index; unsigned long nr_lookback; - struct page *page; - struct radix_tree_cache cache; + struct radix_tree_cache cache; - *remain = 0; - nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) * - 100 / (readahead_ratio + 1); - if (nr_lookback > offset) - nr_lookback = offset; - if (nr_lookback > mapping->nrpages) - nr_lookback = mapping->nrpages; - - if (nr_lookback <= ra_min * 100 / (readahead_ratio + 1)) { - *remain = nr_lookback; - return ra_min; + /* + * Scan backward and check the near @ra_max pages. + * The count here determines ra_size. + */ + read_lock_irq(&mapping->tree_lock); + index = radix_tree_lookup_head(&mapping->page_tree, offset, ra_max); + read_unlock_irq(&mapping->tree_lock); +#ifdef DEBUG_READAHEAD_RADIXTREE + if (index <= offset) { + WARN_ON(!find_page(mapping, index)); + if (index + ra_max > offset) + WARN_ON(find_page(mapping, index - 1)); + } else { + BUG_ON(index > offset + 1); + WARN_ON(find_page(mapping, offset)); } +#endif - radix_tree_cache_init(&cache); - read_lock_irq(&mapping->tree_lock); + *remain = offset - index + 1; - /* check the far end first */ - index = offset - nr_lookback; - page = radix_tree_cache_lookup(&mapping->page_tree, &cache, index); - if (page_refcnt(page) >= refcnt) { - step = 1 + nr_lookback / 3; - if(nr_lookback > ra_min * 8) { - count = 1; - goto check_more; - } else { - *remain = nr_lookback; - goto out_unlock; - } + if (unlikely(*remain <= ra_min)) { + count = ra_min; + goto out; } - /* scan backward for non-present page */ - count = 0; /* just to make gcc happy */ - for(step = ra_min; step < nr_lookback; step *= 4) { - index = offset - step; - page = radix_tree_cache_lookup(&mapping->page_tree, &cache, - index); - if (!page) - goto check_more; - } - index = offset - nr_lookback; - page = NULL; + count = count_cache_hit(mapping, index, offset); + if (count < ra_min) + count = ra_min; + if (unlikely(count * 2 < offset - index)) + goto out; - /* scan forward and check some more pages */ -check_more: - for(;;) { - if (page && !*remain) - *remain = offset - index; - if (page_refcnt(page) < refcnt) { - count = 0; - step = (offset - index + 3) / 4; - } else if (++count >= 3 || step < ra_min) - break; - index += step; - if (index >= offset) + if (*remain < ra_max) + goto out; + + /* + * Check the far pages coarsely. + * The big count here helps increase la_size. + */ + nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) * + 100 / (readahead_ratio + 1); + if (nr_lookback > offset) + nr_lookback = offset; + + radix_tree_cache_init(&cache); + read_lock_irq(&mapping->tree_lock); + for (step = 2 * ra_max; step < nr_lookback; step += ra_max) { + struct radix_tree_node *node; + node = radix_tree_cache_lookup_node(&mapping->page_tree, + &cache, offset - step, 1); + if (!node) break; - page = radix_tree_cache_lookup(&mapping->page_tree, &cache, - index); +#ifdef DEBUG_READAHEAD_RADIXTREE + if (node != radix_tree_lookup_node(&mapping->page_tree, + offset - step, 1)) { + read_unlock_irq(&mapping->tree_lock); + printk(KERN_ERR "check radix_tree_cache_lookup_node!\n"); + return 1; + } +#endif } -out_unlock: read_unlock_irq(&mapping->tree_lock); - count = 3 * step; - if (count > nr_lookback) - return nr_lookback; - - if (!*remain) - *remain = count; - - count = count * readahead_ratio / 100; - if (count < get_min_readahead(NULL)) - count = get_min_readahead(NULL); + /* + * For sequential read that extends from index 0, the counted value + * may well be far under the true threshold, so return it unmodified + * for further process in adjust_rala_accelerated(). + */ + if (step < offset) + count = step * readahead_ratio / 100; + else + count = offset; +out: return count; } /* - * Scan forward in inactive_list for the first non-present page. - * It takes advantage of the adjacency of pages in inactive_list. + * Scan backward in the file for the first non-present page. */ -static unsigned long lru_scan_forward(struct page *page, int nr_pages) +static inline unsigned long first_absent_page_bw(struct address_space *mapping, + unsigned long index, unsigned long max_scan) { - unsigned long index = page->index; - struct address_space *mapping = page_mapping(page); - struct zone *zone; - - for(;;) { - zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); - - if (!PageLRU(page)) - goto out; + struct radix_tree_cache cache; + struct page *page; + unsigned long origin; - do { + origin = index; + if (max_scan > index) + max_scan = index; + radix_tree_cache_init(&cache); + read_lock_irq(&mapping->tree_lock); + for (;;) { + page = radix_tree_cache_lookup(&mapping->page_tree, + &cache, --index); + if (page) { index++; - if (!--nr_pages) - goto out; - page = next_page(page); - } while (page_mapping(page) == mapping && page->index == index); - - spin_unlock_irq(&zone->lru_lock); - - page = find_page(mapping, index); - if (!page) - return index; + break; + } + if (origin - index > max_scan) + break; } -out: - spin_unlock_irq(&zone->lru_lock); - return nr_pages ? index : 0; + read_unlock_irq(&mapping->tree_lock); + + return index; } -/* Directly calling lru_scan_forward() would be slow. - * This function tries to avoid unnecessary scans for the most common cases: - * - Slow reads should scan forward directly; - * - Fast reads should step backward first; - * - Aggressive reads may well have max allowed look-ahead size. - */ -static unsigned long first_absent_page(struct address_space *mapping, - struct page *page, unsigned long index, - unsigned long ra_size, unsigned long ra_max) +/* + * Scan forward in the file for the first non-present page. + */ +static inline unsigned long first_absent_page(struct address_space *mapping, + unsigned long index, unsigned long max_scan) { - struct radix_tree_cache cache; - - if (ra_size < ra_max) - goto scan_forward; + unsigned long ra_index; - radix_tree_cache_init(&cache); read_lock_irq(&mapping->tree_lock); + ra_index = radix_tree_lookup_tail(&mapping->page_tree, + index + 1, max_scan); + read_unlock_irq(&mapping->tree_lock); - if (ra_size < LOOKAHEAD_RATIO * ra_max) - goto scan_backward; - - page = radix_tree_cache_lookup(&mapping->page_tree, &cache, - index + ra_max); - if (page) { - read_unlock_irq(&mapping->tree_lock); - return 0; - } - page = radix_tree_cache_lookup(&mapping->page_tree, &cache, - index + ra_max - 1); - if (page) { - read_unlock_irq(&mapping->tree_lock); - return index + ra_max; +#ifdef DEBUG_READAHEAD_RADIXTREE + BUG_ON(ra_index <= index); + if (index + max_scan > index) { + if (ra_index <= index + max_scan) + WARN_ON(find_page(mapping, ra_index)); + WARN_ON(!find_page(mapping, ra_index - 1)); } +#endif -scan_backward: - if (ra_size == index) - ra_size /= 4; + if (ra_index <= index + max_scan) + return ra_index; else - ra_size /= (LOOKAHEAD_RATIO * 2); - for(;; ra_size /= 2) { - page = radix_tree_cache_lookup(&mapping->page_tree, &cache, - index + ra_size); - if (page) - break; - if (!ra_size) - return index + 1; - } - read_unlock_irq(&mapping->tree_lock); - ra_size = ra_max; - -scan_forward: - return lru_scan_forward(page, ra_size + 1); + return 0; } /* @@ -1350,7 +1526,7 @@ static inline int adjust_rala_accelerate return 1; } -/* +/* * Main function for page context based read-ahead. */ static inline int @@ -1364,50 +1540,44 @@ try_context_based_readahead(struct addre unsigned long ra_size; unsigned long la_size; unsigned long remain_pages; - unsigned long ret; - int refcnt; - /* NFSv3 daemons may process adjecent requests in parallel, + /* Where to start read-ahead? + * NFSv3 daemons may process adjecent requests in parallel, * leading to many locally disordered, globally sequential reads. - * So do not require nearby history pages to be accessed, present is - * enough. + * So do not require nearby history pages to be present or accessed. */ - if (!prev_page) - return 0; - - refcnt = page_refcnt(prev_page); - if (refcnt < PAGE_REFCNT_1) - refcnt = PAGE_REFCNT_1; - - ra_size = count_sequential_pages(mapping, refcnt, - &remain_pages, index, ra_min, ra_max); - - /* Where to start read-ahead? */ - if (!page) - ra_index = index; - else { - ra_index = first_absent_page( - mapping, page, index, ra_size, ra_max); + if (page) { + ra_index = first_absent_page(mapping, index, ra_max * 5 / 4); if (unlikely(!ra_index)) return -1; - } + } else if (!prev_page) { + ra_index = first_absent_page_bw(mapping, index, ra_min); + if (index - ra_index > ra_min) + return 0; + ra_min += index - ra_index; + index = ra_index; + } else + ra_index = index; + + ra_size = query_page_cache(mapping, &remain_pages, + index - 1, ra_min, ra_max); la_size = ra_index - index; - if (readahead_ratio < 80 && + if (readahead_ratio < VM_READAHEAD_PROTECT_RATIO && remain_pages <= la_size && la_size > 1) { rescue_pages(page, la_size); return -1; } if (ra_size == index) { - ret = adjust_rala_accelerated(ra_max, &ra_size, &la_size); + if (!adjust_rala_accelerated(ra_max, &ra_size, &la_size)) + return -1; set_ra_class(ra, RA_CLASS_CONTEXT_ACCELERATED); } else { - ret = adjust_rala(ra_max, &ra_size, &la_size); + if (!adjust_rala(ra_max, &ra_size, &la_size)) + return -1; set_ra_class(ra, RA_CLASS_CONTEXT); } - if (unlikely(!ret)) - return -1; ra_state_init(ra, index, ra_index); ra_state_update(ra, ra_size, la_size); @@ -1455,31 +1625,33 @@ newfile_readahead(struct address_space * */ static inline int try_read_backward(struct file_ra_state *ra, - unsigned long first_index, unsigned long last_index, - unsigned long ra_size, unsigned long ra_max) + unsigned long begin_index, unsigned long end_index, + unsigned long ra_size, + unsigned long ra_min, unsigned long ra_max) { - if (ra_size > ra_max) + if (ra_size > ra_max || end_index > ra->prev_page) return 0; if (ra_has_index(ra, ra->prev_page)) { + if (end_index > ra->la_index) + return 0; ra_size += 2 * ra_cache_hit(ra, 0); - last_index = ra->la_index; + end_index = ra->la_index; } else { - ra_size = 4 * ra_size; - last_index = ra->prev_page; + ra_size += ra_min; + end_index = ra->prev_page; } if (ra_size > ra_max) ra_size = ra_max; - if (last_index < first_index || - last_index > first_index + ra_size) + if (end_index > begin_index + ra_size) return 0; - first_index = last_index - ra_size; + begin_index = end_index - ra_size; set_ra_class(ra, RA_CLASS_BACKWARD); - ra_state_init(ra, first_index, first_index); + ra_state_init(ra, begin_index, begin_index); ra_state_update(ra, ra_size, 0); return 1; @@ -1488,7 +1660,7 @@ try_read_backward(struct file_ra_state * /* * If there is a previous sequential read, it is likely to be another * sequential read at the new position. - * Databases are known to have the seek-and-read-one-record pattern. + * Databases are known to have this seek-and-read-one-record pattern. */ static inline int try_random_readahead(struct file_ra_state *ra, unsigned long index, @@ -1502,17 +1674,23 @@ try_random_readahead(struct file_ra_stat if (!ra_has_index(ra, ra->prev_page)) return 0; - if (index == ra->prev_page + 1) /* read after thrashing */ + if (index == ra->prev_page + 1) { /* read after thrashing */ ra_size = hit0; - else if (ra_size < hit1 && /* read after seeking */ + set_ra_class(ra, RA_CLASS_RANDOM_THRASHING); + ra_account(ra, RA_EVENT_READAHEAD_THRASHING, + ra->readahead_index - index); + } else if (ra_size < hit1 && /* read after seeking */ hit1 > hit2 / 2 && hit2 > hit3 / 2 && - hit3 > hit1 / 2) - ra_size = min(ra_max, hit1); - else + hit3 > hit1 / 2) { + ra_size = max(hit1, hit2); + set_ra_class(ra, RA_CLASS_RANDOM_SEEK); + } else return 0; - set_ra_class(ra, RA_CLASS_RANDOM); + if (ra_size > ra_max) + ra_size = ra_max; + ra_state_init(ra, index, index); ra_state_update(ra, ra_size, 0); @@ -1522,13 +1700,12 @@ try_random_readahead(struct file_ra_stat /* * ra_size is mainly determined by: * 1. sequential-start: min(KB(16 + mem_mb/16), KB(64)) - * 2. sequential-max: min(KB(64 + mem_mb*64), KB(2048)) + * 2. sequential-max: min(ra->ra_pages, KB(262140)) * 3. sequential: (thrashing-threshold) * readahead_ratio / 100 * * Table of concrete numbers for 4KB page size: * (inactive + free) (in MB): 4 8 16 32 64 128 256 512 1024 * initial ra_size (in KB): 16 16 16 16 20 24 32 48 64 - * max ra_size (in KB): 320 576 1088 2048 2048 2048 2048 2048 2048 */ static inline void get_readahead_bounds(struct file_ra_state *ra, unsigned long *ra_min, @@ -1538,31 +1715,54 @@ static inline void get_readahead_bounds( #define KB(size) (((size) * 1024) / PAGE_CACHE_SIZE) mem_mb = nr_free_inactive() * PAGE_CACHE_SIZE / 1024 / 1024; - *ra_max = min(min(KB(64 + mem_mb*64), KB(2048)), ra->ra_pages); + *ra_max = min(ra->ra_pages, KB(262140)); *ra_min = min(min(KB(VM_MIN_READAHEAD + mem_mb/16), KB(128)), *ra_max/2); #undef KB } -/* +/* + * Set a new look-ahead mark at @new_index. + */ +void renew_lookahead(struct address_space *mapping, + struct file_ra_state *ra, + unsigned long index, unsigned long new_index) +{ + struct page *page; + + if (index == ra->lookahead_index && + new_index >= ra->readahead_index) + return; + + page = find_page(mapping, new_index); + if (!page) + return; + + SetPageReadahead(page); + if (ra->lookahead_index == index) + ra->lookahead_index = new_index; +} + +/* * This is the entry point of the adaptive read-ahead logic. * * It is only called on two conditions: * 1. page == NULL * A cache miss happened, it can be either a random read or a sequential one. - * 2. page != NULL + * 2. page != NULL * There is a look-ahead mark(PG_readahead) from a previous sequential read. * It's time to do some checking and submit the next read-ahead IO. * - * That makes both methods happy, and lives in harmony with application managed - * read-aheads via fadvise() / madvise(). The cache hit problem is also - * eliminated naturally. + * That has the merits of: + * - makes all stateful/stateless methods happy; + * - eliminates the cache hit problem naturally; + * - lives in harmony with application managed read-aheads via fadvise/madvise. */ unsigned long page_cache_readahead_adaptive(struct address_space *mapping, struct file_ra_state *ra, struct file *filp, struct page *prev_page, struct page *page, - unsigned long first_index, - unsigned long index, unsigned long last_index) + unsigned long begin_index, + unsigned long index, unsigned long end_index) { unsigned long size; unsigned long ra_min; @@ -1572,16 +1772,24 @@ page_cache_readahead_adaptive(struct add if (page) { if(!TestClearPageReadahead(page)) return 0; - if (bdi_read_congested(mapping->backing_dev_info)) + if (bdi_read_congested(mapping->backing_dev_info)) { + ra_account(ra, RA_EVENT_IO_CONGESTION, + end_index - index); return 0; + } + if (laptop_mode && laptop_spinned_down()) { + renew_lookahead(mapping, ra, index, index + 32); + return 0; + } } if (page) - ra_account(readahead_return, ra->flags & RA_CLASS_MASK, 1); + ra_account(ra, RA_EVENT_LOOKAHEAD_HIT, + ra->readahead_index - ra->lookahead_index); else if (index) - inc_page_state(cache_miss); + ra_account(ra, RA_EVENT_CACHE_MISS, end_index - begin_index); - size = last_index - index; + size = end_index - index; get_readahead_bounds(ra, &ra_min, &ra_max); /* readahead disabled? */ @@ -1594,7 +1802,7 @@ page_cache_readahead_adaptive(struct add * Start of file. */ if (index == 0) - return newfile_readahead(mapping, filp, ra, last_index, ra_min); + return newfile_readahead(mapping, filp, ra, end_index, ra_min); /* * State based sequential read-ahead. @@ -1603,20 +1811,18 @@ page_cache_readahead_adaptive(struct add index == ra->lookahead_index && (page || index == ra->readahead_index) && (ra_cache_hit_ok(ra) || - last_index - first_index >= ra_max)) + end_index - begin_index >= ra_max)) return state_based_readahead(mapping, filp, ra, page, ra_max); /* * Backward read-ahead. */ - if (try_read_backward(ra, first_index, last_index, size, ra_max)) + if (try_read_backward(ra, begin_index, end_index, size, ra_min, ra_max)) return ra_dispatch(ra, mapping, filp); - /* + /* * Context based sequential read-ahead. - */ - if (!prev_page) - prev_page = find_page(mapping, index - 1); + */ ret = try_context_based_readahead(mapping, ra, prev_page, page, index, ra_min, ra_max); if (ret > 0) @@ -1624,7 +1830,7 @@ page_cache_readahead_adaptive(struct add if (ret < 0) return 0; - /* No action on look ahead time ? */ + /* No action on look ahead time? */ if (page) return 0; @@ -1643,12 +1849,10 @@ page_cache_readahead_adaptive(struct add readit: size = __do_page_cache_readahead(mapping, filp, index, size, 0); - inc_page_state(readrandom); - mod_page_state(pgreadrandom, size); - + ra_account(ra, RA_EVENT_READRANDOM, size); dprintk("readrandom(ino=%lu, pages=%lu, index=%lu-%lu-%lu) = %lu\n", mapping->host->i_ino, mapping->nrpages, - first_index, index, last_index, size); + begin_index, index, end_index, size); return size; } @@ -1663,148 +1867,99 @@ void fastcall ra_access(struct file_ra_s (1 << PG_referenced))) return; - if (!ra_has_index(ra, page->index)) + if (ra_has_index(ra, page->index)) { + if (PageLocked(page)) + ra_account(ra, RA_EVENT_IO_BLOCK, + ra->readahead_index - page->index); + } else { + if (PageLocked(page)) + ra_account(0, RA_EVENT_IO_BLOCK, 1); return; + } ra->cache_hit++; if (page->index >= ra->ra_index) - ra_account(pgreadahead_hit, ra->flags & RA_CLASS_MASK, 1); + ra_account(ra, RA_EVENT_READAHEAD_HIT, 1); else - ra_account(pgreadahead_hit, - (ra->flags >> RA_CLASS_SHIFT) & RA_CLASS_MASK, 1); + ra_account(ra, RA_EVENT_READAHEAD_HIT, -1); } /* - * Detect and protect sequential read-ahead pages. - * - * The safty guarantee provided by this function is only needed in file servers - * with big readahead_ratio set. + * Detect and protect live read-ahead pages. + * + * This function provides safty guarantee for file servers with big + * readahead_ratio(>=VM_READAHEAD_PROTECT_RATIO) set. The goal is to save all + * and only the sequential pages that are to be accessed in the near future. * * This function is called when pages in @page_list are to be freed, - * it protects ra pages by moving them into @save_list. + * it protects live read-ahead pages by moving them into @save_list. * * The general idea is to classify pages of a file into random pages and groups - * of sequential accessed pages. Random pages and leading segments of - * sequential pages are left over, following sequential pages are saved. + * of sequential accessed pages. Random pages and dead sequential pages are + * left over, live sequential pages are saved. + * + * Live read-ahead pages are defined as sequential pages that have reading in + * progress. They are detected by reference count pattern of: + * + * live head live pages + * ra pages group --> ------------___________________ + * [ pages to save ] (*) * - * The algorithm must ensure: + * (*) for now, an extra page from the live head may also be saved. + * + * In pratical, the group of pages are fragmented into chunks. To tell whether + * pages inside a chunk are alive, we must check: + * 1) Are there any live heads inside the chunk? + * 2) Are there any live heads in the group before the chunk? + * 3) Sepcial case: live head just sits on the boundary of current chunk? + * + * The detailed rules employed must ensure: * - no page is pinned in inactive_list. * - no excessive pages are saved. * - * chunk - a list of pages belong to the same file - * rs/ra pages - a chunk of pages that was read/to be read sequentially - * Detected by ascending index and (almost) non-descending - * reference count. rs pages have greater reference count than - * following ra pages. A page can be both rs/ra page, which - * indicates there are two adjacent readers. - * live ra pages - ra pages that have reading in progress - * Detected by having leading rs pages(either in page_list or in - * inactive_list), or limited ra pages(may be in another zone, - * just had their rs pages dropped). - * dead ra pages - ra pages that seems to have no imminent reader - * Note that they are not necessarily dead: either the cost of - * search the leading rs pages or the cost of keeping them in - * memory is large, so they are abandoned. - * Leading rs pages are detected and handled the same way. - * - * Live ra pages are saved, pure/leading rs pages and dead ra pages are left - * over and eligible for free. - * - * The rules apply to the following common cases: - * keep head back search chunk case - * Y ----____________|______________________ Normal - * ----------------|----__________________ Normal - * |----__________________ Normal + * A picture of common cases: + * back search chunk case + * -----___________|[____________________] Normal + * ----------------|----[________________] Normal + * |----[________________] Normal * ----------------|---------------------- Normal * |---------------------- Normal - * y ________________|______________________ cache miss - * |______________________ cache miss - * y ________________|_______--------_______ two readers - * Y ----____________|_______--------_______ two readers - * |_______--------_______ two readers - * |----_____------_______ two readers - * ----------------|----_____------_______ two readers - * _______---------|---------------_______ two readers - * Y ----___---------|---------------_______ two readers - * ________________|---------------_______ two readers - * Y ----____________|---------------_______ two readers - * Y ====------------|----__________________ two readers - * N |====-----------_______ two readers - * N |###======------------- three readers - * Y: saved by leading rs pages - * y: saved by limited leading ra pages - * N: to be activated anyway - * - * To make it run smooth and fast, ra request boundary must be reserved: - * - alloc pages of a chunk from one single zone - * - insert pages into lru at one time - * - make vmscan code aware of chunk boundaries - * - * Read backward pattern support is possible, in which case the pages are - * better pushed into lru in reverse order. + * ________________|______________________ ra miss + * |______________________ ra miss + * ________________|_______--------[_____] two readers + * ----____________|[______--------______] two readers + * |_______--------[_____] two readers + * |----[____------______] two readers + * ----------------|----[____------______] two readers + * _______---------|---------------[_____] two readers + * ----___---------|[--------------______] two readers + * ________________|---------------[_____] two readers + * ----____________|[--------------______] two readers + * ====------------|[---_________________] two readers + * |====[----------______] two readers + * |###======[-----------] three readers + * + * Read backward pattern support is possible, in which case the pages should be + * pushed into inactive_list in reverse order. + * + * The two special cases are awkwardly delt with for now. They will be all set + * when the timing information of recently evicted pages are available. + * Dead pages can also be purged earlier with the timing info. */ -int rescue_ra_pages(struct list_head *page_list, struct list_head *save_list) +static int save_chunk(struct page *head, struct page *live_head, + struct page *tail, struct list_head *save_list) { - struct address_space *mapping; - struct page *chunk_head; struct page *page; - unsigned long refcnt; - unsigned long index; - int ascend_count; - int ret = 0; - - page = list_to_page(page_list); - -next_chunk: - chunk_head = page; - mapping = page_mapping(page); - ascend_count = 0; - -next_page: - index = page->index; - refcnt = __page_refcnt(page); - page = next_page(page); - - if (&page->lru == page_list) - goto save_chunk; - - if (mapping == page_mapping(page) && page->index > index) { - if (refcnt < __page_refcnt(page)) - ascend_count++; - goto next_page; - } - -save_chunk: - if (mapping && !PageSwapCache(page) && - !page_mapped(page) && - ascend_count <= 3 && - (!refcnt || index >= chunk_head->index + 8)) - ret += save_chunk(chunk_head, page, save_list); - - if (&page->lru != page_list) - goto next_chunk; - - if (ret) - mod_page_state(pgreadahead_rescue, ret); - - return ret; -} - -int save_chunk(struct page *head, struct page *tail, - struct list_head *save_list) -{ - struct page *page; - struct page *next_page; - struct address_space *mapping = page_mapping(head); + struct address_space *mapping; struct radix_tree_cache cache; int i; - int keep_head; - unsigned long index = head->index; - unsigned long refcnt = __page_refcnt(head); + unsigned long index; + unsigned long refcnt; + #ifdef DEBUG_READAHEAD static char static_buf[PAGE_SIZE]; - static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; + static char *zone_names[] = {"DMA", "DMA32", "Normal", "HighMem"}; char *pat = static_buf; unsigned long pidx = PAGE_SIZE / 2; @@ -1815,63 +1970,113 @@ int save_chunk(struct page *head, struct } #endif - /* The leading pages are going to be activated anyway? */ - keep_head = 0; - if (refcnt > PAGE_REFCNT_1) - goto drop_head; - if (refcnt >= PAGE_REFCNT_1 && mapping_mapped(mapping)) - goto drop_head; - - /* Scan backward to see if leading pages should be saved. */ +#define LIVE_PAGE_SCAN (4 * MAX_RA_PAGES) + index = head->index; + refcnt = page_refcnt(head); + mapping = head->mapping; radix_tree_cache_init(&cache); + + BUG_ON(!mapping); /* QUESTION: in what case mapping will be NULL ? */ read_lock_irq(&mapping->tree_lock); - for (i = 2 * mapping->backing_dev_info->ra_pages; i >= 0; i--) { + + /* + * Common case test: + * Does the far end indicates a leading live head? + */ + index = radix_tree_lookup_head(&mapping->page_tree, + index, LIVE_PAGE_SCAN); + page = __find_page(mapping, index); + if (cold_page_refcnt(page) > refcnt) { +#ifdef DEBUG_READAHEAD + if ((readahead_ratio & 3) == 3) { + pat[--pidx] = '.'; + pat[--pidx] = '.'; + pat[--pidx] = '.'; + pat[--pidx] = page_refcnt_symbol(page); + pat[--pidx] = '|'; + } +#endif + live_head = head; + goto skip_scan_locked; + } + + /* + * Special case 1: + * If @head is a live head, rescue_ra_pages() will not detect it. + * Check it here. + */ + index = head->index; + page = radix_tree_cache_lookup(&mapping->page_tree, &cache, --index); + if (!page || PageActive(page)) { +#ifdef DEBUG_READAHEAD + if ((readahead_ratio & 3) == 3) + pat[--pidx] = page_refcnt_symbol(page); +#endif + goto skip_scan_locked; + } + if (refcnt > page_refcnt(next_page(head)) && + page_refcnt(page) > page_refcnt(next_page(head))) { +#ifdef DEBUG_READAHEAD + if ((readahead_ratio & 3) == 3) + pat[--pidx] = page_refcnt_symbol(page); +#endif + live_head = head; + goto skip_scan_locked; + } + + /* + * Scan backward to see if the whole chunk should be saved. + * It can be costly. But can be made rare in future. + */ + for (i = LIVE_PAGE_SCAN; i >= 0; i--) { page = radix_tree_cache_lookup(&mapping->page_tree, &cache, --index); #ifdef DEBUG_READAHEAD if ((readahead_ratio & 3) == 3 && pidx) pat[--pidx] = page_refcnt_symbol(page); #endif - /* Having limited leading ra pages is required now. It will - * be less important if ra request boundaries are reserved. - */ - if (!page) { - if (i > mapping->backing_dev_info->ra_pages && - index != head->index - 1 && - !__page_refcnt(head)) - keep_head = 1; + + if (!page) break; - } /* Avoid being pinned by active page. */ - if (PageActive(page)) + if (unlikely(PageActive(page))) break; - /* A trick to speed things up, must be placed after the - * active page test. This check may be removed when chunk - * boundaries are reserved. - */ - if ((index & 63) == 63 && !__page_refcnt(head) && - i > mapping->backing_dev_info->ra_pages && - radix_tree_cache_count(&cache) < - index - radix_tree_cache_first_index(&cache)) { -#ifdef DEBUG_READAHEAD - if ((readahead_ratio & 3) == 3 && pidx) - pat[--pidx] = '|'; -#endif - keep_head = 1; + if (page_refcnt(page) > refcnt) { /* So we are alive! */ + live_head = head; break; } - if (__page_refcnt(page) > refcnt) { /* so they are live pages */ - keep_head = 1; - break; - } - refcnt = __page_refcnt(page); + refcnt = page_refcnt(page); + } + +skip_scan_locked: + /* + * Special case 2: + * Save one extra page if it is a live head of the following chunk. + * Just to be safe. It protects the rare situation when the reader + * is just crossing the chunk boundary, and the following chunk is not + * far away from tail of inactive_list. + */ + if (live_head != head) { + struct page *last_page = prev_page(tail); + page = radix_tree_cache_lookup(&mapping->page_tree, &cache, + last_page->index + 1); + if (page && !live_head) { + refcnt = page_refcnt(last_page); + if (page_refcnt(page) >= refcnt) + page = radix_tree_cache_lookup( + &mapping->page_tree, &cache, + last_page->index + 2); + if (page && page_refcnt(page) < refcnt) + live_head = last_page; + } else if (!page && live_head) + live_head = next_page(live_head); } + read_unlock_irq(&mapping->tree_lock); -drop_head: #ifdef DEBUG_READAHEAD if ((readahead_ratio & 3) == 3) { for (i = 0; pidx < PAGE_SIZE / 2;) @@ -1879,48 +2084,50 @@ drop_head: pat[i++] = '|'; for (page = head; page != tail; page = next_page(page)) { pidx = page->index; + if (page == live_head) + pat[i++] = '['; pat[i++] = page_refcnt_symbol(page); - if (i >= PAGE_SIZE - 1) + BUG_ON(PageAnon(page)); + BUG_ON(PageSwapCache(page)); + /* BUG_ON(page_mapped(page)); */ + if (i >= PAGE_SIZE - 2) break; } + if (live_head) + pat[i++] = ']'; pat[i] = 0; pat[PAGE_SIZE - 1] = 0; } #endif - /* Drop non-descending leading pages. */ - page = head; - if (!keep_head) { - refcnt = __page_refcnt(page); - while (page != tail && /* never dereference tail! */ - refcnt <= __page_refcnt(page)) { - refcnt = __page_refcnt(page); - page = next_page(page); + /* + * Now save the alive pages. + */ + i = 0; + if (live_head) { + for (; live_head != tail;) { /* never dereference tail! */ + page = next_page(live_head); + if (!PageActivate(live_head)) { + if (!page_refcnt(live_head)) + __get_cpu_var(smooth_aging)++; + i++; + list_move(&live_head->lru, save_list); + } + live_head = page; } - } - /* Save the remaining pages. */ - for (i = 0; page != tail;) { - next_page = next_page(page); - if (!PageActivate(page)) { - i++; - list_move(&page->lru, save_list); - } - page = next_page; + if (i) + ra_account(0, RA_EVENT_READAHEAD_RESCUE, i); } - if (i) - inc_page_state(readahead_rescue); - #ifdef DEBUG_READAHEAD if ((readahead_ratio & 3) == 3) { ddprintk("save_chunk(ino=%lu, idx=%lu-%lu-%lu, %s@%s:%s)" - " %s, save %d\n", + " = %d\n", mapping->host->i_ino, index, head->index, pidx, mapping_mapped(mapping) ? "mmap" : "file", - zone_names[page_zonenum(head)], pat, - keep_head ? "keephead" : "drophead", i); + zone_names[page_zonenum(head)], pat, i); if (pat != static_buf) free_page((unsigned long)pat); } @@ -1928,3 +2135,69 @@ drop_head: return i; } + +int rescue_ra_pages(struct list_head *page_list, struct list_head *save_list) +{ + struct address_space *mapping; + struct page *chunk_head; + struct page *live_head; + struct page *page; + unsigned long refcnt; + int n; + int ret = 0; + + page = list_to_page(page_list); + +next_chunk: + chunk_head = page; + live_head = NULL; + mapping = page->mapping; + n = 0; + +next_rs_page: + refcnt = page_refcnt(page); + page = next_page(page); + + if (mapping != page->mapping || &page->lru == page_list) + goto save_chunk; + + if (refcnt == page_refcnt(page)) + n++; + else if (refcnt < page_refcnt(page)) + n = 0; + else if (n < 1) + n = INT_MIN; + else + goto got_live_head; + + goto next_rs_page; + +got_live_head: + n = 0; + live_head = prev_page(page); + +next_page: + if (refcnt < page_refcnt(page)) + n++; + refcnt = page_refcnt(page); + page = next_page(page); + + if (mapping != page->mapping || &page->lru == page_list) + goto save_chunk; + + goto next_page; + +save_chunk: + if (mapping && !PageAnon(chunk_head) && + !PageSwapCache(chunk_head) && + /* !page_mapped(chunk_head) && */ + n <= 3 && + (!refcnt || + prev_page(page)->index >= chunk_head->index + 5)) + ret += save_chunk(chunk_head, live_head, page, save_list); + + if (&page->lru != page_list) + goto next_chunk; + + return ret; +} diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/mm/swap.c linux-2.6.14-ck2/mm/swap.c --- linux-2.6.14-ck1/mm/swap.c 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/mm/swap.c 2005-11-03 00:02:46.000000000 +1100 @@ -29,7 +29,6 @@ #include #include #include -#include /* How many pages do we try to swap or page in/out together? */ int page_cluster; @@ -96,8 +95,6 @@ int rotate_reclaimable_page(struct page return 0; } -extern int readahead_ratio; - /* * FIXME: speed this up? */ @@ -115,25 +112,23 @@ void fastcall activate_page(struct page spin_unlock_irq(&zone->lru_lock); } +DECLARE_PER_CPU(unsigned long, smooth_aging); + /* * Mark a page as having seen activity. * * inactive,unreferenced -> inactive,referenced - * inactive,referenced -> active,unreferenced - * active,unreferenced -> active,referenced + * inactive,referenced -> activate,unreferenced + * activate,unreferenced -> activate,referenced */ void fastcall mark_page_accessed(struct page *page) { - if (!PageActive(page) && !PageActivate(page) && - PageReferenced(page) && PageLRU(page)) { - if (readahead_ratio > 9 || (readahead_ratio & 1)) { - page_zone(page)->nr_page_aging++; - SetPageActivate(page); - } else - activate_page(page); + if (!PageActivate(page) && PageReferenced(page) && PageLRU(page)) { + SetPageActivate(page); ClearPageReferenced(page); } else if (!PageReferenced(page)) { SetPageReferenced(page); + __get_cpu_var(smooth_aging)++; } } @@ -306,7 +301,6 @@ void __pagevec_lru_add(struct pagevec *p if (zone) spin_unlock_irq(&zone->lru_lock); zone = pagezone; - update_page_age(zone); spin_lock_irq(&zone->lru_lock); } if (TestSetPageLRU(page)) diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/mm/vmscan.c linux-2.6.14-ck2/mm/vmscan.c --- linux-2.6.14-ck1/mm/vmscan.c 2005-11-03 00:02:32.000000000 +1100 +++ linux-2.6.14-ck2/mm/vmscan.c 2005-11-03 00:02:46.000000000 +1100 @@ -373,6 +373,7 @@ static pageout_t pageout(struct page *pa } extern int readahead_ratio; +DECLARE_PER_CPU(unsigned long, smooth_aging); /* * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed @@ -382,12 +383,13 @@ static int shrink_list(struct list_head LIST_HEAD(ret_pages); struct pagevec freed_pvec; int pgactivate = 0; + int pgkeep = 0; int reclaimed = 0; cond_resched(); - if (readahead_ratio >= 80) - rescue_ra_pages(page_list, &ret_pages); + if (readahead_ratio >= VM_READAHEAD_PROTECT_RATIO) + pgkeep += rescue_ra_pages(page_list, &ret_pages); pagevec_init(&freed_pvec, 1); while (!list_empty(page_list)) { @@ -416,6 +418,7 @@ static int shrink_list(struct list_head if (PageActivate(page)) { ClearPageActivate(page); + ClearPageReferenced(page); goto activate_locked; } @@ -423,6 +426,8 @@ static int shrink_list(struct list_head /* In active use or really unfreeable? Activate it. */ if (referenced && page_mapping_inuse(page)) goto activate_locked; + if (!referenced) + __get_cpu_var(smooth_aging)++; #ifdef CONFIG_SWAP /* @@ -564,11 +569,13 @@ keep_locked: keep: list_add(&page->lru, &ret_pages); BUG_ON(PageLRU(page)); + pgkeep++; } list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) __pagevec_release_nonlru(&freed_pvec); mod_page_state(pgactivate, pgactivate); + mod_page_state(pgkeepcold, pgkeep - pgactivate); sc->nr_reclaimed += reclaimed; return reclaimed; } @@ -652,6 +659,7 @@ static void shrink_cache(struct zone *zo goto done; max_scan -= nr_scan; + update_page_age(zone, nr_scan); if (current_is_kswapd()) mod_page_state_zone(zone, pgscan_kswapd, nr_scan); else @@ -774,6 +782,7 @@ refill_inactive_zone(struct zone *zone, list_add(&page->lru, &l_active); continue; } + __get_cpu_var(smooth_aging)++; } list_add(&page->lru, &l_inactive); } @@ -792,7 +801,6 @@ refill_inactive_zone(struct zone *zone, pgmoved++; if (!pagevec_add(&pvec, page)) { zone->nr_inactive += pgmoved; - zone->nr_page_aging += pgmoved; spin_unlock_irq(&zone->lru_lock); pgdeactivate += pgmoved; pgmoved = 0; @@ -802,7 +810,6 @@ refill_inactive_zone(struct zone *zone, spin_lock_irq(&zone->lru_lock); } } - zone->nr_page_aging += pgmoved; zone->nr_inactive += pgmoved; pgdeactivate += pgmoved; if (buffer_heads_over_limit) { @@ -834,6 +841,7 @@ refill_inactive_zone(struct zone *zone, mod_page_state_zone(zone, pgrefill, pgscanned); mod_page_state(pgdeactivate, pgdeactivate); + mod_page_state(pgkeephot, pgmoved); } /* @@ -885,13 +893,46 @@ shrink_zone(struct zone *zone, struct sc } } - update_page_age(zone); throttle_vm_writeout(); atomic_dec(&zone->reclaim_in_progress); } /* + * Helper functions to adjust nice level of kswapd, based on the priority of + * the task (p) that called it. If it is already higher priority we do not + * demote its nice level since it is still working on behalf of a higher + * priority task. With kernel threads we leave it at nice 0. + * + * We don't ever run kswapd real time, so if a real time task calls kswapd we + * set it to highest SCHED_NORMAL priority. + */ +static int effective_sc_prio(struct task_struct *p) +{ + if (likely(p->mm)) { + if (rt_task(p)) + return -20; + if (batch_task(p)) + return 19; + return task_nice(p); + } + return 0; +} + +static void set_kswapd_nice(task_t *kswapd, task_t *p, int active) +{ + long nice = effective_sc_prio(p); + + if (task_nice(kswapd) > nice || !active) + set_user_nice(kswapd, nice); +} + +static int sc_priority(struct task_struct *p) +{ + return (DEF_PRIORITY + (DEF_PRIORITY * effective_sc_prio(p) / 40)); +} + +/* * This is the direct reclaim path, for page-allocating processes. We only * try to reclaim pages from zones which will satisfy the caller's allocation * request. @@ -945,7 +986,8 @@ shrink_caches(struct zone **zones, struc * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -int try_to_free_pages(struct zone **zones, unsigned int gfp_mask) +int try_to_free_pages(struct zone **zones, unsigned int gfp_mask, + struct task_struct *p) { int priority; int ret = 0; @@ -953,7 +995,10 @@ int try_to_free_pages(struct zone **zone struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc; unsigned long lru_pages = 0; - int i; + int i, scan_priority = DEF_PRIORITY; + + if (p) + scan_priority = sc_priority(p); delay_prefetch(); @@ -969,11 +1014,11 @@ int try_to_free_pages(struct zone **zone if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->temp_priority = DEF_PRIORITY; + zone->temp_priority = scan_priority; lru_pages += zone->nr_active + zone->nr_inactive; } - for (priority = DEF_PRIORITY; priority >= 0; priority--) { + for (priority = scan_priority; priority >= 0; priority--) { sc.nr_mapped = read_page_state(nr_mapped); sc.nr_scanned = 0; sc.nr_reclaimed = 0; @@ -1005,7 +1050,7 @@ int try_to_free_pages(struct zone **zone } /* Take a nap, wait for some writeback to complete */ - if (sc.nr_scanned && priority < DEF_PRIORITY - 2) + if (sc.nr_scanned && priority < scan_priority - 2) blk_congestion_wait(WRITE, HZ/10); } out: @@ -1048,13 +1093,15 @@ out: static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) { int to_free = nr_pages; - int all_zones_ok; + int all_zones_ok = 0; int priority; - int i; + int i, scan_priority; int total_scanned, total_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc; + scan_priority = sc_priority(pgdat->kswapd); + loop_again: total_scanned = 0; total_reclaimed = 0; @@ -1068,10 +1115,10 @@ loop_again: for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; - zone->temp_priority = DEF_PRIORITY; + zone->temp_priority = scan_priority; } - for (priority = DEF_PRIORITY; priority >= 0; priority--) { + for (priority = scan_priority; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ int begin_zone = -1; unsigned long lru_pages = 0; @@ -1087,13 +1134,11 @@ loop_again: struct zone *zone = pgdat->node_zones + i; unsigned long watermark; - update_page_age(zone); - if (zone->present_pages == 0) continue; if (zone->all_unreclaimable && - priority != DEF_PRIORITY) + priority != scan_priority) continue; /* @@ -1103,7 +1148,7 @@ loop_again: */ watermark = zone->pages_high + (zone->pages_high * priority / - DEF_PRIORITY); + scan_priority); if (!zone_watermark_ok(zone, order, watermark, 0, 0, 0)) { @@ -1155,13 +1200,13 @@ loop_again: if (zone->present_pages == 0) continue; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != scan_priority) continue; if (nr_pages == 0) { /* Not software suspend */ unsigned long watermark = zone->pages_high + (zone->pages_high * priority / - DEF_PRIORITY); + scan_priority); if (!zone_watermark_ok(zone, order, watermark, end_zone, 0, 0)) all_zones_ok = 0; @@ -1204,7 +1249,7 @@ loop_again: * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. */ - if (total_scanned && priority < DEF_PRIORITY - 2) + if (total_scanned && priority < scan_priority - 2) blk_congestion_wait(WRITE, HZ/10); /* @@ -1294,6 +1339,7 @@ static int kswapd(void *p) */ order = new_order; } else { + set_user_nice(tsk, 0); schedule(); order = pgdat->kswapd_max_order; } @@ -1307,21 +1353,27 @@ static int kswapd(void *p) /* * A zone is low on free memory, so wake its kswapd task to service it. */ -void wakeup_kswapd(struct zone *zone, int order) +void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p) { pg_data_t *pgdat; + int active; if (zone->present_pages == 0) return; pgdat = zone->zone_pgdat; + if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0)) return; + if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) return; - if (!waitqueue_active(&pgdat->kswapd_wait)) + + active = waitqueue_active(&pgdat->kswapd_wait); + set_kswapd_nice(pgdat->kswapd, p, active); + if (!active) return; wake_up_interruptible(&pgdat->kswapd_wait); }