diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/Documentation/sysctl/vm.txt linux-2.6.14-ck2/Documentation/sysctl/vm.txt
--- linux-2.6.14-ck1/Documentation/sysctl/vm.txt	2005-11-03 00:02:31.000000000 +1100
+++ linux-2.6.14-ck2/Documentation/sysctl/vm.txt	2005-11-03 00:02:46.000000000 +1100
@@ -27,6 +27,7 @@ Currently, these files are in /proc/sys/
 - laptop_mode
 - block_dump
 - swap_prefetch
+- readahead_ratio
 
 ==============================================================
 
@@ -114,3 +115,16 @@ except when laptop_mode is enabled and t
 Setting it to 0 disables prefetching entirely.
 
 The default value is dependant on ramsize.
+
+==============================================================
+
+readahead_ratio
+
+This limits read-ahead size to percent of the thrashing-threshold.
+The thrashing-threshold is dynamicly estimated according to the
+_history_ read speed and system load, and used to limit the
+_future_ read-ahead request size. So you should set it to a low
+value if you have not enough memory to counteract the I/O load
+fluctuation.
+
+The default value is 50.
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/fs/buffer.c linux-2.6.14-ck2/fs/buffer.c
--- linux-2.6.14-ck1/fs/buffer.c	2005-10-28 20:22:00.000000000 +1000
+++ linux-2.6.14-ck2/fs/buffer.c	2005-11-03 00:02:46.000000000 +1100
@@ -504,7 +504,7 @@ static void free_more_memory(void)
 	for_each_pgdat(pgdat) {
 		zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
 		if (*zones)
-			try_to_free_pages(zones, GFP_NOFS);
+			try_to_free_pages(zones, GFP_NOFS, NULL);
 	}
 }
 
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/fs.h linux-2.6.14-ck2/include/linux/fs.h
--- linux-2.6.14-ck1/include/linux/fs.h	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/include/linux/fs.h	2005-11-03 00:02:46.000000000 +1100
@@ -570,28 +570,15 @@ struct file_ra_state {
 	unsigned long mmap_hit;		/* Cache hit stat for mmap accesses */
 	unsigned long mmap_miss;	/* Cache miss stat for mmap accesses */
 
+	unsigned long age;
 	unsigned long la_index;
 	unsigned long ra_index;
 	unsigned long lookahead_index;
 	unsigned long readahead_index;
-	unsigned long nr_page_aging;
 };
 #define RA_FLAG_MISS 0x01	/* a cache miss occured against this file */
 #define RA_FLAG_INCACHE 0x02	/* file is already in cache */
 
-#define RA_CLASS_SHIFT 3
-#define RA_CLASS_MASK  ((1 << RA_CLASS_SHIFT) - 1)
-enum file_ra_class { /* the same order must be kept in page_state */
-	RA_CLASS_NEWFILE = 1,
-	RA_CLASS_STATE,
-	RA_CLASS_CONTEXT,
-	RA_CLASS_CONTEXT_ACCELERATED,
-	RA_CLASS_BACKWARD,
-	/* RA_CLASS_AROUND, */
-	RA_CLASS_RANDOM,
-	RA_CLASS_END,
-};
-
 struct file {
 	struct list_head	f_list;
 	struct dentry		*f_dentry;
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/mm.h linux-2.6.14-ck2/include/linux/mm.h
--- linux-2.6.14-ck1/include/linux/mm.h	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/include/linux/mm.h	2005-11-03 00:02:46.000000000 +1100
@@ -880,6 +880,9 @@ int write_one_page(struct page *page, in
 #define VM_MAX_CACHE_HIT    	256	/* max pages in a row in cache before
 					 * turning readahead off */
 
+/* turn on read-ahead thrashing protection if (readahead_ratio >= ##) */
+#define VM_READAHEAD_PROTECT_RATIO	80
+
 int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 			unsigned long offset, unsigned long nr_to_read);
 int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
@@ -900,8 +903,7 @@ page_cache_readahead_adaptive(struct add
 			unsigned long index, unsigned long last_index);
 void fastcall ra_access(struct file_ra_state *ra, struct page *page);
 int rescue_ra_pages(struct list_head *page_list, struct list_head *save_list);
-int save_chunk(struct page *head, struct page *tail,
-		struct list_head *save_list);
+
 
 /* Do stack extension */
 extern int expand_stack(struct vm_area_struct * vma, unsigned long address);
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/mm_inline.h linux-2.6.14-ck2/include/linux/mm_inline.h
--- linux-2.6.14-ck1/include/linux/mm_inline.h	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/include/linux/mm_inline.h	2004-03-11 21:29:27.000000000 +1100
@@ -11,7 +11,6 @@ add_page_to_inactive_list(struct zone *z
 {
 	list_add(&page->lru, &zone->inactive_list);
 	zone->nr_inactive++;
-	zone->nr_page_aging++;
 }
 
 static inline void
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/mmzone.h linux-2.6.14-ck2/include/linux/mmzone.h
--- linux-2.6.14-ck1/include/linux/mmzone.h	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/include/linux/mmzone.h	2005-11-03 00:02:46.000000000 +1100
@@ -316,7 +316,7 @@ void __get_zone_counts(unsigned long *ac
 void get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free);
 void build_all_zonelists(void);
-void wakeup_kswapd(struct zone *zone, int order);
+void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p);
 int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 		int alloc_type, int can_try_harder, int gfp_high);
 
@@ -342,11 +342,13 @@ unsigned long __init node_memmap_size_by
 #define		PAGE_AGE_MASK   ((1 << PAGE_AGE_SHIFT) - 1)
 
 /*
- * The percent of pages in inactive_list that have been scanned / aged.
- * It's not really ##%, but a high resolution normalized value.
+ * Keep track of the percent of pages in inactive_list that have been scanned
+ * / aged.  It's not really ##%, but a high resolution normalized value.
  */
-static inline void update_page_age(struct zone *z)
+static inline void update_page_age(struct zone *z, int nr_scan)
 {
+	z->nr_page_aging += nr_scan;
+
 	if (z->nr_page_aging - z->aging_milestone > z->nr_inactive)
 		z->aging_milestone += z->nr_inactive;
 
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/page-flags.h linux-2.6.14-ck2/include/linux/page-flags.h
--- linux-2.6.14-ck1/include/linux/page-flags.h	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/include/linux/page-flags.h	2005-11-03 00:02:46.000000000 +1100
@@ -106,6 +106,8 @@ struct page_state {
 	unsigned long pgfree;		/* page freeings */
 	unsigned long pgactivate;	/* pages moved inactive->active */
 	unsigned long pgdeactivate;	/* pages moved active->inactive */
+	unsigned long pgkeephot;	/* pages sent back to active */
+	unsigned long pgkeepcold;	/* pages sent back to inactive */
 
 	unsigned long pgfault;		/* faults (major+minor) */
 	unsigned long pgmajfault;	/* faults (major only) */
@@ -133,63 +135,6 @@ struct page_state {
 
 	unsigned long pgrotated;	/* pages rotated to tail of the LRU */
 	unsigned long nr_bounce;	/* pages for bounce buffers */
-
-	unsigned long cache_miss;	/* read cache misses */
-	unsigned long readrandom;	/* random reads */
-	unsigned long pgreadrandom;	/* random read pages */
-	unsigned long readahead_rescue; /* read-aheads rescued*/
-	unsigned long pgreadahead_rescue;
-	unsigned long readahead_end;	/* read-aheads passed EOF */
-
-	unsigned long readahead;	/* read-aheads issued */
-	unsigned long readahead_return;	/* look-ahead marks returned */
-	unsigned long readahead_eof;	/* read-aheads stop at EOF */
-	unsigned long pgreadahead;	/* read-ahead pages issued */
-	unsigned long pgreadahead_hit;	/* read-ahead pages accessed */
-	unsigned long pgreadahead_eof;
-
-	unsigned long ra_newfile;	/* read-ahead on start of file */
-	unsigned long ra_newfile_return;
-	unsigned long ra_newfile_eof;
-	unsigned long pgra_newfile;
-	unsigned long pgra_newfile_hit;
-	unsigned long pgra_newfile_eof;
-
-	unsigned long ra_state;		/* state based read-ahead */
-	unsigned long ra_state_return;
-	unsigned long ra_state_eof;
-	unsigned long pgra_state;
-	unsigned long pgra_state_hit;
-	unsigned long pgra_state_eof;
-
-	unsigned long ra_context;	/* context based read-ahead */
-	unsigned long ra_context_return;
-	unsigned long ra_context_eof;
-	unsigned long pgra_context;
-	unsigned long pgra_context_hit;
-	unsigned long pgra_context_eof;
-
-	unsigned long ra_contexta;	/* accelerated context based read-ahead */
-	unsigned long ra_contexta_return;
-	unsigned long ra_contexta_eof;
-	unsigned long pgra_contexta;
-	unsigned long pgra_contexta_hit;
-	unsigned long pgra_contexta_eof;
-
-	unsigned long ra_backward;	/* prefetch pages for backward reading */
-	unsigned long ra_backward_return;
-	unsigned long ra_backward_eof;
-	unsigned long pgra_backward;
-	unsigned long pgra_backward_hit;
-	unsigned long pgra_backward_eof;
-
-	unsigned long ra_random;	/* read-ahead on seek-and-read-pages */
-	unsigned long ra_random_return;
-	unsigned long ra_random_eof;
-	unsigned long pgra_random;
-	unsigned long pgra_random_hit;
-	unsigned long pgra_random_eof;
-
 };
 
 extern void get_page_state(struct page_state *ret);
@@ -374,9 +319,7 @@ extern void __mod_page_state(unsigned lo
 
 #define PageReadahead(page)	test_bit(PG_readahead, &(page)->flags)
 #define SetPageReadahead(page)	set_bit(PG_readahead, &(page)->flags)
-#define ClearPageReadahead(page) clear_bit(PG_readahead, &(page)->flags)
 #define TestClearPageReadahead(page) test_and_clear_bit(PG_readahead, &(page)->flags)
-#define TestSetPageReadahead(page) test_and_set_bit(PG_readahead, &(page)->flags)
 
 struct page;	/* forward declaration */
 
@@ -394,4 +337,28 @@ static inline void set_page_writeback(st
 	test_set_page_writeback(page);
 }
 
+#if PG_activate < PG_referenced
+#error unexpected page flags order
+#endif
+
+#define PAGE_REFCNT_0		0
+#define PAGE_REFCNT_1		(1 << PG_referenced)
+#define PAGE_REFCNT_2		(1 << PG_activate)
+#define PAGE_REFCNT_3		((1 << PG_activate) | (1 << PG_referenced))
+#define PAGE_REFCNT_MASK	PAGE_REFCNT_3
+
+/*
+ * STATUS   REFERENCE COUNT
+ *  __                   0
+ *  _R       PAGE_REFCNT_1
+ *  A_       PAGE_REFCNT_2
+ *  AR       PAGE_REFCNT_3
+ *
+ *  A/R: Active / Referenced
+ */
+static inline unsigned long page_refcnt(struct page *page)
+{
+	return page->flags & PAGE_REFCNT_MASK;
+}
+
 #endif	/* PAGE_FLAGS_H */
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/radix-tree.h linux-2.6.14-ck2/include/linux/radix-tree.h
--- linux-2.6.14-ck1/include/linux/radix-tree.h	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/include/linux/radix-tree.h	2005-11-03 00:02:46.000000000 +1100
@@ -22,12 +22,39 @@
 #include <linux/preempt.h>
 #include <linux/types.h>
 
+#ifdef __KERNEL__
+#define RADIX_TREE_MAP_SHIFT	6
+#else
+#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
+#endif
+#define RADIX_TREE_TAGS		2
+
+#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
+
+#define RADIX_TREE_TAG_LONGS	\
+	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
+
+struct radix_tree_node {
+	unsigned int	count;
+	void		*slots[RADIX_TREE_MAP_SIZE];
+	unsigned long	tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
+};
+
 struct radix_tree_root {
 	unsigned int		height;
 	unsigned int		gfp_mask;
 	struct radix_tree_node	*rnode;
 };
 
+/*
+ * Support access patterns with strong locality.
+ */
+struct radix_tree_cache {
+	unsigned long first_index;
+	struct radix_tree_node *tree_node;
+};
+
 #define RADIX_TREE_INIT(mask)	{					\
 	.height = 0,							\
 	.gfp_mask = (mask),						\
@@ -45,10 +72,13 @@ do {									\
 } while (0)
 
 int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
-void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 void *radix_tree_lookup_node(struct radix_tree_root *, unsigned long,
 							unsigned int);
+void *radix_tree_delete(struct radix_tree_root *, unsigned long);
+unsigned long radix_tree_lookup_head(struct radix_tree_root *root,
+				unsigned long index, unsigned int max_scan);
+unsigned long radix_tree_lookup_tail(struct radix_tree_root *root,
+				unsigned long index, unsigned int max_scan);
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
@@ -70,19 +100,119 @@ static inline void radix_tree_preload_en
 	preempt_enable();
 }
 
-/*
- * Support access patterns with locality.
+/**
+ *	radix_tree_lookup    -    perform lookup operation on a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Lookup the item at the position @index in the radix tree @root.
  */
-struct radix_tree_cache {
-	unsigned long first_index;
-	struct radix_tree_node *tree_node;
-};
+static inline void *radix_tree_lookup(struct radix_tree_root *root,
+							unsigned long index)
+{
+	return radix_tree_lookup_node(root, index, 0);
+}
+
+/**
+ *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Lookup the slot corresponding to the position @index in the radix tree
+ *	@root. This is useful for update-if-exists operations.
+ */
+static inline void **radix_tree_lookup_slot(struct radix_tree_root *root,
+							unsigned long index)
+{
+	struct radix_tree_node *node;
+
+	node = radix_tree_lookup_node(root, index, 1);
+	return node->slots + (index & RADIX_TREE_MAP_MASK);
+}
 
-void radix_tree_cache_init(struct radix_tree_cache *cache);
-void *radix_tree_cache_lookup(struct radix_tree_root *,
-				struct radix_tree_cache *, unsigned long);
-int radix_tree_cache_size(struct radix_tree_cache *cache);
-int radix_tree_cache_count(struct radix_tree_cache *cache);
-int radix_tree_cache_first_index(struct radix_tree_cache *cache);
+/**
+ *	radix_tree_cache_lookup_node    -    cached lookup node
+ *	@root:		radix tree root
+ *	@cache:		look-aside cache
+ *	@index:		index key
+ *
+ *	Lookup the item at the position @index in the radix tree @root,
+ *	and return the node @level levels from the bottom in the search path.
+ *	@cache stores the last accessed upper level tree node by this
+ *	function, and is always checked first before searching in the tree.
+ *	It can improve speed for access patterns with strong locality.
+ *	NOTE:
+ *	- The cache becomes invalid on leaving the lock;
+ *	- Do not intermix calls with different @level.
+ */
+static inline void *radix_tree_cache_lookup_node(struct radix_tree_root *root,
+				struct radix_tree_cache *cache,
+				unsigned long index, unsigned int level)
+{
+	struct radix_tree_node *node;
+        unsigned long i;
+        unsigned long mask;
+
+        if (level && level >= root->height)
+                return root->rnode;
+
+        i = ((index >> (level * RADIX_TREE_MAP_SHIFT)) & RADIX_TREE_MAP_MASK);
+        mask = ~((RADIX_TREE_MAP_SIZE << (level * RADIX_TREE_MAP_SHIFT)) - 1);
+
+	if ((index & mask) == cache->first_index)
+                return cache->tree_node->slots[i];
+
+	node = radix_tree_lookup_node(root, index, level + 1);
+	if (!node)
+		return 0;
+
+	cache->tree_node = node;
+	cache->first_index = (index & mask);
+        return node->slots[i];
+}
+
+/**
+ *	radix_tree_cache_lookup    -    cached lookup page
+ *	@root:		radix tree root
+ *	@cache:		look-aside cache
+ *	@index:		index key
+ *
+ *	Lookup the item at the position @index in the radix tree @root.
+ */
+static inline void *radix_tree_cache_lookup(struct radix_tree_root *root,
+				struct radix_tree_cache *cache,
+				unsigned long index)
+{
+	return radix_tree_cache_lookup_node(root, cache, index, 0);
+}
+
+static inline void radix_tree_cache_init(struct radix_tree_cache *cache)
+{
+	cache->first_index = 0x77;
+	cache->tree_node = NULL; /* just to kill gcc warning */
+}
+
+static inline int radix_tree_cache_size(struct radix_tree_cache *cache)
+{
+	return RADIX_TREE_MAP_SIZE;
+}
+
+static inline int radix_tree_cache_count(struct radix_tree_cache *cache)
+{
+	if (cache->first_index != 0x77)
+		return cache->tree_node->count;
+	else
+		return 0;
+}
+
+static inline int radix_tree_cache_full(struct radix_tree_cache *cache)
+{
+	return radix_tree_cache_count(cache) == radix_tree_cache_size(cache);
+}
+
+static inline int radix_tree_cache_first_index(struct radix_tree_cache *cache)
+{
+	return cache->first_index;
+}
 
 #endif /* _LINUX_RADIX_TREE_H */
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/swap.h linux-2.6.14-ck2/include/linux/swap.h
--- linux-2.6.14-ck1/include/linux/swap.h	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/include/linux/swap.h	2005-11-03 00:02:46.000000000 +1100
@@ -171,7 +171,8 @@ extern int rotate_reclaimable_page(struc
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
-extern int try_to_free_pages(struct zone **, unsigned int);
+extern int try_to_free_pages(struct zone **, unsigned int,
+				struct task_struct *p);
 extern int zone_reclaim(struct zone *, unsigned int, unsigned int);
 extern int shrink_all_memory(int);
 extern int vm_mapped;
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/sysctl.h linux-2.6.14-ck2/include/linux/sysctl.h
--- linux-2.6.14-ck1/include/linux/sysctl.h	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/include/linux/sysctl.h	2005-11-03 00:02:46.000000000 +1100
@@ -183,9 +183,9 @@ enum
 	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
 	VM_SWAP_TOKEN_TIMEOUT=28, /* default time for token time out */
-	VM_HARDMAPLIMIT=29,	/* Make mapped a hard limit */
-	VM_SWAP_PREFETCH=30,	/* int: amount to swap prefetch */
-	VM_READAHEAD_RATIO=31,	/* percent of read-ahead size to thrashing-threshold */
+	VM_SWAP_PREFETCH=29,	/* int: amount to swap prefetch */
+	VM_READAHEAD_RATIO=30, /* percent of read-ahead size to thrashing-threshold */
+	VM_HARDMAPLIMIT=31,	/* Make mapped a hard limit */
 };
 
 
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/include/linux/writeback.h linux-2.6.14-ck2/include/linux/writeback.h
--- linux-2.6.14-ck1/include/linux/writeback.h	2005-10-28 20:22:02.000000000 +1000
+++ linux-2.6.14-ck2/include/linux/writeback.h	2005-11-03 00:02:46.000000000 +1100
@@ -90,6 +90,12 @@ void laptop_io_completion(void);
 void laptop_sync_completion(void);
 void throttle_vm_writeout(void);
 
+extern struct timer_list laptop_mode_wb_timer;
+static inline int laptop_spinned_down(void)
+{
+	return !timer_pending(&laptop_mode_wb_timer);
+}
+
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
 extern int vm_dirty_ratio;
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/kernel/sched.c linux-2.6.14-ck2/kernel/sched.c
--- linux-2.6.14-ck1/kernel/sched.c	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/kernel/sched.c	2005-11-03 00:02:46.000000000 +1100
@@ -16,9 +16,9 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
- *  2005-09-16	New staircase scheduling policy by Con Kolivas with help
+ *  2005-11-02	New staircase scheduling policy by Con Kolivas with help
  *		from William Lee Irwin III, Zwane Mwaikambo & Peter Williams.
- *		Staircase v12.1
+ *		Staircase v12.2
  */
 
 #include <linux/mm.h>
@@ -779,7 +779,11 @@ static inline void recalc_task_prio(task
 		NS_TO_JIFFIES(sleep_time) < p->slice) {
 			p->flags &= ~PF_NONSLEEP;
 			dec_burst(p);
-			p->totalrun += sleep_time - JIFFIES_TO_NS(p->slice);
+			p->totalrun -= JIFFIES_TO_NS(p->slice);
+			if (sleep_time > p->totalrun)
+				p->totalrun = 0;
+			else
+				p->totalrun -= sleep_time;
 			goto out;
 	}
 
@@ -3431,6 +3435,8 @@ void set_user_nice(task_t *p, long nice)
 	delta = new_prio - old_prio;
 	p->static_prio = NICE_TO_PRIO(nice);
 	p->prio += delta;
+	if (p->burst > burst(p))
+		p->burst = burst(p);
 
 	if (queued) {
 		enqueue_task(p, rq);
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/lib/radix-tree.c linux-2.6.14-ck2/lib/radix-tree.c
--- linux-2.6.14-ck1/lib/radix-tree.c	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/lib/radix-tree.c	2005-11-03 00:02:46.000000000 +1100
@@ -32,25 +32,6 @@
 #include <linux/bitops.h>
 
 
-#ifdef __KERNEL__
-#define RADIX_TREE_MAP_SHIFT	6
-#else
-#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
-#endif
-#define RADIX_TREE_TAGS		2
-
-#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
-#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
-
-#define RADIX_TREE_TAG_LONGS	\
-	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
-
-struct radix_tree_node {
-	unsigned int	count;
-	void		*slots[RADIX_TREE_MAP_SIZE];
-	unsigned long	tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
-};
-
 struct radix_tree_path {
 	struct radix_tree_node *node;
 	int offset;
@@ -134,6 +115,7 @@ int radix_tree_preload(gfp_t gfp_mask)
 out:
 	return ret;
 }
+EXPORT_SYMBOL(radix_tree_preload);
 
 static inline void tag_set(struct radix_tree_node *node, int tag, int offset)
 {
@@ -281,6 +263,19 @@ int radix_tree_insert(struct radix_tree_
 }
 EXPORT_SYMBOL(radix_tree_insert);
 
+/**
+ *	radix_tree_lookup_node    -    low level lookup routine
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@level:		stop at that many levels from bottom
+ *
+ *	Lookup the item at the position @index in the radix tree @root.
+ *	The return value is:
+ *	@level == 0:      page at @index;
+ *	@level == 1:      the corresponding bottom level tree node;
+ *	@level < height:  (height - @level)th level tree node;
+ *	@level >= height: root node.
+ */
 void *radix_tree_lookup_node(struct radix_tree_root *root,
 				unsigned long index, unsigned int level)
 {
@@ -308,63 +303,111 @@ void *radix_tree_lookup_node(struct radi
 EXPORT_SYMBOL(radix_tree_lookup_node);
 
 /**
- *	radix_tree_lookup    -    perform lookup operation on a radix tree
+ *	radix_tree_lookup_head    -    lookup the head index
  *	@root:		radix tree root
  *	@index:		index key
+ *	@max_scan:      max items to scan
  *
- *	Lookup the item at the position @index in the radix tree @root.
+ *      Lookup head index of the segment which contains @index. A segment is
+ *      a set of continuous pages in a file.
+ *      CASE                       RETURN VALUE
+ *      no page at @index          (not head) = @index + 1
+ *      found in the range         @index - @max_scan < (head index) <= @index
+ *      not found in range         (unfinished head) <= @index - @max_scan
  */
-void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
-{
-	return radix_tree_lookup_node(root, index, 0);
-}
-EXPORT_SYMBOL(radix_tree_lookup);
-
-void *radix_tree_cache_lookup(struct radix_tree_root *root,
-				struct radix_tree_cache *cache,
-				unsigned long index)
+unsigned long radix_tree_lookup_head(struct radix_tree_root *root,
+				unsigned long index, unsigned int max_scan)
 {
+	struct radix_tree_cache cache;
 	struct radix_tree_node *node;
+	int i;
+	unsigned long origin;
 
-	if ((index & ~RADIX_TREE_MAP_MASK) == cache->first_index)
-		return cache->tree_node->slots[index & RADIX_TREE_MAP_MASK];
+	origin = index;
+	if (unlikely(max_scan > index))
+		max_scan = index;
+        radix_tree_cache_init(&cache);
+
+next_node:
+	if (origin - index > max_scan)
+		goto out;
 
-	node = radix_tree_lookup_node(root, index, 1);
+	node = radix_tree_cache_lookup_node(root, &cache, index, 1);
 	if (!node)
-		return 0;
+		goto out;
 
-	cache->tree_node = node;
-	cache->first_index = (index & ~RADIX_TREE_MAP_MASK);
-	return node->slots[index & RADIX_TREE_MAP_MASK];
-}
-EXPORT_SYMBOL(radix_tree_cache_lookup);
+	if (node->count == RADIX_TREE_MAP_SIZE) {
+		if (index < RADIX_TREE_MAP_SIZE) {
+			index = -1;
+			goto out;
+		}
+		index = (index - RADIX_TREE_MAP_SIZE) | RADIX_TREE_MAP_MASK;
+		goto next_node;
+	}
 
-void radix_tree_cache_init(struct radix_tree_cache *cache)
-{
-	cache->first_index = 1;
-}
-EXPORT_SYMBOL(radix_tree_cache_init);
+	for (i = index & RADIX_TREE_MAP_MASK; i >= 0; i--, index--) {
+		if (!node->slots[i])
+			goto out;
+	}
 
-int radix_tree_cache_size(struct radix_tree_cache *cache)
-{
-	return RADIX_TREE_MAP_SIZE;
-}
-EXPORT_SYMBOL(radix_tree_cache_size);
+	goto next_node;
 
-int radix_tree_cache_count(struct radix_tree_cache *cache)
-{
-	if (cache->first_index != 1)
-		return cache->tree_node->count;
-	else
-		return 0;
+out:
+	return index + 1;
 }
-EXPORT_SYMBOL(radix_tree_cache_count);
+EXPORT_SYMBOL(radix_tree_lookup_head);
 
-int radix_tree_cache_first_index(struct radix_tree_cache *cache)
+/**
+ *	radix_tree_lookup_tail    -    lookup the tail index
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@max_scan:      max items to scan
+ *
+ *      Lookup tail(pass the end) index of the segment which contains @index.
+ *      A segment is a set of continuous pages in a file.
+ *      CASE                       RETURN VALUE
+ *      found in the range         @index <= (tail index) < @index + @max_scan
+ *      not found in range         @index + @max_scan <= (non tail)
+ */
+unsigned long radix_tree_lookup_tail(struct radix_tree_root *root,
+				unsigned long index, unsigned int max_scan)
 {
-	return cache->first_index;
+	struct radix_tree_cache cache;
+	struct radix_tree_node *node;
+	int i;
+	unsigned long origin;
+
+	origin = index;
+	if (unlikely(index + max_scan < index))
+		max_scan = LONG_MAX - index;
+        radix_tree_cache_init(&cache);
+
+next_node:
+	if (index - origin >= max_scan)
+		goto out;
+
+	node = radix_tree_cache_lookup_node(root, &cache, index, 1);
+	if (!node)
+		goto out;
+
+	if (node->count == RADIX_TREE_MAP_SIZE) {
+		index = (index | RADIX_TREE_MAP_MASK) + 1;
+		if (unlikely(!index))
+			goto out;
+		goto next_node;
+	}
+
+	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++, index++) {
+		if (!node->slots[i])
+			goto out;
+	}
+
+	goto next_node;
+
+out:
+	return index;
 }
-EXPORT_SYMBOL(radix_tree_cache_first_index);
+EXPORT_SYMBOL(radix_tree_lookup_tail);
 
 /**
  *	radix_tree_tag_set - set a tag on a radix tree node
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/Makefile linux-2.6.14-ck2/Makefile
--- linux-2.6.14-ck1/Makefile	2005-11-03 00:02:31.000000000 +1100
+++ linux-2.6.14-ck2/Makefile	2005-11-03 00:02:46.000000000 +1100
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 14
-EXTRAVERSION =-ck1
+EXTRAVERSION =-ck2
 NAME=Cognac Affected Albatross
 
 # *DOCUMENTATION*
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/mm/filemap.c linux-2.6.14-ck2/mm/filemap.c
--- linux-2.6.14-ck1/mm/filemap.c	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/mm/filemap.c	2005-11-03 00:02:46.000000000 +1100
@@ -765,7 +765,7 @@ void do_generic_mapping_read(struct addr
 		nr = nr - offset;
 
 		cond_resched();
-		
+
 		if (readahead_ratio <= 9 && index == next_index)
 			next_index = page_cache_readahead(mapping, &ra, filp,
 					index, last_index - index);
@@ -794,6 +794,7 @@ find_page:
 		if (prev_page)
 			page_cache_release(prev_page);
 		prev_page = page;
+		ra_access(&ra, page);
 		if (!PageUptodate(page))
 			goto page_not_up_to_date;
 page_ok:
@@ -810,7 +811,6 @@ page_ok:
 		 * in succession, only mark it as accessed the first time.
 		 */
 		if (prev_index != index) {
-			ra_access(&ra, page);
 			mark_page_accessed(page);
 		}
 		prev_index = index;
@@ -1309,6 +1309,8 @@ retry_find:
 	if (!did_readaround)
 		ra->mmap_hit++;
 
+	ra_access(ra, page);
+
 	/*
 	 * Ok, found a page in the page cache, now we need to check
 	 * that it's up-to-date.
@@ -1320,10 +1322,11 @@ success:
 	/*
 	 * Found the page and have a reference on it.
 	 */
-	ra_access(ra, page);
 	mark_page_accessed(page);
 	if (type)
 		*type = majmin;
+	if (readahead_ratio > 9)
+		ra->prev_page = page->index;
 	return page;
 
 outside_data_content:
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/mm/page_alloc.c linux-2.6.14-ck2/mm/page_alloc.c
--- linux-2.6.14-ck1/mm/page_alloc.c	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/mm/page_alloc.c	2005-11-03 00:02:46.000000000 +1100
@@ -110,10 +110,9 @@ static void bad_page(const char *functio
 			1 << PG_private |
 			1 << PG_locked	|
 			1 << PG_active	|
-			1 << PG_activate|
 			1 << PG_dirty	|
 			1 << PG_reclaim |
-			1 << PG_slab	|
+			1 << PG_slab    |
 			1 << PG_swapcache |
 			1 << PG_writeback);
 	set_page_count(page, 0);
@@ -820,11 +819,11 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
 
 	classzone_idx = zone_idx(zones[0]);
 
-restart:
 	/*
 	 * Go through the zonelist once, looking for a zone with enough free.
 	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
 	 */
+restart:
 	/*
 	 * To fulfill three goals:
 	 * - balanced page aging
@@ -904,7 +903,7 @@ zone_reclaim_retry:
 			goto got_pg;
 
 try_harder:
-		wakeup_kswapd(z, order);
+		wakeup_kswapd(z, order, p);
 
 		/*
 		 * Put stress on the zone. Let __GFP_HIGH and allocations
@@ -956,7 +955,7 @@ rebalance:
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	did_some_progress = try_to_free_pages(zones, gfp_mask);
+	did_some_progress = try_to_free_pages(zones, gfp_mask, p);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
@@ -2282,6 +2281,8 @@ static char *vmstat_text[] = {
 	"pgfree",
 	"pgactivate",
 	"pgdeactivate",
+	"pgkeephot",
+	"pgkeepcold",
 
 	"pgfault",
 	"pgmajfault",
@@ -2309,63 +2310,6 @@ static char *vmstat_text[] = {
 
 	"pgrotated",
 	"nr_bounce",
-
-	"cache_miss",
-	"readrandom",
-	"pgreadrandom",
-	"readahead_rescue",
-	"pgreadahead_rescue",
-	"readahead_end",
-
-	"readahead",
-	"readahead_return",
-	"readahead_eof",
-	"pgreadahead",
-	"pgreadahead_hit",
-	"pgreadahead_eof",
-
-	"ra_newfile",
-	"ra_newfile_return",
-	"ra_newfile_eof",
-	"pgra_newfile",
-	"pgra_newfile_hit",
-	"pgra_newfile_eof",
-
-	"ra_state",
-	"ra_state_return",
-	"ra_state_eof",
-	"pgra_state",
-	"pgra_state_hit",
-	"pgra_state_eof",
-
-	"ra_context",
-	"ra_context_return",
-	"ra_context_eof",
-	"pgra_context",
-	"pgra_context_hit",
-	"pgra_context_eof",
-
-	"ra_contexta",
-	"ra_contexta_return",
-	"ra_contexta_eof",
-	"pgra_contexta",
-	"pgra_contexta_hit",
-	"pgra_contexta_eof",
-
-	"ra_backward",
-	"ra_backward_return",
-	"ra_backward_eof",
-	"pgra_backward",
-	"pgra_backward_hit",
-	"pgra_backward_eof",
-
-	"ra_random",
-	"ra_random_return",
-	"ra_random_eof",
-	"pgra_random",
-	"pgra_random_hit",
-	"pgra_random_eof",
-
 };
 
 static void *vmstat_start(struct seq_file *m, loff_t *pos)
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/mm/page-writeback.c linux-2.6.14-ck2/mm/page-writeback.c
--- linux-2.6.14-ck1/mm/page-writeback.c	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/mm/page-writeback.c	2005-11-03 00:02:46.000000000 +1100
@@ -369,7 +369,7 @@ static void wb_timer_fn(unsigned long un
 static void laptop_timer_fn(unsigned long unused);
 
 static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
-static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
+DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
 
 /*
  * Periodic writeback of "old" data.
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/mm/readahead.c linux-2.6.14-ck2/mm/readahead.c
--- linux-2.6.14-ck1/mm/readahead.c	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/mm/readahead.c	2005-11-03 00:02:46.000000000 +1100
@@ -14,56 +14,256 @@
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/writeback.h>
 
+/* Set look-ahead size to 1/8 of the thrashing-threshold. */
+#define LOOKAHEAD_RATIO 8
+
+/* Set read-ahead size to ##% of the thrashing-threshold. */
+int readahead_ratio = 50;
+EXPORT_SYMBOL(readahead_ratio);
+
+/* Analog to nr_page_aging.
+ * But mainly increased on fresh page references, so is much more smoother.
+ */
+DEFINE_PER_CPU(unsigned long, smooth_aging);
+EXPORT_PER_CPU_SYMBOL(smooth_aging);
+
+/* Detailed classification of read-ahead behaviors. */
+#define RA_CLASS_SHIFT 3
+#define RA_CLASS_MASK  ((1 << RA_CLASS_SHIFT) - 1)
+enum ra_class {
+	RA_CLASS_ALL,
+	RA_CLASS_NEWFILE,
+	RA_CLASS_STATE,
+	RA_CLASS_CONTEXT,
+	RA_CLASS_CONTEXT_ACCELERATED,
+	RA_CLASS_BACKWARD,
+	RA_CLASS_RANDOM_THRASHING,
+	RA_CLASS_RANDOM_SEEK,
+	RA_CLASS_END,
+};
+
+/* Read-ahead events to be accounted. */
+enum ra_event {
+	RA_EVENT_CACHE_MISS,		/* read cache misses */
+	RA_EVENT_READRANDOM,		/* random reads */
+	RA_EVENT_IO_CONGESTION,		/* io congestion */
+	RA_EVENT_IO_CACHE_HIT,		/* canceled io due to cache hit */
+	RA_EVENT_IO_BLOCK,		/* read on locked page */
+
+	RA_EVENT_READAHEAD,		/* read-ahead issued */
+	RA_EVENT_READAHEAD_HIT,		/* read-ahead page hit */
+	RA_EVENT_LOOKAHEAD,		/* look-ahead issued */
+	RA_EVENT_LOOKAHEAD_HIT,		/* look-ahead mark hit */
+	RA_EVENT_READAHEAD_EOF,		/* read-ahead reaches EOF */
+	RA_EVENT_READAHEAD_SHRINK,	/* ra_size decreased, reflects var. */
+	RA_EVENT_READAHEAD_THRASHING,	/* read-ahead thrashing happened */
+	RA_EVENT_READAHEAD_RESCUE,	/* read-ahead rescued */
+
+	RA_EVENT_END
+};
+
+/*
+ * Debug facilities.
+ */
+#ifdef CONFIG_DEBUG_FS
 #define DEBUG_READAHEAD
+#endif
 
 #ifdef DEBUG_READAHEAD
+#include <linux/jiffies.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+
+static char *ra_class_name[] = {
+	"total",
+	"newfile",
+	"state",
+	"context",
+	"contexta",
+	"backward",
+	"onthrash",
+	"onraseek",
+	"none",
+};
+
+static char *ra_event_name[] = {
+	"cache_miss",
+	"read_random",
+	"io_congestion",
+	"io_cache_hit",
+	"io_block",
+	"readahead",
+	"readahead_hit",
+	"lookahead",
+	"lookahead_hit",
+	"readahead_eof",
+	"readahead_shrink",
+	"readahead_thrash",
+	"readahead_rescue",
+};
+
+static unsigned long ra_event_count[RA_CLASS_END+1][RA_EVENT_END][2];
+
+static inline void ra_account(struct file_ra_state *ra,
+				enum ra_event e, int pages)
+{
+	enum ra_class c;
+
+	c = (ra ? ra->flags & RA_CLASS_MASK : RA_CLASS_END);
+	if (e == RA_EVENT_READAHEAD_HIT && pages < 0) {
+		c = (ra->flags >> RA_CLASS_SHIFT) & RA_CLASS_MASK;
+		pages = -pages;
+	}
+	if (!c)
+		c = RA_CLASS_END;
+	BUG_ON(c > RA_CLASS_END);
+
+	ra_event_count[c][e][0] += 1;
+	ra_event_count[c][e][1] += pages;
+}
+
+static int ra_account_show(struct seq_file *s, void *_)
+{
+	int i;
+	int c;
+	int e;
+	static char event_fmt[] = "%-16s";
+	static char class_fmt[] = "%11s";
+	static char item_fmt[] = "%11lu";
+	static char percent_format[] = "%10lu%%";
+	static char *table_name[] = {
+		"[table requests]",
+		"[table pages]",
+		"[table summary]"};
+
+	for (i = 0; i <= 1; i++) {
+		for (e = 0; e < RA_EVENT_END; e++) {
+			ra_event_count[0][e][i] = 0;
+			for (c = 1; c <= RA_CLASS_END; c++)
+				ra_event_count[0][e][i] +=
+							ra_event_count[c][e][i];
+		}
+
+		seq_printf(s, event_fmt, table_name[i]);
+		for (c = 0; c <= RA_CLASS_END; c++)
+			seq_printf(s, class_fmt, ra_class_name[c]);
+		seq_puts(s, "\n");
+
+		for (e = 0; e < RA_EVENT_END; e++) {
+			if (e == RA_EVENT_READAHEAD_HIT && i == 0)
+				continue;
+
+			seq_printf(s, event_fmt, ra_event_name[e]);
+			for (c = 0; c <= RA_CLASS_END; c++)
+				seq_printf(s, item_fmt,
+						ra_event_count[c][e][i]);
+			seq_puts(s, "\n");
+		}
+		seq_puts(s, "\n");
+	}
+
+	seq_printf(s, event_fmt, table_name[2]);
+	for (c = 0; c <= RA_CLASS_END; c++)
+		seq_printf(s, class_fmt, ra_class_name[c]);
+	seq_puts(s, "\n");
+
+	seq_printf(s, event_fmt, "random_rate");
+	for (c = 0; c <= RA_CLASS_END; c++)
+		seq_printf(s, percent_format,
+			(ra_event_count[c][RA_EVENT_READRANDOM][0] * 100) /
+			(ra_event_count[c][RA_EVENT_READRANDOM][0] +
+			 ra_event_count[c][RA_EVENT_READAHEAD][0] + 1));
+	seq_puts(s, "\n");
+
+	seq_printf(s, event_fmt, "ra_hit_rate");
+	for (c = 0; c <= RA_CLASS_END; c++)
+		seq_printf(s, percent_format,
+			(ra_event_count[c][RA_EVENT_READAHEAD_HIT][1] * 100) /
+			(ra_event_count[c][RA_EVENT_READAHEAD][1] + 1));
+	seq_puts(s, "\n");
+
+	seq_printf(s, event_fmt, "la_hit_rate");
+	for (c = 0; c <= RA_CLASS_END; c++)
+		seq_printf(s, percent_format,
+			(ra_event_count[c][RA_EVENT_LOOKAHEAD_HIT][0] * 100) /
+			(ra_event_count[c][RA_EVENT_LOOKAHEAD][0] + 1));
+	seq_puts(s, "\n");
+
+	seq_printf(s, event_fmt, "avg_ra_size");
+	for (c = 0; c <= RA_CLASS_END; c++)
+		seq_printf(s, item_fmt,
+			(ra_event_count[c][RA_EVENT_READAHEAD][1] +
+			 ra_event_count[c][RA_EVENT_READAHEAD][0] / 2) /
+			(ra_event_count[c][RA_EVENT_READAHEAD][0] + 1));
+	seq_puts(s, "\n");
+
+	seq_printf(s, event_fmt, "avg_la_size");
+	for (c = 0; c <= RA_CLASS_END; c++)
+		seq_printf(s, item_fmt,
+			(ra_event_count[c][RA_EVENT_LOOKAHEAD][1] +
+			 ra_event_count[c][RA_EVENT_LOOKAHEAD][0] / 2) /
+			(ra_event_count[c][RA_EVENT_LOOKAHEAD][0] + 1));
+	seq_puts(s, "\n");
+
+	return 0;
+}
+
+static struct dentry *readahead_dentry;
+
+static int ra_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ra_account_show, NULL);
+}
+
+static ssize_t ra_debug_write(struct file *file, const char __user *buf,
+				size_t size, loff_t *offset)
+{
+	if (file->f_dentry == readahead_dentry)
+		memset(ra_event_count, 0, sizeof(ra_event_count));
+	return 1;
+}
+
+static struct file_operations ra_debug_fops = {
+	.owner		= THIS_MODULE,
+	.open		= ra_debug_open,
+	.write		= ra_debug_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init readahead_init(void)
+{
+	readahead_dentry = debugfs_create_file("readahead",
+					0644, NULL, NULL, &ra_debug_fops);
+	return 0;
+}
+
+module_init(readahead_init)
+
 #define dprintk(args...) \
 	if (readahead_ratio & 1) printk(KERN_DEBUG args)
 #define ddprintk(args...) \
 	if ((readahead_ratio & 3) == 3) printk(KERN_DEBUG args)
 
-#define ra_account_page(ra, member, delta)	do {			\
-	unsigned long opg = offsetof(struct page_state, pgreadahead) - 	\
-				offsetof(struct page_state, readahead);	\
-	unsigned long o1 = offsetof(struct page_state, member);		\
-	unsigned long o2 = o1 + 2 * opg * ((ra)->flags & RA_CLASS_MASK);\
-	BUG_ON(opg + o2 >= sizeof(struct page_state));			\
-	__mod_page_state(o1, 1UL);					\
-	__mod_page_state(o2, 1UL);					\
-	__mod_page_state(opg + o1, (delta));				\
-	__mod_page_state(opg + o2, (delta));				\
-} while (0)
-
-#define ra_account(member, class, delta)	do {			\
-	unsigned long opg = offsetof(struct page_state, pgreadahead) - 	\
-				offsetof(struct page_state, readahead);	\
-	unsigned long o1 = offsetof(struct page_state, member);		\
-	unsigned long o2 = o1 + 2 * opg * (class);			\
-	if ((class) >= RA_CLASS_END)					\
-		break;							\
-	BUG_ON(o2 >= sizeof(struct page_state));			\
-	__mod_page_state(o1, (delta));					\
-	__mod_page_state(o2, (delta));					\
-} while (0)
-
-#else
-#undef inc_page_state
-#undef mod_page_state
-#define inc_page_state(a)    do {} while(0)
-#define mod_page_state(a, b) do {} while(0)
+#else /* !DEBUG_READAHEAD */
+
+static inline void ra_account(struct file_ra_state *ra,
+				enum ra_event e, int pages)
+{
+}
 #define dprintk(args...)     do {} while(0)
 #define ddprintk(args...)    do {} while(0)
-#define ra_account(member, class, delta)	do {} while(0)
-#define ra_account_page(member, class, delta)	do {} while(0)
-#endif
 
-/* Set look-ahead size to 1/8 of the read-ahead size. */
-#define LOOKAHEAD_RATIO 8
+#endif /* DEBUG_READAHEAD */
 
-/* Set read-ahead size to ##% of the thrashing-threshold. */
-int readahead_ratio = 0;   
-EXPORT_SYMBOL(readahead_ratio);
+
+/* The default max/min read-ahead pages. */
+#define MAX_RA_PAGES	(VM_MAX_READAHEAD >> (PAGE_CACHE_SHIFT - 10))
+#define MIN_RA_PAGES	(VM_MIN_READAHEAD >> (PAGE_CACHE_SHIFT - 10))
 
 void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 {
@@ -71,7 +271,7 @@ void default_unplug_io_fn(struct backing
 EXPORT_SYMBOL(default_unplug_io_fn);
 
 struct backing_dev_info default_backing_dev_info = {
-	.ra_pages	= (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
+	.ra_pages	= MAX_RA_PAGES,
 	.state		= 0,
 	.capabilities	= BDI_CAP_MAP_COPY,
 	.unplug_io_fn	= default_unplug_io_fn,
@@ -99,7 +299,7 @@ static inline unsigned long get_max_read
 
 static inline unsigned long get_min_readahead(struct file_ra_state *ra)
 {
-	return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+	return MIN_RA_PAGES;
 }
 
 static inline void ra_off(struct file_ra_state *ra)
@@ -326,7 +526,7 @@ __do_page_cache_readahead(struct address
 	read_lock_irq(&mapping->tree_lock);
 	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
 		unsigned long page_offset = offset + page_idx;
-		
+
 		if (page_offset > end_index)
 			break;
 
@@ -632,38 +832,48 @@ unsigned long max_sane_readahead(unsigne
  * In every read-ahead chunk, it selects one page and tag it with PG_readahead.
  * Later when the page with PG_readahead is to be read, the logic knows that
  * it's time to carry out the next read-ahead chunk in advance.
- *    
+ *
  *                 a read-ahead chunk
- *    +-----------------------------------------+       
- *    |       # PG_readahead                    |       
- *    +-----------------------------------------+       
+ *    +-----------------------------------------+
+ *    |       # PG_readahead                    |
+ *    +-----------------------------------------+
  *            ^ When this page is read, we submit I/O for the next read-ahead.
  *
  *
  * Here are some variable names used frequently:
  *
  *                                   |<------- la_size ------>|
- *                  +-----------------------------------------+       
- *                  |                #                        |       
- *                  +-----------------------------------------+       
+ *                  +-----------------------------------------+
+ *                  |                #                        |
+ *                  +-----------------------------------------+
  *      ra_index -->|<---------------- ra_size -------------->|
  *
  */
 
-#define next_page(page) (list_entry((page)->lru.prev, struct page, lru))
-#define prev_page(page) (list_entry((page)->lru.next, struct page, lru))
+#define next_page(pg) (list_entry((pg)->lru.prev, struct page, lru))
+#define prev_page(pg) (list_entry((pg)->lru.next, struct page, lru))
 
 /*
  * The nature of read-ahead allows most tests to fail or even be wrong.
  * Here we just do not bother to call get_page(), it's meaningless anyway.
  */
+static inline struct page *__find_page(struct address_space *mapping,
+						unsigned long offset)
+{
+	return radix_tree_lookup(&mapping->page_tree, offset);
+}
+
 struct page *find_page(struct address_space *mapping, unsigned long offset)
 {
 	struct page *page;
 
 	read_lock_irq(&mapping->tree_lock);
-	page = radix_tree_lookup(&mapping->page_tree, offset);
+	page = __find_page(mapping, offset);
 	read_unlock_irq(&mapping->tree_lock);
+#ifdef DEBUG_READAHEAD_RADIXTREE
+	if (page)
+		BUG_ON(page->index != offset);
+#endif
 	return page;
 }
 
@@ -680,7 +890,7 @@ static int rescue_pages(struct page *pag
 
 	BUG_ON(!nr_pages || !page);
 	pgrescue = 0;
-	index = page->index;
+	index = page_index(page);
 	mapping = page_mapping(page);
 
 	dprintk("rescue_pages(ino=%lu, index=%lu nr=%lu)\n",
@@ -693,7 +903,8 @@ static int rescue_pages(struct page *pag
 		if (!PageLRU(page))
 			goto out_unlock;
 
-		while (page_mapping(page) == mapping && page->index == index) {
+		while (page_mapping(page) == mapping &&
+				page_index(page) == index) {
 			struct page *the_page = page;
 			page = next_page(page);
 			if (!PageActive(the_page) &&
@@ -701,7 +912,6 @@ static int rescue_pages(struct page *pag
 					!PageLocked(the_page) &&
 					page_count(the_page) == 1) {
 				list_move(&the_page->lru, &zone->inactive_list);
-				zone->nr_page_aging++;
 				pgrescue++;
 			}
 			index++;
@@ -718,8 +928,7 @@ static int rescue_pages(struct page *pag
 out_unlock:
 	spin_unlock_irq(&zone->lru_lock);
 out:
-	inc_page_state(readahead_rescue);
-	mod_page_state(pgreadahead_rescue, pgrescue);
+	ra_account(0, RA_EVENT_READAHEAD_RESCUE, pgrescue);
 
 	return nr_pages ? index : 0;
 }
@@ -734,7 +943,7 @@ out:
  *  |             #             |                   #                       |
  *  +---------------------------+-------------------------------------------+
  *                ^             ^                   ^                       ^
- *              la_index      ra_index     lookahead_index         readahead_index        
+ *              la_index      ra_index     lookahead_index         readahead_index
  */
 
 /*
@@ -754,16 +963,16 @@ static unsigned long nr_free_inactive(vo
 }
 
 /*
- * The accumulated count of pages pushed into inactive_list(s).
+ * A much smoother analog to nr_page_aging.
  */
-static unsigned long nr_page_aging(void)
+static unsigned long nr_smooth_aging(void)
 {
-	unsigned int i;
+	unsigned long cpu;
 	unsigned long sum = 0;
-	struct zone *zones = NODE_DATA(numa_node_id())->node_zones;
+	cpumask_t mask = node_to_cpumask(numa_node_id());
 
-	for (i = 0; i < MAX_NR_ZONES; i++)
-		sum += zones[i].nr_page_aging;
+	for_each_cpu_mask(cpu, mask)
+		sum += per_cpu(smooth_aging, cpu);
 
 	return sum;
 }
@@ -772,7 +981,7 @@ static unsigned long nr_page_aging(void)
  * Set class of read-ahead
  */
 static inline void set_ra_class(struct file_ra_state *ra,
-				enum file_ra_class ra_class)
+				enum ra_class ra_class)
 {
 	ra->flags <<= RA_CLASS_SHIFT;
 	ra->flags += ra_class;
@@ -788,6 +997,11 @@ static inline int ra_cache_hit(struct fi
 	return (ra->cache_hit >> (nr * 16)) & 0xFFFF;
 }
 
+/*
+ * Something like:
+ * ra_cache_hit(ra, 1) += ra_cache_hit(ra, 0);
+ * ra_cache_hit(ra, 0) = 0;
+ */
 static inline void ra_addup_cache_hit(struct file_ra_state *ra)
 {
 	int n;
@@ -838,12 +1052,17 @@ static inline void ra_state_init(struct 
 static inline void ra_state_update(struct file_ra_state *ra,
 				unsigned long ra_size, unsigned long la_size)
 {
+#ifdef DEBUG_READAHEAD
+	unsigned long old_ra = ra->readahead_index - ra->ra_index;
+	if (ra_size < old_ra && ra_cache_hit(ra, 0))
+		ra_account(ra, RA_EVENT_READAHEAD_SHRINK, old_ra - ra_size);
+#endif
 	ra_addup_cache_hit(ra);
 	ra->ra_index = ra->readahead_index;
 	ra->la_index = ra->lookahead_index;
 	ra->readahead_index += ra_size;
 	ra->lookahead_index = ra->readahead_index - la_size;
-	ra->nr_page_aging = nr_page_aging();
+	ra->age = nr_smooth_aging();
 }
 
 /*
@@ -866,16 +1085,7 @@ static int ra_dispatch(struct file_ra_st
 	unsigned long ra_size;
 	unsigned long la_size;
 	int actual;
-	enum file_ra_class ra_class;
-	static char *ra_class_name[] = {
-		"newfile",
-		"state",
-		"context",
-		"contexta",
-		"backward",
-		/* "around", */
-		"random",
-	};
+	enum ra_class ra_class;
 
 	ra_class = (ra->flags & RA_CLASS_MASK);
 	BUG_ON(ra_class == 0 || ra_class > RA_CLASS_END);
@@ -885,10 +1095,8 @@ static int ra_dispatch(struct file_ra_st
 	la_size = ra->readahead_index - ra->lookahead_index;
 
 	/* Snap to EOF. */
-	if (unlikely(ra->ra_index >= eof_index)) {
-		inc_page_state(readahead_end);
+	if (unlikely(ra->ra_index >= eof_index))
 		return 0;
-	}
 	if (ra->readahead_index + ra_size / 2 > eof_index) {
 		if (ra_class == RA_CLASS_CONTEXT_ACCELERATED &&
 				eof_index > ra->lookahead_index + 1)
@@ -902,12 +1110,16 @@ static int ra_dispatch(struct file_ra_st
 	actual = __do_page_cache_readahead(mapping, filp,
 					ra->ra_index, ra_size, la_size);
 
-	if (!la_size && ra->readahead_index == eof_index)
-		ra_account_page(ra, readahead_eof, actual);
-	ra_account_page(ra, readahead, actual);
+	if (ra->readahead_index == eof_index)
+		ra_account(ra, RA_EVENT_READAHEAD_EOF, actual);
+	if (la_size)
+		ra_account(ra, RA_EVENT_LOOKAHEAD, la_size);
+	if (ra_size > actual)
+		ra_account(ra, RA_EVENT_IO_CACHE_HIT, ra_size - actual);
+	ra_account(ra, RA_EVENT_READAHEAD, actual);
 
 	dprintk("readahead-%s(ino=%lu, index=%lu, ra=%lu+%lu-%lu) = %d\n",
-			ra_class_name[ra_class - 1],
+			ra_class_name[ra_class],
 			mapping->host->i_ino, ra->la_index,
 			ra->ra_index, ra_size, la_size, actual);
 
@@ -958,7 +1170,7 @@ static inline int adjust_rala(unsigned l
  *    It is returned to make the next read-ahead request.
  * 2. the remained space for the current chunk
  *    It will be checked to ensure that the current chunk is safe.
- * 
+ *
  * The computation will be pretty accurate under heavy load, and will change
  * vastly with light load(small global_shift), so the grow speed of ra_size
  * must be limited, and a moderate large stream_shift must be insured.
@@ -986,11 +1198,11 @@ static inline unsigned long compute_thra
 	unsigned long ra_size;
 
 	global_size = nr_free_inactive();
-	global_shift = nr_page_aging() - ra->nr_page_aging;
+	global_shift = nr_smooth_aging() - ra->age;
 	stream_shift = ra_cache_hit(ra, 0);
 
 	ra_size = stream_shift *
-			global_size * readahead_ratio / 100 / global_shift;
+			global_size * readahead_ratio / (100 * global_shift);
 
 	if (global_size > global_shift)
 		*remain = stream_shift *
@@ -1006,7 +1218,7 @@ static inline unsigned long compute_thra
 	return ra_size;
 }
 
-/* 
+/*
  * Main function for file_ra_state based read-ahead.
  */
 static inline unsigned long
@@ -1023,7 +1235,7 @@ state_based_readahead(struct address_spa
 	ra_old = ra->readahead_index - ra->ra_index;
 	ra_size = compute_thrashing_threshold(ra, &remain_space);
 
-	if (readahead_ratio < 80 && 
+	if (readahead_ratio < VM_READAHEAD_PROTECT_RATIO &&
 			remain_space <= la_size && la_size > 1) {
 		rescue_pages(page, la_size);
 		return 0;
@@ -1039,7 +1251,6 @@ state_based_readahead(struct address_spa
 	return ra_dispatch(ra, mapping, filp);
 }
 
-
 /*
  * Page cache context based estimation of read-ahead/look-ahead size/index.
  *
@@ -1048,11 +1259,11 @@ state_based_readahead(struct address_spa
  * the start point of next read-ahead.
  *
  * The estimation theory can be illustrated with figure:
- * 
+ *
  *   chunk A           chunk B                      chunk C                 head
  *
  *   l01 l11           l12   l21                    l22
- *| |-->|-->|       |------>|-->|                |------>| 
+ *| |-->|-->|       |------>|-->|                |------>|
  *| +-------+       +-----------+                +-------------+               |
  *| |   #   |       |       #   |                |       #     |               |
  *| +-------+       +-----------+                +-------------+               |
@@ -1075,33 +1286,22 @@ state_based_readahead(struct address_spa
  * a lower estimation of the true thrashing-threshold.
  */
 
-#if PG_activate < PG_referenced
-#error unexpected page flags order
-#endif
-
-#define PAGE_REFCNT_1		(1 << PG_referenced)
-#define PAGE_REFCNT_2		(1 << PG_activate)
-#define PAGE_REFCNT_3		((1 << PG_activate) | (1 << PG_referenced))
-#define PAGE_REFCNT_MASK	PAGE_REFCNT_3
 /*
  * STATUS   REFERENCE COUNT      TYPE
- *   __                   -      not in inactive list
- *   __                   0      fresh
- *   _R       PAGE_REFCNT_1      stale
- *   A_       PAGE_REFCNT_2      disturbed once
- *   AR       PAGE_REFCNT_3      disturbed twice
+ *  A__                   0      not in inactive list
+ *  ___                   0      fresh
+ *  __R       PAGE_REFCNT_1      stale
+ *  _a_       PAGE_REFCNT_2      disturbed once
+ *  _aR       PAGE_REFCNT_3      disturbed twice
+ *
+ *  A/a/R: Active / aCTIVATE / Referenced
  */
-static inline unsigned long __page_refcnt(struct page *page)
-{
-	return page->flags & PAGE_REFCNT_MASK;
-}
-
-static inline unsigned long page_refcnt(struct page *page)
+static inline unsigned long cold_page_refcnt(struct page *page)
 {
 	if (!page || PageActive(page))
 		return 0;
 
-	return __page_refcnt(page);
+	return page_refcnt(page);
 }
 
 static inline char page_refcnt_symbol(struct page *page)
@@ -1110,7 +1310,7 @@ static inline char page_refcnt_symbol(st
 		return 'X';
 	if (PageActive(page))
 		return 'A';
-	switch (__page_refcnt(page)) {
+	switch (page_refcnt(page)) {
 		case 0:
 			return '_';
 		case PAGE_REFCNT_1:
@@ -1124,201 +1324,177 @@ static inline char page_refcnt_symbol(st
 }
 
 /*
- * Look back and count history pages to estimate thrashing-threshold.
- *
- * Strategies
- * - Sequential read that extends from index 0
- * 	The counted value may well be far under the true threshold, so return
- * 	it unmodified for further process in adjust_rala_accelerated().
- * - Sequential read with a large history count
- * 	Check 3 evenly spread pages to be sure there is no hole or many
- * 	not-yet-accessed pages. This prevents unnecessary IO, and allows some
- * 	almost sequential patterns to survive.
- * - Return equal or smaller count; but ensure a reasonable minimal value.
- *
- * Optimization
- * - The count will normally be min(nr_lookback, offset), unless either memory
- *   or read speed is low, or it is still in grow up phase.
- * - A rigid implementation would be a simple loop to scan page by page
- *   backward, though this may be unnecessary and inefficient, so the
- *   stepping backward/forward scheme is used.
- *
- * FIXME: it seems ugly :(
- */
-static int count_sequential_pages(struct address_space *mapping,
-			int refcnt, unsigned long *remain,
-			unsigned long offset,
+ * Count/estimate cache hits in range [first_index, last_index].
+ * The estimation is simple and a bit optimistic.
+ */
+static int count_cache_hit(struct address_space *mapping,
+			unsigned long first_index, unsigned long last_index)
+{
+	static int steps[8] = {0, 4, 2, 6, 1, 3, 5, 7};
+	struct page *page;
+	int size = last_index - first_index + 1;
+	int count = 0;
+	int i;
+
+	read_lock_irq(&mapping->tree_lock);
+
+	for (i = 0; i < 8;) {
+		page = __find_page(mapping,
+					first_index + size * steps[i++] / 8);
+		if (cold_page_refcnt(page) >= PAGE_REFCNT_1 && ++count >= 2)
+			break;
+	}
+
+	read_unlock_irq(&mapping->tree_lock);
+
+	return size * count / i;
+}
+
+/*
+ * Look back and check history pages to estimate thrashing-threshold.
+ */
+static int query_page_cache(struct address_space *mapping,
+			unsigned long *remain, unsigned long offset,
 			unsigned long ra_min, unsigned long ra_max)
 {
 	int step;
 	int count;
 	unsigned long index;
 	unsigned long nr_lookback;
-	struct page *page;
-	struct radix_tree_cache cache;
+        struct radix_tree_cache cache;
 
-	*remain = 0;
-	nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) *
-						100 / (readahead_ratio + 1);
-	if (nr_lookback > offset)
-		nr_lookback = offset;
-	if (nr_lookback > mapping->nrpages)
-		nr_lookback = mapping->nrpages;
-
-	if (nr_lookback <= ra_min * 100 / (readahead_ratio + 1)) {
-		*remain = nr_lookback;
-		return ra_min;
+	/*
+	 * Scan backward and check the near @ra_max pages.
+	 * The count here determines ra_size.
+	 */
+	read_lock_irq(&mapping->tree_lock);
+	index = radix_tree_lookup_head(&mapping->page_tree, offset, ra_max);
+	read_unlock_irq(&mapping->tree_lock);
+#ifdef DEBUG_READAHEAD_RADIXTREE
+	if (index <= offset) {
+		WARN_ON(!find_page(mapping, index));
+		if (index + ra_max > offset)
+			WARN_ON(find_page(mapping, index - 1));
+	} else {
+		BUG_ON(index > offset + 1);
+		WARN_ON(find_page(mapping, offset));
 	}
+#endif
 
-	radix_tree_cache_init(&cache);
-	read_lock_irq(&mapping->tree_lock);
+	*remain = offset - index + 1;
 
-	/* check the far end first */
-	index = offset - nr_lookback;
-	page = radix_tree_cache_lookup(&mapping->page_tree, &cache, index);
-	if (page_refcnt(page) >= refcnt) {
-		step = 1 + nr_lookback / 3;
-		if(nr_lookback > ra_min * 8) {
-			count = 1;
-			goto check_more;
-		} else {
-			*remain = nr_lookback;
-			goto out_unlock;
-		}
+	if (unlikely(*remain <= ra_min)) {
+		count = ra_min;
+		goto out;
 	}
 
-	/* scan backward for non-present page */
-	count = 0; /* just to make gcc happy */
-	for(step = ra_min; step < nr_lookback; step *= 4) {
-		index = offset - step;
-		page = radix_tree_cache_lookup(&mapping->page_tree, &cache,
-									index);
-		if (!page)
-			goto check_more;
-	}
-	index = offset - nr_lookback;
-	page = NULL;
+	count = count_cache_hit(mapping, index, offset);
+	if (count < ra_min)
+		count = ra_min;
+	if (unlikely(count * 2 < offset - index))
+		goto out;
 
-	/* scan forward and check some more pages */
-check_more:
-	for(;;) {
-		if (page && !*remain)
-			*remain = offset - index;
-		if (page_refcnt(page) < refcnt) {
-			count = 0;
-			step = (offset - index + 3) / 4;
-		} else if (++count >= 3 || step < ra_min)
-			break;
-		index += step;
-		if (index >= offset)
+	if (*remain < ra_max)
+		goto out;
+
+	/*
+	 * Check the far pages coarsely.
+	 * The big count here helps increase la_size.
+	 */
+	nr_lookback = ra_max * (LOOKAHEAD_RATIO + 1) *
+						100 / (readahead_ratio + 1);
+	if (nr_lookback > offset)
+		nr_lookback = offset;
+
+        radix_tree_cache_init(&cache);
+	read_lock_irq(&mapping->tree_lock);
+	for (step = 2 * ra_max; step < nr_lookback; step += ra_max) {
+		struct radix_tree_node *node;
+		node = radix_tree_cache_lookup_node(&mapping->page_tree,
+                                                &cache, offset - step, 1);
+		if (!node)
 			break;
-		page = radix_tree_cache_lookup(&mapping->page_tree, &cache,
-									index);
+#ifdef DEBUG_READAHEAD_RADIXTREE
+		if (node != radix_tree_lookup_node(&mapping->page_tree,
+							offset - step, 1)) {
+			read_unlock_irq(&mapping->tree_lock);
+			printk(KERN_ERR "check radix_tree_cache_lookup_node!\n");
+			return 1;
+		}
+#endif
 	}
-out_unlock:
 	read_unlock_irq(&mapping->tree_lock);
 
-	count = 3 * step;
-	if (count > nr_lookback)
-		return nr_lookback;
-	
-	if (!*remain)
-		*remain = count;
-
-	count = count * readahead_ratio / 100;
-	if (count < get_min_readahead(NULL))
-		count = get_min_readahead(NULL);
+	/*
+	 *  For sequential read that extends from index 0, the counted value
+	 *  may well be far under the true threshold, so return it unmodified
+	 *  for further process in adjust_rala_accelerated().
+	 */
+	if (step < offset)
+		count = step * readahead_ratio / 100;
+	else
+		count = offset;
 
+out:
 	return count;
 }
 
 /*
- * Scan forward in inactive_list for the first non-present page.
- * It takes advantage of the adjacency of pages in inactive_list.
+ * Scan backward in the file for the first non-present page.
  */
-static unsigned long lru_scan_forward(struct page *page, int nr_pages)
+static inline unsigned long first_absent_page_bw(struct address_space *mapping,
+				unsigned long index, unsigned long max_scan)
 {
-	unsigned long index = page->index;
-	struct address_space *mapping = page_mapping(page);
-	struct zone *zone;
-
-	for(;;) {
-		zone = page_zone(page);
-		spin_lock_irq(&zone->lru_lock);
-
-		if (!PageLRU(page))
-			goto out;
+	struct radix_tree_cache cache;
+	struct page *page;
+	unsigned long origin;
 
-		do {    
+	origin = index;
+	if (max_scan > index)
+		max_scan = index;
+	radix_tree_cache_init(&cache);
+	read_lock_irq(&mapping->tree_lock);
+	for (;;) {
+		page = radix_tree_cache_lookup(&mapping->page_tree,
+							&cache, --index);
+		if (page) {
 			index++;
-			if (!--nr_pages)
-				goto out;
-			page = next_page(page);
-		} while (page_mapping(page) == mapping && page->index == index);
-
-		spin_unlock_irq(&zone->lru_lock);
-
-		page = find_page(mapping, index);
-		if (!page)
-			return index;
+			break;
+		}
+		if (origin - index > max_scan)
+			break;
 	}
-out:    
-	spin_unlock_irq(&zone->lru_lock);
-	return nr_pages ? index : 0;
+	read_unlock_irq(&mapping->tree_lock);
+
+	return index;
 }
 
-/* Directly calling lru_scan_forward() would be slow.
- * This function tries to avoid unnecessary scans for the most common cases:
- * - Slow reads should scan forward directly;
- * - Fast reads should step backward first;
- * - Aggressive reads may well have max allowed look-ahead size.
- */
-static unsigned long first_absent_page(struct address_space *mapping,
-				struct page *page, unsigned long index,
-				unsigned long ra_size, unsigned long ra_max)
+/*
+ * Scan forward in the file for the first non-present page.
+ */
+static inline unsigned long first_absent_page(struct address_space *mapping,
+				unsigned long index, unsigned long max_scan)
 {
-	struct radix_tree_cache cache;
-
-	if (ra_size < ra_max)
-		goto scan_forward;
+	unsigned long ra_index;
 
-	radix_tree_cache_init(&cache);
 	read_lock_irq(&mapping->tree_lock);
+	ra_index = radix_tree_lookup_tail(&mapping->page_tree,
+					index + 1, max_scan);
+	read_unlock_irq(&mapping->tree_lock);
 
-	if (ra_size < LOOKAHEAD_RATIO * ra_max)
-		goto scan_backward;
-
-	page = radix_tree_cache_lookup(&mapping->page_tree, &cache,
-							index + ra_max);
-	if (page) {
-		read_unlock_irq(&mapping->tree_lock);
-		return 0;
-	}
-	page = radix_tree_cache_lookup(&mapping->page_tree, &cache,
-							index + ra_max - 1);
-	if (page) {
-		read_unlock_irq(&mapping->tree_lock);
-		return index + ra_max;
+#ifdef DEBUG_READAHEAD_RADIXTREE
+	BUG_ON(ra_index <= index);
+	if (index + max_scan > index) {
+		if (ra_index <= index + max_scan)
+			WARN_ON(find_page(mapping, ra_index));
+		WARN_ON(!find_page(mapping, ra_index - 1));
 	}
+#endif
 
-scan_backward:
-	if (ra_size == index)
-		ra_size /= 4;
+	if (ra_index <= index + max_scan)
+		return ra_index;
 	else
-		ra_size /= (LOOKAHEAD_RATIO * 2);
-	for(;; ra_size /= 2) {
-		page = radix_tree_cache_lookup(&mapping->page_tree, &cache,
-							index + ra_size);
-		if (page)
-			break;
-		if (!ra_size)
-			return index + 1;
-	}
-	read_unlock_irq(&mapping->tree_lock);
-	ra_size = ra_max;
-
-scan_forward:
-	return lru_scan_forward(page, ra_size + 1);
+		return 0;
 }
 
 /*
@@ -1350,7 +1526,7 @@ static inline int adjust_rala_accelerate
 	return 1;
 }
 
-/* 
+/*
  * Main function for page context based read-ahead.
  */
 static inline int
@@ -1364,50 +1540,44 @@ try_context_based_readahead(struct addre
 	unsigned long ra_size;
 	unsigned long la_size;
 	unsigned long remain_pages;
-	unsigned long ret;
-	int refcnt;
 
-	/* NFSv3 daemons may process adjecent requests in parallel,
+	/* Where to start read-ahead?
+	 * NFSv3 daemons may process adjecent requests in parallel,
 	 * leading to many locally disordered, globally sequential reads.
-	 * So do not require nearby history pages to be accessed, present is
-	 * enough.
+	 * So do not require nearby history pages to be present or accessed.
 	 */
-	if (!prev_page)
-		return 0;
-
-	refcnt = page_refcnt(prev_page);
-	if (refcnt < PAGE_REFCNT_1)
-		refcnt = PAGE_REFCNT_1;
-	
-	ra_size = count_sequential_pages(mapping, refcnt,
-			&remain_pages, index, ra_min, ra_max);
-
-	/* Where to start read-ahead? */
-	if (!page)
-		ra_index = index;
-	else {
-		ra_index = first_absent_page(
-				mapping, page, index, ra_size, ra_max);
+	if (page) {
+		ra_index = first_absent_page(mapping, index, ra_max * 5 / 4);
 		if (unlikely(!ra_index))
 			return -1;
-	}
+	} else if (!prev_page) {
+		ra_index = first_absent_page_bw(mapping, index, ra_min);
+		if (index - ra_index > ra_min)
+			return 0;
+		ra_min += index - ra_index;
+		index = ra_index;
+	} else
+		ra_index = index;
+
+	ra_size = query_page_cache(mapping, &remain_pages,
+						index - 1, ra_min, ra_max);
 
 	la_size = ra_index - index;
-	if (readahead_ratio < 80 && 
+	if (readahead_ratio < VM_READAHEAD_PROTECT_RATIO &&
 			remain_pages <= la_size && la_size > 1) {
 		rescue_pages(page, la_size);
 		return -1;
 	}
 
 	if (ra_size == index) {
-		ret = adjust_rala_accelerated(ra_max, &ra_size, &la_size);
+		if (!adjust_rala_accelerated(ra_max, &ra_size, &la_size))
+			return -1;
 		set_ra_class(ra, RA_CLASS_CONTEXT_ACCELERATED);
 	} else {
-		ret = adjust_rala(ra_max, &ra_size, &la_size);
+		if (!adjust_rala(ra_max, &ra_size, &la_size))
+			return -1;
 		set_ra_class(ra, RA_CLASS_CONTEXT);
 	}
-	if (unlikely(!ret))
-		return -1;
 
 	ra_state_init(ra, index, ra_index);
 	ra_state_update(ra, ra_size, la_size);
@@ -1455,31 +1625,33 @@ newfile_readahead(struct address_space *
  */
 static inline int
 try_read_backward(struct file_ra_state *ra,
-			unsigned long first_index, unsigned long last_index,
-			unsigned long ra_size, unsigned long ra_max)
+			unsigned long begin_index, unsigned long end_index,
+			unsigned long ra_size,
+			unsigned long ra_min, unsigned long ra_max)
 {
-	if (ra_size > ra_max)
+	if (ra_size > ra_max || end_index > ra->prev_page)
 		return 0;
 
 	if (ra_has_index(ra, ra->prev_page)) {
+		if (end_index > ra->la_index)
+			return 0;
 		ra_size += 2 * ra_cache_hit(ra, 0);
-		last_index = ra->la_index;
+		end_index = ra->la_index;
 	} else {
-		ra_size = 4 * ra_size;
-		last_index = ra->prev_page;
+		ra_size += ra_min;
+		end_index = ra->prev_page;
 	}
 
 	if (ra_size > ra_max)
 		ra_size = ra_max;
 
-	if (last_index < first_index ||
-	    last_index > first_index + ra_size)
+	if (end_index > begin_index + ra_size)
 		return 0;
 
-	first_index = last_index - ra_size;
+	begin_index = end_index - ra_size;
 
 	set_ra_class(ra, RA_CLASS_BACKWARD);
-	ra_state_init(ra, first_index, first_index);
+	ra_state_init(ra, begin_index, begin_index);
 	ra_state_update(ra, ra_size, 0);
 
 	return 1;
@@ -1488,7 +1660,7 @@ try_read_backward(struct file_ra_state *
 /*
  * If there is a previous sequential read, it is likely to be another
  * sequential read at the new position.
- * Databases are known to have the seek-and-read-one-record pattern.
+ * Databases are known to have this seek-and-read-one-record pattern.
  */
 static inline int
 try_random_readahead(struct file_ra_state *ra, unsigned long index,
@@ -1502,17 +1674,23 @@ try_random_readahead(struct file_ra_stat
 	if (!ra_has_index(ra, ra->prev_page))
 		return 0;
 
-	if (index == ra->prev_page + 1)      /* read after thrashing */
+	if (index == ra->prev_page + 1) {    /* read after thrashing */
 		ra_size = hit0;
-	else if (ra_size < hit1 &&           /* read after seeking   */
+		set_ra_class(ra, RA_CLASS_RANDOM_THRASHING);
+		ra_account(ra, RA_EVENT_READAHEAD_THRASHING,
+						ra->readahead_index - index);
+	} else if (ra_size < hit1 &&         /* read after seeking   */
 			hit1 > hit2 / 2 &&
 			hit2 > hit3 / 2 &&
-			hit3 > hit1 / 2)
-		ra_size = min(ra_max, hit1);
-	else
+			hit3 > hit1 / 2) {
+		ra_size = max(hit1, hit2);
+		set_ra_class(ra, RA_CLASS_RANDOM_SEEK);
+	} else
 		return 0;
 
-	set_ra_class(ra, RA_CLASS_RANDOM);
+	if (ra_size > ra_max)
+		ra_size = ra_max;
+
 	ra_state_init(ra, index, index);
 	ra_state_update(ra, ra_size, 0);
 
@@ -1522,13 +1700,12 @@ try_random_readahead(struct file_ra_stat
 /*
  * ra_size is mainly determined by:
  * 1. sequential-start: min(KB(16 + mem_mb/16), KB(64))
- * 2. sequential-max:	min(KB(64 + mem_mb*64), KB(2048))
+ * 2. sequential-max:	min(ra->ra_pages, KB(262140))
  * 3. sequential:	(thrashing-threshold) * readahead_ratio / 100
  *
  * Table of concrete numbers for 4KB page size:
  *  (inactive + free) (in MB):    4   8   16   32   64  128  256  512 1024
  *    initial ra_size (in KB):   16  16   16   16   20   24   32   48   64
- *	  max ra_size (in KB):  320 576 1088 2048 2048 2048 2048 2048 2048
  */
 static inline void get_readahead_bounds(struct file_ra_state *ra,
 					unsigned long *ra_min,
@@ -1538,31 +1715,54 @@ static inline void get_readahead_bounds(
 
 #define KB(size)	(((size) * 1024) / PAGE_CACHE_SIZE)
 	mem_mb = nr_free_inactive() * PAGE_CACHE_SIZE / 1024 / 1024;
-	*ra_max = min(min(KB(64 + mem_mb*64), KB(2048)), ra->ra_pages); 
+	*ra_max = min(ra->ra_pages, KB(262140));
 	*ra_min = min(min(KB(VM_MIN_READAHEAD + mem_mb/16), KB(128)), *ra_max/2);
 #undef KB
 }
 
-/* 
+/*
+ * Set a new look-ahead mark at @new_index.
+ */
+void renew_lookahead(struct address_space *mapping,
+			struct file_ra_state *ra,
+			unsigned long index, unsigned long new_index)
+{
+	struct page *page;
+
+	if (index == ra->lookahead_index &&
+			new_index >= ra->readahead_index)
+		return;
+
+	page = find_page(mapping, new_index);
+	if (!page)
+		return;
+
+	SetPageReadahead(page);
+	if (ra->lookahead_index == index)
+		ra->lookahead_index = new_index;
+}
+
+/*
  * This is the entry point of the adaptive read-ahead logic.
  *
  * It is only called on two conditions:
  * 1. page == NULL
  *    A cache miss happened, it can be either a random read or a sequential one.
- * 2. page != NULL 
+ * 2. page != NULL
  *    There is a look-ahead mark(PG_readahead) from a previous sequential read.
  *    It's time to do some checking and submit the next read-ahead IO.
  *
- * That makes both methods happy, and lives in harmony with application managed
- * read-aheads via fadvise() / madvise(). The cache hit problem is also
- * eliminated naturally.
+ * That has the merits of:
+ * - makes all stateful/stateless methods happy;
+ * - eliminates the cache hit problem naturally;
+ * - lives in harmony with application managed read-aheads via fadvise/madvise.
  */
 unsigned long
 page_cache_readahead_adaptive(struct address_space *mapping,
 			struct file_ra_state *ra, struct file *filp,
 			struct page *prev_page, struct page *page,
-			unsigned long first_index,
-			unsigned long index, unsigned long last_index)
+			unsigned long begin_index,
+			unsigned long index, unsigned long end_index)
 {
 	unsigned long size;
 	unsigned long ra_min;
@@ -1572,16 +1772,24 @@ page_cache_readahead_adaptive(struct add
 	if (page) {
 		if(!TestClearPageReadahead(page))
 			return 0;
-		if (bdi_read_congested(mapping->backing_dev_info))
+		if (bdi_read_congested(mapping->backing_dev_info)) {
+			ra_account(ra, RA_EVENT_IO_CONGESTION,
+							end_index - index);
 			return 0;
+		}
+		if (laptop_mode && laptop_spinned_down()) {
+			renew_lookahead(mapping, ra, index, index + 32);
+			return 0;
+		}
 	}
 
 	if (page)
-		ra_account(readahead_return, ra->flags & RA_CLASS_MASK, 1);
+		ra_account(ra, RA_EVENT_LOOKAHEAD_HIT,
+				ra->readahead_index - ra->lookahead_index);
 	else if (index)
-		inc_page_state(cache_miss);
+		ra_account(ra, RA_EVENT_CACHE_MISS, end_index - begin_index);
 
-	size = last_index - index;
+	size = end_index - index;
 	get_readahead_bounds(ra, &ra_min, &ra_max);
 
 	/* readahead disabled? */
@@ -1594,7 +1802,7 @@ page_cache_readahead_adaptive(struct add
 	 * Start of file.
 	 */
 	if (index == 0)
-		return newfile_readahead(mapping, filp, ra, last_index, ra_min);
+		return newfile_readahead(mapping, filp, ra, end_index, ra_min);
 
 	/*
 	 * State based sequential read-ahead.
@@ -1603,20 +1811,18 @@ page_cache_readahead_adaptive(struct add
 		index == ra->lookahead_index &&
 		(page || index == ra->readahead_index) &&
 		(ra_cache_hit_ok(ra) ||
-		 last_index - first_index >= ra_max))
+		 end_index - begin_index >= ra_max))
 		return state_based_readahead(mapping, filp, ra, page, ra_max);
 
 	/*
 	 * Backward read-ahead.
 	 */
-	if (try_read_backward(ra, first_index, last_index, size, ra_max))
+	if (try_read_backward(ra, begin_index, end_index, size, ra_min, ra_max))
 		return ra_dispatch(ra, mapping, filp);
 
-	/* 
+	/*
 	 * Context based sequential read-ahead.
-	 */ 
-	if (!prev_page)
-		prev_page = find_page(mapping, index - 1);
+	 */
 	ret = try_context_based_readahead(mapping, ra, prev_page, page,
 						index, ra_min, ra_max);
 	if (ret > 0)
@@ -1624,7 +1830,7 @@ page_cache_readahead_adaptive(struct add
 	if (ret < 0)
 		return 0;
 
-	/* No action on look ahead time ? */
+	/* No action on look ahead time? */
 	if (page)
 		return 0;
 
@@ -1643,12 +1849,10 @@ page_cache_readahead_adaptive(struct add
 readit:
 	size = __do_page_cache_readahead(mapping, filp, index, size, 0);
 
-	inc_page_state(readrandom);
-	mod_page_state(pgreadrandom, size);
-
+	ra_account(ra, RA_EVENT_READRANDOM, size);
 	dprintk("readrandom(ino=%lu, pages=%lu, index=%lu-%lu-%lu) = %lu\n",
 			mapping->host->i_ino, mapping->nrpages,
-			first_index, index, last_index, size);
+			begin_index, index, end_index, size);
 
 	return size;
 }
@@ -1663,148 +1867,99 @@ void fastcall ra_access(struct file_ra_s
 			   (1 << PG_referenced)))
 		return;
 
-	if (!ra_has_index(ra, page->index))
+	if (ra_has_index(ra, page->index)) {
+		if (PageLocked(page))
+			ra_account(ra, RA_EVENT_IO_BLOCK,
+					ra->readahead_index - page->index);
+	} else {
+		if (PageLocked(page))
+			ra_account(0, RA_EVENT_IO_BLOCK, 1);
 		return;
+	}
 
 	ra->cache_hit++;
 
 	if (page->index >= ra->ra_index)
-		ra_account(pgreadahead_hit, ra->flags & RA_CLASS_MASK, 1);
+		ra_account(ra, RA_EVENT_READAHEAD_HIT, 1);
 	else
-		ra_account(pgreadahead_hit,
-			(ra->flags >> RA_CLASS_SHIFT) & RA_CLASS_MASK, 1);
+		ra_account(ra, RA_EVENT_READAHEAD_HIT, -1);
 }
 
 /*
- * Detect and protect sequential read-ahead pages.
- * 
- * The safty guarantee provided by this function is only needed in file servers
- * with big readahead_ratio set.
+ * Detect and protect live read-ahead pages.
+ *
+ * This function provides safty guarantee for file servers with big
+ * readahead_ratio(>=VM_READAHEAD_PROTECT_RATIO) set.  The goal is to save all
+ * and only the sequential pages that are to be accessed in the near future.
  *
  * This function is called when pages in @page_list are to be freed,
- * it protects ra pages by moving them into @save_list.
+ * it protects live read-ahead pages by moving them into @save_list.
  *
  * The general idea is to classify pages of a file into random pages and groups
- * of sequential accessed pages. Random pages and leading segments of
- * sequential pages are left over, following sequential pages are saved.
+ * of sequential accessed pages. Random pages and dead sequential pages are
+ * left over, live sequential pages are saved.
+ *
+ * Live read-ahead pages are defined as sequential pages that have reading in
+ * progress. They are detected by reference count pattern of:
+ *
+ *                        live head       live pages
+ *  ra pages group -->   ------------___________________
+ *                                   [  pages to save  ] (*)
  *
- * The algorithm must ensure:
+ * (*) for now, an extra page from the live head may also be saved.
+ *
+ * In pratical, the group of pages are fragmented into chunks. To tell whether
+ * pages inside a chunk are alive, we must check:
+ * 1) Are there any live heads inside the chunk?
+ * 2) Are there any live heads in the group before the chunk?
+ * 3) Sepcial case: live head just sits on the boundary of current chunk?
+ *
+ * The detailed rules employed must ensure:
  * - no page is pinned in inactive_list.
  * - no excessive pages are saved.
  *
- * chunk         - a list of pages belong to the same file
- * rs/ra pages   - a chunk of pages that was read/to be read sequentially
- *                 Detected by ascending index and (almost) non-descending
- *                 reference count. rs pages have greater reference count than
- *                 following ra pages.  A page can be both rs/ra page, which
- *                 indicates there are two adjacent readers.
- * live ra pages - ra pages that have reading in progress
- *                 Detected by having leading rs pages(either in page_list or in
- *                 inactive_list), or limited ra pages(may be in another zone,
- *                 just had their rs pages dropped).
- * dead ra pages - ra pages that seems to have no imminent reader
- *                 Note that they are not necessarily dead: either the cost of
- *                 search the leading rs pages or the cost of keeping them in
- *                 memory is large, so they are abandoned.
- *                 Leading rs pages are detected and handled the same way.
- *
- * Live ra pages are saved, pure/leading rs pages and dead ra pages are left
- * over and eligible for free. 
- *
- * The rules apply to the following common cases:
- * keep head   back search            chunk             case
- *    Y      ----____________|______________________    Normal
- *           ----------------|----__________________    Normal
- *                           |----__________________    Normal
+ * A picture of common cases:
+ *             back search            chunk             case
+ *           -----___________|[____________________]    Normal
+ *           ----------------|----[________________]    Normal
+ *                           |----[________________]    Normal
  *           ----------------|----------------------    Normal
  *                           |----------------------    Normal
- *     y     ________________|______________________    cache miss
- *                           |______________________    cache miss
- *     y     ________________|_______--------_______    two readers
- *    Y      ----____________|_______--------_______    two readers
- *                           |_______--------_______    two readers
- *                           |----_____------_______    two readers
- *           ----------------|----_____------_______    two readers
- *           _______---------|---------------_______    two readers
- *    Y      ----___---------|---------------_______    two readers
- *           ________________|---------------_______    two readers
- *    Y      ----____________|---------------_______    two readers
- *    Y      ====------------|----__________________    two readers
- *    N                      |====-----------_______    two readers
- *    N                      |###======-------------    three readers
- * Y: saved by leading rs pages
- * y: saved by limited leading ra pages
- * N: to be activated anyway
- *
- * To make it run smooth and fast, ra request boundary must be reserved:
- * - alloc pages of a chunk from one single zone
- * - insert pages into lru at one time
- * - make vmscan code aware of chunk boundaries
- *
- * Read backward pattern support is possible, in which case the pages are
- * better pushed into lru in reverse order.
+ *           ________________|______________________    ra miss
+ *                           |______________________    ra miss
+ *           ________________|_______--------[_____]    two readers
+ *           ----____________|[______--------______]    two readers
+ *                           |_______--------[_____]    two readers
+ *                           |----[____------______]    two readers
+ *           ----------------|----[____------______]    two readers
+ *           _______---------|---------------[_____]    two readers
+ *           ----___---------|[--------------______]    two readers
+ *           ________________|---------------[_____]    two readers
+ *           ----____________|[--------------______]    two readers
+ *           ====------------|[---_________________]    two readers
+ *                           |====[----------______]    two readers
+ *                           |###======[-----------]    three readers
+ *
+ * Read backward pattern support is possible, in which case the pages should be
+ * pushed into inactive_list in reverse order.
+ *
+ * The two special cases are awkwardly delt with for now. They will be all set
+ * when the timing information of recently evicted pages are available.
+ * Dead pages can also be purged earlier with the timing info.
  */
-int rescue_ra_pages(struct list_head *page_list, struct list_head *save_list)
+static int save_chunk(struct page *head, struct page *live_head,
+			struct page *tail, struct list_head *save_list)
 {
-	struct address_space *mapping;
-	struct page *chunk_head;
 	struct page *page;
-	unsigned long refcnt;
-	unsigned long index;
-	int ascend_count;
-	int ret = 0;
-
-	page = list_to_page(page_list);
-
-next_chunk:
-	chunk_head = page;
-	mapping = page_mapping(page);
-	ascend_count = 0;
-
-next_page:
-	index = page->index;
-	refcnt = __page_refcnt(page);
-	page = next_page(page);
-
-	if (&page->lru == page_list)
-		goto save_chunk;
-
-	if (mapping == page_mapping(page) && page->index > index) {
-		if (refcnt < __page_refcnt(page))
-			ascend_count++;
-		goto next_page;
-	}
-
-save_chunk:
-	if (mapping && !PageSwapCache(page) &&
-			!page_mapped(page) &&
-			ascend_count <= 3 &&
-			(!refcnt || index >= chunk_head->index + 8))
-		ret += save_chunk(chunk_head, page, save_list);
-
-	if (&page->lru != page_list)
-		goto next_chunk;
-
-	if (ret)
-		mod_page_state(pgreadahead_rescue, ret);
-
-	return ret;
-}
-
-int save_chunk(struct page *head, struct page *tail,
-		struct list_head *save_list)
-{
-	struct page *page;
-	struct page *next_page;
-	struct address_space *mapping = page_mapping(head);
+	struct address_space *mapping;
 	struct radix_tree_cache cache;
 	int i;
-	int keep_head;
-	unsigned long index = head->index;
-	unsigned long refcnt = __page_refcnt(head);
+	unsigned long index;
+	unsigned long refcnt;
+
 #ifdef DEBUG_READAHEAD
 	static char static_buf[PAGE_SIZE];
-	static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
+	static char *zone_names[] = {"DMA", "DMA32", "Normal", "HighMem"};
 	char *pat = static_buf;
 	unsigned long pidx = PAGE_SIZE / 2;
 
@@ -1815,63 +1970,113 @@ int save_chunk(struct page *head, struct
 	}
 #endif
 
-	/* The leading pages are going to be activated anyway? */
-	keep_head = 0;
-	if (refcnt > PAGE_REFCNT_1)
-		goto drop_head;
-	if (refcnt >= PAGE_REFCNT_1 && mapping_mapped(mapping))
-		goto drop_head;
-
-	/* Scan backward to see if leading pages should be saved. */
+#define LIVE_PAGE_SCAN		(4 * MAX_RA_PAGES)
+	index = head->index;
+	refcnt = page_refcnt(head);
+	mapping = head->mapping;
 	radix_tree_cache_init(&cache);
+
+	BUG_ON(!mapping); /* QUESTION: in what case mapping will be NULL ? */
 	read_lock_irq(&mapping->tree_lock);
-	for (i = 2 * mapping->backing_dev_info->ra_pages; i >= 0; i--) {
+
+	/*
+	 * Common case test:
+	 * Does the far end indicates a leading live head?
+	 */
+	index = radix_tree_lookup_head(&mapping->page_tree,
+						index, LIVE_PAGE_SCAN);
+	page = __find_page(mapping, index);
+	if (cold_page_refcnt(page) > refcnt) {
+#ifdef DEBUG_READAHEAD
+		if ((readahead_ratio & 3) == 3) {
+			pat[--pidx] = '.';
+			pat[--pidx] = '.';
+			pat[--pidx] = '.';
+			pat[--pidx] = page_refcnt_symbol(page);
+			pat[--pidx] = '|';
+		}
+#endif
+		live_head = head;
+		goto skip_scan_locked;
+	}
+
+	/*
+	 * Special case 1:
+	 * If @head is a live head, rescue_ra_pages() will not detect it.
+	 * Check it here.
+	 */
+	index = head->index;
+	page = radix_tree_cache_lookup(&mapping->page_tree, &cache, --index);
+	if (!page || PageActive(page)) {
+#ifdef DEBUG_READAHEAD
+		if ((readahead_ratio & 3) == 3)
+			pat[--pidx] = page_refcnt_symbol(page);
+#endif
+		goto skip_scan_locked;
+	}
+	if (refcnt > page_refcnt(next_page(head)) &&
+			page_refcnt(page) > page_refcnt(next_page(head))) {
+#ifdef DEBUG_READAHEAD
+		if ((readahead_ratio & 3) == 3)
+			pat[--pidx] = page_refcnt_symbol(page);
+#endif
+		live_head = head;
+		goto skip_scan_locked;
+	}
+
+	/*
+	 * Scan backward to see if the whole chunk should be saved.
+	 * It can be costly. But can be made rare in future.
+	 */
+	for (i = LIVE_PAGE_SCAN; i >= 0; i--) {
 		page = radix_tree_cache_lookup(&mapping->page_tree, &cache,
 								--index);
 #ifdef DEBUG_READAHEAD
 		if ((readahead_ratio & 3) == 3 && pidx)
 			pat[--pidx] = page_refcnt_symbol(page);
 #endif
-		/* Having limited leading ra pages is required now. It will
-		 * be less important if ra request boundaries are reserved.
-		 */
-		if (!page) {
-			if (i > mapping->backing_dev_info->ra_pages &&
-					index != head->index - 1 &&
-					!__page_refcnt(head))
-				keep_head = 1;
+
+		if (!page)
 			break;
-		}
 
 		/* Avoid being pinned by active page. */
-		if (PageActive(page))
+		if (unlikely(PageActive(page)))
 			break;
 
-		/* A trick to speed things up, must be placed after the
-		 * active page test.  This check may be removed when chunk
-		 * boundaries are reserved.
-		 */
-		if ((index & 63) == 63 && !__page_refcnt(head) &&
-				i > mapping->backing_dev_info->ra_pages &&
-				radix_tree_cache_count(&cache) <
-				index - radix_tree_cache_first_index(&cache)) {
-#ifdef DEBUG_READAHEAD
-			if ((readahead_ratio & 3) == 3 && pidx)
-				pat[--pidx] = '|';
-#endif
-			keep_head = 1;
+		if (page_refcnt(page) > refcnt) { /* So we are alive! */
+			live_head = head;
 			break;
 		}
 
-		if (__page_refcnt(page) > refcnt) { /* so they are live pages */
-			keep_head = 1;
-			break;
-		}
-		refcnt = __page_refcnt(page);
+		refcnt = page_refcnt(page);
+	}
+
+skip_scan_locked:
+	/*
+	 * Special case 2:
+	 * Save one extra page if it is a live head of the following chunk.
+	 * Just to be safe.  It protects the rare situation when the reader
+	 * is just crossing the chunk boundary, and the following chunk is not
+	 * far away from tail of inactive_list.
+	 */
+	if (live_head != head) {
+		struct page *last_page = prev_page(tail);
+		page = radix_tree_cache_lookup(&mapping->page_tree, &cache,
+						last_page->index + 1);
+		if (page && !live_head) {
+			refcnt = page_refcnt(last_page);
+			if (page_refcnt(page) >= refcnt)
+				page = radix_tree_cache_lookup(
+						&mapping->page_tree, &cache,
+						last_page->index + 2);
+			if (page && page_refcnt(page) < refcnt)
+				live_head = last_page;
+		} else if (!page && live_head)
+			live_head = next_page(live_head);
 	}
+
 	read_unlock_irq(&mapping->tree_lock);
 
-drop_head:
 #ifdef DEBUG_READAHEAD
 	if ((readahead_ratio & 3) == 3) {
 		for (i = 0; pidx < PAGE_SIZE / 2;)
@@ -1879,48 +2084,50 @@ drop_head:
 		pat[i++] = '|';
 		for (page = head; page != tail; page = next_page(page)) {
 			pidx = page->index;
+			if (page == live_head)
+				pat[i++] = '[';
 			pat[i++] = page_refcnt_symbol(page);
-			if (i >= PAGE_SIZE - 1)
+			BUG_ON(PageAnon(page));
+			BUG_ON(PageSwapCache(page));
+			/* BUG_ON(page_mapped(page)); */
+			if (i >= PAGE_SIZE - 2)
 				break;
 		}
+		if (live_head)
+			pat[i++] = ']';
 		pat[i] = 0;
 		pat[PAGE_SIZE - 1] = 0;
 	}
 #endif
 
-	/* Drop non-descending leading pages. */
-	page = head;
-	if (!keep_head) {
-		refcnt = __page_refcnt(page);
-		while (page != tail && /* never dereference tail! */
-			refcnt <= __page_refcnt(page)) {
-			refcnt = __page_refcnt(page);
-			page = next_page(page);
+	/*
+	 * Now save the alive pages.
+	 */
+	i = 0;
+	if (live_head) {
+		for (; live_head != tail;) { /* never dereference tail! */
+			page = next_page(live_head);
+			if (!PageActivate(live_head)) {
+				if (!page_refcnt(live_head))
+					__get_cpu_var(smooth_aging)++;
+				i++;
+				list_move(&live_head->lru, save_list);
+			}
+			live_head = page;
 		}
-	}
 
-	/* Save the remaining pages. */
-	for (i = 0; page != tail;) {
-		next_page = next_page(page);
-		if (!PageActivate(page)) {
-			i++;
-			list_move(&page->lru, save_list);
-		}
-		page = next_page;
+		if (i)
+			ra_account(0, RA_EVENT_READAHEAD_RESCUE, i);
 	}
 
-	if (i)
-		inc_page_state(readahead_rescue);
-
 #ifdef DEBUG_READAHEAD
 	if ((readahead_ratio & 3) == 3) {
 		ddprintk("save_chunk(ino=%lu, idx=%lu-%lu-%lu, %s@%s:%s)"
-				" %s, save %d\n",
+				" = %d\n",
 				mapping->host->i_ino,
 				index, head->index, pidx,
 				mapping_mapped(mapping) ? "mmap" : "file",
-				zone_names[page_zonenum(head)], pat,
-				keep_head ? "keephead" : "drophead", i);
+				zone_names[page_zonenum(head)], pat, i);
 		if (pat != static_buf)
 			free_page((unsigned long)pat);
 	}
@@ -1928,3 +2135,69 @@ drop_head:
 
 	return i;
 }
+
+int rescue_ra_pages(struct list_head *page_list, struct list_head *save_list)
+{
+	struct address_space *mapping;
+	struct page *chunk_head;
+	struct page *live_head;
+	struct page *page;
+	unsigned long refcnt;
+	int n;
+	int ret = 0;
+
+	page = list_to_page(page_list);
+
+next_chunk:
+	chunk_head = page;
+	live_head = NULL;
+	mapping = page->mapping;
+	n = 0;
+
+next_rs_page:
+	refcnt = page_refcnt(page);
+	page = next_page(page);
+
+	if (mapping != page->mapping || &page->lru == page_list)
+		goto save_chunk;
+
+	if (refcnt == page_refcnt(page))
+		n++;
+	else if (refcnt < page_refcnt(page))
+		n = 0;
+	else if (n < 1)
+		n = INT_MIN;
+	else
+		goto got_live_head;
+
+	goto next_rs_page;
+
+got_live_head:
+	n = 0;
+	live_head = prev_page(page);
+
+next_page:
+	if (refcnt < page_refcnt(page))
+		n++;
+	refcnt = page_refcnt(page);
+	page = next_page(page);
+
+	if (mapping != page->mapping || &page->lru == page_list)
+		goto save_chunk;
+
+	goto next_page;
+
+save_chunk:
+	if (mapping && !PageAnon(chunk_head) &&
+			!PageSwapCache(chunk_head) &&
+			/* !page_mapped(chunk_head) && */
+			n <= 3 &&
+			(!refcnt ||
+			 prev_page(page)->index >= chunk_head->index + 5))
+		ret += save_chunk(chunk_head, live_head, page, save_list);
+
+	if (&page->lru != page_list)
+		goto next_chunk;
+
+	return ret;
+}
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/mm/swap.c linux-2.6.14-ck2/mm/swap.c
--- linux-2.6.14-ck1/mm/swap.c	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/mm/swap.c	2005-11-03 00:02:46.000000000 +1100
@@ -29,7 +29,6 @@
 #include <linux/percpu.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
-#include <linux/init.h>
 
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -96,8 +95,6 @@ int rotate_reclaimable_page(struct page 
 	return 0;
 }
 
-extern int readahead_ratio;
-
 /*
  * FIXME: speed this up?
  */
@@ -115,25 +112,23 @@ void fastcall activate_page(struct page 
 	spin_unlock_irq(&zone->lru_lock);
 }
 
+DECLARE_PER_CPU(unsigned long, smooth_aging);
+
 /*
  * Mark a page as having seen activity.
  *
  * inactive,unreferenced	->	inactive,referenced
- * inactive,referenced		->	active,unreferenced
- * active,unreferenced		->	active,referenced
+ * inactive,referenced		->	activate,unreferenced
+ * activate,unreferenced	->	activate,referenced
  */
 void fastcall mark_page_accessed(struct page *page)
 {
-	if (!PageActive(page) && !PageActivate(page) &&
-			PageReferenced(page) && PageLRU(page)) {
-		if (readahead_ratio > 9 || (readahead_ratio & 1)) {
-			page_zone(page)->nr_page_aging++;
-			SetPageActivate(page);
-		} else
-			activate_page(page);
+	if (!PageActivate(page) && PageReferenced(page) && PageLRU(page)) {
+		SetPageActivate(page);
 		ClearPageReferenced(page);
 	} else if (!PageReferenced(page)) {
 		SetPageReferenced(page);
+		__get_cpu_var(smooth_aging)++;
 	}
 }
 
@@ -306,7 +301,6 @@ void __pagevec_lru_add(struct pagevec *p
 			if (zone)
 				spin_unlock_irq(&zone->lru_lock);
 			zone = pagezone;
-			update_page_age(zone);
 			spin_lock_irq(&zone->lru_lock);
 		}
 		if (TestSetPageLRU(page))
diff -Naurp --exclude-from=/home/con/kernel/dontdiff linux-2.6.14-ck1/mm/vmscan.c linux-2.6.14-ck2/mm/vmscan.c
--- linux-2.6.14-ck1/mm/vmscan.c	2005-11-03 00:02:32.000000000 +1100
+++ linux-2.6.14-ck2/mm/vmscan.c	2005-11-03 00:02:46.000000000 +1100
@@ -373,6 +373,7 @@ static pageout_t pageout(struct page *pa
 }
 
 extern int readahead_ratio;
+DECLARE_PER_CPU(unsigned long, smooth_aging);
 
 /*
  * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
@@ -382,12 +383,13 @@ static int shrink_list(struct list_head 
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
+	int pgkeep = 0;
 	int reclaimed = 0;
 
 	cond_resched();
 
-	if (readahead_ratio >= 80) 
-		rescue_ra_pages(page_list, &ret_pages);
+	if (readahead_ratio >= VM_READAHEAD_PROTECT_RATIO)
+		pgkeep += rescue_ra_pages(page_list, &ret_pages);
 
 	pagevec_init(&freed_pvec, 1);
 	while (!list_empty(page_list)) {
@@ -416,6 +418,7 @@ static int shrink_list(struct list_head 
 
 		if (PageActivate(page)) {
 			ClearPageActivate(page);
+			ClearPageReferenced(page);
 			goto activate_locked;
 		}
 
@@ -423,6 +426,8 @@ static int shrink_list(struct list_head 
 		/* In active use or really unfreeable?  Activate it. */
 		if (referenced && page_mapping_inuse(page))
 			goto activate_locked;
+		if (!referenced)
+			__get_cpu_var(smooth_aging)++;
 
 #ifdef CONFIG_SWAP
 		/*
@@ -564,11 +569,13 @@ keep_locked:
 keep:
 		list_add(&page->lru, &ret_pages);
 		BUG_ON(PageLRU(page));
+		pgkeep++;
 	}
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
 	mod_page_state(pgactivate, pgactivate);
+	mod_page_state(pgkeepcold, pgkeep - pgactivate);
 	sc->nr_reclaimed += reclaimed;
 	return reclaimed;
 }
@@ -652,6 +659,7 @@ static void shrink_cache(struct zone *zo
 			goto done;
 
 		max_scan -= nr_scan;
+		update_page_age(zone, nr_scan);
 		if (current_is_kswapd())
 			mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
 		else
@@ -774,6 +782,7 @@ refill_inactive_zone(struct zone *zone, 
 				list_add(&page->lru, &l_active);
 				continue;
 			}
+			__get_cpu_var(smooth_aging)++;
 		}
 		list_add(&page->lru, &l_inactive);
 	}
@@ -792,7 +801,6 @@ refill_inactive_zone(struct zone *zone, 
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
 			zone->nr_inactive += pgmoved;
-			zone->nr_page_aging += pgmoved;
 			spin_unlock_irq(&zone->lru_lock);
 			pgdeactivate += pgmoved;
 			pgmoved = 0;
@@ -802,7 +810,6 @@ refill_inactive_zone(struct zone *zone, 
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
-	zone->nr_page_aging += pgmoved;
 	zone->nr_inactive += pgmoved;
 	pgdeactivate += pgmoved;
 	if (buffer_heads_over_limit) {
@@ -834,6 +841,7 @@ refill_inactive_zone(struct zone *zone, 
 
 	mod_page_state_zone(zone, pgrefill, pgscanned);
 	mod_page_state(pgdeactivate, pgdeactivate);
+	mod_page_state(pgkeephot, pgmoved);
 }
 
 /*
@@ -885,13 +893,46 @@ shrink_zone(struct zone *zone, struct sc
 		}
 	}
 
-	update_page_age(zone);
 	throttle_vm_writeout();
 
 	atomic_dec(&zone->reclaim_in_progress);
 }
 
 /*
+ * Helper functions to adjust nice level of kswapd, based on the priority of
+ * the task (p) that called it. If it is already higher priority we do not
+ * demote its nice level since it is still working on behalf of a higher
+ * priority task. With kernel threads we leave it at nice 0.
+ *
+ * We don't ever run kswapd real time, so if a real time task calls kswapd we
+ * set it to highest SCHED_NORMAL priority.
+ */
+static int effective_sc_prio(struct task_struct *p)
+{
+	if (likely(p->mm)) {
+		if (rt_task(p))
+			return -20;
+		if (batch_task(p))
+			return 19;
+		return task_nice(p);
+	}
+	return 0;
+}
+
+static void set_kswapd_nice(task_t *kswapd, task_t *p, int active)
+{
+	long nice = effective_sc_prio(p);
+
+	if (task_nice(kswapd) > nice || !active)
+		set_user_nice(kswapd, nice);
+}
+
+static int sc_priority(struct task_struct *p)
+{
+	return (DEF_PRIORITY + (DEF_PRIORITY * effective_sc_prio(p) / 40));
+}
+
+/*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
  * request.
@@ -945,7 +986,8 @@ shrink_caches(struct zone **zones, struc
  * holds filesystem locks which prevent writeout this might not work, and the
  * allocation attempt will fail.
  */
-int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
+int try_to_free_pages(struct zone **zones, unsigned int gfp_mask,
+			struct task_struct *p)
 {
 	int priority;
 	int ret = 0;
@@ -953,7 +995,10 @@ int try_to_free_pages(struct zone **zone
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc;
 	unsigned long lru_pages = 0;
-	int i;
+	int i, scan_priority = DEF_PRIORITY;
+
+	if (p)
+		scan_priority = sc_priority(p);
 
 	delay_prefetch();
 
@@ -969,11 +1014,11 @@ int try_to_free_pages(struct zone **zone
 		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
-		zone->temp_priority = DEF_PRIORITY;
+		zone->temp_priority = scan_priority;
 		lru_pages += zone->nr_active + zone->nr_inactive;
 	}
 
-	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+	for (priority = scan_priority; priority >= 0; priority--) {
 		sc.nr_mapped = read_page_state(nr_mapped);
 		sc.nr_scanned = 0;
 		sc.nr_reclaimed = 0;
@@ -1005,7 +1050,7 @@ int try_to_free_pages(struct zone **zone
 		}
 
 		/* Take a nap, wait for some writeback to complete */
-		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+		if (sc.nr_scanned && priority < scan_priority - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
 out:
@@ -1048,13 +1093,15 @@ out:
 static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
 {
 	int to_free = nr_pages;
-	int all_zones_ok;
+	int all_zones_ok = 0;
 	int priority;
-	int i;
+	int i, scan_priority;
 	int total_scanned, total_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc;
 
+	scan_priority = sc_priority(pgdat->kswapd);
+
 loop_again:
 	total_scanned = 0;
 	total_reclaimed = 0;
@@ -1068,10 +1115,10 @@ loop_again:
 	for (i = 0; i < pgdat->nr_zones; i++) {
 		struct zone *zone = pgdat->node_zones + i;
 
-		zone->temp_priority = DEF_PRIORITY;
+		zone->temp_priority = scan_priority;
 	}
 
-	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+	for (priority = scan_priority; priority >= 0; priority--) {
 		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
 		int begin_zone = -1;
 		unsigned long lru_pages = 0;
@@ -1087,13 +1134,11 @@ loop_again:
 				struct zone *zone = pgdat->node_zones + i;
 				unsigned long watermark;
 
-				update_page_age(zone);
-
 				if (zone->present_pages == 0)
 					continue;
 
 				if (zone->all_unreclaimable &&
-						priority != DEF_PRIORITY)
+						priority != scan_priority)
 					continue;
 
 				/*
@@ -1103,7 +1148,7 @@ loop_again:
 				 */
 				watermark = zone->pages_high +
 					(zone->pages_high * priority /
-					DEF_PRIORITY);
+					scan_priority);
 
 				if (!zone_watermark_ok(zone, order, watermark,
 						       0, 0, 0)) {
@@ -1155,13 +1200,13 @@ loop_again:
 			if (zone->present_pages == 0)
 				continue;
 
-			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+			if (zone->all_unreclaimable && priority != scan_priority)
 				continue;
 
 			if (nr_pages == 0) {	/* Not software suspend */
 				unsigned long watermark = zone->pages_high +
 					(zone->pages_high * priority /
-					DEF_PRIORITY);
+					scan_priority);
 				if (!zone_watermark_ok(zone, order, watermark,
 						       end_zone, 0, 0))
 					all_zones_ok = 0;
@@ -1204,7 +1249,7 @@ loop_again:
 		 * OK, kswapd is getting into trouble.  Take a nap, then take
 		 * another pass across the zones.
 		 */
-		if (total_scanned && priority < DEF_PRIORITY - 2)
+		if (total_scanned && priority < scan_priority - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 
 		/*
@@ -1294,6 +1339,7 @@ static int kswapd(void *p)
 			 */
 			order = new_order;
 		} else {
+			set_user_nice(tsk, 0);
 			schedule();
 			order = pgdat->kswapd_max_order;
 		}
@@ -1307,21 +1353,27 @@ static int kswapd(void *p)
 /*
  * A zone is low on free memory, so wake its kswapd task to service it.
  */
-void wakeup_kswapd(struct zone *zone, int order)
+void wakeup_kswapd(struct zone *zone, int order, struct task_struct *p)
 {
 	pg_data_t *pgdat;
+	int active;
 
 	if (zone->present_pages == 0)
 		return;
 
 	pgdat = zone->zone_pgdat;
+
 	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0))
 		return;
+
 	if (pgdat->kswapd_max_order < order)
 		pgdat->kswapd_max_order = order;
 	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 		return;
-	if (!waitqueue_active(&pgdat->kswapd_wait))
+
+	active = waitqueue_active(&pgdat->kswapd_wait);
+	set_kswapd_nice(pgdat->kswapd, p, active);
+	if (!active)
 		return;
 	wake_up_interruptible(&pgdat->kswapd_wait);
 }