---

 linux-2.6-npiggin/fs/buffer.c                  |    2 
 linux-2.6-npiggin/fs/dcache.c                  |  118 ++
 linux-2.6-npiggin/fs/dquot.c                   |    6 
 linux-2.6-npiggin/fs/exec.c                    |    3 
 linux-2.6-npiggin/fs/fs-writeback.c            |    7 
 linux-2.6-npiggin/fs/hfs/inode.c               |   23 
 linux-2.6-npiggin/fs/hfsplus/inode.c           |   23 
 linux-2.6-npiggin/fs/inode.c                   |   92 +-
 linux-2.6-npiggin/fs/proc/array.c              |    5 
 linux-2.6-npiggin/include/asm-arm/system.h     |   30 
 linux-2.6-npiggin/include/asm-ia64/system.h    |   10 
 linux-2.6-npiggin/include/asm-mips/system.h    |   10 
 linux-2.6-npiggin/include/asm-s390/system.h    |    5 
 linux-2.6-npiggin/include/asm-sparc/system.h   |    4 
 linux-2.6-npiggin/include/asm-sparc64/system.h |   14 
 linux-2.6-npiggin/include/linux/dcache.h       |    3 
 linux-2.6-npiggin/include/linux/fs.h           |    5 
 linux-2.6-npiggin/include/linux/gfp.h          |    1 
 linux-2.6-npiggin/include/linux/init_task.h    |    6 
 linux-2.6-npiggin/include/linux/mm.h           |   19 
 linux-2.6-npiggin/include/linux/mm_inline.h    |   34 
 linux-2.6-npiggin/include/linux/mmzone.h       |   33 
 linux-2.6-npiggin/include/linux/page-flags.h   |   58 -
 linux-2.6-npiggin/include/linux/rmap.h         |    4 
 linux-2.6-npiggin/include/linux/sched.h        |   21 
 linux-2.6-npiggin/include/linux/swap.h         |   13 
 linux-2.6-npiggin/include/linux/sysctl.h       |    2 
 linux-2.6-npiggin/include/linux/writeback.h    |    2 
 linux-2.6-npiggin/kernel/sched.c               | 1063 +++++++++----------------
 linux-2.6-npiggin/kernel/sysctl.c              |   43 -
 linux-2.6-npiggin/mm/filemap.c                 |    6 
 linux-2.6-npiggin/mm/hugetlb.c                 |    9 
 linux-2.6-npiggin/mm/memory.c                  |    7 
 linux-2.6-npiggin/mm/oom_kill.c                |    7 
 linux-2.6-npiggin/mm/page-writeback.c          |    3 
 linux-2.6-npiggin/mm/page_alloc.c              |  101 +-
 linux-2.6-npiggin/mm/rmap.c                    |   49 -
 linux-2.6-npiggin/mm/shmem.c                   |    6 
 linux-2.6-npiggin/mm/swap.c                    |   89 --
 linux-2.6-npiggin/mm/swap_state.c              |    3 
 linux-2.6-npiggin/mm/swapfile.c                |    6 
 linux-2.6-npiggin/mm/vmscan.c                  |  712 +++++++++-------
 42 files changed, 1312 insertions(+), 1345 deletions(-)

diff -puN fs/buffer.c~rollup fs/buffer.c
--- linux-2.6/fs/buffer.c~rollup	2004-08-20 18:15:19.000000000 +1000
+++ linux-2.6-npiggin/fs/buffer.c	2004-08-20 18:15:23.000000000 +1000
@@ -594,7 +594,7 @@ static void free_more_memory(void)
 	for_each_pgdat(pgdat) {
 		zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones;
 		if (*zones)
-			try_to_free_pages(zones, GFP_NOFS, 0);
+			try_to_free_pages(zones, GFP_NOFS, 0, 0);
 	}
 }
 
diff -puN fs/dcache.c~rollup fs/dcache.c
--- linux-2.6/fs/dcache.c~rollup	2004-08-20 18:15:19.000000000 +1000
+++ linux-2.6-npiggin/fs/dcache.c	2004-08-20 18:15:23.000000000 +1000
@@ -34,7 +34,7 @@
 
 /* #define DCACHE_DEBUG 1 */
 
-int sysctl_vfs_cache_pressure = 100;
+int sysctl_vfs_cache_cost = 16;
 
 spinlock_t dcache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
@@ -60,6 +60,7 @@ static unsigned int d_hash_mask;
 static unsigned int d_hash_shift;
 static struct hlist_head *dentry_hashtable;
 static LIST_HEAD(dentry_unused);
+static int zone_shrinker;
 
 /* Statistics gathering. */
 struct dentry_stat_t dentry_stat = {
@@ -86,6 +87,22 @@ static void d_free(struct dentry *dentry
  	call_rcu(&dentry->d_rcu, d_callback);
 }
 
+static void dentry_add_lru(struct dentry *dentry)
+{
+	struct zone_shrinker *zs;
+	zs = get_zone_shrinker(page_zone(virt_to_page(dentry)), zone_shrinker);
+	list_add(&dentry->d_lru, &zs->lru);
+	zs->nr++;
+}
+
+static void dentry_del_lru(struct dentry *dentry)
+{
+	struct zone_shrinker *zs;
+	zs = get_zone_shrinker(page_zone(virt_to_page(dentry)), zone_shrinker);
+	list_del(&dentry->d_lru);
+	zs->nr--;
+}
+
 /*
  * Release the dentry's inode, using the filesystem
  * d_iput() operation if defined.
@@ -155,7 +172,7 @@ repeat:
 		spin_unlock(&dcache_lock);
 		return;
 	}
-			
+
 	/*
 	 * AV: ->d_delete() is _NOT_ allowed to block now.
 	 */
@@ -166,9 +183,9 @@ repeat:
 	/* Unreachable? Get rid of it */
  	if (d_unhashed(dentry))
 		goto kill_it;
-  	if (list_empty(&dentry->d_lru)) {
-  		dentry->d_flags |= DCACHE_REFERENCED;
-  		list_add(&dentry->d_lru, &dentry_unused);
+	dentry->d_flags |= DCACHE_REFERENCED;
+  	if (list_empty(&dentry->d_unused)) {
+  		list_add(&dentry->d_unused, &dentry_unused);
   		dentry_stat.nr_unused++;
   	}
  	spin_unlock(&dentry->d_lock);
@@ -181,11 +198,12 @@ unhash_it:
 kill_it: {
 		struct dentry *parent;
 
-		/* If dentry was on d_lru list
+		/* If dentry was on d_unused list
 		 * delete it from there
 		 */
-  		if (!list_empty(&dentry->d_lru)) {
-  			list_del(&dentry->d_lru);
+		dentry_del_lru(dentry);
+  		if (!list_empty(&dentry->d_unused)) {
+  			list_del(&dentry->d_unused);
   			dentry_stat.nr_unused--;
   		}
   		list_del(&dentry->d_child);
@@ -263,9 +281,9 @@ int d_invalidate(struct dentry * dentry)
 static inline struct dentry * __dget_locked(struct dentry *dentry)
 {
 	atomic_inc(&dentry->d_count);
-	if (!list_empty(&dentry->d_lru)) {
+	if (!list_empty(&dentry->d_unused)) {
 		dentry_stat.nr_unused--;
-		list_del_init(&dentry->d_lru);
+		list_del_init(&dentry->d_unused);
 	}
 	return dentry;
 }
@@ -350,6 +368,7 @@ static inline void prune_one_dentry(stru
 {
 	struct dentry * parent;
 
+	dentry_del_lru(dentry);
 	__d_drop(dentry);
 	list_del(&dentry->d_child);
 	dentry_stat.nr_dentry--;	/* For d_free, below */
@@ -392,7 +411,39 @@ static void prune_dcache(int count)
 		list_del_init(tmp);
 		prefetch(dentry_unused.prev);
  		dentry_stat.nr_unused--;
+		dentry = list_entry(tmp, struct dentry, d_unused);
+
+ 		spin_lock(&dentry->d_lock);
+		/*
+		 * We found an inuse dentry which was not removed from
+		 * dentry_unused because of laziness during lookup.  Do not free
+		 * it - just keep it off the dentry_unused list.
+		 */
+ 		if (atomic_read(&dentry->d_count)) {
+ 			spin_unlock(&dentry->d_lock);
+			continue;
+		}
+		if (dentry->d_flags & DCACHE_REFERENCED)
+			dentry->d_flags &= ~DCACHE_REFERENCED;
+		prune_one_dentry(dentry);
+	}
+	spin_unlock(&dcache_lock);
+}
+
+static void prune_dcache_lru(struct list_head *list, unsigned long count)
+{
+	spin_lock(&dcache_lock);
+	for (; count ; count--) {
+		struct dentry *dentry;
+		struct list_head *tmp;
+
+		tmp = list->prev;
+		if (tmp == list)
+			break;
+		list_del(tmp);
+		prefetch(list->prev);
 		dentry = list_entry(tmp, struct dentry, d_lru);
+		list_add(&dentry->d_lru, list);
 
  		spin_lock(&dentry->d_lock);
 		/*
@@ -401,22 +452,27 @@ static void prune_dcache(int count)
 		 * it - just keep it off the dentry_unused list.
 		 */
  		if (atomic_read(&dentry->d_count)) {
+			if (!list_empty(&dentry->d_unused)) {
+				list_del_init(&dentry->d_unused);
+				dentry_stat.nr_unused--;
+			}
  			spin_unlock(&dentry->d_lock);
 			continue;
 		}
 		/* If the dentry was recently referenced, don't free it. */
 		if (dentry->d_flags & DCACHE_REFERENCED) {
 			dentry->d_flags &= ~DCACHE_REFERENCED;
- 			list_add(&dentry->d_lru, &dentry_unused);
- 			dentry_stat.nr_unused++;
  			spin_unlock(&dentry->d_lock);
 			continue;
 		}
+		list_del_init(&dentry->d_unused);
+		dentry_stat.nr_unused--;
 		prune_one_dentry(dentry);
 	}
 	spin_unlock(&dcache_lock);
 }
 
+
 /*
  * Shrink the dcache for the specified super block.
  * This allows us to unmount a device without disturbing
@@ -453,7 +509,7 @@ void shrink_dcache_sb(struct super_block
 	while (next != &dentry_unused) {
 		tmp = next;
 		next = tmp->next;
-		dentry = list_entry(tmp, struct dentry, d_lru);
+		dentry = list_entry(tmp, struct dentry, d_unused);
 		if (dentry->d_sb != sb)
 			continue;
 		list_del(tmp);
@@ -468,7 +524,7 @@ repeat:
 	while (next != &dentry_unused) {
 		tmp = next;
 		next = tmp->next;
-		dentry = list_entry(tmp, struct dentry, d_lru);
+		dentry = list_entry(tmp, struct dentry, d_unused);
 		if (dentry->d_sb != sb)
 			continue;
 		dentry_stat.nr_unused--;
@@ -558,16 +614,16 @@ resume:
 		struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
 		next = tmp->next;
 
-		if (!list_empty(&dentry->d_lru)) {
+		if (!list_empty(&dentry->d_unused)) {
 			dentry_stat.nr_unused--;
-			list_del_init(&dentry->d_lru);
+			list_del_init(&dentry->d_unused);
 		}
 		/* 
 		 * move only zero ref count dentries to the end 
 		 * of the unused list for prune_dcache
 		 */
 		if (!atomic_read(&dentry->d_count)) {
-			list_add(&dentry->d_lru, dentry_unused.prev);
+			list_add(&dentry->d_unused, dentry_unused.prev);
 			dentry_stat.nr_unused++;
 			found++;
 		}
@@ -633,9 +689,9 @@ void shrink_dcache_anon(struct hlist_hea
 		spin_lock(&dcache_lock);
 		hlist_for_each(lp, head) {
 			struct dentry *this = hlist_entry(lp, struct dentry, d_hash);
-			if (!list_empty(&this->d_lru)) {
+			if (!list_empty(&this->d_unused)) {
 				dentry_stat.nr_unused--;
-				list_del_init(&this->d_lru);
+				list_del_init(&this->d_unused);
 			}
 
 			/* 
@@ -643,7 +699,7 @@ void shrink_dcache_anon(struct hlist_hea
 			 * of the unused list for prune_dcache
 			 */
 			if (!atomic_read(&this->d_count)) {
-				list_add_tail(&this->d_lru, &dentry_unused);
+				list_add_tail(&this->d_unused, &dentry_unused);
 				dentry_stat.nr_unused++;
 				found++;
 			}
@@ -665,14 +721,16 @@ void shrink_dcache_anon(struct hlist_hea
  *
  * In this case we return -1 to tell the caller that we baled.
  */
-static int shrink_dcache_memory(int nr, unsigned int gfp_mask)
+static long shrink_dcache_memory(struct zone_shrinker *zs,
+						unsigned long nr,
+						unsigned int gfp_mask)
 {
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
-		prune_dcache(nr);
+		prune_dcache_lru(&zs->lru, nr);
 	}
-	return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+	return zs->nr / sysctl_vfs_cache_cost;
 }
 
 /**
@@ -702,7 +760,7 @@ struct dentry *d_alloc(struct dentry * p
 		}
 	} else  {
 		dname = dentry->d_iname;
-	}	
+	}
 	dentry->d_name.name = dname;
 
 	dentry->d_name.len = name->len;
@@ -723,6 +781,7 @@ struct dentry *d_alloc(struct dentry * p
 	dentry->d_bucket = NULL;
 	INIT_HLIST_NODE(&dentry->d_hash);
 	INIT_LIST_HEAD(&dentry->d_lru);
+	INIT_LIST_HEAD(&dentry->d_unused);
 	INIT_LIST_HEAD(&dentry->d_subdirs);
 	INIT_LIST_HEAD(&dentry->d_alias);
 
@@ -734,6 +793,7 @@ struct dentry *d_alloc(struct dentry * p
 	}
 
 	spin_lock(&dcache_lock);
+	dentry_add_lru(dentry);
 	if (parent)
 		list_add(&dentry->d_child, &parent->d_subdirs);
 	dentry_stat.nr_dentry++;
@@ -838,7 +898,7 @@ struct dentry * d_alloc_anon(struct inod
 		return NULL;
 
 	tmp->d_parent = tmp; /* make sure dput doesn't croak */
-	
+
 	spin_lock(&dcache_lock);
 	if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) {
 		/* A directory can only have one dentry.
@@ -976,7 +1036,7 @@ struct dentry * __d_lookup(struct dentry
 	struct hlist_node *node;
 
 	rcu_read_lock();
-	
+
 	hlist_for_each_rcu(node, head) {
 		struct dentry *dentry; 
 		struct qstr *qstr;
@@ -1597,8 +1657,10 @@ static void __init dcache_init(unsigned 
 					 0,
 					 SLAB_RECLAIM_ACCOUNT|SLAB_PANIC,
 					 NULL, NULL);
-	
-	set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory);
+
+	zone_shrinker = set_zone_shrinker(shrink_dcache_memory, DEFAULT_SEEKS);
+	if (zone_shrinker < 0)
+		BUG();
 }
 
 /* SLAB cache for __getname() consumers */
diff -puN fs/dquot.c~rollup fs/dquot.c
--- linux-2.6/fs/dquot.c~rollup	2004-08-20 18:15:19.000000000 +1000
+++ linux-2.6-npiggin/fs/dquot.c	2004-08-20 18:15:23.000000000 +1000
@@ -115,7 +115,7 @@
  * spinlock to internal buffers before writing.
  *
  * Lock ordering (including related VFS locks) is following:
- *   i_sem > dqonoff_sem > iprune_sem > journal_lock > dqptr_sem >
+ *   i_sem > dqonoff_sem > iprune_rwsem > journal_lock > dqptr_sem >
  *   > dquot->dq_lock > dqio_sem
  * i_sem on quota files is special (it's below dqio_sem)
  */
@@ -734,11 +734,11 @@ static void drop_dquot_ref(struct super_
 
 	/* We need to be guarded against prune_icache to reach all the
 	 * inodes - otherwise some can be on the local list of prune_icache */
-	down(&iprune_sem);
+	down_write(&iprune_rwsem);
 	down_write(&sb_dqopt(sb)->dqptr_sem);
 	remove_dquot_ref(sb, type, &tofree_head);
 	up_write(&sb_dqopt(sb)->dqptr_sem);
-	up(&iprune_sem);
+	up_write(&iprune_rwsem);
 	put_dquot_list(&tofree_head);
 }
 
diff -puN fs/exec.c~rollup fs/exec.c
--- linux-2.6/fs/exec.c~rollup	2004-08-20 18:15:19.000000000 +1000
+++ linux-2.6-npiggin/fs/exec.c	2004-08-20 18:15:22.000000000 +1000
@@ -321,7 +321,8 @@ void install_arg_page(struct vm_area_str
 		goto out;
 	}
 	mm->rss++;
-	lru_cache_add_active(page);
+	lru_cache_add(page);
+	mark_page_accessed(page);
 	set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
 	page_add_anon_rmap(page, vma, address);
diff -puN fs/fs-writeback.c~rollup fs/fs-writeback.c
--- linux-2.6/fs/fs-writeback.c~rollup	2004-08-20 18:15:19.000000000 +1000
+++ linux-2.6-npiggin/fs/fs-writeback.c	2004-08-20 18:15:23.000000000 +1000
@@ -225,8 +225,7 @@ __sync_single_inode(struct inode *inode,
 			/*
 			 * The inode is clean, unused
 			 */
-			list_move(&inode->i_list, &inode_unused);
-			inodes_stat.nr_unused++;
+			inode_add_unused(inode);
 		}
 	}
 	wake_up_inode(inode);
@@ -457,9 +456,7 @@ void sync_inodes_sb(struct super_block *
 	unsigned long nr_dirty = read_page_state(nr_dirty);
 	unsigned long nr_unstable = read_page_state(nr_unstable);
 
-	wbc.nr_to_write = nr_dirty + nr_unstable +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused) +
-			nr_dirty + nr_unstable;
+	wbc.nr_to_write = nr_dirty + nr_unstable + inodes_stat.nr_inodes;
 	wbc.nr_to_write += wbc.nr_to_write / 2;		/* Bit more for luck */
 	spin_lock(&inode_lock);
 	sync_sb_inodes(sb, &wbc);
diff -puN fs/hfs/inode.c~rollup fs/hfs/inode.c
--- linux-2.6/fs/hfs/inode.c~rollup	2004-08-20 18:15:19.000000000 +1000
+++ linux-2.6-npiggin/fs/hfs/inode.c	2004-08-20 18:15:22.000000000 +1000
@@ -67,19 +67,20 @@ int hfs_releasepage(struct page *page, i
 		nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
 		spin_lock(&tree->hash_lock);
 		node = hfs_bnode_findhash(tree, nidx);
-		if (!node)
-			;
-		else if (atomic_read(&node->refcnt))
-			res = 0;
-		else for (i = 0; i < tree->pages_per_bnode; i++) {
-			if (PageActive(node->page[i])) {
+		if (node) {
+			if (atomic_read(&node->refcnt))
 				res = 0;
-				break;
+			else for (i = 0; i < tree->pages_per_bnode; i++) {
+				if (PageActiveMapped(node->page[i]) ||
+					PageActiveUnmapped(node->page[i])) {
+					res = 0;
+					break;
+				}
+			}
+			if (res) {
+				hfs_bnode_unhash(node);
+				hfs_bnode_free(node);
 			}
-		}
-		if (res && node) {
-			hfs_bnode_unhash(node);
-			hfs_bnode_free(node);
 		}
 		spin_unlock(&tree->hash_lock);
 	} else {
diff -puN fs/hfsplus/inode.c~rollup fs/hfsplus/inode.c
--- linux-2.6/fs/hfsplus/inode.c~rollup	2004-08-20 18:15:19.000000000 +1000
+++ linux-2.6-npiggin/fs/hfsplus/inode.c	2004-08-20 18:15:22.000000000 +1000
@@ -67,19 +67,20 @@ int hfsplus_releasepage(struct page *pag
 		nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT);
 		spin_lock(&tree->hash_lock);
 		node = hfs_bnode_findhash(tree, nidx);
-		if (!node)
-			;
-		else if (atomic_read(&node->refcnt))
-			res = 0;
-		else for (i = 0; i < tree->pages_per_bnode; i++) {
-			if (PageActive(node->page[i])) {
+		if (node) {
+			if (atomic_read(&node->refcnt))
 				res = 0;
-				break;
+			else for (i = 0; i < tree->pages_per_bnode; i++) {
+				if (PageActiveMapped(node->page[i]) ||
+					PageActiveUnmapped(node->page[i])) {
+					res = 0;
+					break;
+				}
+			}
+			if (res) {
+				hfs_bnode_unhash(node);
+				hfs_bnode_free(node);
 			}
-		}
-		if (res && node) {
-			hfs_bnode_unhash(node);
-			hfs_bnode_free(node);
 		}
 		spin_unlock(&tree->hash_lock);
 	} else {
diff -puN fs/inode.c~rollup fs/inode.c
--- linux-2.6/fs/inode.c~rollup	2004-08-20 18:15:19.000000000 +1000
+++ linux-2.6-npiggin/fs/inode.c	2004-08-20 18:15:23.000000000 +1000
@@ -69,9 +69,8 @@ static unsigned int i_hash_shift;
  * A "dirty" list is maintained for each super block,
  * allowing for low-overhead inode sync() operations.
  */
-
+static int zone_shrinker;
 LIST_HEAD(inode_in_use);
-LIST_HEAD(inode_unused);
 static struct hlist_head *inode_hashtable;
 
 /*
@@ -91,7 +90,7 @@ EXPORT_SYMBOL(inode_lock);
  * from its final dispose_list, the struct super_block they refer to
  * (for inode->i_sb->s_op) may already have been freed and reused.
  */
-DECLARE_MUTEX(iprune_sem);
+DECLARE_RWSEM(iprune_rwsem);
 
 /*
  * Statistics gathering..
@@ -220,6 +219,24 @@ static void init_once(void * foo, kmem_c
 		inode_init_once(inode);
 }
 
+void inode_add_unused(struct inode *inode)
+{
+	struct zone_shrinker *zs;
+	zs = get_zone_shrinker(page_zone(virt_to_page(inode)), zone_shrinker);
+	list_add(&inode->i_list, &zs->lru);
+	zs->nr++;
+}
+
+static void inode_del_unused(struct inode *inode)
+{
+	struct zone_shrinker *zs;
+	zs = get_zone_shrinker(page_zone(virt_to_page(inode)), zone_shrinker);
+	list_del_init(&inode->i_list);
+	BUG_ON(zs->nr == 0);
+	zs->nr--;
+}
+
+
 /*
  * inode_lock must be held
  */
@@ -230,9 +247,10 @@ void __iget(struct inode * inode)
 		return;
 	}
 	atomic_inc(&inode->i_count);
-	if (!(inode->i_state & (I_DIRTY|I_LOCK)))
-		list_move(&inode->i_list, &inode_in_use);
-	inodes_stat.nr_unused--;
+	if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+		inode_del_unused(inode);
+		list_add(&inode->i_list, &inode_in_use);
+	}
 }
 EXPORT_SYMBOL(__iget);
 
@@ -302,7 +320,7 @@ static void dispose_list(struct list_hea
 static int invalidate_list(struct list_head *head, struct list_head *dispose)
 {
 	struct list_head *next;
-	int busy = 0, count = 0;
+	int busy = 0;
 
 	next = head->next;
 	for (;;) {
@@ -317,15 +335,13 @@ static int invalidate_list(struct list_h
 		if (!atomic_read(&inode->i_count)) {
 			hlist_del_init(&inode->i_hash);
 			list_del(&inode->i_sb_list);
-			list_move(&inode->i_list, dispose);
+			inode_del_unused(inode);
+			list_add(&inode->i_list, dispose);
 			inode->i_state |= I_FREEING;
-			count++;
 			continue;
 		}
 		busy = 1;
 	}
-	/* only unused inodes may be cached with i_count zero */
-	inodes_stat.nr_unused -= count;
 	return busy;
 }
 
@@ -350,13 +366,13 @@ int invalidate_inodes(struct super_block
 	int busy;
 	LIST_HEAD(throw_away);
 
-	down(&iprune_sem);
+	down_write(&iprune_rwsem);
 	spin_lock(&inode_lock);
 	busy = invalidate_list(&sb->s_inodes, &throw_away);
 	spin_unlock(&inode_lock);
 
 	dispose_list(&throw_away);
-	up(&iprune_sem);
+	up_write(&iprune_rwsem);
 
 	return busy;
 }
@@ -416,25 +432,26 @@ static int can_unuse(struct inode *inode
  * If the inode has metadata buffers attached to mapping->private_list then
  * try to remove them.
  */
-static void prune_icache(int nr_to_scan)
+static void prune_icache_lru(struct list_head *list, unsigned long nr_to_scan)
 {
 	LIST_HEAD(freeable);
-	int nr_pruned = 0;
 	int nr_scanned;
 	unsigned long reap = 0;
 
-	down(&iprune_sem);
+	down_read(&iprune_rwsem);
 	spin_lock(&inode_lock);
 	for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) {
 		struct inode *inode;
+		struct list_head *tmp;
 
-		if (list_empty(&inode_unused))
+		tmp = list->prev;
+		if (tmp == list)
 			break;
-
-		inode = list_entry(inode_unused.prev, struct inode, i_list);
+		prefetch(tmp->prev);
+		inode = list_entry(tmp, struct inode, i_list);
 
 		if (inode->i_state || atomic_read(&inode->i_count)) {
-			list_move(&inode->i_list, &inode_unused);
+			list_move(&inode->i_list, list);
 			continue;
 		}
 		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
@@ -445,7 +462,7 @@ static void prune_icache(int nr_to_scan)
 			iput(inode);
 			spin_lock(&inode_lock);
 
-			if (inode != list_entry(inode_unused.next,
+			if (inode != list_entry(list->prev,
 						struct inode, i_list))
 				continue;	/* wrong inode or list_empty */
 			if (!can_unuse(inode))
@@ -453,15 +470,14 @@ static void prune_icache(int nr_to_scan)
 		}
 		hlist_del_init(&inode->i_hash);
 		list_del_init(&inode->i_sb_list);
-		list_move(&inode->i_list, &freeable);
+		inode_del_unused(inode);
+		list_add(&inode->i_list, &freeable);
 		inode->i_state |= I_FREEING;
-		nr_pruned++;
 	}
-	inodes_stat.nr_unused -= nr_pruned;
 	spin_unlock(&inode_lock);
 
 	dispose_list(&freeable);
-	up(&iprune_sem);
+	up_read(&iprune_rwsem);
 
 	if (current_is_kswapd())
 		mod_page_state(kswapd_inodesteal, reap);
@@ -478,7 +494,9 @@ static void prune_icache(int nr_to_scan)
  * This function is passed the number of inodes to scan, and it returns the
  * total number of remaining possibly-reclaimable inodes.
  */
-static int shrink_icache_memory(int nr, unsigned int gfp_mask)
+static long shrink_icache_memory(struct zone_shrinker *zs,
+					unsigned long nr,
+					unsigned int gfp_mask)
 {
 	if (nr) {
 		/*
@@ -488,9 +506,9 @@ static int shrink_icache_memory(int nr, 
 	 	 */
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
-		prune_icache(nr);
+		prune_icache_lru(&zs->lru, nr);
 	}
-	return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure;
+	return zs->nr / sysctl_vfs_cache_cost;
 }
 
 static void __wait_on_freeing_inode(struct inode *inode);
@@ -1033,18 +1051,20 @@ void generic_forget_inode(struct inode *
 	struct super_block *sb = inode->i_sb;
 
 	if (!hlist_unhashed(&inode->i_hash)) {
-		if (!(inode->i_state & (I_DIRTY|I_LOCK)))
-			list_move(&inode->i_list, &inode_unused);
-		inodes_stat.nr_unused++;
+		if (!(inode->i_state & (I_DIRTY|I_LOCK))) {
+			list_del(&inode->i_list);
+			inode_add_unused(inode);
+		}
+
 		spin_unlock(&inode_lock);
 		if (!sb || (sb->s_flags & MS_ACTIVE))
 			return;
 		write_inode_now(inode, 1);
 		spin_lock(&inode_lock);
-		inodes_stat.nr_unused--;
 		hlist_del_init(&inode->i_hash);
-	}
-	list_del_init(&inode->i_list);
+		inode_del_unused(inode);
+	} else
+		list_del_init(&inode->i_list);
 	list_del_init(&inode->i_sb_list);
 	inode->i_state|=I_FREEING;
 	inodes_stat.nr_inodes--;
@@ -1369,7 +1389,9 @@ void __init inode_init(unsigned long mem
 	/* inode slab cache */
 	inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode),
 				0, SLAB_PANIC, init_once, NULL);
-	set_shrinker(DEFAULT_SEEKS, shrink_icache_memory);
+	zone_shrinker = set_zone_shrinker(shrink_icache_memory, DEFAULT_SEEKS);
+	if (zone_shrinker < 0)
+		BUG();
 }
 
 void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
diff -puN fs/proc/array.c~rollup fs/proc/array.c
--- linux-2.6/fs/proc/array.c~rollup	2004-08-20 18:15:19.000000000 +1000
+++ linux-2.6-npiggin/fs/proc/array.c	2004-08-20 18:15:23.000000000 +1000
@@ -159,7 +159,8 @@ static inline char * task_state(struct t
 	read_lock(&tasklist_lock);
 	buffer += sprintf(buffer,
 		"State:\t%s\n"
-		"SleepAVG:\t%lu%%\n"
+		"sleep_time:\t%lu\n"
+		"total_time:\t%lu\n"
 		"Tgid:\t%d\n"
 		"Pid:\t%d\n"
 		"PPid:\t%d\n"
@@ -167,7 +168,7 @@ static inline char * task_state(struct t
 		"Uid:\t%d\t%d\t%d\t%d\n"
 		"Gid:\t%d\t%d\t%d\t%d\n",
 		get_task_state(p),
-		(p->sleep_avg/1024)*100/(1020000000/1024),
+		p->sleep_time, p->total_time,
 	       	p->tgid,
 		p->pid, p->pid ? p->real_parent->pid : 0,
 		p->pid && p->ptrace ? p->parent->pid : 0,
diff -puN include/asm-arm/system.h~rollup include/asm-arm/system.h
--- linux-2.6/include/asm-arm/system.h~rollup	2004-08-20 18:15:19.000000000 +1000
+++ linux-2.6-npiggin/include/asm-arm/system.h	2004-08-20 18:15:23.000000000 +1000
@@ -137,34 +137,12 @@ extern unsigned int user_debug;
 #define set_wmb(var, value) do { var = value; wmb(); } while (0)
 #define nop() __asm__ __volatile__("mov\tr0,r0\t@ nop\n\t");
 
-#ifdef CONFIG_SMP
 /*
- * Define our own context switch locking.  This allows us to enable
- * interrupts over the context switch, otherwise we end up with high
- * interrupt latency.  The real problem area is switch_mm() which may
- * do a full cache flush.
+ * switch_mm() may do a full cache flush over the context switch,
+ * so enable interrupts over the context switch to avoid high
+ * latency.
  */
-#define prepare_arch_switch(rq,next)					\
-do {									\
-	spin_lock(&(next)->switch_lock);				\
-	spin_unlock_irq(&(rq)->lock);					\
-} while (0)
-
-#define finish_arch_switch(rq,prev)					\
-	spin_unlock(&(prev)->switch_lock)
-
-#define task_running(rq,p)						\
-	((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
-#else
-/*
- * Our UP-case is more simple, but we assume knowledge of how
- * spin_unlock_irq() and friends are implemented.  This avoids
- * us needlessly decrementing and incrementing the preempt count.
- */
-#define prepare_arch_switch(rq,next)	local_irq_enable()
-#define finish_arch_switch(rq,prev)	spin_unlock(&(rq)->lock)
-#define task_running(rq,p)		((rq)->curr == (p))
-#endif
+#define __ARCH_WANT_INTERRUPTS_ON_CTXSW
 
 /*
  * switch_to(prev, next) should switch from task `prev' to `next'
diff -puN include/asm-ia64/system.h~rollup include/asm-ia64/system.h
--- linux-2.6/include/asm-ia64/system.h~rollup	2004-08-20 18:15:19.000000000 +1000
+++ linux-2.6-npiggin/include/asm-ia64/system.h	2004-08-20 18:15:23.000000000 +1000
@@ -183,8 +183,6 @@ do {								\
 
 #ifdef __KERNEL__
 
-#define prepare_to_switch()    do { } while(0)
-
 #ifdef CONFIG_IA32_SUPPORT
 # define IS_IA32_PROCESS(regs)	(ia64_psr(regs)->is != 0)
 #else
@@ -274,13 +272,7 @@ extern void ia64_load_extra (struct task
  * of that CPU which will not be released, because there we wait for the
  * tasklist_lock to become available.
  */
-#define prepare_arch_switch(rq, next)		\
-do {						\
-	spin_lock(&(next)->switch_lock);	\
-	spin_unlock(&(rq)->lock);		\
-} while (0)
-#define finish_arch_switch(rq, prev)	spin_unlock_irq(&(prev)->switch_lock)
-#define task_running(rq, p) 		((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
+#define __ARCH_WANT_UNLOCKED_CTXSW
 
 #define ia64_platform_is(x) (strcmp(x, platform_name) == 0)
 
diff -puN include/asm-mips/system.h~rollup include/asm-mips/system.h
--- linux-2.6/include/asm-mips/system.h~rollup	2004-08-20 18:15:19.000000000 +1000
+++ linux-2.6-npiggin/include/asm-mips/system.h	2004-08-20 18:15:23.000000000 +1000
@@ -488,15 +488,9 @@ static __inline__ int con_is_present(voi
 }
 
 /*
- * Taken from include/asm-ia64/system.h; prevents deadlock on SMP
+ * See include/asm-ia64/system.h; prevents deadlock on SMP
  * systems.
  */
-#define prepare_arch_switch(rq, next)		\
-do {						\
-	spin_lock(&(next)->switch_lock);	\
-	spin_unlock(&(rq)->lock);		\
-} while (0)
-#define finish_arch_switch(rq, prev)	spin_unlock_irq(&(prev)->switch_lock)
-#define task_running(rq, p) 		((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
+#define __ARCH_WANT_UNLOCKED_CTXSW
 
 #endif /* _ASM_SYSTEM_H */
diff -puN include/asm-s390/system.h~rollup include/asm-s390/system.h
--- linux-2.6/include/asm-s390/system.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/asm-s390/system.h	2004-08-20 18:15:23.000000000 +1000
@@ -103,11 +103,8 @@ static inline void restore_access_regs(u
 	prev = __switch_to(prev,next);					     \
 } while (0)
 
-#define prepare_arch_switch(rq, next)	do { } while(0)
-#define task_running(rq, p)		((rq)->curr == (p))
-#define finish_arch_switch(rq, prev) do {				     \
+#define finish_arch_switch(prev) do {					     \
 	set_fs(current->thread.mm_segment);				     \
-	spin_unlock_irq(&(rq)->lock);					     \
 } while (0)
 
 #define nop() __asm__ __volatile__ ("nop")
diff -puN include/asm-sparc/system.h~rollup include/asm-sparc/system.h
--- linux-2.6/include/asm-sparc/system.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/asm-sparc/system.h	2004-08-20 18:15:23.000000000 +1000
@@ -101,7 +101,7 @@ extern void fpsave(unsigned long *fpregs
  * SWITCH_ENTER and SWITH_DO_LAZY_FPU do not work yet (e.g. SMP does not work)
  * XXX WTF is the above comment? Found in late teen 2.4.x.
  */
-#define prepare_arch_switch(rq, next) do { \
+#define prepare_arch_switch(next) do { \
 	__asm__ __volatile__( \
 	".globl\tflush_patch_switch\nflush_patch_switch:\n\t" \
 	"save %sp, -0x40, %sp; save %sp, -0x40, %sp; save %sp, -0x40, %sp\n\t" \
@@ -109,8 +109,6 @@ extern void fpsave(unsigned long *fpregs
 	"save %sp, -0x40, %sp\n\t" \
 	"restore; restore; restore; restore; restore; restore; restore"); \
 } while(0)
-#define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
-#define task_running(rq, p)		((rq)->curr == (p))
 
 	/* Much care has gone into this code, do not touch it.
 	 *
diff -puN include/asm-sparc64/system.h~rollup include/asm-sparc64/system.h
--- linux-2.6/include/asm-sparc64/system.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/asm-sparc64/system.h	2004-08-20 18:15:23.000000000 +1000
@@ -139,19 +139,13 @@ extern void __flushw_user(void);
 #define flush_user_windows flushw_user
 #define flush_register_windows flushw_all
 
-#define prepare_arch_switch(rq, next)		\
-do {	spin_lock(&(next)->switch_lock);	\
-	spin_unlock(&(rq)->lock);		\
+/* Don't hold the runqueue lock over context switch */
+#define __ARCH_WANT_UNLOCKED_CTXSW
+#define prepare_arch_switch(next)		\
+do {						\
 	flushw_all();				\
 } while (0)
 
-#define finish_arch_switch(rq, prev)		\
-do {	spin_unlock_irq(&(prev)->switch_lock);	\
-} while (0)
-
-#define task_running(rq, p) \
-	((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock))
-
 	/* See what happens when you design the chip correctly?
 	 *
 	 * We tell gcc we clobber all non-fixed-usage registers except
diff -puN include/linux/dcache.h~rollup include/linux/dcache.h
--- linux-2.6/include/linux/dcache.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/linux/dcache.h	2004-08-20 18:15:23.000000000 +1000
@@ -95,6 +95,7 @@ struct dentry {
 	struct qstr d_name;
 
 	struct list_head d_lru;		/* LRU list */
+	struct list_head d_unused;	/* unused list */
 	struct list_head d_child;	/* child of parent list */
 	struct list_head d_subdirs;	/* our children */
 	struct list_head d_alias;	/* inode alias list */
@@ -313,7 +314,7 @@ static inline int d_mountpoint(struct de
 extern struct vfsmount *lookup_mnt(struct vfsmount *, struct dentry *);
 extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
 
-extern int sysctl_vfs_cache_pressure;
+extern int sysctl_vfs_cache_cost;
 
 #endif /* __KERNEL__ */
 
diff -puN include/linux/fs.h~rollup include/linux/fs.h
--- linux-2.6/include/linux/fs.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/linux/fs.h	2004-08-20 18:15:23.000000000 +1000
@@ -56,8 +56,7 @@ extern struct files_stat_struct files_st
 
 struct inodes_stat_t {
 	int nr_inodes;
-	int nr_unused;
-	int dummy[5];
+	int dummy[6];
 };
 extern struct inodes_stat_t inodes_stat;
 
@@ -1477,7 +1476,7 @@ extern void destroy_inode(struct inode *
 extern struct inode *new_inode(struct super_block *);
 extern int remove_suid(struct dentry *);
 extern void remove_dquot_ref(struct super_block *, int, struct list_head *);
-extern struct semaphore iprune_sem;
+extern struct rw_semaphore iprune_rwsem;
 
 extern void __insert_inode_hash(struct inode *, unsigned long hashval);
 extern void remove_inode_hash(struct inode *);
diff -puN include/linux/gfp.h~rollup include/linux/gfp.h
--- linux-2.6/include/linux/gfp.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/linux/gfp.h	2004-08-20 18:15:23.000000000 +1000
@@ -6,6 +6,7 @@
 #include <linux/linkage.h>
 #include <linux/config.h>
 
+extern int vm_free_local_harder;
 struct vm_area_struct;
 
 /*
diff -puN include/linux/init_task.h~rollup include/linux/init_task.h
--- linux-2.6/include/linux/init_task.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/linux/init_task.h	2004-08-20 18:15:23.000000000 +1000
@@ -73,14 +73,13 @@ extern struct group_info init_groups;
 	.usage		= ATOMIC_INIT(2),				\
 	.flags		= 0,						\
 	.lock_depth	= -1,						\
-	.prio		= MAX_PRIO-20,					\
-	.static_prio	= MAX_PRIO-20,					\
+	.prio		= MAX_PRIO-29,					\
+	.static_prio	= MAX_PRIO-29,					\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
-	.time_slice	= HZ,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
@@ -112,7 +111,6 @@ extern struct group_info init_groups;
 	.blocked	= {{0}},					\
 	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
-	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
 	.journal_info	= NULL,						\
 	.private_pages	= LIST_HEAD_INIT(tsk.private_pages),		\
 	.private_pages_count = 0,					\
diff -puN include/linux/mm.h~rollup include/linux/mm.h
--- linux-2.6/include/linux/mm.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/linux/mm.h	2004-08-20 18:15:23.000000000 +1000
@@ -615,6 +615,25 @@ struct shrinker;
 extern struct shrinker *set_shrinker(int, shrinker_t);
 extern void remove_shrinker(struct shrinker *shrinker);
 
+struct zone_shrinker;
+typedef long (*zone_shrinker_fn)(struct zone_shrinker *zs,
+						unsigned long nr_to_scan,
+						unsigned int gfp_mask);
+struct zone_shrinker {
+	struct list_head	lru;
+	unsigned long		nr;
+	zone_shrinker_fn	shrinker;
+	unsigned long		nr_scan;
+	int			seeks;
+
+	int			idx;
+	struct list_head	list;
+};
+
+int set_zone_shrinker(zone_shrinker_fn, int);
+struct zone_shrinker *get_zone_shrinker(struct zone *, int);
+void remove_zone_shrinker(int);
+
 /*
  * On a two-level page table, this ends up being trivial. Thus the
  * inlining and the symmetry break with pte_alloc_map() that does all
diff -puN include/linux/mm_inline.h~rollup include/linux/mm_inline.h
--- linux-2.6/include/linux/mm_inline.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/linux/mm_inline.h	2004-08-20 18:15:22.000000000 +1000
@@ -1,9 +1,16 @@
 
 static inline void
-add_page_to_active_list(struct zone *zone, struct page *page)
+add_page_to_active_mapped_list(struct zone *zone, struct page *page)
 {
-	list_add(&page->lru, &zone->active_list);
-	zone->nr_active++;
+	list_add(&page->lru, &zone->active_mapped_list);
+	zone->nr_active_mapped++;
+}
+
+static inline void
+add_page_to_active_unmapped_list(struct zone *zone, struct page *page)
+{
+	list_add(&page->lru, &zone->active_unmapped_list);
+	zone->nr_active_unmapped++;
 }
 
 static inline void
@@ -14,10 +21,17 @@ add_page_to_inactive_list(struct zone *z
 }
 
 static inline void
-del_page_from_active_list(struct zone *zone, struct page *page)
+del_page_from_active_mapped_list(struct zone *zone, struct page *page)
+{
+	list_del(&page->lru);
+	zone->nr_active_mapped--;
+}
+
+static inline void
+del_page_from_active_unmapped_list(struct zone *zone, struct page *page)
 {
 	list_del(&page->lru);
-	zone->nr_active--;
+	zone->nr_active_unmapped--;
 }
 
 static inline void
@@ -31,10 +45,14 @@ static inline void
 del_page_from_lru(struct zone *zone, struct page *page)
 {
 	list_del(&page->lru);
-	if (PageActive(page)) {
-		ClearPageActive(page);
-		zone->nr_active--;
+	if (PageActiveMapped(page)) {
+		ClearPageActiveMapped(page);
+		zone->nr_active_mapped--;
+	} else if (PageActiveUnmapped(page)) {
+		ClearPageActiveUnmapped(page);
+		zone->nr_active_unmapped--;
 	} else {
+		ClearPageUsedOnce(page);
 		zone->nr_inactive--;
 	}
 }
diff -puN include/linux/mmzone.h~rollup include/linux/mmzone.h
--- linux-2.6/include/linux/mmzone.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/linux/mmzone.h	2004-08-20 18:15:23.000000000 +1000
@@ -130,36 +130,23 @@ struct zone {
 
 	ZONE_PADDING(_pad1_)
 
-	spinlock_t		lru_lock;	
-	struct list_head	active_list;
+	spinlock_t		lru_lock;
+	struct list_head	active_mapped_list;
+	struct list_head	active_unmapped_list;
 	struct list_head	inactive_list;
-	unsigned long		nr_scan_active;
+	unsigned long		nr_scan_active_mapped;
+	unsigned long		nr_scan_active_unmapped;
 	unsigned long		nr_scan_inactive;
-	unsigned long		nr_active;
+	unsigned long		nr_dirty_inactive;
+	unsigned long		nr_active_mapped;
+	unsigned long		nr_active_unmapped;
 	unsigned long		nr_inactive;
 	int			all_unreclaimable; /* All pages pinned */
 	unsigned long		pages_scanned;	   /* since last reclaim */
 
-	ZONE_PADDING(_pad2_)
+	struct list_head	zone_shrinker_list;
 
-	/*
-	 * prev_priority holds the scanning priority for this zone.  It is
-	 * defined as the scanning priority at which we achieved our reclaim
-	 * target at the previous try_to_free_pages() or balance_pgdat()
-	 * invokation.
-	 *
-	 * We use prev_priority as a measure of how much stress page reclaim is
-	 * under - it drives the swappiness decision: whether to unmap mapped
-	 * pages.
-	 *
-	 * temp_priority is used to remember the scanning priority at which
-	 * this zone was successfully refilled to free_pages == pages_high.
-	 *
-	 * Access to both these fields is quite racy even on uniprocessor.  But
-	 * it is expected to average out OK.
-	 */
-	int temp_priority;
-	int prev_priority;
+	ZONE_PADDING(_pad2_)
 
 	/*
 	 * free areas of different sizes
diff -puN include/linux/page-flags.h~rollup include/linux/page-flags.h
--- linux-2.6/include/linux/page-flags.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/linux/page-flags.h	2004-08-20 18:15:22.000000000 +1000
@@ -58,22 +58,25 @@
 
 #define PG_dirty	 	 4
 #define PG_lru			 5
-#define PG_active		 6
-#define PG_slab			 7	/* slab debug (Suparna wants this) */
+#define PG_active_mapped	 6
+#define PG_active_unmapped	 7
 
-#define PG_highmem		 8
-#define PG_checked		 9	/* kill me in 2.5.<early>. */
-#define PG_arch_1		10
-#define PG_reserved		11
-
-#define PG_private		12	/* Has something at ->private */
-#define PG_writeback		13	/* Page is under writeback */
-#define PG_nosave		14	/* Used for system suspend/resume */
-#define PG_compound		15	/* Part of a compound page */
-
-#define PG_swapcache		16	/* Swap page: swp_entry_t in private */
-#define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
-#define PG_reclaim		18	/* To be reclaimed asap */
+#define PG_slab			 8	/* slab debug (Suparna wants this) */
+#define PG_highmem		 9
+#define PG_checked		10	/* kill me in 2.5.<early>. */
+#define PG_arch_1		11
+
+#define PG_reserved		12
+#define PG_private		13	/* Has something at ->private */
+#define PG_writeback		14	/* Page is under writeback */
+#define PG_nosave		15	/* Used for system suspend/resume */
+
+#define PG_compound		16	/* Part of a compound page */
+#define PG_swapcache		17	/* Swap page: swp_entry_t in private */
+#define PG_mappedtodisk		18	/* Has blocks allocated on-disk */
+#define PG_reclaim		19	/* To be reclaimed asap */
+
+#define PG_usedonce		20	/* LRU page has been touched once */
 
 
 /*
@@ -97,10 +100,11 @@ struct page_state {
 	unsigned long pgpgout;		/* Disk writes */
 	unsigned long pswpin;		/* swap reads */
 	unsigned long pswpout;		/* swap writes */
-	unsigned long pgalloc_high;	/* page allocations */
 
+	unsigned long pgalloc_high;	/* page allocations */
 	unsigned long pgalloc_normal;
 	unsigned long pgalloc_dma;
+	unsigned long pgalloc_remote;
 	unsigned long pgfree;		/* page freeings */
 	unsigned long pgactivate;	/* pages moved inactive->active */
 	unsigned long pgdeactivate;	/* pages moved active->inactive */
@@ -208,11 +212,17 @@ extern unsigned long __read_page_state(u
 #define TestSetPageLRU(page)	test_and_set_bit(PG_lru, &(page)->flags)
 #define TestClearPageLRU(page)	test_and_clear_bit(PG_lru, &(page)->flags)
 
-#define PageActive(page)	test_bit(PG_active, &(page)->flags)
-#define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
-#define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
-#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags)
-#define TestSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags)
+#define PageActiveMapped(page)		test_bit(PG_active_mapped, &(page)->flags)
+#define SetPageActiveMapped(page)	set_bit(PG_active_mapped, &(page)->flags)
+#define ClearPageActiveMapped(page)	clear_bit(PG_active_mapped, &(page)->flags)
+#define TestClearPageActiveMapped(page) test_and_clear_bit(PG_active_mapped, &(page)->flags)
+#define TestSetPageActiveMapped(page) test_and_set_bit(PG_active_mapped, &(page)->flags)
+
+#define PageActiveUnmapped(page)	test_bit(PG_active_unmapped, &(page)->flags)
+#define SetPageActiveUnmapped(page)	set_bit(PG_active_unmapped, &(page)->flags)
+#define ClearPageActiveUnmapped(page)	clear_bit(PG_active_unmapped, &(page)->flags)
+#define TestClearPageActiveUnmapped(page) test_and_clear_bit(PG_active_unmapped, &(page)->flags)
+#define TestSetPageActiveUnmapped(page) test_and_set_bit(PG_active_unmapped, &(page)->flags)
 
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
 #define SetPageSlab(page)	set_bit(PG_slab, &(page)->flags)
@@ -290,6 +300,12 @@ extern unsigned long __read_page_state(u
 #define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
 #define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
 
+#define PageUsedOnce(page)	test_bit(PG_usedonce, &(page)->flags)
+#define SetPageUsedOnce(page)	set_bit(PG_usedonce, &(page)->flags)
+#define TestSetPageUsedOnce(page) test_and_set_bit(PG_usedonce, &(page)->flags)
+#define ClearPageUsedOnce(page)	clear_bit(PG_usedonce, &(page)->flags)
+#define TestClearPageUsedOnce(page) test_and_clear_bit(PG_usedonce, &(page)->flags)
+
 #ifdef CONFIG_SWAP
 #define PageSwapCache(page)	test_bit(PG_swapcache, &(page)->flags)
 #define SetPageSwapCache(page)	set_bit(PG_swapcache, &(page)->flags)
diff -puN include/linux/rmap.h~rollup include/linux/rmap.h
--- linux-2.6/include/linux/rmap.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/linux/rmap.h	2004-08-20 18:15:21.000000000 +1000
@@ -88,7 +88,7 @@ static inline void page_dup_rmap(struct 
 /*
  * Called from mm/vmscan.c to handle paging out
  */
-int page_referenced(struct page *, int is_locked);
+void page_gather(struct page *, int is_locked, int *referenced, int *dirty);
 int try_to_unmap(struct page *);
 
 /*
@@ -102,7 +102,7 @@ unsigned long page_address_in_vma(struct
 #define anon_vma_prepare(vma)	(0)
 #define anon_vma_link(vma)	do {} while (0)
 
-#define page_referenced(page,l)	TestClearPageReferenced(page)
+#define page_gather(page,l,r,d)	TestClearPageReferenced(page)
 #define try_to_unmap(page)	SWAP_FAIL
 
 #endif	/* CONFIG_MMU */
diff -puN include/linux/sched.h~rollup include/linux/sched.h
--- linux-2.6/include/linux/sched.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/linux/sched.h	2004-08-20 18:15:23.000000000 +1000
@@ -313,6 +313,11 @@ struct signal_struct {
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 };
 
+/* Context switch must be unlocked if interrupts are to be enabled */
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+# define __ARCH_WANT_UNLOCKED_CTXSW
+#endif
+
 /*
  * Priority of a process goes from 0..MAX_PRIO-1, valid RT
  * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are
@@ -329,7 +334,7 @@ struct signal_struct {
 #define MAX_USER_RT_PRIO	100
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 
-#define MAX_PRIO		(MAX_RT_PRIO + 40)
+#define MAX_PRIO		(MAX_RT_PRIO + 59)
 
 #define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
 
@@ -447,18 +452,22 @@ struct task_struct {
 
 	int lock_depth;		/* Lock depth */
 
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	int oncpu;
+#endif
 	int prio, static_prio;
 	struct list_head run_list;
 	prio_array_t *array;
 
-	unsigned long sleep_avg;
-	long interactive_credit;
+	/* Scheduler variables follow. kernel/sched.c */
+	unsigned long array_sequence;
 	unsigned long long timestamp;
-	int activated;
+	int used_slice;
+
+	unsigned long total_time, sleep_time;
 
 	unsigned long policy;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice, first_time_slice;
 
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_info sched_info;
@@ -566,8 +575,6 @@ struct task_struct {
 	spinlock_t alloc_lock;
 /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
 	spinlock_t proc_lock;
-/* context-switch lock */
-	spinlock_t switch_lock;
 
 /* journalling filesystem info */
 	void *journal_info;
diff -puN include/linux/swap.h~rollup include/linux/swap.h
--- linux-2.6/include/linux/swap.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/linux/swap.h	2004-08-20 18:15:22.000000000 +1000
@@ -164,17 +164,20 @@ extern unsigned int nr_free_pagecache_pa
 
 /* linux/mm/swap.c */
 extern void FASTCALL(lru_cache_add(struct page *));
-extern void FASTCALL(lru_cache_add_active(struct page *));
-extern void FASTCALL(activate_page(struct page *));
-extern void FASTCALL(mark_page_accessed(struct page *));
 extern void lru_add_drain(void);
 extern int rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
+/* Mark a page as having seen activity. */
+#define mark_page_accessed(page)	\
+do {					\
+	SetPageReferenced(page);	\
+} while (0)
+
 /* linux/mm/vmscan.c */
-extern int try_to_free_pages(struct zone **, unsigned int, unsigned int);
+extern int try_to_free_pages(struct zone **, unsigned int, unsigned int, int);
 extern int shrink_all_memory(int);
-extern int vm_swappiness;
+extern int vm_mapped_page_cost;
 
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
diff -puN include/linux/sysctl.h~rollup include/linux/sysctl.h
--- linux-2.6/include/linux/sysctl.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/linux/sysctl.h	2004-08-20 18:15:23.000000000 +1000
@@ -134,6 +134,7 @@ enum
 	KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */
 	KERN_HZ_TIMER=65,	/* int: hz timer on or off */
 	KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */
+	KERN_SCHED_TIMESLICE=67, /* int: base timeslice for scheduler */
 };
 
 
@@ -167,6 +168,7 @@ enum
 	VM_HUGETLB_GROUP=25,	/* permitted hugetlb group */
 	VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */
 	VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */
+	VM_FREE_LOCAL_HARDER=28,
 };
 
 
diff -puN include/linux/writeback.h~rollup include/linux/writeback.h
--- linux-2.6/include/linux/writeback.h~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/include/linux/writeback.h	2004-08-20 18:15:23.000000000 +1000
@@ -8,7 +8,7 @@ struct backing_dev_info;
 
 extern spinlock_t inode_lock;
 extern struct list_head inode_in_use;
-extern struct list_head inode_unused;
+extern void inode_add_unused(struct inode *inode);
 
 /*
  * Yes, writeback.h requires sched.h
diff -puN kernel/sched.c~rollup kernel/sched.c
--- linux-2.6/kernel/sched.c~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/kernel/sched.c	2004-08-20 18:15:23.000000000 +1000
@@ -49,139 +49,74 @@
 
 #include <asm/unistd.h>
 
-#ifdef CONFIG_NUMA
-#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
-#else
-#define cpu_to_node_mask(cpu) (cpu_online_map)
-#endif
-
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  * and back.
  */
-#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
+#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 30)
+#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 30)
 #define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)
 
 /*
  * 'User priority' is the nice value converted to something we
  * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
+ * it's a [ 0 ... 58 ] range.
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 
-/*
- * Some helpers for converting nanosecond timing to jiffy resolution
- */
-#define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
-#define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+#define US_TO_JIFFIES(x)	((x) * HZ / 1000000)
+#define JIFFIES_TO_US(x)	((x) * 1000000 / HZ)
 
 /*
- * These are the 'tuning knobs' of the scheduler:
- *
- * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
- * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
- * Timeslices get refilled after they expire.
- */
-#define MIN_TIMESLICE		max(5 * HZ / 1000, 1)
-#define DEF_TIMESLICE		(100 * HZ / 1000)
-#define ON_RUNQUEUE_WEIGHT	 30
-#define CHILD_PENALTY		 95
-#define PARENT_PENALTY		100
-#define EXIT_WEIGHT		  3
-#define PRIO_BONUS_RATIO	 25
-#define MAX_BONUS		(MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
-#define INTERACTIVE_DELTA	  2
-#define MAX_SLEEP_AVG		(DEF_TIMESLICE * MAX_BONUS)
-#define STARVATION_LIMIT	(MAX_SLEEP_AVG)
-#define NS_MAX_SLEEP_AVG	(JIFFIES_TO_NS(MAX_SLEEP_AVG))
-#define CREDIT_LIMIT		100
-
-/*
- * If a task is 'interactive' then we reinsert it in the active
- * array after it has expired its current timeslice. (it will not
- * continue to run immediately, it will still roundrobin with
- * other interactive tasks.)
- *
- * This part scales the interactivity limit depending on niceness.
- *
- * We scale it linearly, offset by the INTERACTIVE_DELTA delta.
- * Here are a few examples of different nice levels:
- *
- *  TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
- *  TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
- *  TASK_INTERACTIVE(  0): [1,1,1,1,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
- *  TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
- *
- * (the X axis represents the possible -5 ... 0 ... +5 dynamic
- *  priority range a task can explore, a value of '1' means the
- *  task is rated interactive.)
- *
- * Ie. nice +19 tasks can never get 'interactive' enough to be
- * reinserted into the active array. And only heavily CPU-hog nice -20
- * tasks will be expired. Default nice 0 tasks are somewhere between,
- * it takes some effort for them to get interactive, but it's not
- * too hard.
- */
-
-#define CURRENT_BONUS(p) \
-	(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
-		MAX_SLEEP_AVG)
-
-#ifdef CONFIG_SMP
-#define TIMESLICE_GRANULARITY(p)	(MIN_TIMESLICE * \
-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
-			num_online_cpus())
-#else
-#define TIMESLICE_GRANULARITY(p)	(MIN_TIMESLICE * \
-		(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
-#endif
-
-#define SCALE(v1,v1_max,v2_max) \
-	(v1) * (v2_max) / (v1_max)
+ * MIN_TIMESLICE is the timeslice that a minimum priority process gets if there
+ * is a maximum priority process runnable. MAX_TIMESLICE is derived from the
+ * formula in task_timeslice. It cannot be changed here. It is the timesilce
+ * that the maximum priority process will get. Larger timeslices are attainable
+ * by low priority processes however.
+ */
+int sched_base_timeslice = 64;
+int sched_min_base = 1;
+int sched_max_base = 10000;
 
-#define DELTA(p) \
-	(SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA)
+#define RT_TIMESLICE		(50 * 1000 / HZ)     /* 50ms */
+#define BASE_TIMESLICE		(sched_base_timeslice)
+#define MIN_TIMESLICE		1
 
-#define TASK_INTERACTIVE(p) \
-	((p)->prio <= (p)->static_prio - DELTA(p))
+/* Maximum amount of history that will be used to calculate priority */
+#define MAX_SLEEP_SHIFT		19
+#define MAX_SLEEP		(1UL << MAX_SLEEP_SHIFT) /* roughly 0.52s */
 
-#define INTERACTIVE_SLEEP(p) \
-	(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
-		(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
+/*
+ * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is
+ * will moderate dicard freak events (eg. SIGSTOP)
+ */
+#define MAX_SLEEP_AFFECT	(MAX_SLEEP/4)
 
-#define HIGH_CREDIT(p) \
-	((p)->interactive_credit > CREDIT_LIMIT)
+/*
+ * The amount of history can be decreased (on fork for example). This puts a
+ * lower bound on it.
+ */
+#define MIN_HISTORY		(MAX_SLEEP/8)
 
-#define LOW_CREDIT(p) \
-	((p)->interactive_credit < -CREDIT_LIMIT)
+#define FORKED_TS_MAX		(US_TO_JIFFIES(MIN_HISTORY) ?: 1)
 
-#define TASK_PREEMPTS_CURR(p, rq) \
-	((p)->prio < (rq)->curr->prio)
+/*
+ * SLEEP_FACTOR is a fixed point factor used to scale history tracking things.
+ * In particular: total_time, sleep_time, sleep_avg.
+ */
+#define SLEEP_FACTOR		1024
 
 /*
- * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
- * to time slice values: [800ms ... 100ms ... 5ms]
- *
- * The higher a thread's priority, the bigger timeslices
- * it gets during one round of execution. But even the lowest
- * priority thread gets MIN_TIMESLICE worth of execution time.
+ * The scheduler classifies a process as performing one of the following
+ * activities
  */
+#define STIME_SLEEP		1	/* Sleeping */
+#define STIME_RUN		2	/* Using CPU */
 
-#define SCALE_PRIO(x, prio) \
-	max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
+#define TASK_PREEMPTS_CURR(p, rq)	( (p)->prio < (rq)->curr->prio )
 
-static unsigned int task_timeslice(task_t *p)
-{
-	if (p->static_prio < NICE_TO_PRIO(0))
-		return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
-	else
-		return SCALE_PRIO(DEF_TIMESLICE, p->static_prio);
-}
 #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time)
 
 enum idle_type
@@ -203,6 +138,7 @@ struct sched_domain;
 typedef struct runqueue runqueue_t;
 
 struct prio_array {
+	int min_prio;
 	unsigned int nr_active;
 	unsigned long bitmap[BITMAP_SIZE];
 	struct list_head queue[MAX_PRIO];
@@ -226,16 +162,17 @@ struct runqueue {
 #ifdef CONFIG_SMP
 	unsigned long cpu_load;
 #endif
+	unsigned long array_sequence;
+	unsigned long nr_uninterruptible;
 	unsigned long long nr_switches;
-	unsigned long expired_timestamp, nr_uninterruptible;
-	unsigned long long timestamp_last_tick;
 	task_t *curr, *idle;
 	struct mm_struct *prev_mm;
-	prio_array_t *active, *expired, arrays[2];
-	int best_expired_prio;
 	atomic_t nr_iowait;
+	prio_array_t *active, *expired, arrays[2];
 
 #ifdef CONFIG_SMP
+	unsigned long long timestamp_last_tick;
+
 	struct sched_domain *sd;
 
 	/* For active balancing */
@@ -302,7 +239,6 @@ static DEFINE_PER_CPU(struct runqueue, r
 #define SD_WAKE_IDLE		4	/* Wake to idle CPU on task wakeup */
 #define SD_WAKE_AFFINE		8	/* Wake task to waking CPU */
 #define SD_WAKE_BALANCE		16	/* Perform balancing at task wakeup */
-#define SD_SHARE_CPUPOWER	32	/* Domain members share cpu power */
 
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
@@ -328,7 +264,6 @@ struct sched_domain {
 	unsigned int imbalance_pct;	/* No balance until over watermark */
 	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
-	unsigned int per_cpu_gain;	/* CPU % gained by adding domain cpus */
 	int flags;			/* See SD_* */
 
 	/* Runtime fields. */
@@ -368,12 +303,10 @@ struct sched_domain {
 	.imbalance_pct		= 110,			\
 	.cache_hot_time		= 0,			\
 	.cache_nice_tries	= 0,			\
-	.per_cpu_gain		= 25,			\
 	.flags			= SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
-				| SD_WAKE_IDLE		\
-				| SD_SHARE_CPUPOWER,	\
+				| SD_WAKE_IDLE,		\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
@@ -389,9 +322,8 @@ struct sched_domain {
 	.max_interval		= 4,			\
 	.busy_factor		= 64,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (5*1000000/2),	\
+	.cache_hot_time		= (5*1000/2),		\
 	.cache_nice_tries	= 1,			\
-	.per_cpu_gain		= 100,			\
 	.flags			= SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
@@ -411,9 +343,8 @@ struct sched_domain {
 	.max_interval		= 32,			\
 	.busy_factor		= 32,			\
 	.imbalance_pct		= 125,			\
-	.cache_hot_time		= (10*1000000),		\
+	.cache_hot_time		= (10*1000),		\
 	.cache_nice_tries	= 1,			\
-	.per_cpu_gain		= 100,			\
 	.flags			= SD_BALANCE_EXEC	\
 				| SD_WAKE_BALANCE,	\
 	.last_balance		= jiffies,		\
@@ -433,14 +364,71 @@ struct sched_domain {
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
-/*
- * Default context-switch locking:
- */
 #ifndef prepare_arch_switch
-# define prepare_arch_switch(rq, next)	do { } while (0)
-# define finish_arch_switch(rq, next)	spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p)		((rq)->curr == (p))
+# define prepare_arch_switch(next)	do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)	do { } while (0)
+#endif
+
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+	return rq->curr == p;
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+	spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+#ifdef CONFIG_SMP
+	return p->oncpu;
+#else
+	return rq->curr == p;
+#endif
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * We can optimise this out completely for !SMP, because the
+	 * SMP rebalancing from interrupt is the only thing that cares
+	 * here.
+	 */
+	next->oncpu = 1;
 #endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	spin_unlock_irq(&rq->lock);
+#else
+	spin_unlock(&rq->lock);
+#endif
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * After ->oncpu is cleared, the task can be moved to a different CPU.
+	 * We must ensure this doesn't happen until the switch is completely
+	 * finished.
+	 */
+	smp_wmb();
+	prev->oncpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+	local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
 /*
  * task_rq_lock - lock the runqueue a given task resides on and disable
@@ -565,20 +553,6 @@ struct file_operations proc_schedstat_op
 # define schedstat_add(rq, field, amt)	do { } while (0);
 #endif
 
-/*
- * rq_lock - lock a given runqueue and disable interrupts.
- */
-static runqueue_t *this_rq_lock(void)
-{
-	runqueue_t *rq;
-
-	local_irq_disable();
-	rq = this_rq();
-	spin_lock(&rq->lock);
-
-	return rq;
-}
-
 static inline void rq_unlock(runqueue_t *rq)
 {
 	spin_unlock_irq(&rq->lock);
@@ -703,8 +677,18 @@ static void dequeue_task(struct task_str
 
 static void enqueue_task(struct task_struct *p, prio_array_t *array)
 {
+	struct list_head *entry = array->queue + p->prio;
 	sched_info_queued(p);
-	list_add_tail(&p->run_list, array->queue + p->prio);
+
+	if (!rt_task(p)) {
+		/*
+		 * Cycle tasks on the same priority level. This reduces their
+		 * timeslice fluctuations due to higher priority tasks expiring.
+		 */
+		if (!list_empty(entry))
+			entry = entry->next;
+	}
+	list_add_tail(&p->run_list, entry);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
@@ -723,44 +707,123 @@ static inline void enqueue_task_head(str
 	p->array = array;
 }
 
+static inline unsigned long long clock_us(void)
+{
+	return sched_clock() >> 10;
+}
+
 /*
- * effective_prio - return the priority that is based on the static
- * priority but is modified by bonuses/penalties.
- *
- * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
- *
- * We use 25% of the full 0...39 priority range so that:
- *
- * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ * add_task_time updates a task @p after @time of doing the specified @type
+ * of activity. See STIME_*. This is used for priority calculation.
+ */
+static inline void add_task_time(task_t *p, unsigned long long time, unsigned long type)
+{
+	unsigned long ratio;
+	unsigned long long tmp;
+	unsigned long t;
+
+	if (type == STIME_SLEEP) {
+		if (time > MAX_SLEEP_AFFECT*4)
+			time = MAX_SLEEP_AFFECT*4;
+		t = ((unsigned long)time + 3) / 4;
+	} else {
+		unsigned long div = 60 - USER_PRIO(p->static_prio);
+		t = (unsigned long)time * 30;
+		t = t / div;
+		t = t * 30;
+		t = t / div;
+	}
+
+	ratio = MAX_SLEEP - t;
+	tmp = (unsigned long long)ratio*p->total_time + MAX_SLEEP/2;
+	tmp >>= MAX_SLEEP_SHIFT;
+	p->total_time = (unsigned long)tmp;
+
+	tmp = (unsigned long long)ratio*p->sleep_time + MAX_SLEEP/2;
+	tmp >>= MAX_SLEEP_SHIFT;
+	p->sleep_time = (unsigned long)tmp;
+
+	p->total_time += t;
+	if (type == STIME_SLEEP)
+		p->sleep_time += t;
+}
+
+static unsigned long task_sleep_avg(task_t *p)
+{
+	return (SLEEP_FACTOR * p->sleep_time) / (p->total_time + 1);
+}
+
+/*
+ * The higher a thread's priority, the bigger timeslices
+ * it gets during one round of execution. But even the lowest
+ * priority thread gets MIN_TIMESLICE worth of execution time.
  *
- * Both properties are important to certain workloads.
+ * Timeslices are scaled, so if only low priority processes are running,
+ * they will all get long timeslices.
+ */
+static int task_timeslice(task_t *p, runqueue_t *rq)
+{
+	int idx, base, delta;
+	int timeslice;
+
+	if (rt_task(p))
+		return RT_TIMESLICE;
+
+	idx = min(p->prio, rq->expired->min_prio);
+	delta = p->prio - idx;
+	base = BASE_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2);
+	base = base * (MAX_USER_PRIO + 1) / (delta + 2);
+
+	base = base * 40 / (70 - USER_PRIO(idx));
+	base = base * 40 / (70 - USER_PRIO(idx));
+
+	timeslice = base >> 10;
+	timeslice = timeslice * HZ / 1000;
+	if (timeslice < MIN_TIMESLICE)
+		timeslice = MIN_TIMESLICE;
+
+	return timeslice;
+}
+
+/*
+ * task_priority: calculates a task's priority based on previous running
+ * history (see add_task_time). The priority is just a simple linear function
+ * based on sleep_avg and static_prio.
  */
-static int effective_prio(task_t *p)
+static int task_priority(task_t *p)
 {
+	unsigned long sleep_avg;
 	int bonus, prio;
 
 	if (rt_task(p))
 		return p->prio;
 
-	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
+	sleep_avg = task_sleep_avg(p);
+
+	prio = USER_PRIO(p->static_prio) + 10;
+	bonus = (((MAX_USER_PRIO + 1) / 3) * sleep_avg + (SLEEP_FACTOR / 2))
+					/ SLEEP_FACTOR;
+	prio = MAX_RT_PRIO + prio - bonus;
 
-	prio = p->static_prio - bonus;
 	if (prio < MAX_RT_PRIO)
-		prio = MAX_RT_PRIO;
+		return MAX_RT_PRIO;
 	if (prio > MAX_PRIO-1)
-		prio = MAX_PRIO-1;
+		return MAX_PRIO-1;
+
 	return prio;
 }
 
 /*
  * __activate_task - move a task to the runqueue.
  */
-static inline void __activate_task(task_t *p, runqueue_t *rq)
+static inline void __activate_task(task_t *p, runqueue_t *rq, prio_array_t *array)
 {
-	enqueue_task(p, rq->active);
+	enqueue_task(p, array);
 	rq->nr_running++;
+	if (!rt_task(p)) {
+		if (p->prio < array->min_prio)
+			array->min_prio = p->prio;
+	}
 }
 
 /*
@@ -772,80 +835,6 @@ static inline void __activate_idle_task(
 	rq->nr_running++;
 }
 
-static void recalc_task_prio(task_t *p, unsigned long long now)
-{
-	unsigned long long __sleep_time = now - p->timestamp;
-	unsigned long sleep_time;
-
-	if (__sleep_time > NS_MAX_SLEEP_AVG)
-		sleep_time = NS_MAX_SLEEP_AVG;
-	else
-		sleep_time = (unsigned long)__sleep_time;
-
-	if (likely(sleep_time > 0)) {
-		/*
-		 * User tasks that sleep a long time are categorised as
-		 * idle and will get just interactive status to stay active &
-		 * prevent them suddenly becoming cpu hogs and starving
-		 * other processes.
-		 */
-		if (p->mm && p->activated != -1 &&
-			sleep_time > INTERACTIVE_SLEEP(p)) {
-				p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG -
-						DEF_TIMESLICE);
-				if (!HIGH_CREDIT(p))
-					p->interactive_credit++;
-		} else {
-			/*
-			 * The lower the sleep avg a task has the more
-			 * rapidly it will rise with sleep time.
-			 */
-			sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1;
-
-			/*
-			 * Tasks with low interactive_credit are limited to
-			 * one timeslice worth of sleep avg bonus.
-			 */
-			if (LOW_CREDIT(p) &&
-			    sleep_time > JIFFIES_TO_NS(task_timeslice(p)))
-				sleep_time = JIFFIES_TO_NS(task_timeslice(p));
-
-			/*
-			 * Non high_credit tasks waking from uninterruptible
-			 * sleep are limited in their sleep_avg rise as they
-			 * are likely to be cpu hogs waiting on I/O
-			 */
-			if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) {
-				if (p->sleep_avg >= INTERACTIVE_SLEEP(p))
-					sleep_time = 0;
-				else if (p->sleep_avg + sleep_time >=
-						INTERACTIVE_SLEEP(p)) {
-					p->sleep_avg = INTERACTIVE_SLEEP(p);
-					sleep_time = 0;
-				}
-			}
-
-			/*
-			 * This code gives a bonus to interactive tasks.
-			 *
-			 * The boost works by updating the 'average sleep time'
-			 * value here, based on ->timestamp. The more time a
-			 * task spends sleeping, the higher the average gets -
-			 * and the higher the priority boost gets as well.
-			 */
-			p->sleep_avg += sleep_time;
-
-			if (p->sleep_avg > NS_MAX_SLEEP_AVG) {
-				p->sleep_avg = NS_MAX_SLEEP_AVG;
-				if (!HIGH_CREDIT(p))
-					p->interactive_credit++;
-			}
-		}
-	}
-
-	p->prio = effective_prio(p);
-}
-
 /*
  * activate_task - move a task to the runqueue and do priority recalculation
  *
@@ -854,9 +843,10 @@ static void recalc_task_prio(task_t *p, 
  */
 static void activate_task(task_t *p, runqueue_t *rq, int local)
 {
-	unsigned long long now;
+	unsigned long long now, sleep;
+	prio_array_t *array;
 
-	now = sched_clock();
+	now = clock_us();
 #ifdef CONFIG_SMP
 	if (!local) {
 		/* Compensate for drifting sched_clock */
@@ -865,44 +855,34 @@ static void activate_task(task_t *p, run
 			+ rq->timestamp_last_tick;
 	}
 #endif
-
-	recalc_task_prio(p, now);
-
 	/*
-	 * This checks to make sure it's not an uninterruptible task
-	 * that is now waking up.
+	 * If we have slept through an active/expired array switch, restart
+	 * our timeslice too.
 	 */
-	if (!p->activated) {
-		/*
-		 * Tasks which were woken up by interrupts (ie. hw events)
-		 * are most likely of interactive nature. So we give them
-		 * the credit of extending their sleep time to the period
-		 * of time they spend on the runqueue, waiting for execution
-		 * on a CPU, first time around:
-		 */
-		if (in_interrupt())
-			p->activated = 2;
-		else {
-			/*
-			 * Normal first-time wakeups get a credit too for
-			 * on-runqueue time, but it will be weighted down:
-			 */
-			p->activated = 1;
-		}
-	}
+
+	sleep = now - p->timestamp;
 	p->timestamp = now;
+	add_task_time(p, sleep, STIME_SLEEP);
+	p->prio = task_priority(p);
 
-	__activate_task(p, rq);
+	array = rq->active;
+	if (unlikely(p->used_slice == -1)) {
+		/* This only applys to newly woken children */
+		array = rq->expired;
+		p->used_slice = 0;
+	} else if (rq->array_sequence != p->array_sequence)
+		p->used_slice = 0;
+
+	__activate_task(p, rq, array);
 }
 
 /*
  * deactivate_task - remove a task from the runqueue.
  */
-static void deactivate_task(struct task_struct *p, runqueue_t *rq)
+static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
+	p->array_sequence = rq->array_sequence;
 	rq->nr_running--;
-	if (p->state == TASK_UNINTERRUPTIBLE)
-		rq->nr_uninterruptible++;
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
@@ -1226,28 +1206,14 @@ out_set_cpu:
 
 out_activate:
 #endif /* CONFIG_SMP */
-	if (old_state == TASK_UNINTERRUPTIBLE) {
+	if (old_state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible--;
-		/*
-		 * Tasks on involuntary sleep don't earn
-		 * sleep_avg beyond just interactive state.
-		 */
-		p->activated = -1;
-	}
-
-	/*
-	 * Sync wakeups (i.e. those types of wakeups where the waker
-	 * has indicated that it will leave the CPU in short order)
-	 * don't trigger a preemption, if the woken up task will run on
-	 * this cpu. (in this case the 'I will reschedule' promise of
-	 * the waker guarantees that the freshly woken up task is going
-	 * to be considered on this CPU.)
-	 */
 	activate_task(p, rq, cpu == this_cpu);
 	if (!sync || cpu != this_cpu) {
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
+
 	success = 1;
 
 out_running:
@@ -1261,7 +1227,7 @@ out:
 int fastcall wake_up_process(task_t * p)
 {
 	return try_to_wake_up(p, TASK_STOPPED |
-		       		 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+				TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
 }
 
 EXPORT_SYMBOL(wake_up_process);
@@ -1282,6 +1248,9 @@ static int find_idlest_cpu(struct task_s
  */
 void fastcall sched_fork(task_t *p)
 {
+	unsigned long sleep_avg;
+	runqueue_t *rq;
+
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
@@ -1291,46 +1260,52 @@ void fastcall sched_fork(task_t *p)
 	p->state = TASK_RUNNING;
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
-	spin_lock_init(&p->switch_lock);
 #ifdef CONFIG_SCHEDSTATS
 	memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	p->oncpu = 0;
+#endif
 #ifdef CONFIG_PREEMPT
-	/*
-	 * During context-switch we hold precisely one spinlock, which
-	 * schedule_tail drops. (in the common case it's this_rq()->lock,
-	 * but it also can be p->switch_lock.) So we compensate with a count
-	 * of 1. Also, we want to start with kernel preemption disabled.
-	 */
+	/* Want to start with kernel preemption disabled. */
 	p->thread_info->preempt_count = 1;
 #endif
-	/*
-	 * Share the timeslice between parent and child, thus the
-	 * total amount of pending timeslices in the system doesn't change,
-	 * resulting in more scheduling fairness.
-	 */
+
+	preempt_disable();
+	rq = this_rq();
+
+	/* XXX */
+	if (unlikely(p->comm[0] == 'X' && p->comm[1] == 'F')) {
+		static int warned = 0;
+		if (!warned) {
+			printk(KERN_INFO "Renicing %s for you\n", p->comm);
+			warned = 1;
+		}
+		p->static_prio = NICE_TO_PRIO(-10);
+	}
+
+	/* Get MIN_HISTORY of history with the same sleep_avg as parent. */
+	sleep_avg = task_sleep_avg(current);
+	p->total_time = MIN_HISTORY;
+	p->sleep_time = p->total_time * sleep_avg / SLEEP_FACTOR;
+
+	/* Parent loses 1/4 of sleep time for forking */
+	current->sleep_time = 3*current->sleep_time/4;
+
+	p->used_slice = 0;
 	local_irq_disable();
-	p->time_slice = (current->time_slice + 1) >> 1;
-	/*
-	 * The remainder of the first timeslice might be recovered by
-	 * the parent if the child exits early enough.
-	 */
-	p->first_time_slice = 1;
-	current->time_slice >>= 1;
-	p->timestamp = sched_clock();
-	if (unlikely(!current->time_slice)) {
-		/*
-		 * This case is rare, it happens when the parent has only
-		 * a single jiffy left from its timeslice. Taking the
-		 * runqueue lock is not a problem.
-		 */
-		current->time_slice = 1;
-		preempt_disable();
-		scheduler_tick(0, 0);
-		local_irq_enable();
-		preempt_enable();
-	} else
-		local_irq_enable();
+	if (unlikely(current->used_slice == -1 || current == rq->idle))
+		p->used_slice = -1;
+	else {
+		int ts = task_timeslice(current, rq);
+		current->used_slice += (ts + 3) / 4;
+		if (current->used_slice >= ts) {
+			current->used_slice = -1;
+			set_need_resched();
+		}
+	}
+	local_irq_enable();
+	preempt_enable();
 }
 
 /*
@@ -1344,57 +1319,55 @@ void fastcall wake_up_new_task(task_t * 
 {
 	unsigned long flags;
 	int this_cpu, cpu;
-	runqueue_t *rq, *this_rq;
+	runqueue_t *rq;
+	prio_array_t *array;
+
+	BUG_ON(p->state != TASK_RUNNING);
+
+	p->prio = task_priority(p);
+	p->timestamp = clock_us();
 
 	rq = task_rq_lock(p, &flags);
-	cpu = task_cpu(p);
 	this_cpu = smp_processor_id();
-
-	BUG_ON(p->state != TASK_RUNNING);
+	cpu = task_cpu(p);
 
 	schedstat_inc(rq, wunt_cnt);
-	/*
-	 * We decrease the sleep average of forking parents
-	 * and children as well, to keep max-interactive tasks
-	 * from forking tasks that are max-interactive. The parent
-	 * (current) is done further down, under its lock.
-	 */
-	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
-		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 
-	p->interactive_credit = 0;
-
-	p->prio = effective_prio(p);
+	array = rq->active;
+	if (unlikely(p->used_slice == -1)) {
+		p->used_slice = 0;
+		array = rq->expired;
+	} else {
+		int total = task_timeslice(p, rq);
+		int ts = max((total + 3) / 4, MIN_TIMESLICE);
+		ts = min(ts, (int)FORKED_TS_MAX);
+		p->used_slice = total - ts;
+	}
 
 	if (likely(cpu == this_cpu)) {
-		if (!(clone_flags & CLONE_VM)) {
+		if (!(clone_flags & CLONE_VM) && likely(array == rq->active)) {
 			/*
 			 * The VM isn't cloned, so we're in a good position to
 			 * do child-runs-first in anticipation of an exec. This
 			 * usually avoids a lot of COW overhead.
 			 */
-			if (unlikely(!current->array))
-				__activate_task(p, rq);
-			else {
+			if (p->prio >= current->prio) {
 				p->prio = current->prio;
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
 				rq->nr_running++;
-			}
+			} else
+				__activate_task(p, rq, array);
+
 			set_need_resched();
-		} else
+		} else {
 			/* Run child last */
-			__activate_task(p, rq);
-		/*
-		 * We skip the following code due to cpu == this_cpu
-	 	 *
-		 *   task_rq_unlock(rq, &flags);
-		 *   this_rq = task_rq_lock(current, &flags);
-		 */
-		this_rq = rq;
+			__activate_task(p, rq, array);
+		}
+#ifdef CONFIG_SMP
 	} else {
-		this_rq = cpu_rq(this_cpu);
+		runqueue_t *this_rq = this_rq();
 
 		/*
 		 * Not the local CPU - must adjust timestamp. This should
@@ -1402,52 +1375,18 @@ void fastcall wake_up_new_task(task_t * 
 		 */
 		p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
 					+ rq->timestamp_last_tick;
-		__activate_task(p, rq);
+		__activate_task(p, rq, array);
 		if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 
 		schedstat_inc(rq, wunt_moved);
-		/*
-		 * Parent and child are on different CPUs, now get the
-		 * parent runqueue to update the parent's ->sleep_avg:
-		 */
-		task_rq_unlock(rq, &flags);
-		this_rq = task_rq_lock(current, &flags);
+#endif
 	}
-	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
-		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-	task_rq_unlock(this_rq, &flags);
+	task_rq_unlock(rq, &flags);
 }
 
-/*
- * Potentially available exiting-child timeslices are
- * retrieved here - this way the parent does not get
- * penalized for creating too many threads.
- *
- * (this cannot be used to 'generate' timeslices
- * artificially, because any timeslice recovered here
- * was given away by the parent in the first place.)
- */
 void fastcall sched_exit(task_t * p)
 {
-	unsigned long flags;
-	runqueue_t *rq;
-
-	/*
-	 * If the child was a (relative-) CPU hog then decrease
-	 * the sleep_avg of the parent as well.
-	 */
-	rq = task_rq_lock(p->parent, &flags);
-	if (p->first_time_slice) {
-		p->parent->time_slice += p->time_slice;
-		if (unlikely(p->parent->time_slice > task_timeslice(p)))
-			p->parent->time_slice = task_timeslice(p);
-	}
-	if (p->sleep_avg < p->parent->sleep_avg)
-		p->parent->sleep_avg = p->parent->sleep_avg /
-		(EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg /
-		(EXIT_WEIGHT + 1);
-	task_rq_unlock(rq, &flags);
 }
 
 /**
@@ -1483,7 +1422,8 @@ static void finish_task_switch(task_t *p
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_task_flags = prev->flags;
-	finish_arch_switch(rq, prev);
+	finish_arch_switch(prev);
+	finish_lock_switch(rq, prev);
 	if (mm)
 		mmdrop(mm);
 	if (unlikely(prev_task_flags & PF_DEAD)) {
@@ -1500,7 +1440,10 @@ static void finish_task_switch(task_t *p
 asmlinkage void schedule_tail(task_t *prev)
 {
 	finish_task_switch(prev);
-
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+	/* In this case, finish_task_switch does not reenable preemption */
+	preempt_enable();
+#endif
 	if (current->set_child_tid)
 		put_user(current->pid, current->set_child_tid);
 }
@@ -1759,6 +1702,10 @@ void pull_task(runqueue_t *src_rq, prio_
 	set_task_cpu(p, this_cpu);
 	this_rq->nr_running++;
 	enqueue_task(p, this_array);
+	if (!rt_task(p)) {
+		if (p->prio < this_array->min_prio)
+			this_array->min_prio = p->prio;
+	}
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
 	/*
@@ -2062,7 +2009,6 @@ static int load_balance(int this_cpu, ru
 	unsigned long imbalance;
 	int nr_moved;
 
-	spin_lock(&this_rq->lock);
 	schedstat_inc(sd, lb_cnt[idle]);
 
 	group = find_busiest_group(sd, this_cpu, &imbalance, idle);
@@ -2097,12 +2043,11 @@ static int load_balance(int this_cpu, ru
 		 * still unbalanced. nr_moved simply stays zero, so it is
 		 * correctly treated as an imbalance.
 		 */
-		double_lock_balance(this_rq, busiest);
+		double_rq_lock(this_rq, busiest);
 		nr_moved = move_tasks(this_rq, this_cpu, busiest,
 						imbalance, sd, idle);
-		spin_unlock(&busiest->lock);
+		double_rq_unlock(this_rq, busiest);
 	}
-	spin_unlock(&this_rq->lock);
 
 	if (!nr_moved) {
 		schedstat_inc(sd, lb_failed[idle]);
@@ -2136,8 +2081,6 @@ static int load_balance(int this_cpu, ru
 	return nr_moved;
 
 out_balanced:
-	spin_unlock(&this_rq->lock);
-
 	/* tune up the balancing interval */
 	if (sd->balance_interval < sd->max_interval)
 		sd->balance_interval *= 2;
@@ -2342,42 +2285,11 @@ static inline void idle_balance(int cpu,
 }
 #endif
 
-static inline int wake_priority_sleeper(runqueue_t *rq)
-{
-#ifdef CONFIG_SCHED_SMT
-	/*
-	 * If an SMT sibling task has been put to sleep for priority
-	 * reasons reschedule the idle task to see if it can now run.
-	 */
-	if (rq->nr_running) {
-		resched_task(rq->idle);
-		return 1;
-	}
-#endif
-	return 0;
-}
-
 DEFINE_PER_CPU(struct kernel_stat, kstat);
 
 EXPORT_PER_CPU_SYMBOL(kstat);
 
 /*
- * We place interactive tasks back into the active array, if possible.
- *
- * To guarantee that this does not starve expired tasks we ignore the
- * interactivity of a task if the first expired task had to wait more
- * than a 'reasonable' amount of time. This deadline timeout is
- * load-dependent, as the frequency of array switched decreases with
- * increasing number of running tasks. We also ignore the interactivity
- * if a better static_prio task has expired:
- */
-#define EXPIRED_STARVING(rq) \
-	((STARVATION_LIMIT && ((rq)->expired_timestamp && \
-		(jiffies - (rq)->expired_timestamp >= \
-			STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \
-			((rq)->curr->static_prio > (rq)->best_expired_prio))
-
-/*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
  *
@@ -2386,12 +2298,16 @@ EXPORT_PER_CPU_SYMBOL(kstat);
  */
 void scheduler_tick(int user_ticks, int sys_ticks)
 {
+	enum idle_type cpu_status;
 	int cpu = smp_processor_id();
 	struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
+	int ts;
 
-	rq->timestamp_last_tick = sched_clock();
+#ifdef CONFIG_SMP
+	rq->timestamp_last_tick = clock_us();
+#endif
 
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_ticks);
@@ -2410,11 +2326,11 @@ void scheduler_tick(int user_ticks, int 
 			cpustat->iowait += sys_ticks;
 		else
 			cpustat->idle += sys_ticks;
-		if (wake_priority_sleeper(rq))
-			goto out;
-		rebalance_tick(cpu, rq, IDLE);
-		return;
+		cpu_status = IDLE;
+		goto out;
 	}
+	cpu_status = NOT_IDLE;
+
 	if (TASK_NICE(p) > 0)
 		cpustat->nice += user_ticks;
 	else
@@ -2422,168 +2338,24 @@ void scheduler_tick(int user_ticks, int 
 	cpustat->system += sys_ticks;
 
 	/* Task might have expired already, but not scheduled off yet */
-	if (p->array != rq->active) {
-		set_tsk_need_resched(p);
+	if (unlikely(p->used_slice == -1))
 		goto out;
-	}
-	spin_lock(&rq->lock);
-	/*
-	 * The task was running during this tick - update the
-	 * time slice counter. Note: we do not update a thread's
-	 * priority until it either goes to sleep or uses up its
-	 * timeslice. This makes it possible for interactive tasks
-	 * to use up their timeslices at their highest priority levels.
-	 */
-	if (rt_task(p)) {
-		/*
-		 * RR tasks need a special form of timeslice management.
-		 * FIFO tasks have no timeslices.
-		 */
-		if ((p->policy == SCHED_RR) && !--p->time_slice) {
-			p->time_slice = task_timeslice(p);
-			p->first_time_slice = 0;
-			set_tsk_need_resched(p);
-
-			/* put it at the end of the queue: */
-			dequeue_task(p, rq->active);
-			enqueue_task(p, rq->active);
-		}
-		goto out_unlock;
-	}
-	if (!--p->time_slice) {
-		dequeue_task(p, rq->active);
-		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
-		p->time_slice = task_timeslice(p);
-		p->first_time_slice = 0;
-
-		if (!rq->expired_timestamp)
-			rq->expired_timestamp = jiffies;
-		if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) {
-			enqueue_task(p, rq->expired);
-			if (p->static_prio < rq->best_expired_prio)
-				rq->best_expired_prio = p->static_prio;
-		} else
-			enqueue_task(p, rq->active);
-	} else {
-		/*
-		 * Prevent a too long timeslice allowing a task to monopolize
-		 * the CPU. We do this by splitting up the timeslice into
-		 * smaller pieces.
-		 *
-		 * Note: this does not mean the task's timeslices expire or
-		 * get lost in any way, they just might be preempted by
-		 * another task of equal priority. (one with higher
-		 * priority would have preempted this task already.) We
-		 * requeue this task to the end of the list on this priority
-		 * level, which is in essence a round-robin of tasks with
-		 * equal priority.
-		 *
-		 * This only applies to tasks in the interactive
-		 * delta range with at least TIMESLICE_GRANULARITY to requeue.
-		 */
-		if (TASK_INTERACTIVE(p) && !((task_timeslice(p) -
-			p->time_slice) % TIMESLICE_GRANULARITY(p)) &&
-			(p->time_slice >= TIMESLICE_GRANULARITY(p)) &&
-			(p->array == rq->active)) {
-
-			dequeue_task(p, rq->active);
-			set_tsk_need_resched(p);
-			p->prio = effective_prio(p);
-			enqueue_task(p, rq->active);
-		}
-	}
-out_unlock:
-	spin_unlock(&rq->lock);
-out:
-	rebalance_tick(cpu, rq, NOT_IDLE);
-}
-
-#ifdef CONFIG_SCHED_SMT
-static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
-{
-	int i;
-	struct sched_domain *sd = rq->sd;
-	cpumask_t sibling_map;
 
-	if (!(sd->flags & SD_SHARE_CPUPOWER))
-		return;
-
-	cpus_and(sibling_map, sd->span, cpu_online_map);
-	for_each_cpu_mask(i, sibling_map) {
-		runqueue_t *smt_rq;
-
-		if (i == cpu)
-			continue;
-
-		smt_rq = cpu_rq(i);
+	if (unlikely(p->policy == SCHED_FIFO))
+		goto out;
 
-		/*
-		 * If an SMT sibling task is sleeping due to priority
-		 * reasons wake it up now.
-		 */
-		if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running)
-			resched_task(smt_rq->idle);
+	/* p was running during this tick. Update its time slice counter. */
+	p->used_slice++;
+	ts = task_timeslice(p, rq);
+	if (unlikely(p->used_slice >= ts)) {
+		p->used_slice = -1;
+		set_tsk_need_resched(p);
 	}
-}
-
-static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
-{
-	struct sched_domain *sd = rq->sd;
-	cpumask_t sibling_map;
-	int ret = 0, i;
-
-	if (!(sd->flags & SD_SHARE_CPUPOWER))
-		return 0;
-
-	cpus_and(sibling_map, sd->span, cpu_online_map);
-	for_each_cpu_mask(i, sibling_map) {
-		runqueue_t *smt_rq;
-		task_t *smt_curr;
-
-		if (i == cpu)
-			continue;
 
-		smt_rq = cpu_rq(i);
-		smt_curr = smt_rq->curr;
-
-		/*
-		 * If a user task with lower static priority than the
-		 * running task on the SMT sibling is trying to schedule,
-		 * delay it till there is proportionately less timeslice
-		 * left of the sibling task to prevent a lower priority
-		 * task from using an unfair proportion of the
-		 * physical cpu's resources. -ck
-		 */
-		if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
-			task_timeslice(p) || rt_task(smt_curr)) &&
-			p->mm && smt_curr->mm && !rt_task(p))
-				ret = 1;
-
-		/*
-		 * Reschedule a lower priority task on the SMT sibling,
-		 * or wake it up if it has been put to sleep for priority
-		 * reasons.
-		 */
-		if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
-			task_timeslice(smt_curr) || rt_task(p)) &&
-			smt_curr->mm && p->mm && !rt_task(smt_curr)) ||
-			(smt_curr == smt_rq->idle && smt_rq->nr_running))
-				resched_task(smt_curr);
-	}
-	return ret;
-}
-#else
-static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
-{
+out:
+	rebalance_tick(cpu, rq, NOT_IDLE);
 }
 
-static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
-{
-	return 0;
-}
-#endif
-
 /*
  * schedule() is the main scheduler function.
  */
@@ -2603,11 +2375,10 @@ asmlinkage void __sched schedule(void)
 	 * schedule() atomically, we ignore that path for now.
 	 * Otherwise, whine if we are scheduling when we should not be.
 	 */
-	if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
-		if (unlikely(in_atomic())) {
-			printk(KERN_ERR "bad: scheduling while atomic!\n");
-			dump_stack();
-		}
+	if (unlikely(in_atomic()) &&
+			likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) {
+		printk(KERN_ERR "bad: scheduling while atomic!\n");
+		dump_stack();
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
@@ -2627,19 +2398,10 @@ need_resched:
 
 	release_kernel_lock(prev);
 	schedstat_inc(rq, sched_cnt);
-	now = sched_clock();
-	if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG))
-		run_time = now - prev->timestamp;
-	else
-		run_time = NS_MAX_SLEEP_AVG;
-
-	/*
-	 * Tasks with interactive credits get charged less run_time
-	 * at high sleep_avg to delay them losing their interactive
-	 * status
-	 */
-	if (HIGH_CREDIT(prev))
-		run_time /= (CURRENT_BONUS(prev) ? : 1);
+	now = clock_us();
+	run_time = now - prev->timestamp;
+	prev->timestamp = now;
+	add_task_time(prev, run_time, STIME_RUN);
 
 	spin_lock_irq(&rq->lock);
 
@@ -2653,17 +2415,39 @@ need_resched:
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev))))
 			prev->state = TASK_RUNNING;
-		else
+		else {
 			deactivate_task(prev, rq);
+			if (prev->state == TASK_UNINTERRUPTIBLE)
+				rq->nr_uninterruptible++;
+			goto no_check_expired;
+		}
 	}
 
+	if (unlikely(prev->used_slice == -1)) {
+		if (rt_task(prev)) {
+			if (prev->policy == SCHED_RR) {
+				dequeue_task(prev, prev->array);
+				enqueue_task(prev, rq->active);
+			}
+		} else {
+			dequeue_task(prev, prev->array);
+			prev->prio = task_priority(prev);
+			enqueue_task(prev, rq->expired);
+			if (prev->prio < rq->expired->min_prio)
+				rq->expired->min_prio = prev->prio;
+		}
+		prev->used_slice = 0;
+	}
+no_check_expired:
+
 	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
+		rq->array_sequence++;
 		idle_balance(cpu, rq);
 		if (!rq->nr_running) {
+			rq->arrays[0].min_prio = MAX_PRIO;
+			rq->arrays[1].min_prio = MAX_PRIO;
 			next = rq->idle;
-			rq->expired_timestamp = 0;
-			wake_sleeping_dependent(cpu, rq);
 			goto switch_tasks;
 		}
 	}
@@ -2674,11 +2458,11 @@ need_resched:
 		 * Switch the active and expired arrays.
 		 */
 		schedstat_inc(rq, sched_switch);
+		rq->array_sequence++;
 		rq->active = rq->expired;
 		rq->expired = array;
+		rq->expired->min_prio = MAX_PRIO;
 		array = rq->active;
-		rq->expired_timestamp = 0;
-		rq->best_expired_prio = MAX_PRIO;
 	} else
 		schedstat_inc(rq, sched_noswitch);
 
@@ -2686,37 +2470,11 @@ need_resched:
 	queue = array->queue + idx;
 	next = list_entry(queue->next, task_t, run_list);
 
-	if (dependent_sleeper(cpu, rq, next)) {
-		schedstat_inc(rq, sched_goidle);
-		next = rq->idle;
-		goto switch_tasks;
-	}
-
-	if (!rt_task(next) && next->activated > 0) {
-		unsigned long long delta = now - next->timestamp;
-
-		if (next->activated == 1)
-			delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
-
-		array = next->array;
-		dequeue_task(next, array);
-		recalc_task_prio(next, next->timestamp + delta);
-		enqueue_task(next, array);
-	}
-	next->activated = 0;
 switch_tasks:
 	prefetch(next);
 	clear_tsk_need_resched(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
-	prev->sleep_avg -= run_time;
-	if ((long)prev->sleep_avg <= 0) {
-		prev->sleep_avg = 0;
-		if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev)))
-			prev->interactive_credit--;
-	}
-	prev->timestamp = now;
-
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = now;
@@ -2724,10 +2482,10 @@ switch_tasks:
 		rq->curr = next;
 		++*switch_count;
 
-		prepare_arch_switch(rq, next);
+		prepare_lock_switch(rq, next);
+		prepare_arch_switch(next);
 		prev = context_switch(rq, prev, next);
 		barrier();
-
 		finish_task_switch(prev);
 	} else
 		spin_unlock_irq(&rq->lock);
@@ -3204,12 +2962,12 @@ static int setscheduler(pid_t pid, int p
 
 	array = p->array;
 	if (array)
-		deactivate_task(p, task_rq(p));
+		deactivate_task(p, rq);
 	retval = 0;
 	oldprio = p->prio;
 	__setscheduler(p, policy, lp.sched_priority);
 	if (array) {
-		__activate_task(p, task_rq(p));
+		__activate_task(p, rq, array);
 		/*
 		 * Reschedule if we are currently running on this runqueue and
 		 * our priority decreased, or if we are not currently running on
@@ -3432,37 +3190,31 @@ out_unlock:
  */
 asmlinkage long sys_sched_yield(void)
 {
-	runqueue_t *rq = this_rq_lock();
-	prio_array_t *array = current->array;
-	prio_array_t *target = rq->expired;
+#ifdef CONFIG_SCHEDSTATS
+	runqueue_t *rq;
+#endif
 
-	schedstat_inc(rq, yld_cnt);
-	/*
-	 * We implement yielding by moving the task into the expired
-	 * queue.
-	 *
-	 * (special rule: RT tasks will just roundrobin in the active
-	 *  array.)
-	 */
-	if (rt_task(current))
-		target = rq->active;
+	local_irq_disable();
+#ifdef CONFIG_SCHEDSTATS
+	rq = this_rq();
 
+	schedstat_inc(rq, yld_cnt);
+	spin_lock(&rq->lock);
 	if (current->array->nr_active == 1) {
 		schedstat_inc(rq, yld_act_empty);
 		if (!rq->expired->nr_active)
 			schedstat_inc(rq, yld_both_empty);
 	} else if (!rq->expired->nr_active)
 		schedstat_inc(rq, yld_exp_empty);
-
-	dequeue_task(current, array);
-	enqueue_task(current, target);
-
 	/*
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
 	_raw_spin_unlock(&rq->lock);
 	preempt_enable_no_resched();
+#endif
+	current->used_slice = -1;
+	local_irq_enable();
 
 	schedule();
 
@@ -3579,6 +3331,8 @@ long sys_sched_rr_get_interval(pid_t pid
 	int retval = -EINVAL;
 	struct timespec t;
 	task_t *p;
+	unsigned long flags;
+	runqueue_t *rq;
 
 	if (pid < 0)
 		goto out_nounlock;
@@ -3593,8 +3347,9 @@ long sys_sched_rr_get_interval(pid_t pid
 	if (retval)
 		goto out_unlock;
 
-	jiffies_to_timespec(p->policy & SCHED_FIFO ?
-				0 : task_timeslice(p), &t);
+	rq = task_rq_lock(p, &flags);
+	jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : task_timeslice(p, rq), &t);
+	task_rq_unlock(rq, &flags);
 	read_unlock(&tasklist_lock);
 	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:
@@ -3707,15 +3462,17 @@ void __devinit init_idle(task_t *idle, i
 	runqueue_t *rq = cpu_rq(cpu);
 	unsigned long flags;
 
-	idle->sleep_avg = 0;
-	idle->interactive_credit = 0;
 	idle->array = NULL;
 	idle->prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
+	idle->used_slice = 0;
 	set_task_cpu(idle, cpu);
 
 	spin_lock_irqsave(&rq->lock, flags);
 	rq->curr = rq->idle = idle;
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+	idle->oncpu = 1;
+#endif
 	set_tsk_need_resched(idle);
 	spin_unlock_irqrestore(&rq->lock, flags);
 
@@ -4521,7 +4278,6 @@ void __init sched_init(void)
 		spin_lock_init(&rq->lock);
 		rq->active = rq->arrays;
 		rq->expired = rq->arrays + 1;
-		rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
 		rq->sd = &sched_domain_init;
@@ -4535,11 +4291,12 @@ void __init sched_init(void)
 
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;
+			array->min_prio = MAX_PRIO;
 			for (k = 0; k < MAX_PRIO; k++) {
 				INIT_LIST_HEAD(array->queue + k);
 				__clear_bit(k, array->bitmap);
 			}
-			// delimiter for bitsearch
+			/* delimiter for bitsearch */
 			__set_bit(MAX_PRIO, array->bitmap);
 		}
 	}
diff -puN kernel/sysctl.c~rollup kernel/sysctl.c
--- linux-2.6/kernel/sysctl.c~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/kernel/sysctl.c	2004-08-20 18:15:23.000000000 +1000
@@ -65,6 +65,9 @@ extern int sysctl_lower_zone_protection;
 extern int min_free_kbytes;
 extern int printk_ratelimit_jiffies;
 extern int printk_ratelimit_burst;
+extern int sched_base_timeslice;
+extern int sched_min_base;
+extern int sched_max_base;
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(__i386__)
 int unknown_nmi_panic;
@@ -641,12 +644,25 @@ static ctl_table kern_table[] = {
 		.proc_handler   = &proc_unknown_nmi_panic,
 	},
 #endif
+	{
+		.ctl_name	= KERN_SCHED_TIMESLICE,
+		.procname	= "base_timeslice",
+		.data		= &sched_base_timeslice,
+		.maxlen		= sizeof (sched_base_timeslice),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &sched_min_base,
+		.extra2		= &sched_max_base,
+	},
+
 	{ .ctl_name = 0 }
 };
 
 /* Constants for minimum and maximum testing in vm_table.
    We use these as one-element integer vectors. */
 static int zero;
+static int one = 1;
 static int one_hundred = 100;
 
 
@@ -723,15 +739,27 @@ static ctl_table vm_table[] = {
 	},
 	{
 		.ctl_name	= VM_SWAPPINESS,
-		.procname	= "swappiness",
-		.data		= &vm_swappiness,
-		.maxlen		= sizeof(vm_swappiness),
+		.procname	= "mapped_page_cost",
+		.data		= &vm_mapped_page_cost,
+		.maxlen		= sizeof(vm_mapped_page_cost),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
 		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
+		.extra1		= &one,
 		.extra2		= &one_hundred,
 	},
+	{
+		.ctl_name	= VM_FREE_LOCAL_HARDER,
+		.procname	= "free_local_harder",
+		.data		= &vm_free_local_harder,
+		.maxlen		= sizeof(vm_free_local_harder),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+
 #ifdef CONFIG_HUGETLB_PAGE
 	 {
 		.ctl_name	= VM_HUGETLB_PAGES,
@@ -802,9 +830,9 @@ static ctl_table vm_table[] = {
 	},
 	{
 		.ctl_name	= VM_VFS_CACHE_PRESSURE,
-		.procname	= "vfs_cache_pressure",
-		.data		= &sysctl_vfs_cache_pressure,
-		.maxlen		= sizeof(sysctl_vfs_cache_pressure),
+		.procname	= "vfs_cache_cost",
+		.data		= &sysctl_vfs_cache_cost,
+		.maxlen		= sizeof(sysctl_vfs_cache_cost),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 		.strategy	= &sysctl_intvec,
@@ -932,6 +960,7 @@ static ctl_table fs_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+
 	{ .ctl_name = 0 }
 };
 
diff -puN mm/filemap.c~rollup mm/filemap.c
--- linux-2.6/mm/filemap.c~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/mm/filemap.c	2004-08-20 18:15:21.000000000 +1000
@@ -764,11 +764,7 @@ page_ok:
 		if (mapping_writably_mapped(mapping))
 			flush_dcache_page(page);
 
-		/*
-		 * Mark the page accessed if we read the beginning.
-		 */
-		if (!offset)
-			mark_page_accessed(page);
+		mark_page_accessed(page);
 
 		/*
 		 * Ok, we have the page, and it's up-to-date, so
diff -puN mm/hugetlb.c~rollup mm/hugetlb.c
--- linux-2.6/mm/hugetlb.c~rollup	2004-08-20 18:15:20.000000000 +1000
+++ linux-2.6-npiggin/mm/hugetlb.c	2004-08-20 18:15:22.000000000 +1000
@@ -130,9 +130,12 @@ static void update_and_free_page(struct 
 	nr_huge_pages--;
 	nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--;
 	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
-		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
+		page[i].flags &= ~(
+			1 << PG_locked		| 1 << PG_error		|
+			1 << PG_referenced	| 1 << PG_dirty		|
+			1 << PG_active_mapped	| 1 << PG_active_unmapped |
+			1 << PG_reserved	| 1 << PG_private	|
+			1 << PG_writeback);
 		set_page_count(&page[i], 0);
 	}
 	set_page_count(page, 1);
diff -puN mm/memory.c~rollup mm/memory.c
--- linux-2.6/mm/memory.c~rollup	2004-08-20 18:15:21.000000000 +1000
+++ linux-2.6-npiggin/mm/memory.c	2004-08-20 18:15:22.000000000 +1000
@@ -1121,7 +1121,8 @@ static int do_wp_page(struct mm_struct *
 		else
 			page_remove_rmap(old_page);
 		break_cow(vma, new_page, address, page_table);
-		lru_cache_add_active(new_page);
+		lru_cache_add(new_page);
+		mark_page_accessed(new_page);
 		page_add_anon_rmap(new_page, vma, address);
 
 		/* Free the old page.. */
@@ -1468,7 +1469,7 @@ do_anonymous_page(struct mm_struct *mm, 
 		entry = maybe_mkwrite(pte_mkdirty(mk_pte(page,
 							 vma->vm_page_prot)),
 				      vma);
-		lru_cache_add_active(page);
+		lru_cache_add(page);
 		mark_page_accessed(page);
 		page_add_anon_rmap(page, vma, addr);
 	}
@@ -1580,7 +1581,7 @@ retry:
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		set_pte(page_table, entry);
 		if (anon) {
-			lru_cache_add_active(new_page);
+			lru_cache_add(new_page);
 			page_add_anon_rmap(new_page, vma, address);
 		} else
 			page_add_file_rmap(new_page);
diff -puN mm/oom_kill.c~rollup mm/oom_kill.c
--- linux-2.6/mm/oom_kill.c~rollup	2004-08-20 18:15:21.000000000 +1000
+++ linux-2.6-npiggin/mm/oom_kill.c	2004-08-20 18:15:23.000000000 +1000
@@ -144,11 +144,10 @@ static void __oom_kill_task(task_t *p)
 	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
 
 	/*
-	 * We give our sacrificial lamb high priority and access to
-	 * all the memory it needs. That way it should be able to
-	 * exit() and clear out its resources quickly...
+	 * We give our sacrificial lamb access to all the memory it needs.
+	 * That way it should be able to exit() and clear out its resources
+	 * quickly...
 	 */
-	p->time_slice = HZ;
 	p->flags |= PF_MEMALLOC | PF_MEMDIE;
 
 	/* This process has hardware access, be more careful. */
diff -puN mm/page-writeback.c~rollup mm/page-writeback.c
--- linux-2.6/mm/page-writeback.c~rollup	2004-08-20 18:15:21.000000000 +1000
+++ linux-2.6-npiggin/mm/page-writeback.c	2004-08-20 18:15:23.000000000 +1000
@@ -377,8 +377,7 @@ static void wb_kupdate(unsigned long arg
 	oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
 	start_jif = jiffies;
 	next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
-	nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
-			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+	nr_to_write = wbs.nr_dirty + wbs.nr_unstable + inodes_stat.nr_inodes;
 	while (nr_to_write > 0) {
 		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
diff -puN mm/page_alloc.c~rollup mm/page_alloc.c
--- linux-2.6/mm/page_alloc.c~rollup	2004-08-20 18:15:21.000000000 +1000
+++ linux-2.6-npiggin/mm/page_alloc.c	2004-08-20 18:15:23.000000000 +1000
@@ -87,7 +87,7 @@ static void bad_page(const char *functio
 	page->flags &= ~(1 << PG_private	|
 			1 << PG_locked	|
 			1 << PG_lru	|
-			1 << PG_active	|
+			1 << PG_active_mapped	|
 			1 << PG_dirty	|
 			1 << PG_swapcache |
 			1 << PG_writeback);
@@ -226,7 +226,8 @@ static inline void free_pages_check(cons
 			1 << PG_lru	|
 			1 << PG_private |
 			1 << PG_locked	|
-			1 << PG_active	|
+			1 << PG_active_mapped	|
+			1 << PG_active_unmapped	|
 			1 << PG_reclaim	|
 			1 << PG_slab	|
 			1 << PG_swapcache |
@@ -259,8 +260,6 @@ free_pages_bulk(struct zone *zone, int c
 	base = zone->zone_mem_map;
 	area = zone->free_area + order;
 	spin_lock_irqsave(&zone->lock, flags);
-	zone->all_unreclaimable = 0;
-	zone->pages_scanned = 0;
 	while (!list_empty(list) && count--) {
 		page = list_entry(list->prev, struct page, lru);
 		/* have to delete it as __free_pages_bulk list manipulates */
@@ -347,7 +346,8 @@ static void prep_new_page(struct page *p
 			1 << PG_private	|
 			1 << PG_locked	|
 			1 << PG_lru	|
-			1 << PG_active	|
+			1 << PG_active_mapped	|
+			1 << PG_active_unmapped	|
 			1 << PG_dirty	|
 			1 << PG_reclaim	|
 			1 << PG_swapcache |
@@ -664,6 +664,8 @@ buffered_rmqueue(struct zone *zone, int 
 	if (page != NULL) {
 		BUG_ON(bad_range(zone, page));
 		mod_page_state_zone(zone, pgalloc, 1 << order);
+		if (numa_node_id() != zone->zone_pgdat->node_id)
+			mod_page_state(pgalloc_remote, 1 << order);
 		prep_new_page(page, order);
 		if (order && (gfp_flags & __GFP_COMP))
 			prep_compound_page(page, order);
@@ -671,6 +673,8 @@ buffered_rmqueue(struct zone *zone, int 
 	return page;
 }
 
+int vm_free_local_harder = 1;
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  *
@@ -699,7 +703,6 @@ __alloc_pages(unsigned int gfp_mask, uns
 	struct task_struct *p = current;
 	int i;
 	int alloc_type;
-	int do_retry;
 	int can_try_harder;
 
 	might_sleep_if(wait);
@@ -726,6 +729,38 @@ __alloc_pages(unsigned int gfp_mask, uns
 
 	alloc_type = zone_idx(zones[0]);
 
+	if (!vm_free_local_harder ||
+			(p->flags & (PF_MEMALLOC | PF_MEMDIE)) || !wait)
+		goto no_local_harder;
+
+	/* Go through the zonelist, looking for a local zone with enough free */
+	if (zones[0]->zone_pgdat->node_id == numa_node_id()) {
+		for (i = 0; (z = zones[i]) != NULL; i++) {
+			if (z->zone_pgdat->node_id != numa_node_id())
+				break;
+
+			min = z->pages_high + (1<<order) + z->protection[alloc_type];
+
+			if (z->free_pages < min)
+				continue;
+
+			page = buffered_rmqueue(z, order, gfp_mask);
+			if (page)
+				goto got_pg;
+		}
+
+		p->flags |= PF_MEMALLOC;
+		reclaim_state.reclaimed_slab = 0;
+		p->reclaim_state = &reclaim_state;
+
+		try_to_free_pages(zones, gfp_mask, order, 1);
+
+		p->reclaim_state = NULL;
+		p->flags &= ~PF_MEMALLOC;
+
+	}
+
+no_local_harder:
 	/* Go through the zonelist once, looking for a zone with enough free */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
 		min = z->pages_low + (1<<order) + z->protection[alloc_type];
@@ -782,7 +817,7 @@ rebalance:
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	try_to_free_pages(zones, gfp_mask, order);
+	try_to_free_pages(zones, gfp_mask, order, 0);
 
 	p->reclaim_state = NULL;
 	p->flags &= ~PF_MEMALLOC;
@@ -811,16 +846,11 @@ rebalance:
 	 * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
 	 * <= 3, but that may not be true in other implementations.
 	 */
-	do_retry = 0;
 	if (!(gfp_mask & __GFP_NORETRY)) {
 		if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
-			do_retry = 1;
+			goto rebalance;
 		if (gfp_mask & __GFP_NOFAIL)
-			do_retry = 1;
-	}
-	if (do_retry) {
-		blk_congestion_wait(WRITE, HZ/50);
-		goto rebalance;
+			goto rebalance;
 	}
 
 nopage:
@@ -1070,7 +1100,7 @@ void get_zone_counts(unsigned long *acti
 	*inactive = 0;
 	*free = 0;
 	for_each_zone(zone) {
-		*active += zone->nr_active;
+		*active += zone->nr_active_mapped + zone->nr_active_unmapped;
 		*inactive += zone->nr_inactive;
 		*free += zone->free_pages;
 	}
@@ -1188,7 +1218,7 @@ void show_free_areas(void)
 			K(zone->pages_min),
 			K(zone->pages_low),
 			K(zone->pages_high),
-			K(zone->nr_active),
+			K(zone->nr_active_mapped + zone->nr_active_unmapped),
 			K(zone->nr_inactive),
 			K(zone->present_pages)
 			);
@@ -1586,8 +1616,6 @@ static void __init free_area_init_core(s
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
 
-		zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
-
 		/*
 		 * The per-cpu-pages pools are set to around 1000th of the
 		 * size of the zone.  But no more than 1/4 of a meg - there's
@@ -1621,12 +1649,17 @@ static void __init free_area_init_core(s
 		}
 		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
 				zone_names[j], realsize, batch);
-		INIT_LIST_HEAD(&zone->active_list);
+		INIT_LIST_HEAD(&zone->active_mapped_list);
+		INIT_LIST_HEAD(&zone->active_unmapped_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
-		zone->nr_scan_active = 0;
+		zone->nr_scan_active_mapped = 0;
+		zone->nr_scan_active_unmapped = 0;
 		zone->nr_scan_inactive = 0;
-		zone->nr_active = 0;
+		zone->nr_dirty_inactive = 0;
+		zone->nr_active_mapped = 0;
+		zone->nr_active_unmapped = 0;
 		zone->nr_inactive = 0;
+		INIT_LIST_HEAD(&zone->zone_shrinker_list);
 		if (!size)
 			continue;
 
@@ -1776,10 +1809,11 @@ static char *vmstat_text[] = {
 	"pgpgout",
 	"pswpin",
 	"pswpout",
-	"pgalloc_high",
 
+	"pgalloc_high",
 	"pgalloc_normal",
 	"pgalloc_dma",
+	"pgalloc_remote",
 	"pgfree",
 	"pgactivate",
 	"pgdeactivate",
@@ -1990,13 +2024,18 @@ static void setup_per_zone_pages_min(voi
 	}
 
 	for_each_zone(zone) {
+		unsigned long tmp;
 		spin_lock_irqsave(&zone->lru_lock, flags);
+		tmp = (pages_min * zone->present_pages) / lowmem_pages;
 		if (is_highmem(zone)) {
 			/*
-			 * Often, highmem doesn't need to reserve any pages.
-			 * But the pages_min/low/high values are also used for
-			 * batching up page reclaim activity so we need a
-			 * decent value here.
+			 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
+			 * need highmem pages, so cap pages_min to a small
+			 * value here.
+			 *
+			 * The (pages_high-pages_low) and (pages_low-pages_min)
+			 * deltas controls asynch page reclaim, and so should
+			 * not be capped for highmem.
 			 */
 			int min_pages;
 
@@ -2007,15 +2046,15 @@ static void setup_per_zone_pages_min(voi
 				min_pages = 128;
 			zone->pages_min = min_pages;
 		} else {
-			/* if it's a lowmem zone, reserve a number of pages 
+			/*
+			 * If it's a lowmem zone, reserve a number of pages
 			 * proportionate to the zone's size.
 			 */
-			zone->pages_min = (pages_min * zone->present_pages) / 
-			                   lowmem_pages;
+			zone->pages_min = tmp;
 		}
 
-		zone->pages_low = zone->pages_min * 2;
-		zone->pages_high = zone->pages_min * 3;
+		zone->pages_low = zone->pages_min + tmp;
+		zone->pages_high = zone->pages_low + tmp;
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
 }
diff -puN mm/rmap.c~rollup mm/rmap.c
--- linux-2.6/mm/rmap.c~rollup	2004-08-20 18:15:21.000000000 +1000
+++ linux-2.6-npiggin/mm/rmap.c	2004-08-20 18:15:21.000000000 +1000
@@ -252,15 +252,15 @@ unsigned long page_address_in_vma(struct
  * Subfunctions of page_referenced: page_referenced_one called
  * repeatedly from either page_referenced_anon or page_referenced_file.
  */
-static int page_referenced_one(struct page *page,
-	struct vm_area_struct *vma, unsigned int *mapcount)
+static void page_gather_one(struct page *page,
+		struct vm_area_struct *vma, unsigned int *mapcount,
+		int *referenced, int *dirty)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long address;
 	pgd_t *pgd;
 	pmd_t *pmd;
 	pte_t *pte;
-	int referenced = 0;
 
 	if (!mm->rss)
 		goto out;
@@ -286,7 +286,10 @@ static int page_referenced_one(struct pa
 		goto out_unmap;
 
 	if (ptep_clear_flush_young(vma, address, pte))
-		referenced++;
+		(*referenced)++;
+
+	if (pte_dirty(*pte))
+		(*dirty)++;
 
 	if (mm != current->mm && has_swap_token(mm))
 		referenced++;
@@ -301,28 +304,27 @@ out_unmap:
 out_unlock:
 	spin_unlock(&mm->page_table_lock);
 out:
-	return referenced;
+	;
 }
 
-static int page_referenced_anon(struct page *page)
+static inline void
+page_gather_anon(struct page *page, int *referenced, int *dirty)
 {
 	unsigned int mapcount;
 	struct anon_vma *anon_vma;
 	struct vm_area_struct *vma;
-	int referenced = 0;
 
 	anon_vma = page_lock_anon_vma(page);
 	if (!anon_vma)
-		return referenced;
+		return;
 
 	mapcount = page_mapcount(page);
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
-		referenced += page_referenced_one(page, vma, &mapcount);
+		page_gather_one(page, vma, &mapcount, referenced, dirty);
 		if (!mapcount)
 			break;
 	}
 	spin_unlock(&anon_vma->lock);
-	return referenced;
 }
 
 /**
@@ -336,14 +338,14 @@ static int page_referenced_anon(struct p
  *
  * This function is only called from page_referenced for object-based pages.
  */
-static int page_referenced_file(struct page *page)
+static inline void
+page_gather_file(struct page *page, int *referenced, int *dirty)
 {
 	unsigned int mapcount;
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct vm_area_struct *vma;
 	struct prio_tree_iter iter;
-	int referenced = 0;
 
 	/*
 	 * The caller's checks on page->mapping and !PageAnon have made
@@ -371,16 +373,15 @@ static int page_referenced_file(struct p
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
 				  == (VM_LOCKED|VM_MAYSHARE)) {
-			referenced++;
+			(*referenced)++;
 			break;
 		}
-		referenced += page_referenced_one(page, vma, &mapcount);
+		page_gather_one(page, vma, &mapcount, referenced, dirty);
 		if (!mapcount)
 			break;
 	}
 
 	spin_unlock(&mapping->i_mmap_lock);
-	return referenced;
 }
 
 /**
@@ -391,29 +392,29 @@ static int page_referenced_file(struct p
  * Quick test_and_clear_referenced for all mappings to a page,
  * returns the number of ptes which referenced the page.
  */
-int page_referenced(struct page *page, int is_locked)
+void page_gather(struct page *page, int is_locked, int *referenced, int *dirty)
 {
-	int referenced = 0;
+	*referenced = 0;
+	*dirty = 0;
 
 	if (page_test_and_clear_young(page))
-		referenced++;
+		(*referenced)++;
 
 	if (TestClearPageReferenced(page))
-		referenced++;
+		(*referenced)++;
 
 	if (page_mapped(page) && page->mapping) {
 		if (PageAnon(page))
-			referenced += page_referenced_anon(page);
+			page_gather_anon(page, referenced, dirty);
 		else if (is_locked)
-			referenced += page_referenced_file(page);
+			page_gather_file(page, referenced, dirty);
 		else if (TestSetPageLocked(page))
-			referenced++;
+			(*referenced)++;
 		else if (page->mapping) {
-			referenced += page_referenced_file(page);
+			page_gather_file(page, referenced, dirty);
 			unlock_page(page);
 		}
 	}
-	return referenced;
 }
 
 /**
diff -puN mm/shmem.c~rollup mm/shmem.c
--- linux-2.6/mm/shmem.c~rollup	2004-08-20 18:15:21.000000000 +1000
+++ linux-2.6-npiggin/mm/shmem.c	2004-08-20 18:15:21.000000000 +1000
@@ -1431,11 +1431,7 @@ static void do_shmem_file_read(struct fi
 			 */
 			if (mapping_writably_mapped(mapping))
 				flush_dcache_page(page);
-			/*
-			 * Mark the page accessed if we read the beginning.
-			 */
-			if (!offset)
-				mark_page_accessed(page);
+			mark_page_accessed(page);
 		}
 
 		/*
diff -puN mm/swap.c~rollup mm/swap.c
--- linux-2.6/mm/swap.c~rollup	2004-08-20 18:15:21.000000000 +1000
+++ linux-2.6-npiggin/mm/swap.c	2004-08-20 18:15:22.000000000 +1000
@@ -78,14 +78,18 @@ int rotate_reclaimable_page(struct page 
 		return 1;
 	if (PageDirty(page))
 		return 1;
-	if (PageActive(page))
+	if (PageActiveMapped(page))
+		return 1;
+	if (PageActiveUnmapped(page))
 		return 1;
 	if (!PageLRU(page))
 		return 1;
 
 	zone = page_zone(page);
 	spin_lock_irqsave(&zone->lru_lock, flags);
-	if (PageLRU(page) && !PageActive(page)) {
+	if (PageLRU(page)
+		&& !PageActiveMapped(page) && !PageActiveUnmapped(page)) {
+
 		list_del(&page->lru);
 		list_add_tail(&page->lru, &zone->inactive_list);
 		inc_page_state(pgrotated);
@@ -96,48 +100,11 @@ int rotate_reclaimable_page(struct page 
 	return 0;
 }
 
-/*
- * FIXME: speed this up?
- */
-void fastcall activate_page(struct page *page)
-{
-	struct zone *zone = page_zone(page);
-
-	spin_lock_irq(&zone->lru_lock);
-	if (PageLRU(page) && !PageActive(page)) {
-		del_page_from_inactive_list(zone, page);
-		SetPageActive(page);
-		add_page_to_active_list(zone, page);
-		inc_page_state(pgactivate);
-	}
-	spin_unlock_irq(&zone->lru_lock);
-}
-
-/*
- * Mark a page as having seen activity.
- *
- * inactive,unreferenced	->	inactive,referenced
- * inactive,referenced		->	active,unreferenced
- * active,unreferenced		->	active,referenced
- */
-void fastcall mark_page_accessed(struct page *page)
-{
-	if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
-		activate_page(page);
-		ClearPageReferenced(page);
-	} else if (!PageReferenced(page)) {
-		SetPageReferenced(page);
-	}
-}
-
-EXPORT_SYMBOL(mark_page_accessed);
-
 /**
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
  */
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
 
 void fastcall lru_cache_add(struct page *page)
 {
@@ -149,25 +116,12 @@ void fastcall lru_cache_add(struct page 
 	put_cpu_var(lru_add_pvecs);
 }
 
-void fastcall lru_cache_add_active(struct page *page)
-{
-	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
-
-	page_cache_get(page);
-	if (!pagevec_add(pvec, page))
-		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_active_pvecs);
-}
-
 void lru_add_drain(void)
 {
 	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
 
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
-	pvec = &__get_cpu_var(lru_add_active_pvecs);
-	if (pagevec_count(pvec))
-		__pagevec_lru_add_active(pvec);
 	put_cpu_var(lru_add_pvecs);
 }
 
@@ -304,6 +258,7 @@ void __pagevec_lru_add(struct pagevec *p
 		}
 		if (TestSetPageLRU(page))
 			BUG();
+		ClearPageUsedOnce(page);
 		add_page_to_inactive_list(zone, page);
 	}
 	if (zone)
@@ -314,33 +269,6 @@ void __pagevec_lru_add(struct pagevec *p
 
 EXPORT_SYMBOL(__pagevec_lru_add);
 
-void __pagevec_lru_add_active(struct pagevec *pvec)
-{
-	int i;
-	struct zone *zone = NULL;
-
-	for (i = 0; i < pagevec_count(pvec); i++) {
-		struct page *page = pvec->pages[i];
-		struct zone *pagezone = page_zone(page);
-
-		if (pagezone != zone) {
-			if (zone)
-				spin_unlock_irq(&zone->lru_lock);
-			zone = pagezone;
-			spin_lock_irq(&zone->lru_lock);
-		}
-		if (TestSetPageLRU(page))
-			BUG();
-		if (TestSetPageActive(page))
-			BUG();
-		add_page_to_active_list(zone, page);
-	}
-	if (zone)
-		spin_unlock_irq(&zone->lru_lock);
-	release_pages(pvec->pages, pvec->nr, pvec->cold);
-	pagevec_reinit(pvec);
-}
-
 /*
  * Try to drop buffers from the pages in a pagevec
  */
@@ -422,9 +350,6 @@ static void lru_drain_cache(unsigned int
 	/* CPU is dead, so no locking needed. */
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
-	pvec = &per_cpu(lru_add_active_pvecs, cpu);
-	if (pagevec_count(pvec))
-		__pagevec_lru_add_active(pvec);
 }
 
 /* Drop the CPU's cached committed space back into the central pool. */
diff -puN mm/swap_state.c~rollup mm/swap_state.c
--- linux-2.6/mm/swap_state.c~rollup	2004-08-20 18:15:21.000000000 +1000
+++ linux-2.6-npiggin/mm/swap_state.c	2004-08-20 18:15:22.000000000 +1000
@@ -375,7 +375,8 @@ struct page *read_swap_cache_async(swp_e
 			/*
 			 * Initiate read into locked page and return.
 			 */
-			lru_cache_add_active(new_page);
+			lru_cache_add(new_page);
+			mark_page_accessed(new_page);
 			swap_readpage(NULL, new_page);
 			return new_page;
 		}
diff -puN mm/swapfile.c~rollup mm/swapfile.c
--- linux-2.6/mm/swapfile.c~rollup	2004-08-20 18:15:21.000000000 +1000
+++ linux-2.6-npiggin/mm/swapfile.c	2004-08-20 18:15:21.000000000 +1000
@@ -469,10 +469,10 @@ static unsigned long unuse_pmd(struct vm
 			pte_unmap(pte);
 
 			/*
-			 * Move the page to the active list so it is not
-			 * immediately swapped out again after swapon.
+			 * Touch the page so it is not immediately swapped
+			 * out again after swapon.
 			 */
-			activate_page(page);
+			mark_page_accessed(page);
 
 			/* add 1 since address may be 0 */
 			return 1 + offset + address;
diff -puN mm/vmscan.c~rollup mm/vmscan.c
--- linux-2.6/mm/vmscan.c~rollup	2004-08-20 18:15:21.000000000 +1000
+++ linux-2.6-npiggin/mm/vmscan.c	2004-08-20 18:15:23.000000000 +1000
@@ -58,6 +58,12 @@ struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
 
+	/* Incremented by the number of congested pages that we encountered */
+	unsigned long nr_congested;
+
+	/* Number of dirty pages we're putting on the inactive list */
+	unsigned long nr_dirty_inactive;
+
 	/* Incremented by the number of pages reclaimed */
 	unsigned long nr_reclaimed;
 
@@ -66,12 +72,13 @@ struct scan_control {
 	/* How many pages shrink_cache() should reclaim */
 	int nr_to_reclaim;
 
-	/* Ask shrink_caches, or shrink_zone to scan at this priority */
-	unsigned int priority;
+	/* Are all zones in the current scan unreclaimable? */
+	int all_unreclaimable;
 
 	/* This context's GFP mask */
 	unsigned int gfp_mask;
 
+	int preserve_active;		/* Don't eat into the active list */
 	int may_writepage;
 };
 
@@ -117,10 +124,9 @@ struct shrinker {
 #endif
 
 /*
- * From 0 .. 100.  Higher means more swappy.
+ * From 1 .. 100.  Higher means less swappy.
  */
-int vm_swappiness = 60;
-static long total_memory;
+int vm_mapped_page_cost = 32;
 
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
@@ -130,16 +136,16 @@ static DECLARE_RWSEM(shrinker_rwsem);
  */
 struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
 {
-        struct shrinker *shrinker;
+	struct shrinker *shrinker;
 
-        shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
-        if (shrinker) {
-	        shrinker->shrinker = theshrinker;
-	        shrinker->seeks = seeks;
-	        shrinker->nr = 0;
-	        down_write(&shrinker_rwsem);
-	        list_add(&shrinker->list, &shrinker_list);
-	        up_write(&shrinker_rwsem);
+	shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
+	if (shrinker) {
+		shrinker->shrinker = theshrinker;
+		shrinker->seeks = seeks;
+		shrinker->nr = 0;
+		down_write(&shrinker_rwsem);
+		list_add(&shrinker->list, &shrinker_list);
+		up_write(&shrinker_rwsem);
 	}
 	return shrinker;
 }
@@ -157,6 +163,81 @@ void remove_shrinker(struct shrinker *sh
 }
 EXPORT_SYMBOL(remove_shrinker);
 
+static unsigned int zone_shrinker_idx;
+
+/*
+ * Add a shrinker callback to be called from the vm
+ */
+int set_zone_shrinker(zone_shrinker_fn fn, int seeks)
+{
+	int idx;
+	struct zone_shrinker *zs;
+	struct zone *zone;
+
+	down_write(&shrinker_rwsem);
+	idx = zone_shrinker_idx++;
+
+	for_each_zone(zone) {
+		zs = kmalloc(sizeof(*zs), GFP_KERNEL);
+		if (!zs) {
+			up_write(&shrinker_rwsem);
+			remove_zone_shrinker(idx);
+			return -ENOMEM;
+		}
+		INIT_LIST_HEAD(&zs->lru);
+		zs->shrinker = fn;
+		zs->seeks = seeks;
+		zs->nr = 0;
+		zs->idx = idx;
+		spin_lock_irq(&zone->lru_lock);
+		list_add(&zs->list, &zone->zone_shrinker_list);
+		spin_unlock_irq(&zone->lru_lock);
+	}
+	up_write(&shrinker_rwsem);
+	return idx;
+}
+EXPORT_SYMBOL(set_zone_shrinker);
+
+struct zone_shrinker *get_zone_shrinker(struct zone *zone, int idx)
+{
+	struct zone_shrinker *zs;
+	struct zone_shrinker *ret = NULL;
+
+	spin_lock_irq(&zone->lru_lock);
+	list_for_each_entry(zs, &zone->zone_shrinker_list, list) {
+		if (zs->idx == idx) {
+			ret = zs;
+			break;
+		}
+	}
+	spin_unlock_irq(&zone->lru_lock);
+	return ret;
+}
+EXPORT_SYMBOL(get_zone_shrinker);
+
+/*
+ * Remove one
+ */
+void remove_zone_shrinker(int idx)
+{
+	struct zone *zone;
+
+	down_write(&shrinker_rwsem);
+	for_each_zone(zone) {
+		struct zone_shrinker *zs;
+		list_for_each_entry(zs, &zone->zone_shrinker_list, list) {
+			if (zs->idx == idx) {
+				spin_lock_irq(&zone->lru_lock);
+				list_del(&zs->list);
+				spin_unlock_irq(&zone->lru_lock);
+				kfree(zs);
+			}
+		}
+	}
+	up_write(&shrinker_rwsem);
+}
+EXPORT_SYMBOL(remove_zone_shrinker);
+
 #define SHRINK_BATCH 128
 /*
  * Call the shrink functions to age shrinkable caches
@@ -175,36 +256,36 @@ EXPORT_SYMBOL(remove_shrinker);
  * are eligible for the caller's allocation attempt.  It is used for balancing
  * slab reclaim versus page reclaim.
  */
-static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
-			unsigned long lru_pages)
+static int shrink_slab(struct zone *zone, unsigned long scanned, unsigned long lru_pages, unsigned int gfp_mask)
 {
+	struct zone_shrinker *zs;
 	struct shrinker *shrinker;
 
 	if (scanned == 0)
-		return 0;
+		scanned = 1;
 
 	if (!down_read_trylock(&shrinker_rwsem))
 		return 0;
 
-	list_for_each_entry(shrinker, &shrinker_list, list) {
+	list_for_each_entry(zs, &zone->zone_shrinker_list, list) {
 		unsigned long long delta;
 		unsigned long total_scan;
 
-		delta = (4 * scanned) / shrinker->seeks;
-		delta *= (*shrinker->shrinker)(0, gfp_mask);
-		do_div(delta, lru_pages + 1);
-		shrinker->nr += delta;
-		if (shrinker->nr < 0)
-			shrinker->nr = LONG_MAX;	/* It wrapped! */
+		delta = (4 * scanned) / zs->seeks;
+		delta *= (*zs->shrinker)(zs, 0, gfp_mask);
+		do_div(delta, zone->nr_inactive + zone->nr_active_mapped + zone->nr_active_unmapped + 1);
+		zs->nr_scan += delta;
+		if (zs->nr_scan < 0)
+			zs->nr_scan = LONG_MAX;	/* It wrapped! */
 
-		total_scan = shrinker->nr;
-		shrinker->nr = 0;
+		total_scan = zs->nr_scan;
+		zs->nr_scan = 0;
 
 		while (total_scan >= SHRINK_BATCH) {
 			long this_scan = SHRINK_BATCH;
 			int shrink_ret;
 
-			shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
+			shrink_ret = (*zs->shrinker)(zs, this_scan, gfp_mask);
 			if (shrink_ret == -1)
 				break;
 			mod_page_state(slabs_scanned, this_scan);
@@ -213,31 +294,43 @@ static int shrink_slab(unsigned long sca
 			cond_resched();
 		}
 
-		shrinker->nr += total_scan;
+		zs->nr_scan += total_scan;
 	}
-	up_read(&shrinker_rwsem);
-	return 0;
-}
 
-/* Called without lock on whether page is mapped, so answer is unstable */
-static inline int page_mapping_inuse(struct page *page)
-{
-	struct address_space *mapping;
+	list_for_each_entry(shrinker, &shrinker_list, list) {
+		unsigned long long delta = 0;
+		unsigned long nr_slab;
+		unsigned long total_scan;
 
-	/* Page is in somebody's page tables. */
-	if (page_mapped(page))
-		return 1;
+		nr_slab = (*shrinker->shrinker)(0, gfp_mask);
+		if (nr_slab > shrinker->nr) {
+			delta = (scanned / shrinker->seeks) + 1;
+			delta *= (nr_slab - shrinker->nr);
+			do_div(delta, lru_pages + 1);
+		}
+		shrinker->nr += delta + 1;
 
-	/* Be more reluctant to reclaim swapcache than pagecache */
-	if (PageSwapCache(page))
-		return 1;
+		total_scan = shrinker->nr;
+		shrinker->nr = 0;
 
-	mapping = page_mapping(page);
-	if (!mapping)
-		return 0;
+		while (total_scan >= SHRINK_BATCH) {
+			long this_scan = SHRINK_BATCH;
+			int shrink_ret;
+
+			shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
+			if (shrink_ret == -1)
+				break;
+			mod_page_state(slabs_scanned, this_scan);
+			total_scan -= this_scan;
 
-	/* File is mmap'd by somebody? */
-	return mapping_mapped(mapping);
+			cond_resched();
+		}
+
+		zs->nr_scan += total_scan;
+	}
+
+	up_read(&shrinker_rwsem);
+	return 0;
 }
 
 static inline int is_page_cache_freeable(struct page *page)
@@ -245,13 +338,17 @@ static inline int is_page_cache_freeable
 	return page_count(page) - !!PagePrivate(page) == 2;
 }
 
-static int may_write_to_queue(struct backing_dev_info *bdi)
+static int may_write_to_queue(struct backing_dev_info *bdi, struct scan_control *sc)
 {
+	int congested = bdi_write_congested(bdi);
+	if (congested)
+		sc->nr_congested++;
+
 	if (current_is_kswapd())
 		return 1;
 	if (current_is_pdflush())	/* This is unlikely, but why not... */
 		return 1;
-	if (!bdi_write_congested(bdi))
+	if (!congested)
 		return 1;
 	if (bdi == current->backing_dev_info)
 		return 1;
@@ -284,9 +381,10 @@ static void handle_write_error(struct ad
 }
 
 /*
- * pageout is called by shrink_list() for each dirty page. Calls ->writepage().
+ * pageout is called by hrink_list() for each dirty page. Calls ->writepage().
  */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping,
+					struct scan_control *sc)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
@@ -311,7 +409,7 @@ static pageout_t pageout(struct page *pa
 		return PAGE_KEEP;
 	if (mapping->a_ops->writepage == NULL)
 		return PAGE_ACTIVATE;
-	if (!may_write_to_queue(mapping->backing_dev_info))
+	if (!may_write_to_queue(mapping->backing_dev_info, sc))
 		return PAGE_KEEP;
 
 	if (clear_page_dirty_for_io(page)) {
@@ -359,7 +457,7 @@ static int shrink_list(struct list_head 
 		struct address_space *mapping;
 		struct page *page;
 		int may_enter_fs;
-		int referenced;
+		int referenced, dirty, mapped;
 
 		page = lru_to_page(page_list);
 		list_del(&page->lru);
@@ -367,20 +465,34 @@ static int shrink_list(struct list_head 
 		if (TestSetPageLocked(page))
 			goto keep;
 
-		BUG_ON(PageActive(page));
+		BUG_ON(PageActiveMapped(page) || PageActiveUnmapped(page));
 
 		if (PageWriteback(page))
 			goto keep_locked;
 
+		mapped = page_mapped(page);
 		sc->nr_scanned++;
-		/* Double the slab pressure for mapped and swapcache pages */
-		if (page_mapped(page) || PageSwapCache(page))
-			sc->nr_scanned++;
-
-		referenced = page_referenced(page, 1);
-		/* In active use or really unfreeable?  Activate it. */
-		if (referenced && page_mapping_inuse(page))
-			goto activate_locked;
+		/* Increase the slab pressure for mapped pages */
+		if (mapped)
+			sc->nr_scanned += vm_mapped_page_cost;
+
+		page_gather(page, 1, &referenced, &dirty);
+		/* Has been referenced.  Activate it. */
+		if (referenced) {
+			/*
+			 * Has been referenced.  Activate used twice or
+			 * mapped pages, otherwise give it another chance
+			 * on the inactive list
+			 */
+			if (TestSetPageUsedOnce(page) || mapped)
+				goto activate_locked;
+			if (dirty) {
+				set_page_dirty(page);
+				sc->nr_dirty_inactive++;
+			}
+			sc->nr_scanned--; /* Don't count pages' first round */
+			goto keep_locked;
+		}
 
 #ifdef CONFIG_SWAP
 		/*
@@ -413,16 +525,16 @@ static int shrink_list(struct list_head 
 		}
 
 		if (PageDirty(page)) {
-			if (referenced)
-				goto keep_locked;
-			if (!may_enter_fs)
-				goto keep_locked;
-			if (laptop_mode && !sc->may_writepage)
+			if (!may_enter_fs ||
+					(laptop_mode && !sc->may_writepage)) {
+				sc->nr_dirty_inactive++;
 				goto keep_locked;
+			}
 
 			/* Page is dirty, try to write it out here */
-			switch(pageout(page, mapping)) {
+			switch(pageout(page, mapping, sc)) {
 			case PAGE_KEEP:
+				sc->nr_dirty_inactive++;
 				goto keep_locked;
 			case PAGE_ACTIVATE:
 				goto activate_locked;
@@ -484,7 +596,7 @@ static int shrink_list(struct list_head 
 		/*
 		 * The non-racy check for busy page.  It is critical to check
 		 * PageDirty _after_ making sure that the page is freeable and
-		 * not in use by anybody. 	(pagecache + us == 2)
+		 * not in use by anybody.	(pagecache + us == 2)
 		 */
 		if (page_count(page) != 2 || PageDirty(page)) {
 			write_unlock_irq(&mapping->tree_lock);
@@ -514,7 +626,10 @@ free_it:
 		continue;
 
 activate_locked:
-		SetPageActive(page);
+		if (page_mapped(page))
+			SetPageActiveMapped(page);
+		else
+			SetPageActiveUnmapped(page);
 		pgactivate++;
 keep_locked:
 		unlock_page(page);
@@ -579,13 +694,14 @@ static void shrink_cache(struct zone *zo
 			nr_taken++;
 		}
 		zone->nr_inactive -= nr_taken;
-		zone->pages_scanned += nr_taken;
 		spin_unlock_irq(&zone->lru_lock);
 
-		if (nr_taken == 0)
-			goto done;
-
 		max_scan -= nr_scan;
+		if (nr_taken == 0) {
+			spin_lock_irq(&zone->lru_lock);
+			continue;
+		}
+
 		if (current_is_kswapd())
 			mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
 		else
@@ -605,9 +721,13 @@ static void shrink_cache(struct zone *zo
 			if (TestSetPageLRU(page))
 				BUG();
 			list_del(&page->lru);
-			if (PageActive(page))
-				add_page_to_active_list(zone, page);
-			else
+			if (PageActiveMapped(page)) {
+				ClearPageUsedOnce(page);
+				add_page_to_active_mapped_list(zone, page);
+			} else if (PageActiveUnmapped(page)) {
+				ClearPageUsedOnce(page);
+				add_page_to_active_unmapped_list(zone, page);
+			} else
 				add_page_to_inactive_list(zone, page);
 			if (!pagevec_add(&pvec, page)) {
 				spin_unlock_irq(&zone->lru_lock);
@@ -615,9 +735,8 @@ static void shrink_cache(struct zone *zo
 				spin_lock_irq(&zone->lru_lock);
 			}
 		}
-  	}
+ 	}
 	spin_unlock_irq(&zone->lru_lock);
-done:
 	pagevec_release(&pvec);
 }
 
@@ -639,9 +758,9 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
-refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+shrink_active_list(struct zone *zone, struct list_head *list, unsigned long *nr_list_pages, struct scan_control *sc)
 {
-	int pgmoved;
+	int pgmoved, pgmoved_unmapped;
 	int pgdeactivate = 0;
 	int pgscanned = 0;
 	int nr_pages = sc->nr_to_scan;
@@ -650,17 +769,14 @@ refill_inactive_zone(struct zone *zone, 
 	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
 	struct page *page;
 	struct pagevec pvec;
-	int reclaim_mapped = 0;
-	long mapped_ratio;
-	long distress;
-	long swap_tendency;
 
 	lru_add_drain();
 	pgmoved = 0;
+
 	spin_lock_irq(&zone->lru_lock);
-	while (pgscanned < nr_pages && !list_empty(&zone->active_list)) {
-		page = lru_to_page(&zone->active_list);
-		prefetchw_prev_lru_page(page, &zone->active_list, flags);
+	while (pgscanned < nr_pages && !list_empty(list)) {
+		page = lru_to_page(list);
+		prefetchw_prev_lru_page(page, list, flags);
 		if (!TestClearPageLRU(page))
 			BUG();
 		list_del(&page->lru);
@@ -673,58 +789,37 @@ refill_inactive_zone(struct zone *zone, 
 			 */
 			__put_page(page);
 			SetPageLRU(page);
-			list_add(&page->lru, &zone->active_list);
+			list_add(&page->lru, list);
 		} else {
 			list_add(&page->lru, &l_hold);
 			pgmoved++;
 		}
 		pgscanned++;
 	}
-	zone->nr_active -= pgmoved;
+	*nr_list_pages -= pgmoved;
+	zone->pages_scanned += pgmoved;
 	spin_unlock_irq(&zone->lru_lock);
 
-	/*
-	 * `distress' is a measure of how much trouble we're having reclaiming
-	 * pages.  0 -> no problems.  100 -> great trouble.
-	 */
-	distress = 100 >> zone->prev_priority;
-
-	/*
-	 * The point of this algorithm is to decide when to start reclaiming
-	 * mapped memory instead of just pagecache.  Work out how much memory
-	 * is mapped.
-	 */
-	mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-
-	/*
-	 * Now decide how much we really want to unmap some pages.  The mapped
-	 * ratio is downgraded - just because there's a lot of mapped memory
-	 * doesn't necessarily mean that page reclaim isn't succeeding.
-	 *
-	 * The distress ratio is important - we don't want to start going oom.
-	 *
-	 * A 100% value of vm_swappiness overrides this algorithm altogether.
-	 */
-	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
-	/*
-	 * Now use this metric to decide whether to start moving mapped memory
-	 * onto the inactive list.
-	 */
-	if (swap_tendency >= 100)
-		reclaim_mapped = 1;
-
 	while (!list_empty(&l_hold)) {
+		int referenced, dirty;
+
 		page = lru_to_page(&l_hold);
 		list_del(&page->lru);
-		if (page_mapped(page)) {
-			if (!reclaim_mapped ||
-			    (total_swap_pages == 0 && PageAnon(page)) ||
-			    page_referenced(page, 0)) {
-				list_add(&page->lru, &l_active);
-				continue;
-			}
+
+		if ((total_swap_pages == 0 && PageAnon(page))) {
+			list_add(&page->lru, &l_active);
+			continue;
+		}
+		page_gather(page, 0, &referenced, &dirty);
+		if (referenced) {
+			list_add(&page->lru, &l_active);
+			continue;
+		}
+		if (dirty) {
+			set_page_dirty(page);
+			sc->nr_dirty_inactive++;
 		}
+
 		list_add(&page->lru, &l_inactive);
 	}
 
@@ -736,7 +831,8 @@ refill_inactive_zone(struct zone *zone, 
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
 		if (TestSetPageLRU(page))
 			BUG();
-		if (!TestClearPageActive(page))
+		if (!TestClearPageActiveMapped(page)
+				&& !TestClearPageActiveUnmapped(page))
 			BUG();
 		list_move(&page->lru, &zone->inactive_list);
 		pgmoved++;
@@ -760,23 +856,37 @@ refill_inactive_zone(struct zone *zone, 
 	}
 
 	pgmoved = 0;
+	pgmoved_unmapped = 0;
 	while (!list_empty(&l_active)) {
 		page = lru_to_page(&l_active);
 		prefetchw_prev_lru_page(page, &l_active, flags);
 		if (TestSetPageLRU(page))
 			BUG();
-		BUG_ON(!PageActive(page));
-		list_move(&page->lru, &zone->active_list);
-		pgmoved++;
+		if(!TestClearPageActiveMapped(page)
+				&& !TestClearPageActiveUnmapped(page))
+			BUG();
+		if (page_mapped(page)) {
+			SetPageActiveMapped(page);
+			list_move(&page->lru, &zone->active_mapped_list);
+			pgmoved++;
+		} else {
+			SetPageActiveUnmapped(page);
+			list_move(&page->lru, &zone->active_unmapped_list);
+			pgmoved_unmapped++;
+		}
+
 		if (!pagevec_add(&pvec, page)) {
-			zone->nr_active += pgmoved;
+			zone->nr_active_mapped += pgmoved;
 			pgmoved = 0;
+			zone->nr_active_unmapped += pgmoved_unmapped;
+			pgmoved_unmapped = 0;
 			spin_unlock_irq(&zone->lru_lock);
 			__pagevec_release(&pvec);
 			spin_lock_irq(&zone->lru_lock);
 		}
 	}
-	zone->nr_active += pgmoved;
+	zone->nr_active_mapped += pgmoved;
+	zone->nr_active_unmapped += pgmoved_unmapped;
 	spin_unlock_irq(&zone->lru_lock);
 	pagevec_release(&pvec);
 
@@ -784,52 +894,121 @@ refill_inactive_zone(struct zone *zone, 
 	mod_page_state(pgdeactivate, pgdeactivate);
 }
 
+#define SCAN_MASK	0x00000fff
+#define SCAN_SHIFT	7
+
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
 static void
 shrink_zone(struct zone *zone, struct scan_control *sc)
 {
+	unsigned long long tmp;
+	unsigned long scan_active, scan_active_mapped, scan_active_unmapped;
+	unsigned long scan_inactive;
 	unsigned long nr_active;
-	unsigned long nr_inactive;
+	int count;
+
+	if (sc->preserve_active) {
+		if (zone->nr_inactive * 8 >=
+			zone->nr_active_mapped + zone->nr_active_unmapped)
+			sc->all_unreclaimable = 0;
+	} else if (!zone->all_unreclaimable)
+		sc->all_unreclaimable = 0;
+       if (zone->all_unreclaimable) {
+		scan_inactive = zone->nr_inactive;
+		scan_active_mapped = 1;
+		scan_active_unmapped = vm_mapped_page_cost;
+		goto scan;
+	}
+
+	nr_active = zone->nr_active_mapped + zone->nr_active_unmapped;
+	scan_inactive = (nr_active + zone->nr_inactive);
+
+       if (nr_active >= (zone->nr_inactive + 1) && !sc->preserve_active) {
+		/*
+		 * Add one to `nr_to_scan' just to make sure that the kernel
+		 * will slowly sift through the active list.
+		 */
+		if (nr_active >= 4*(zone->nr_inactive*2 + 1)) {
+			/* Don't scan more than 4 times inactive list scan */
+			scan_active = 4*scan_inactive;
+		} else {
+			/* Cast to long long so the multiply doesn't overflow */
+			tmp = (unsigned long long)scan_inactive * nr_active;
+			do_div(tmp, zone->nr_inactive*2 + 1);
+			scan_active = (unsigned long)tmp;
+		}
+		scan_active *= 2;
+
+		tmp = scan_active * zone->nr_active_mapped;
+		do_div(tmp, nr_active + 1);
+		scan_active_mapped = ((unsigned long)tmp + 1)
+					/ vm_mapped_page_cost;
+		scan_active_unmapped = scan_active - tmp + 1;
+	} else {
+		/* Don't scan the active list if the inactive list is large */
+		scan_active_mapped = zone->nr_active_mapped / 32;
+		scan_active_unmapped = zone->nr_active_unmapped * vm_mapped_page_cost / 32;
+	}
+
+scan:
+	/* zero this before scanning */
+	sc->nr_dirty_inactive = 0;
+	sc->nr_to_scan = SWAP_CLUSTER_MAX;
+
+	count = (zone->nr_scan_active_mapped + scan_active_mapped);
+	zone->nr_scan_active_mapped = count & SCAN_MASK;
+	count >>= SCAN_SHIFT;
+	while (count >= SWAP_CLUSTER_MAX) {
+		count -= SWAP_CLUSTER_MAX;
+		shrink_active_list(zone, &zone->active_mapped_list,
+				&zone->nr_active_mapped, sc);
+	}
+
+	count = (zone->nr_scan_active_unmapped + scan_active_unmapped);
+	zone->nr_scan_active_unmapped = count & SCAN_MASK;
+	count >>= SCAN_SHIFT;
+	while (count >= SWAP_CLUSTER_MAX) {
+		count -= SWAP_CLUSTER_MAX;
+		shrink_active_list(zone, &zone->active_unmapped_list,
+				&zone->nr_active_unmapped, sc);
+	}
+
+	count = (zone->nr_scan_inactive + scan_inactive);
+	zone->nr_scan_inactive = count & SCAN_MASK;
+	count >>= SCAN_SHIFT;
+	while (count >= SWAP_CLUSTER_MAX) {
+		if (sc->nr_to_reclaim <= 0)
+			break;
+		count -= SWAP_CLUSTER_MAX;
+		shrink_cache(zone, sc);
+	}
 
 	/*
-	 * Add one to `nr_to_scan' just to make sure that the kernel will
-	 * slowly sift through the active list.
+	 * Try to write back as many pages as the number of dirty ones
+	 * we're adding to the inactive list.  This tends to cause slow
+	 * streaming writers to write data to the disk smoothly, at the
+	 * dirtying rate, which is nice.   But that's undesirable in
+	 * laptop mode, where we *want* lumpy writeout.  So in laptop
+	 * mode, write out the whole world.
 	 */
-	zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
-	nr_active = zone->nr_scan_active;
-	if (nr_active >= SWAP_CLUSTER_MAX)
-		zone->nr_scan_active = 0;
-	else
-		nr_active = 0;
-
-	zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
-	nr_inactive = zone->nr_scan_inactive;
-	if (nr_inactive >= SWAP_CLUSTER_MAX)
-		zone->nr_scan_inactive = 0;
-	else
-		nr_inactive = 0;
-
-	sc->nr_to_reclaim = SWAP_CLUSTER_MAX;
-
-	while (nr_active || nr_inactive) {
-		if (nr_active) {
-			sc->nr_to_scan = min(nr_active,
-					(unsigned long)SWAP_CLUSTER_MAX);
-			nr_active -= sc->nr_to_scan;
-			refill_inactive_zone(zone, sc);
-		}
-
-		if (nr_inactive) {
-			sc->nr_to_scan = min(nr_inactive,
-					(unsigned long)SWAP_CLUSTER_MAX);
-			nr_inactive -= sc->nr_to_scan;
-			shrink_cache(zone, sc);
-			if (sc->nr_to_reclaim <= 0)
-				break;
-		}
+	zone->nr_dirty_inactive += sc->nr_dirty_inactive;
+	count = zone->nr_dirty_inactive;
+	if (count > zone->nr_inactive / 2
+		|| (!(laptop_mode && !sc->may_writepage)
+			&& count > SWAP_CLUSTER_MAX)) {
+		zone->nr_dirty_inactive = 0;
+		wakeup_bdflush(laptop_mode ? 0 : count*2);
+		sc->may_writepage = 1;
 	}
+
+	if (sc->nr_reclaimed) {
+		zone->all_unreclaimable = 0;
+		zone->pages_scanned = 0;
+	}
+	if (zone->pages_scanned > zone->present_pages)
+		zone->all_unreclaimable = 1;
 }
 
 /*
@@ -849,24 +1028,25 @@ shrink_zone(struct zone *zone, struct sc
  * scan then give up on it.
  */
 static void
-shrink_caches(struct zone **zones, struct scan_control *sc)
+shrink_caches(struct zone **zones, struct scan_control *sc, unsigned long lru_pages)
 {
+	struct reclaim_state *reclaim_state = current->reclaim_state;
 	int i;
 
+	sc->all_unreclaimable = 1;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
-
-		zone->temp_priority = sc->priority;
-		if (zone->prev_priority > sc->priority)
-			zone->prev_priority = sc->priority;
-
-		if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
-			continue;	/* Let kswapd poll it */
-
+		if (sc->preserve_active && zone->zone_pgdat->node_id != numa_node_id())
+			break;
 		shrink_zone(zone, sc);
+		shrink_slab(zone, sc->nr_scanned, lru_pages, sc->gfp_mask);
+		if (reclaim_state) {
+			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+			reclaim_state->reclaimed_slab = 0;
+		}
 	}
 }
- 
+
 /*
  * This is the main entry point to direct page reclaim.
  *
@@ -881,67 +1061,54 @@ shrink_caches(struct zone **zones, struc
  * allocation attempt will fail.
  */
 int try_to_free_pages(struct zone **zones,
-		unsigned int gfp_mask, unsigned int order)
+		unsigned int gfp_mask, unsigned int order, int local)
 {
-	int priority;
 	int ret = 0;
 	int total_scanned = 0, total_reclaimed = 0;
-	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc;
 	unsigned long lru_pages = 0;
 	int i;
 
+	sc.nr_to_reclaim = SWAP_CLUSTER_MAX;
 	sc.gfp_mask = gfp_mask;
 	sc.may_writepage = 0;
+	sc.preserve_active = local;
 
 	inc_page_state(allocstall);
 
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *zone = zones[i];
-
-		zone->temp_priority = DEF_PRIORITY;
-		lru_pages += zone->nr_active + zone->nr_inactive;
+		if (local && zone->zone_pgdat->node_id != numa_node_id())
+			break;
+		lru_pages += zone->nr_active_mapped +
+			zone->nr_active_unmapped + zone->nr_inactive;
 	}
 
-	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
+	for (;;) {
 		sc.nr_mapped = read_page_state(nr_mapped);
+		sc.nr_congested = 0;
 		sc.nr_scanned = 0;
 		sc.nr_reclaimed = 0;
-		sc.priority = priority;
-		shrink_caches(zones, &sc);
-		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
-		if (reclaim_state) {
-			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-			reclaim_state->reclaimed_slab = 0;
-		}
-		if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) {
-			ret = 1;
-			goto out;
-		}
+		shrink_caches(zones, &sc, lru_pages);
 		total_scanned += sc.nr_scanned;
 		total_reclaimed += sc.nr_reclaimed;
-
-		/*
-		 * Try to write back as many pages as we just scanned.  This
-		 * tends to cause slow streaming writers to write data to the
-		 * disk smoothly, at the dirtying rate, which is nice.   But
-		 * that's undesirable in laptop mode, where we *want* lumpy
-		 * writeout.  So in laptop mode, write out the whole world.
-		 */
-		if (total_scanned > SWAP_CLUSTER_MAX + SWAP_CLUSTER_MAX/2) {
-			wakeup_bdflush(laptop_mode ? 0 : total_scanned);
-			sc.may_writepage = 1;
+		if (total_reclaimed >= SWAP_CLUSTER_MAX) {
+			ret = 1;
+			goto out;
 		}
 
 		/* Take a nap, wait for some writeback to complete */
-		if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+		if (sc.all_unreclaimable)
+			break;
+		if (sc.nr_congested * 10 > sc.nr_scanned) {
+			if (local)
+				break;
 			blk_congestion_wait(WRITE, HZ/10);
+		}
 	}
-	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
+	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY) && !local)
 		out_of_memory(gfp_mask);
 out:
-	for (i = 0; zones[i] != 0; i++)
-		zones[i]->prev_priority = zones[i]->temp_priority;
 	return ret;
 }
 
@@ -972,8 +1139,8 @@ out:
  */
 static int balance_pgdat(pg_data_t *pgdat, int nr_pages)
 {
+	int all_zones_ok;
 	int to_free = nr_pages;
-	int priority;
 	int i;
 	int total_scanned = 0, total_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -981,92 +1148,62 @@ static int balance_pgdat(pg_data_t *pgda
 
 	sc.gfp_mask = GFP_KERNEL;
 	sc.may_writepage = 0;
+	sc.preserve_active = 0;
 	sc.nr_mapped = read_page_state(nr_mapped);
 
 	inc_page_state(pageoutrun);
 
-	for (i = 0; i < pgdat->nr_zones; i++) {
-		struct zone *zone = pgdat->node_zones + i;
-
-		zone->temp_priority = DEF_PRIORITY;
-	}
-
-	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-		int all_zones_ok = 1;
-		int end_zone = 0;	/* Inclusive.  0 = ZONE_DMA */
+	for (;;) {
 		unsigned long lru_pages = 0;
+		int first_low = 0;
+		all_zones_ok = 1;
 
-		if (nr_pages == 0) {
-			/*
-			 * Scan in the highmem->dma direction for the highest
-			 * zone which needs scanning
-			 */
-			for (i = pgdat->nr_zones - 1; i >= 0; i--) {
-				struct zone *zone = pgdat->node_zones + i;
-
-				if (zone->all_unreclaimable &&
-						priority != DEF_PRIORITY)
-					continue;
+		sc.nr_scanned = 0;
+		sc.nr_congested = 0;
+		sc.nr_reclaimed = 0;
+		sc.all_unreclaimable = 1;
 
-				if (zone->free_pages <= zone->pages_high) {
-					end_zone = i;
-					goto scan;
-				}
-			}
-			goto out;
-		} else {
-			end_zone = pgdat->nr_zones - 1;
-		}
-scan:
-		for (i = 0; i <= end_zone; i++) {
+		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
 			struct zone *zone = pgdat->node_zones + i;
-
-			lru_pages += zone->nr_active + zone->nr_inactive;
+			if (is_highmem(zone))
+				continue;
+			lru_pages += zone->nr_active_mapped +
+					zone->nr_active_unmapped +
+					zone->nr_inactive;
 		}
 
-		/*
-		 * Now scan the zone in the dma->highmem direction, stopping
-		 * at the last zone which needs scanning.
-		 *
-		 * We do this because the page allocator works in the opposite
-		 * direction.  This prevents the page allocator from allocating
-		 * pages behind kswapd's direction of progress, which would
-		 * cause too much scanning of the lower zones.
-		 */
-		for (i = 0; i <= end_zone; i++) {
+		/* Scan in the highmem->dma direction */
+		for (i = pgdat->nr_zones - 1; i >= 0; i--) {
 			struct zone *zone = pgdat->node_zones + i;
 
-			if (zone->all_unreclaimable && priority != DEF_PRIORITY)
-				continue;
-
 			if (nr_pages == 0) {	/* Not software suspend */
-				if (zone->free_pages <= zone->pages_high)
-					all_zones_ok = 0;
-			}
-			zone->temp_priority = priority;
-			if (zone->prev_priority > priority)
-				zone->prev_priority = priority;
-			sc.nr_scanned = 0;
-			sc.nr_reclaimed = 0;
-			sc.priority = priority;
+				unsigned long pgfree = zone->free_pages;
+				unsigned long pghigh = zone->pages_high;
+
+				/*
+				 * This satisfies the "incremental min" or
+				 * lower zone protection logic in the allocator
+				 */
+				if (first_low > i)
+					pghigh += zone->protection[first_low];
+				if (pgfree >= pghigh)
+					continue;
+				if (first_low < i)
+					first_low = i;
+
+				all_zones_ok = 0;
+				sc.nr_to_reclaim = pghigh - pgfree;
+			} else
+				sc.nr_to_reclaim = INT_MAX; /* Software susp */
+
 			shrink_zone(zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
-			shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages);
+			shrink_slab(zone, sc.nr_scanned, GFP_KERNEL, lru_pages);
 			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-			total_reclaimed += sc.nr_reclaimed;
-			if (zone->all_unreclaimable)
-				continue;
-			if (zone->pages_scanned > zone->present_pages * 2)
-				zone->all_unreclaimable = 1;
-			/*
-			 * If we've done a decent amount of scanning and
-			 * the reclaim ratio is low, start doing writepage
-			 * even in laptop mode
-			 */
-			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
-			    total_scanned > total_reclaimed+total_reclaimed/2)
-				sc.may_writepage = 1;
 		}
+		total_reclaimed += sc.nr_reclaimed;
+		total_scanned += sc.nr_scanned;
+
 		if (nr_pages && to_free > total_reclaimed)
 			continue;	/* swsusp: need to do more work */
 		if (all_zones_ok)
@@ -1075,21 +1212,17 @@ scan:
 		 * OK, kswapd is getting into trouble.  Take a nap, then take
 		 * another pass across the zones.
 		 */
-		if (total_scanned && priority < DEF_PRIORITY - 2)
+		if (sc.all_unreclaimable)
+			schedule_timeout(HZ/10);
+		else if (sc.nr_congested * 10 > sc.nr_scanned)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
-out:
-	for (i = 0; i < pgdat->nr_zones; i++) {
-		struct zone *zone = pgdat->node_zones + i;
-
-		zone->prev_priority = zone->temp_priority;
-	}
 	return total_reclaimed;
 }
 
 /*
  * The background pageout daemon, started as a kernel thread
- * from the init process. 
+ * from the init process.
  *
  * This basically trickles out pages so that we have _some_
  * free memory available even if there is no other activity
@@ -1213,7 +1346,6 @@ static int __init kswapd_init(void)
 	for_each_pgdat(pgdat)
 		pgdat->kswapd
 		= find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
-	total_memory = nr_free_pagecache_pages();
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
 }

_