--- linux-2.6-npiggin/fs/buffer.c | 2 linux-2.6-npiggin/fs/dcache.c | 118 ++ linux-2.6-npiggin/fs/dquot.c | 6 linux-2.6-npiggin/fs/exec.c | 3 linux-2.6-npiggin/fs/fs-writeback.c | 7 linux-2.6-npiggin/fs/hfs/inode.c | 23 linux-2.6-npiggin/fs/hfsplus/inode.c | 23 linux-2.6-npiggin/fs/inode.c | 92 +- linux-2.6-npiggin/fs/proc/array.c | 5 linux-2.6-npiggin/include/asm-arm/system.h | 30 linux-2.6-npiggin/include/asm-ia64/system.h | 10 linux-2.6-npiggin/include/asm-mips/system.h | 10 linux-2.6-npiggin/include/asm-s390/system.h | 5 linux-2.6-npiggin/include/asm-sparc/system.h | 4 linux-2.6-npiggin/include/asm-sparc64/system.h | 14 linux-2.6-npiggin/include/linux/dcache.h | 3 linux-2.6-npiggin/include/linux/fs.h | 5 linux-2.6-npiggin/include/linux/gfp.h | 1 linux-2.6-npiggin/include/linux/init_task.h | 6 linux-2.6-npiggin/include/linux/mm.h | 19 linux-2.6-npiggin/include/linux/mm_inline.h | 34 linux-2.6-npiggin/include/linux/mmzone.h | 33 linux-2.6-npiggin/include/linux/page-flags.h | 58 - linux-2.6-npiggin/include/linux/rmap.h | 4 linux-2.6-npiggin/include/linux/sched.h | 21 linux-2.6-npiggin/include/linux/swap.h | 13 linux-2.6-npiggin/include/linux/sysctl.h | 2 linux-2.6-npiggin/include/linux/writeback.h | 2 linux-2.6-npiggin/kernel/sched.c | 1063 +++++++++---------------- linux-2.6-npiggin/kernel/sysctl.c | 43 - linux-2.6-npiggin/mm/filemap.c | 6 linux-2.6-npiggin/mm/hugetlb.c | 9 linux-2.6-npiggin/mm/memory.c | 7 linux-2.6-npiggin/mm/oom_kill.c | 7 linux-2.6-npiggin/mm/page-writeback.c | 3 linux-2.6-npiggin/mm/page_alloc.c | 101 +- linux-2.6-npiggin/mm/rmap.c | 49 - linux-2.6-npiggin/mm/shmem.c | 6 linux-2.6-npiggin/mm/swap.c | 89 -- linux-2.6-npiggin/mm/swap_state.c | 3 linux-2.6-npiggin/mm/swapfile.c | 6 linux-2.6-npiggin/mm/vmscan.c | 712 +++++++++------- 42 files changed, 1312 insertions(+), 1345 deletions(-) diff -puN fs/buffer.c~rollup fs/buffer.c --- linux-2.6/fs/buffer.c~rollup 2004-08-20 18:15:19.000000000 +1000 +++ linux-2.6-npiggin/fs/buffer.c 2004-08-20 18:15:23.000000000 +1000 @@ -594,7 +594,7 @@ static void free_more_memory(void) for_each_pgdat(pgdat) { zones = pgdat->node_zonelists[GFP_NOFS&GFP_ZONEMASK].zones; if (*zones) - try_to_free_pages(zones, GFP_NOFS, 0); + try_to_free_pages(zones, GFP_NOFS, 0, 0); } } diff -puN fs/dcache.c~rollup fs/dcache.c --- linux-2.6/fs/dcache.c~rollup 2004-08-20 18:15:19.000000000 +1000 +++ linux-2.6-npiggin/fs/dcache.c 2004-08-20 18:15:23.000000000 +1000 @@ -34,7 +34,7 @@ /* #define DCACHE_DEBUG 1 */ -int sysctl_vfs_cache_pressure = 100; +int sysctl_vfs_cache_cost = 16; spinlock_t dcache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; @@ -60,6 +60,7 @@ static unsigned int d_hash_mask; static unsigned int d_hash_shift; static struct hlist_head *dentry_hashtable; static LIST_HEAD(dentry_unused); +static int zone_shrinker; /* Statistics gathering. */ struct dentry_stat_t dentry_stat = { @@ -86,6 +87,22 @@ static void d_free(struct dentry *dentry call_rcu(&dentry->d_rcu, d_callback); } +static void dentry_add_lru(struct dentry *dentry) +{ + struct zone_shrinker *zs; + zs = get_zone_shrinker(page_zone(virt_to_page(dentry)), zone_shrinker); + list_add(&dentry->d_lru, &zs->lru); + zs->nr++; +} + +static void dentry_del_lru(struct dentry *dentry) +{ + struct zone_shrinker *zs; + zs = get_zone_shrinker(page_zone(virt_to_page(dentry)), zone_shrinker); + list_del(&dentry->d_lru); + zs->nr--; +} + /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. @@ -155,7 +172,7 @@ repeat: spin_unlock(&dcache_lock); return; } - + /* * AV: ->d_delete() is _NOT_ allowed to block now. */ @@ -166,9 +183,9 @@ repeat: /* Unreachable? Get rid of it */ if (d_unhashed(dentry)) goto kill_it; - if (list_empty(&dentry->d_lru)) { - dentry->d_flags |= DCACHE_REFERENCED; - list_add(&dentry->d_lru, &dentry_unused); + dentry->d_flags |= DCACHE_REFERENCED; + if (list_empty(&dentry->d_unused)) { + list_add(&dentry->d_unused, &dentry_unused); dentry_stat.nr_unused++; } spin_unlock(&dentry->d_lock); @@ -181,11 +198,12 @@ unhash_it: kill_it: { struct dentry *parent; - /* If dentry was on d_lru list + /* If dentry was on d_unused list * delete it from there */ - if (!list_empty(&dentry->d_lru)) { - list_del(&dentry->d_lru); + dentry_del_lru(dentry); + if (!list_empty(&dentry->d_unused)) { + list_del(&dentry->d_unused); dentry_stat.nr_unused--; } list_del(&dentry->d_child); @@ -263,9 +281,9 @@ int d_invalidate(struct dentry * dentry) static inline struct dentry * __dget_locked(struct dentry *dentry) { atomic_inc(&dentry->d_count); - if (!list_empty(&dentry->d_lru)) { + if (!list_empty(&dentry->d_unused)) { dentry_stat.nr_unused--; - list_del_init(&dentry->d_lru); + list_del_init(&dentry->d_unused); } return dentry; } @@ -350,6 +368,7 @@ static inline void prune_one_dentry(stru { struct dentry * parent; + dentry_del_lru(dentry); __d_drop(dentry); list_del(&dentry->d_child); dentry_stat.nr_dentry--; /* For d_free, below */ @@ -392,7 +411,39 @@ static void prune_dcache(int count) list_del_init(tmp); prefetch(dentry_unused.prev); dentry_stat.nr_unused--; + dentry = list_entry(tmp, struct dentry, d_unused); + + spin_lock(&dentry->d_lock); + /* + * We found an inuse dentry which was not removed from + * dentry_unused because of laziness during lookup. Do not free + * it - just keep it off the dentry_unused list. + */ + if (atomic_read(&dentry->d_count)) { + spin_unlock(&dentry->d_lock); + continue; + } + if (dentry->d_flags & DCACHE_REFERENCED) + dentry->d_flags &= ~DCACHE_REFERENCED; + prune_one_dentry(dentry); + } + spin_unlock(&dcache_lock); +} + +static void prune_dcache_lru(struct list_head *list, unsigned long count) +{ + spin_lock(&dcache_lock); + for (; count ; count--) { + struct dentry *dentry; + struct list_head *tmp; + + tmp = list->prev; + if (tmp == list) + break; + list_del(tmp); + prefetch(list->prev); dentry = list_entry(tmp, struct dentry, d_lru); + list_add(&dentry->d_lru, list); spin_lock(&dentry->d_lock); /* @@ -401,22 +452,27 @@ static void prune_dcache(int count) * it - just keep it off the dentry_unused list. */ if (atomic_read(&dentry->d_count)) { + if (!list_empty(&dentry->d_unused)) { + list_del_init(&dentry->d_unused); + dentry_stat.nr_unused--; + } spin_unlock(&dentry->d_lock); continue; } /* If the dentry was recently referenced, don't free it. */ if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; - list_add(&dentry->d_lru, &dentry_unused); - dentry_stat.nr_unused++; spin_unlock(&dentry->d_lock); continue; } + list_del_init(&dentry->d_unused); + dentry_stat.nr_unused--; prune_one_dentry(dentry); } spin_unlock(&dcache_lock); } + /* * Shrink the dcache for the specified super block. * This allows us to unmount a device without disturbing @@ -453,7 +509,7 @@ void shrink_dcache_sb(struct super_block while (next != &dentry_unused) { tmp = next; next = tmp->next; - dentry = list_entry(tmp, struct dentry, d_lru); + dentry = list_entry(tmp, struct dentry, d_unused); if (dentry->d_sb != sb) continue; list_del(tmp); @@ -468,7 +524,7 @@ repeat: while (next != &dentry_unused) { tmp = next; next = tmp->next; - dentry = list_entry(tmp, struct dentry, d_lru); + dentry = list_entry(tmp, struct dentry, d_unused); if (dentry->d_sb != sb) continue; dentry_stat.nr_unused--; @@ -558,16 +614,16 @@ resume: struct dentry *dentry = list_entry(tmp, struct dentry, d_child); next = tmp->next; - if (!list_empty(&dentry->d_lru)) { + if (!list_empty(&dentry->d_unused)) { dentry_stat.nr_unused--; - list_del_init(&dentry->d_lru); + list_del_init(&dentry->d_unused); } /* * move only zero ref count dentries to the end * of the unused list for prune_dcache */ if (!atomic_read(&dentry->d_count)) { - list_add(&dentry->d_lru, dentry_unused.prev); + list_add(&dentry->d_unused, dentry_unused.prev); dentry_stat.nr_unused++; found++; } @@ -633,9 +689,9 @@ void shrink_dcache_anon(struct hlist_hea spin_lock(&dcache_lock); hlist_for_each(lp, head) { struct dentry *this = hlist_entry(lp, struct dentry, d_hash); - if (!list_empty(&this->d_lru)) { + if (!list_empty(&this->d_unused)) { dentry_stat.nr_unused--; - list_del_init(&this->d_lru); + list_del_init(&this->d_unused); } /* @@ -643,7 +699,7 @@ void shrink_dcache_anon(struct hlist_hea * of the unused list for prune_dcache */ if (!atomic_read(&this->d_count)) { - list_add_tail(&this->d_lru, &dentry_unused); + list_add_tail(&this->d_unused, &dentry_unused); dentry_stat.nr_unused++; found++; } @@ -665,14 +721,16 @@ void shrink_dcache_anon(struct hlist_hea * * In this case we return -1 to tell the caller that we baled. */ -static int shrink_dcache_memory(int nr, unsigned int gfp_mask) +static long shrink_dcache_memory(struct zone_shrinker *zs, + unsigned long nr, + unsigned int gfp_mask) { if (nr) { if (!(gfp_mask & __GFP_FS)) return -1; - prune_dcache(nr); + prune_dcache_lru(&zs->lru, nr); } - return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + return zs->nr / sysctl_vfs_cache_cost; } /** @@ -702,7 +760,7 @@ struct dentry *d_alloc(struct dentry * p } } else { dname = dentry->d_iname; - } + } dentry->d_name.name = dname; dentry->d_name.len = name->len; @@ -723,6 +781,7 @@ struct dentry *d_alloc(struct dentry * p dentry->d_bucket = NULL; INIT_HLIST_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); + INIT_LIST_HEAD(&dentry->d_unused); INIT_LIST_HEAD(&dentry->d_subdirs); INIT_LIST_HEAD(&dentry->d_alias); @@ -734,6 +793,7 @@ struct dentry *d_alloc(struct dentry * p } spin_lock(&dcache_lock); + dentry_add_lru(dentry); if (parent) list_add(&dentry->d_child, &parent->d_subdirs); dentry_stat.nr_dentry++; @@ -838,7 +898,7 @@ struct dentry * d_alloc_anon(struct inod return NULL; tmp->d_parent = tmp; /* make sure dput doesn't croak */ - + spin_lock(&dcache_lock); if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) { /* A directory can only have one dentry. @@ -976,7 +1036,7 @@ struct dentry * __d_lookup(struct dentry struct hlist_node *node; rcu_read_lock(); - + hlist_for_each_rcu(node, head) { struct dentry *dentry; struct qstr *qstr; @@ -1597,8 +1657,10 @@ static void __init dcache_init(unsigned 0, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC, NULL, NULL); - - set_shrinker(DEFAULT_SEEKS, shrink_dcache_memory); + + zone_shrinker = set_zone_shrinker(shrink_dcache_memory, DEFAULT_SEEKS); + if (zone_shrinker < 0) + BUG(); } /* SLAB cache for __getname() consumers */ diff -puN fs/dquot.c~rollup fs/dquot.c --- linux-2.6/fs/dquot.c~rollup 2004-08-20 18:15:19.000000000 +1000 +++ linux-2.6-npiggin/fs/dquot.c 2004-08-20 18:15:23.000000000 +1000 @@ -115,7 +115,7 @@ * spinlock to internal buffers before writing. * * Lock ordering (including related VFS locks) is following: - * i_sem > dqonoff_sem > iprune_sem > journal_lock > dqptr_sem > + * i_sem > dqonoff_sem > iprune_rwsem > journal_lock > dqptr_sem > * > dquot->dq_lock > dqio_sem * i_sem on quota files is special (it's below dqio_sem) */ @@ -734,11 +734,11 @@ static void drop_dquot_ref(struct super_ /* We need to be guarded against prune_icache to reach all the * inodes - otherwise some can be on the local list of prune_icache */ - down(&iprune_sem); + down_write(&iprune_rwsem); down_write(&sb_dqopt(sb)->dqptr_sem); remove_dquot_ref(sb, type, &tofree_head); up_write(&sb_dqopt(sb)->dqptr_sem); - up(&iprune_sem); + up_write(&iprune_rwsem); put_dquot_list(&tofree_head); } diff -puN fs/exec.c~rollup fs/exec.c --- linux-2.6/fs/exec.c~rollup 2004-08-20 18:15:19.000000000 +1000 +++ linux-2.6-npiggin/fs/exec.c 2004-08-20 18:15:22.000000000 +1000 @@ -321,7 +321,8 @@ void install_arg_page(struct vm_area_str goto out; } mm->rss++; - lru_cache_add_active(page); + lru_cache_add(page); + mark_page_accessed(page); set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte( page, vma->vm_page_prot)))); page_add_anon_rmap(page, vma, address); diff -puN fs/fs-writeback.c~rollup fs/fs-writeback.c --- linux-2.6/fs/fs-writeback.c~rollup 2004-08-20 18:15:19.000000000 +1000 +++ linux-2.6-npiggin/fs/fs-writeback.c 2004-08-20 18:15:23.000000000 +1000 @@ -225,8 +225,7 @@ __sync_single_inode(struct inode *inode, /* * The inode is clean, unused */ - list_move(&inode->i_list, &inode_unused); - inodes_stat.nr_unused++; + inode_add_unused(inode); } } wake_up_inode(inode); @@ -457,9 +456,7 @@ void sync_inodes_sb(struct super_block * unsigned long nr_dirty = read_page_state(nr_dirty); unsigned long nr_unstable = read_page_state(nr_unstable); - wbc.nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused) + - nr_dirty + nr_unstable; + wbc.nr_to_write = nr_dirty + nr_unstable + inodes_stat.nr_inodes; wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ spin_lock(&inode_lock); sync_sb_inodes(sb, &wbc); diff -puN fs/hfs/inode.c~rollup fs/hfs/inode.c --- linux-2.6/fs/hfs/inode.c~rollup 2004-08-20 18:15:19.000000000 +1000 +++ linux-2.6-npiggin/fs/hfs/inode.c 2004-08-20 18:15:22.000000000 +1000 @@ -67,19 +67,20 @@ int hfs_releasepage(struct page *page, i nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT); spin_lock(&tree->hash_lock); node = hfs_bnode_findhash(tree, nidx); - if (!node) - ; - else if (atomic_read(&node->refcnt)) - res = 0; - else for (i = 0; i < tree->pages_per_bnode; i++) { - if (PageActive(node->page[i])) { + if (node) { + if (atomic_read(&node->refcnt)) res = 0; - break; + else for (i = 0; i < tree->pages_per_bnode; i++) { + if (PageActiveMapped(node->page[i]) || + PageActiveUnmapped(node->page[i])) { + res = 0; + break; + } + } + if (res) { + hfs_bnode_unhash(node); + hfs_bnode_free(node); } - } - if (res && node) { - hfs_bnode_unhash(node); - hfs_bnode_free(node); } spin_unlock(&tree->hash_lock); } else { diff -puN fs/hfsplus/inode.c~rollup fs/hfsplus/inode.c --- linux-2.6/fs/hfsplus/inode.c~rollup 2004-08-20 18:15:19.000000000 +1000 +++ linux-2.6-npiggin/fs/hfsplus/inode.c 2004-08-20 18:15:22.000000000 +1000 @@ -67,19 +67,20 @@ int hfsplus_releasepage(struct page *pag nidx = page->index >> (tree->node_size_shift - PAGE_CACHE_SHIFT); spin_lock(&tree->hash_lock); node = hfs_bnode_findhash(tree, nidx); - if (!node) - ; - else if (atomic_read(&node->refcnt)) - res = 0; - else for (i = 0; i < tree->pages_per_bnode; i++) { - if (PageActive(node->page[i])) { + if (node) { + if (atomic_read(&node->refcnt)) res = 0; - break; + else for (i = 0; i < tree->pages_per_bnode; i++) { + if (PageActiveMapped(node->page[i]) || + PageActiveUnmapped(node->page[i])) { + res = 0; + break; + } + } + if (res) { + hfs_bnode_unhash(node); + hfs_bnode_free(node); } - } - if (res && node) { - hfs_bnode_unhash(node); - hfs_bnode_free(node); } spin_unlock(&tree->hash_lock); } else { diff -puN fs/inode.c~rollup fs/inode.c --- linux-2.6/fs/inode.c~rollup 2004-08-20 18:15:19.000000000 +1000 +++ linux-2.6-npiggin/fs/inode.c 2004-08-20 18:15:23.000000000 +1000 @@ -69,9 +69,8 @@ static unsigned int i_hash_shift; * A "dirty" list is maintained for each super block, * allowing for low-overhead inode sync() operations. */ - +static int zone_shrinker; LIST_HEAD(inode_in_use); -LIST_HEAD(inode_unused); static struct hlist_head *inode_hashtable; /* @@ -91,7 +90,7 @@ EXPORT_SYMBOL(inode_lock); * from its final dispose_list, the struct super_block they refer to * (for inode->i_sb->s_op) may already have been freed and reused. */ -DECLARE_MUTEX(iprune_sem); +DECLARE_RWSEM(iprune_rwsem); /* * Statistics gathering.. @@ -220,6 +219,24 @@ static void init_once(void * foo, kmem_c inode_init_once(inode); } +void inode_add_unused(struct inode *inode) +{ + struct zone_shrinker *zs; + zs = get_zone_shrinker(page_zone(virt_to_page(inode)), zone_shrinker); + list_add(&inode->i_list, &zs->lru); + zs->nr++; +} + +static void inode_del_unused(struct inode *inode) +{ + struct zone_shrinker *zs; + zs = get_zone_shrinker(page_zone(virt_to_page(inode)), zone_shrinker); + list_del_init(&inode->i_list); + BUG_ON(zs->nr == 0); + zs->nr--; +} + + /* * inode_lock must be held */ @@ -230,9 +247,10 @@ void __iget(struct inode * inode) return; } atomic_inc(&inode->i_count); - if (!(inode->i_state & (I_DIRTY|I_LOCK))) - list_move(&inode->i_list, &inode_in_use); - inodes_stat.nr_unused--; + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + inode_del_unused(inode); + list_add(&inode->i_list, &inode_in_use); + } } EXPORT_SYMBOL(__iget); @@ -302,7 +320,7 @@ static void dispose_list(struct list_hea static int invalidate_list(struct list_head *head, struct list_head *dispose) { struct list_head *next; - int busy = 0, count = 0; + int busy = 0; next = head->next; for (;;) { @@ -317,15 +335,13 @@ static int invalidate_list(struct list_h if (!atomic_read(&inode->i_count)) { hlist_del_init(&inode->i_hash); list_del(&inode->i_sb_list); - list_move(&inode->i_list, dispose); + inode_del_unused(inode); + list_add(&inode->i_list, dispose); inode->i_state |= I_FREEING; - count++; continue; } busy = 1; } - /* only unused inodes may be cached with i_count zero */ - inodes_stat.nr_unused -= count; return busy; } @@ -350,13 +366,13 @@ int invalidate_inodes(struct super_block int busy; LIST_HEAD(throw_away); - down(&iprune_sem); + down_write(&iprune_rwsem); spin_lock(&inode_lock); busy = invalidate_list(&sb->s_inodes, &throw_away); spin_unlock(&inode_lock); dispose_list(&throw_away); - up(&iprune_sem); + up_write(&iprune_rwsem); return busy; } @@ -416,25 +432,26 @@ static int can_unuse(struct inode *inode * If the inode has metadata buffers attached to mapping->private_list then * try to remove them. */ -static void prune_icache(int nr_to_scan) +static void prune_icache_lru(struct list_head *list, unsigned long nr_to_scan) { LIST_HEAD(freeable); - int nr_pruned = 0; int nr_scanned; unsigned long reap = 0; - down(&iprune_sem); + down_read(&iprune_rwsem); spin_lock(&inode_lock); for (nr_scanned = 0; nr_scanned < nr_to_scan; nr_scanned++) { struct inode *inode; + struct list_head *tmp; - if (list_empty(&inode_unused)) + tmp = list->prev; + if (tmp == list) break; - - inode = list_entry(inode_unused.prev, struct inode, i_list); + prefetch(tmp->prev); + inode = list_entry(tmp, struct inode, i_list); if (inode->i_state || atomic_read(&inode->i_count)) { - list_move(&inode->i_list, &inode_unused); + list_move(&inode->i_list, list); continue; } if (inode_has_buffers(inode) || inode->i_data.nrpages) { @@ -445,7 +462,7 @@ static void prune_icache(int nr_to_scan) iput(inode); spin_lock(&inode_lock); - if (inode != list_entry(inode_unused.next, + if (inode != list_entry(list->prev, struct inode, i_list)) continue; /* wrong inode or list_empty */ if (!can_unuse(inode)) @@ -453,15 +470,14 @@ static void prune_icache(int nr_to_scan) } hlist_del_init(&inode->i_hash); list_del_init(&inode->i_sb_list); - list_move(&inode->i_list, &freeable); + inode_del_unused(inode); + list_add(&inode->i_list, &freeable); inode->i_state |= I_FREEING; - nr_pruned++; } - inodes_stat.nr_unused -= nr_pruned; spin_unlock(&inode_lock); dispose_list(&freeable); - up(&iprune_sem); + up_read(&iprune_rwsem); if (current_is_kswapd()) mod_page_state(kswapd_inodesteal, reap); @@ -478,7 +494,9 @@ static void prune_icache(int nr_to_scan) * This function is passed the number of inodes to scan, and it returns the * total number of remaining possibly-reclaimable inodes. */ -static int shrink_icache_memory(int nr, unsigned int gfp_mask) +static long shrink_icache_memory(struct zone_shrinker *zs, + unsigned long nr, + unsigned int gfp_mask) { if (nr) { /* @@ -488,9 +506,9 @@ static int shrink_icache_memory(int nr, */ if (!(gfp_mask & __GFP_FS)) return -1; - prune_icache(nr); + prune_icache_lru(&zs->lru, nr); } - return (inodes_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; + return zs->nr / sysctl_vfs_cache_cost; } static void __wait_on_freeing_inode(struct inode *inode); @@ -1033,18 +1051,20 @@ void generic_forget_inode(struct inode * struct super_block *sb = inode->i_sb; if (!hlist_unhashed(&inode->i_hash)) { - if (!(inode->i_state & (I_DIRTY|I_LOCK))) - list_move(&inode->i_list, &inode_unused); - inodes_stat.nr_unused++; + if (!(inode->i_state & (I_DIRTY|I_LOCK))) { + list_del(&inode->i_list); + inode_add_unused(inode); + } + spin_unlock(&inode_lock); if (!sb || (sb->s_flags & MS_ACTIVE)) return; write_inode_now(inode, 1); spin_lock(&inode_lock); - inodes_stat.nr_unused--; hlist_del_init(&inode->i_hash); - } - list_del_init(&inode->i_list); + inode_del_unused(inode); + } else + list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); inode->i_state|=I_FREEING; inodes_stat.nr_inodes--; @@ -1369,7 +1389,9 @@ void __init inode_init(unsigned long mem /* inode slab cache */ inode_cachep = kmem_cache_create("inode_cache", sizeof(struct inode), 0, SLAB_PANIC, init_once, NULL); - set_shrinker(DEFAULT_SEEKS, shrink_icache_memory); + zone_shrinker = set_zone_shrinker(shrink_icache_memory, DEFAULT_SEEKS); + if (zone_shrinker < 0) + BUG(); } void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) diff -puN fs/proc/array.c~rollup fs/proc/array.c --- linux-2.6/fs/proc/array.c~rollup 2004-08-20 18:15:19.000000000 +1000 +++ linux-2.6-npiggin/fs/proc/array.c 2004-08-20 18:15:23.000000000 +1000 @@ -159,7 +159,8 @@ static inline char * task_state(struct t read_lock(&tasklist_lock); buffer += sprintf(buffer, "State:\t%s\n" - "SleepAVG:\t%lu%%\n" + "sleep_time:\t%lu\n" + "total_time:\t%lu\n" "Tgid:\t%d\n" "Pid:\t%d\n" "PPid:\t%d\n" @@ -167,7 +168,7 @@ static inline char * task_state(struct t "Uid:\t%d\t%d\t%d\t%d\n" "Gid:\t%d\t%d\t%d\t%d\n", get_task_state(p), - (p->sleep_avg/1024)*100/(1020000000/1024), + p->sleep_time, p->total_time, p->tgid, p->pid, p->pid ? p->real_parent->pid : 0, p->pid && p->ptrace ? p->parent->pid : 0, diff -puN include/asm-arm/system.h~rollup include/asm-arm/system.h --- linux-2.6/include/asm-arm/system.h~rollup 2004-08-20 18:15:19.000000000 +1000 +++ linux-2.6-npiggin/include/asm-arm/system.h 2004-08-20 18:15:23.000000000 +1000 @@ -137,34 +137,12 @@ extern unsigned int user_debug; #define set_wmb(var, value) do { var = value; wmb(); } while (0) #define nop() __asm__ __volatile__("mov\tr0,r0\t@ nop\n\t"); -#ifdef CONFIG_SMP /* - * Define our own context switch locking. This allows us to enable - * interrupts over the context switch, otherwise we end up with high - * interrupt latency. The real problem area is switch_mm() which may - * do a full cache flush. + * switch_mm() may do a full cache flush over the context switch, + * so enable interrupts over the context switch to avoid high + * latency. */ -#define prepare_arch_switch(rq,next) \ -do { \ - spin_lock(&(next)->switch_lock); \ - spin_unlock_irq(&(rq)->lock); \ -} while (0) - -#define finish_arch_switch(rq,prev) \ - spin_unlock(&(prev)->switch_lock) - -#define task_running(rq,p) \ - ((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock)) -#else -/* - * Our UP-case is more simple, but we assume knowledge of how - * spin_unlock_irq() and friends are implemented. This avoids - * us needlessly decrementing and incrementing the preempt count. - */ -#define prepare_arch_switch(rq,next) local_irq_enable() -#define finish_arch_switch(rq,prev) spin_unlock(&(rq)->lock) -#define task_running(rq,p) ((rq)->curr == (p)) -#endif +#define __ARCH_WANT_INTERRUPTS_ON_CTXSW /* * switch_to(prev, next) should switch from task `prev' to `next' diff -puN include/asm-ia64/system.h~rollup include/asm-ia64/system.h --- linux-2.6/include/asm-ia64/system.h~rollup 2004-08-20 18:15:19.000000000 +1000 +++ linux-2.6-npiggin/include/asm-ia64/system.h 2004-08-20 18:15:23.000000000 +1000 @@ -183,8 +183,6 @@ do { \ #ifdef __KERNEL__ -#define prepare_to_switch() do { } while(0) - #ifdef CONFIG_IA32_SUPPORT # define IS_IA32_PROCESS(regs) (ia64_psr(regs)->is != 0) #else @@ -274,13 +272,7 @@ extern void ia64_load_extra (struct task * of that CPU which will not be released, because there we wait for the * tasklist_lock to become available. */ -#define prepare_arch_switch(rq, next) \ -do { \ - spin_lock(&(next)->switch_lock); \ - spin_unlock(&(rq)->lock); \ -} while (0) -#define finish_arch_switch(rq, prev) spin_unlock_irq(&(prev)->switch_lock) -#define task_running(rq, p) ((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock)) +#define __ARCH_WANT_UNLOCKED_CTXSW #define ia64_platform_is(x) (strcmp(x, platform_name) == 0) diff -puN include/asm-mips/system.h~rollup include/asm-mips/system.h --- linux-2.6/include/asm-mips/system.h~rollup 2004-08-20 18:15:19.000000000 +1000 +++ linux-2.6-npiggin/include/asm-mips/system.h 2004-08-20 18:15:23.000000000 +1000 @@ -488,15 +488,9 @@ static __inline__ int con_is_present(voi } /* - * Taken from include/asm-ia64/system.h; prevents deadlock on SMP + * See include/asm-ia64/system.h; prevents deadlock on SMP * systems. */ -#define prepare_arch_switch(rq, next) \ -do { \ - spin_lock(&(next)->switch_lock); \ - spin_unlock(&(rq)->lock); \ -} while (0) -#define finish_arch_switch(rq, prev) spin_unlock_irq(&(prev)->switch_lock) -#define task_running(rq, p) ((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock)) +#define __ARCH_WANT_UNLOCKED_CTXSW #endif /* _ASM_SYSTEM_H */ diff -puN include/asm-s390/system.h~rollup include/asm-s390/system.h --- linux-2.6/include/asm-s390/system.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/asm-s390/system.h 2004-08-20 18:15:23.000000000 +1000 @@ -103,11 +103,8 @@ static inline void restore_access_regs(u prev = __switch_to(prev,next); \ } while (0) -#define prepare_arch_switch(rq, next) do { } while(0) -#define task_running(rq, p) ((rq)->curr == (p)) -#define finish_arch_switch(rq, prev) do { \ +#define finish_arch_switch(prev) do { \ set_fs(current->thread.mm_segment); \ - spin_unlock_irq(&(rq)->lock); \ } while (0) #define nop() __asm__ __volatile__ ("nop") diff -puN include/asm-sparc/system.h~rollup include/asm-sparc/system.h --- linux-2.6/include/asm-sparc/system.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/asm-sparc/system.h 2004-08-20 18:15:23.000000000 +1000 @@ -101,7 +101,7 @@ extern void fpsave(unsigned long *fpregs * SWITCH_ENTER and SWITH_DO_LAZY_FPU do not work yet (e.g. SMP does not work) * XXX WTF is the above comment? Found in late teen 2.4.x. */ -#define prepare_arch_switch(rq, next) do { \ +#define prepare_arch_switch(next) do { \ __asm__ __volatile__( \ ".globl\tflush_patch_switch\nflush_patch_switch:\n\t" \ "save %sp, -0x40, %sp; save %sp, -0x40, %sp; save %sp, -0x40, %sp\n\t" \ @@ -109,8 +109,6 @@ extern void fpsave(unsigned long *fpregs "save %sp, -0x40, %sp\n\t" \ "restore; restore; restore; restore; restore; restore; restore"); \ } while(0) -#define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -#define task_running(rq, p) ((rq)->curr == (p)) /* Much care has gone into this code, do not touch it. * diff -puN include/asm-sparc64/system.h~rollup include/asm-sparc64/system.h --- linux-2.6/include/asm-sparc64/system.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/asm-sparc64/system.h 2004-08-20 18:15:23.000000000 +1000 @@ -139,19 +139,13 @@ extern void __flushw_user(void); #define flush_user_windows flushw_user #define flush_register_windows flushw_all -#define prepare_arch_switch(rq, next) \ -do { spin_lock(&(next)->switch_lock); \ - spin_unlock(&(rq)->lock); \ +/* Don't hold the runqueue lock over context switch */ +#define __ARCH_WANT_UNLOCKED_CTXSW +#define prepare_arch_switch(next) \ +do { \ flushw_all(); \ } while (0) -#define finish_arch_switch(rq, prev) \ -do { spin_unlock_irq(&(prev)->switch_lock); \ -} while (0) - -#define task_running(rq, p) \ - ((rq)->curr == (p) || spin_is_locked(&(p)->switch_lock)) - /* See what happens when you design the chip correctly? * * We tell gcc we clobber all non-fixed-usage registers except diff -puN include/linux/dcache.h~rollup include/linux/dcache.h --- linux-2.6/include/linux/dcache.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/linux/dcache.h 2004-08-20 18:15:23.000000000 +1000 @@ -95,6 +95,7 @@ struct dentry { struct qstr d_name; struct list_head d_lru; /* LRU list */ + struct list_head d_unused; /* unused list */ struct list_head d_child; /* child of parent list */ struct list_head d_subdirs; /* our children */ struct list_head d_alias; /* inode alias list */ @@ -313,7 +314,7 @@ static inline int d_mountpoint(struct de extern struct vfsmount *lookup_mnt(struct vfsmount *, struct dentry *); extern struct dentry *lookup_create(struct nameidata *nd, int is_dir); -extern int sysctl_vfs_cache_pressure; +extern int sysctl_vfs_cache_cost; #endif /* __KERNEL__ */ diff -puN include/linux/fs.h~rollup include/linux/fs.h --- linux-2.6/include/linux/fs.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/linux/fs.h 2004-08-20 18:15:23.000000000 +1000 @@ -56,8 +56,7 @@ extern struct files_stat_struct files_st struct inodes_stat_t { int nr_inodes; - int nr_unused; - int dummy[5]; + int dummy[6]; }; extern struct inodes_stat_t inodes_stat; @@ -1477,7 +1476,7 @@ extern void destroy_inode(struct inode * extern struct inode *new_inode(struct super_block *); extern int remove_suid(struct dentry *); extern void remove_dquot_ref(struct super_block *, int, struct list_head *); -extern struct semaphore iprune_sem; +extern struct rw_semaphore iprune_rwsem; extern void __insert_inode_hash(struct inode *, unsigned long hashval); extern void remove_inode_hash(struct inode *); diff -puN include/linux/gfp.h~rollup include/linux/gfp.h --- linux-2.6/include/linux/gfp.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/linux/gfp.h 2004-08-20 18:15:23.000000000 +1000 @@ -6,6 +6,7 @@ #include #include +extern int vm_free_local_harder; struct vm_area_struct; /* diff -puN include/linux/init_task.h~rollup include/linux/init_task.h --- linux-2.6/include/linux/init_task.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/linux/init_task.h 2004-08-20 18:15:23.000000000 +1000 @@ -73,14 +73,13 @@ extern struct group_info init_groups; .usage = ATOMIC_INIT(2), \ .flags = 0, \ .lock_depth = -1, \ - .prio = MAX_PRIO-20, \ - .static_prio = MAX_PRIO-20, \ + .prio = MAX_PRIO-29, \ + .static_prio = MAX_PRIO-29, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ .mm = NULL, \ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ - .time_slice = HZ, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ @@ -112,7 +111,6 @@ extern struct group_info init_groups; .blocked = {{0}}, \ .alloc_lock = SPIN_LOCK_UNLOCKED, \ .proc_lock = SPIN_LOCK_UNLOCKED, \ - .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ .private_pages = LIST_HEAD_INIT(tsk.private_pages), \ .private_pages_count = 0, \ diff -puN include/linux/mm.h~rollup include/linux/mm.h --- linux-2.6/include/linux/mm.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/linux/mm.h 2004-08-20 18:15:23.000000000 +1000 @@ -615,6 +615,25 @@ struct shrinker; extern struct shrinker *set_shrinker(int, shrinker_t); extern void remove_shrinker(struct shrinker *shrinker); +struct zone_shrinker; +typedef long (*zone_shrinker_fn)(struct zone_shrinker *zs, + unsigned long nr_to_scan, + unsigned int gfp_mask); +struct zone_shrinker { + struct list_head lru; + unsigned long nr; + zone_shrinker_fn shrinker; + unsigned long nr_scan; + int seeks; + + int idx; + struct list_head list; +}; + +int set_zone_shrinker(zone_shrinker_fn, int); +struct zone_shrinker *get_zone_shrinker(struct zone *, int); +void remove_zone_shrinker(int); + /* * On a two-level page table, this ends up being trivial. Thus the * inlining and the symmetry break with pte_alloc_map() that does all diff -puN include/linux/mm_inline.h~rollup include/linux/mm_inline.h --- linux-2.6/include/linux/mm_inline.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/linux/mm_inline.h 2004-08-20 18:15:22.000000000 +1000 @@ -1,9 +1,16 @@ static inline void -add_page_to_active_list(struct zone *zone, struct page *page) +add_page_to_active_mapped_list(struct zone *zone, struct page *page) { - list_add(&page->lru, &zone->active_list); - zone->nr_active++; + list_add(&page->lru, &zone->active_mapped_list); + zone->nr_active_mapped++; +} + +static inline void +add_page_to_active_unmapped_list(struct zone *zone, struct page *page) +{ + list_add(&page->lru, &zone->active_unmapped_list); + zone->nr_active_unmapped++; } static inline void @@ -14,10 +21,17 @@ add_page_to_inactive_list(struct zone *z } static inline void -del_page_from_active_list(struct zone *zone, struct page *page) +del_page_from_active_mapped_list(struct zone *zone, struct page *page) +{ + list_del(&page->lru); + zone->nr_active_mapped--; +} + +static inline void +del_page_from_active_unmapped_list(struct zone *zone, struct page *page) { list_del(&page->lru); - zone->nr_active--; + zone->nr_active_unmapped--; } static inline void @@ -31,10 +45,14 @@ static inline void del_page_from_lru(struct zone *zone, struct page *page) { list_del(&page->lru); - if (PageActive(page)) { - ClearPageActive(page); - zone->nr_active--; + if (PageActiveMapped(page)) { + ClearPageActiveMapped(page); + zone->nr_active_mapped--; + } else if (PageActiveUnmapped(page)) { + ClearPageActiveUnmapped(page); + zone->nr_active_unmapped--; } else { + ClearPageUsedOnce(page); zone->nr_inactive--; } } diff -puN include/linux/mmzone.h~rollup include/linux/mmzone.h --- linux-2.6/include/linux/mmzone.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/linux/mmzone.h 2004-08-20 18:15:23.000000000 +1000 @@ -130,36 +130,23 @@ struct zone { ZONE_PADDING(_pad1_) - spinlock_t lru_lock; - struct list_head active_list; + spinlock_t lru_lock; + struct list_head active_mapped_list; + struct list_head active_unmapped_list; struct list_head inactive_list; - unsigned long nr_scan_active; + unsigned long nr_scan_active_mapped; + unsigned long nr_scan_active_unmapped; unsigned long nr_scan_inactive; - unsigned long nr_active; + unsigned long nr_dirty_inactive; + unsigned long nr_active_mapped; + unsigned long nr_active_unmapped; unsigned long nr_inactive; int all_unreclaimable; /* All pages pinned */ unsigned long pages_scanned; /* since last reclaim */ - ZONE_PADDING(_pad2_) + struct list_head zone_shrinker_list; - /* - * prev_priority holds the scanning priority for this zone. It is - * defined as the scanning priority at which we achieved our reclaim - * target at the previous try_to_free_pages() or balance_pgdat() - * invokation. - * - * We use prev_priority as a measure of how much stress page reclaim is - * under - it drives the swappiness decision: whether to unmap mapped - * pages. - * - * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to free_pages == pages_high. - * - * Access to both these fields is quite racy even on uniprocessor. But - * it is expected to average out OK. - */ - int temp_priority; - int prev_priority; + ZONE_PADDING(_pad2_) /* * free areas of different sizes diff -puN include/linux/page-flags.h~rollup include/linux/page-flags.h --- linux-2.6/include/linux/page-flags.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/linux/page-flags.h 2004-08-20 18:15:22.000000000 +1000 @@ -58,22 +58,25 @@ #define PG_dirty 4 #define PG_lru 5 -#define PG_active 6 -#define PG_slab 7 /* slab debug (Suparna wants this) */ +#define PG_active_mapped 6 +#define PG_active_unmapped 7 -#define PG_highmem 8 -#define PG_checked 9 /* kill me in 2.5.. */ -#define PG_arch_1 10 -#define PG_reserved 11 - -#define PG_private 12 /* Has something at ->private */ -#define PG_writeback 13 /* Page is under writeback */ -#define PG_nosave 14 /* Used for system suspend/resume */ -#define PG_compound 15 /* Part of a compound page */ - -#define PG_swapcache 16 /* Swap page: swp_entry_t in private */ -#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ -#define PG_reclaim 18 /* To be reclaimed asap */ +#define PG_slab 8 /* slab debug (Suparna wants this) */ +#define PG_highmem 9 +#define PG_checked 10 /* kill me in 2.5.. */ +#define PG_arch_1 11 + +#define PG_reserved 12 +#define PG_private 13 /* Has something at ->private */ +#define PG_writeback 14 /* Page is under writeback */ +#define PG_nosave 15 /* Used for system suspend/resume */ + +#define PG_compound 16 /* Part of a compound page */ +#define PG_swapcache 17 /* Swap page: swp_entry_t in private */ +#define PG_mappedtodisk 18 /* Has blocks allocated on-disk */ +#define PG_reclaim 19 /* To be reclaimed asap */ + +#define PG_usedonce 20 /* LRU page has been touched once */ /* @@ -97,10 +100,11 @@ struct page_state { unsigned long pgpgout; /* Disk writes */ unsigned long pswpin; /* swap reads */ unsigned long pswpout; /* swap writes */ - unsigned long pgalloc_high; /* page allocations */ + unsigned long pgalloc_high; /* page allocations */ unsigned long pgalloc_normal; unsigned long pgalloc_dma; + unsigned long pgalloc_remote; unsigned long pgfree; /* page freeings */ unsigned long pgactivate; /* pages moved inactive->active */ unsigned long pgdeactivate; /* pages moved active->inactive */ @@ -208,11 +212,17 @@ extern unsigned long __read_page_state(u #define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) #define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) -#define PageActive(page) test_bit(PG_active, &(page)->flags) -#define SetPageActive(page) set_bit(PG_active, &(page)->flags) -#define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) -#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) -#define TestSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags) +#define PageActiveMapped(page) test_bit(PG_active_mapped, &(page)->flags) +#define SetPageActiveMapped(page) set_bit(PG_active_mapped, &(page)->flags) +#define ClearPageActiveMapped(page) clear_bit(PG_active_mapped, &(page)->flags) +#define TestClearPageActiveMapped(page) test_and_clear_bit(PG_active_mapped, &(page)->flags) +#define TestSetPageActiveMapped(page) test_and_set_bit(PG_active_mapped, &(page)->flags) + +#define PageActiveUnmapped(page) test_bit(PG_active_unmapped, &(page)->flags) +#define SetPageActiveUnmapped(page) set_bit(PG_active_unmapped, &(page)->flags) +#define ClearPageActiveUnmapped(page) clear_bit(PG_active_unmapped, &(page)->flags) +#define TestClearPageActiveUnmapped(page) test_and_clear_bit(PG_active_unmapped, &(page)->flags) +#define TestSetPageActiveUnmapped(page) test_and_set_bit(PG_active_unmapped, &(page)->flags) #define PageSlab(page) test_bit(PG_slab, &(page)->flags) #define SetPageSlab(page) set_bit(PG_slab, &(page)->flags) @@ -290,6 +300,12 @@ extern unsigned long __read_page_state(u #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) +#define PageUsedOnce(page) test_bit(PG_usedonce, &(page)->flags) +#define SetPageUsedOnce(page) set_bit(PG_usedonce, &(page)->flags) +#define TestSetPageUsedOnce(page) test_and_set_bit(PG_usedonce, &(page)->flags) +#define ClearPageUsedOnce(page) clear_bit(PG_usedonce, &(page)->flags) +#define TestClearPageUsedOnce(page) test_and_clear_bit(PG_usedonce, &(page)->flags) + #ifdef CONFIG_SWAP #define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags) #define SetPageSwapCache(page) set_bit(PG_swapcache, &(page)->flags) diff -puN include/linux/rmap.h~rollup include/linux/rmap.h --- linux-2.6/include/linux/rmap.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/linux/rmap.h 2004-08-20 18:15:21.000000000 +1000 @@ -88,7 +88,7 @@ static inline void page_dup_rmap(struct /* * Called from mm/vmscan.c to handle paging out */ -int page_referenced(struct page *, int is_locked); +void page_gather(struct page *, int is_locked, int *referenced, int *dirty); int try_to_unmap(struct page *); /* @@ -102,7 +102,7 @@ unsigned long page_address_in_vma(struct #define anon_vma_prepare(vma) (0) #define anon_vma_link(vma) do {} while (0) -#define page_referenced(page,l) TestClearPageReferenced(page) +#define page_gather(page,l,r,d) TestClearPageReferenced(page) #define try_to_unmap(page) SWAP_FAIL #endif /* CONFIG_MMU */ diff -puN include/linux/sched.h~rollup include/linux/sched.h --- linux-2.6/include/linux/sched.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/linux/sched.h 2004-08-20 18:15:23.000000000 +1000 @@ -313,6 +313,11 @@ struct signal_struct { unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; }; +/* Context switch must be unlocked if interrupts are to be enabled */ +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +# define __ARCH_WANT_UNLOCKED_CTXSW +#endif + /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are @@ -329,7 +334,7 @@ struct signal_struct { #define MAX_USER_RT_PRIO 100 #define MAX_RT_PRIO MAX_USER_RT_PRIO -#define MAX_PRIO (MAX_RT_PRIO + 40) +#define MAX_PRIO (MAX_RT_PRIO + 59) #define rt_task(p) (unlikely((p)->prio < MAX_RT_PRIO)) @@ -447,18 +452,22 @@ struct task_struct { int lock_depth; /* Lock depth */ +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + int oncpu; +#endif int prio, static_prio; struct list_head run_list; prio_array_t *array; - unsigned long sleep_avg; - long interactive_credit; + /* Scheduler variables follow. kernel/sched.c */ + unsigned long array_sequence; unsigned long long timestamp; - int activated; + int used_slice; + + unsigned long total_time, sleep_time; unsigned long policy; cpumask_t cpus_allowed; - unsigned int time_slice, first_time_slice; #ifdef CONFIG_SCHEDSTATS struct sched_info sched_info; @@ -566,8 +575,6 @@ struct task_struct { spinlock_t alloc_lock; /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */ spinlock_t proc_lock; -/* context-switch lock */ - spinlock_t switch_lock; /* journalling filesystem info */ void *journal_info; diff -puN include/linux/swap.h~rollup include/linux/swap.h --- linux-2.6/include/linux/swap.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/linux/swap.h 2004-08-20 18:15:22.000000000 +1000 @@ -164,17 +164,20 @@ extern unsigned int nr_free_pagecache_pa /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); -extern void FASTCALL(lru_cache_add_active(struct page *)); -extern void FASTCALL(activate_page(struct page *)); -extern void FASTCALL(mark_page_accessed(struct page *)); extern void lru_add_drain(void); extern int rotate_reclaimable_page(struct page *page); extern void swap_setup(void); +/* Mark a page as having seen activity. */ +#define mark_page_accessed(page) \ +do { \ + SetPageReferenced(page); \ +} while (0) + /* linux/mm/vmscan.c */ -extern int try_to_free_pages(struct zone **, unsigned int, unsigned int); +extern int try_to_free_pages(struct zone **, unsigned int, unsigned int, int); extern int shrink_all_memory(int); -extern int vm_swappiness; +extern int vm_mapped_page_cost; #ifdef CONFIG_MMU /* linux/mm/shmem.c */ diff -puN include/linux/sysctl.h~rollup include/linux/sysctl.h --- linux-2.6/include/linux/sysctl.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/linux/sysctl.h 2004-08-20 18:15:23.000000000 +1000 @@ -134,6 +134,7 @@ enum KERN_SPARC_SCONS_PWROFF=64, /* int: serial console power-off halt */ KERN_HZ_TIMER=65, /* int: hz timer on or off */ KERN_UNKNOWN_NMI_PANIC=66, /* int: unknown nmi panic flag */ + KERN_SCHED_TIMESLICE=67, /* int: base timeslice for scheduler */ }; @@ -167,6 +168,7 @@ enum VM_HUGETLB_GROUP=25, /* permitted hugetlb group */ VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */ VM_LEGACY_VA_LAYOUT=27, /* legacy/compatibility virtual address space layout */ + VM_FREE_LOCAL_HARDER=28, }; diff -puN include/linux/writeback.h~rollup include/linux/writeback.h --- linux-2.6/include/linux/writeback.h~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/include/linux/writeback.h 2004-08-20 18:15:23.000000000 +1000 @@ -8,7 +8,7 @@ struct backing_dev_info; extern spinlock_t inode_lock; extern struct list_head inode_in_use; -extern struct list_head inode_unused; +extern void inode_add_unused(struct inode *inode); /* * Yes, writeback.h requires sched.h diff -puN kernel/sched.c~rollup kernel/sched.c --- linux-2.6/kernel/sched.c~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/kernel/sched.c 2004-08-20 18:15:23.000000000 +1000 @@ -49,139 +49,74 @@ #include -#ifdef CONFIG_NUMA -#define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) -#else -#define cpu_to_node_mask(cpu) (cpu_online_map) -#endif - /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. */ -#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) -#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 30) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 30) #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. + * it's a [ 0 ... 58 ] range. */ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) -/* - * Some helpers for converting nanosecond timing to jiffy resolution - */ -#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#define US_TO_JIFFIES(x) ((x) * HZ / 1000000) +#define JIFFIES_TO_US(x) ((x) * 1000000 / HZ) /* - * These are the 'tuning knobs' of the scheduler: - * - * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger), - * default timeslice is 100 msecs, maximum timeslice is 800 msecs. - * Timeslices get refilled after they expire. - */ -#define MIN_TIMESLICE max(5 * HZ / 1000, 1) -#define DEF_TIMESLICE (100 * HZ / 1000) -#define ON_RUNQUEUE_WEIGHT 30 -#define CHILD_PENALTY 95 -#define PARENT_PENALTY 100 -#define EXIT_WEIGHT 3 -#define PRIO_BONUS_RATIO 25 -#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) -#define INTERACTIVE_DELTA 2 -#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS) -#define STARVATION_LIMIT (MAX_SLEEP_AVG) -#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG)) -#define CREDIT_LIMIT 100 - -/* - * If a task is 'interactive' then we reinsert it in the active - * array after it has expired its current timeslice. (it will not - * continue to run immediately, it will still roundrobin with - * other interactive tasks.) - * - * This part scales the interactivity limit depending on niceness. - * - * We scale it linearly, offset by the INTERACTIVE_DELTA delta. - * Here are a few examples of different nice levels: - * - * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] - * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] - * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] - * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] - * - * (the X axis represents the possible -5 ... 0 ... +5 dynamic - * priority range a task can explore, a value of '1' means the - * task is rated interactive.) - * - * Ie. nice +19 tasks can never get 'interactive' enough to be - * reinserted into the active array. And only heavily CPU-hog nice -20 - * tasks will be expired. Default nice 0 tasks are somewhere between, - * it takes some effort for them to get interactive, but it's not - * too hard. - */ - -#define CURRENT_BONUS(p) \ - (NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \ - MAX_SLEEP_AVG) - -#ifdef CONFIG_SMP -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \ - num_online_cpus()) -#else -#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ - (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -#endif - -#define SCALE(v1,v1_max,v2_max) \ - (v1) * (v2_max) / (v1_max) + * MIN_TIMESLICE is the timeslice that a minimum priority process gets if there + * is a maximum priority process runnable. MAX_TIMESLICE is derived from the + * formula in task_timeslice. It cannot be changed here. It is the timesilce + * that the maximum priority process will get. Larger timeslices are attainable + * by low priority processes however. + */ +int sched_base_timeslice = 64; +int sched_min_base = 1; +int sched_max_base = 10000; -#define DELTA(p) \ - (SCALE(TASK_NICE(p), 40, MAX_BONUS) + INTERACTIVE_DELTA) +#define RT_TIMESLICE (50 * 1000 / HZ) /* 50ms */ +#define BASE_TIMESLICE (sched_base_timeslice) +#define MIN_TIMESLICE 1 -#define TASK_INTERACTIVE(p) \ - ((p)->prio <= (p)->static_prio - DELTA(p)) +/* Maximum amount of history that will be used to calculate priority */ +#define MAX_SLEEP_SHIFT 19 +#define MAX_SLEEP (1UL << MAX_SLEEP_SHIFT) /* roughly 0.52s */ -#define INTERACTIVE_SLEEP(p) \ - (JIFFIES_TO_NS(MAX_SLEEP_AVG * \ - (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1)) +/* + * Maximum effect that 1 block of activity (run/sleep/etc) can have. This is + * will moderate dicard freak events (eg. SIGSTOP) + */ +#define MAX_SLEEP_AFFECT (MAX_SLEEP/4) -#define HIGH_CREDIT(p) \ - ((p)->interactive_credit > CREDIT_LIMIT) +/* + * The amount of history can be decreased (on fork for example). This puts a + * lower bound on it. + */ +#define MIN_HISTORY (MAX_SLEEP/8) -#define LOW_CREDIT(p) \ - ((p)->interactive_credit < -CREDIT_LIMIT) +#define FORKED_TS_MAX (US_TO_JIFFIES(MIN_HISTORY) ?: 1) -#define TASK_PREEMPTS_CURR(p, rq) \ - ((p)->prio < (rq)->curr->prio) +/* + * SLEEP_FACTOR is a fixed point factor used to scale history tracking things. + * In particular: total_time, sleep_time, sleep_avg. + */ +#define SLEEP_FACTOR 1024 /* - * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] - * to time slice values: [800ms ... 100ms ... 5ms] - * - * The higher a thread's priority, the bigger timeslices - * it gets during one round of execution. But even the lowest - * priority thread gets MIN_TIMESLICE worth of execution time. + * The scheduler classifies a process as performing one of the following + * activities */ +#define STIME_SLEEP 1 /* Sleeping */ +#define STIME_RUN 2 /* Using CPU */ -#define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) +#define TASK_PREEMPTS_CURR(p, rq) ( (p)->prio < (rq)->curr->prio ) -static unsigned int task_timeslice(task_t *p) -{ - if (p->static_prio < NICE_TO_PRIO(0)) - return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); - else - return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); -} #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) enum idle_type @@ -203,6 +138,7 @@ struct sched_domain; typedef struct runqueue runqueue_t; struct prio_array { + int min_prio; unsigned int nr_active; unsigned long bitmap[BITMAP_SIZE]; struct list_head queue[MAX_PRIO]; @@ -226,16 +162,17 @@ struct runqueue { #ifdef CONFIG_SMP unsigned long cpu_load; #endif + unsigned long array_sequence; + unsigned long nr_uninterruptible; unsigned long long nr_switches; - unsigned long expired_timestamp, nr_uninterruptible; - unsigned long long timestamp_last_tick; task_t *curr, *idle; struct mm_struct *prev_mm; - prio_array_t *active, *expired, arrays[2]; - int best_expired_prio; atomic_t nr_iowait; + prio_array_t *active, *expired, arrays[2]; #ifdef CONFIG_SMP + unsigned long long timestamp_last_tick; + struct sched_domain *sd; /* For active balancing */ @@ -302,7 +239,6 @@ static DEFINE_PER_CPU(struct runqueue, r #define SD_WAKE_IDLE 4 /* Wake to idle CPU on task wakeup */ #define SD_WAKE_AFFINE 8 /* Wake task to waking CPU */ #define SD_WAKE_BALANCE 16 /* Perform balancing at task wakeup */ -#define SD_SHARE_CPUPOWER 32 /* Domain members share cpu power */ struct sched_group { struct sched_group *next; /* Must be a circular list */ @@ -328,7 +264,6 @@ struct sched_domain { unsigned int imbalance_pct; /* No balance until over watermark */ unsigned long long cache_hot_time; /* Task considered cache hot (ns) */ unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ - unsigned int per_cpu_gain; /* CPU % gained by adding domain cpus */ int flags; /* See SD_* */ /* Runtime fields. */ @@ -368,12 +303,10 @@ struct sched_domain { .imbalance_pct = 110, \ .cache_hot_time = 0, \ .cache_nice_tries = 0, \ - .per_cpu_gain = 25, \ .flags = SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ - | SD_WAKE_IDLE \ - | SD_SHARE_CPUPOWER, \ + | SD_WAKE_IDLE, \ .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ @@ -389,9 +322,8 @@ struct sched_domain { .max_interval = 4, \ .busy_factor = 64, \ .imbalance_pct = 125, \ - .cache_hot_time = (5*1000000/2), \ + .cache_hot_time = (5*1000/2), \ .cache_nice_tries = 1, \ - .per_cpu_gain = 100, \ .flags = SD_BALANCE_NEWIDLE \ | SD_BALANCE_EXEC \ | SD_WAKE_AFFINE \ @@ -411,9 +343,8 @@ struct sched_domain { .max_interval = 32, \ .busy_factor = 32, \ .imbalance_pct = 125, \ - .cache_hot_time = (10*1000000), \ + .cache_hot_time = (10*1000), \ .cache_nice_tries = 1, \ - .per_cpu_gain = 100, \ .flags = SD_BALANCE_EXEC \ | SD_WAKE_BALANCE, \ .last_balance = jiffies, \ @@ -433,14 +364,71 @@ struct sched_domain { #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -/* - * Default context-switch locking: - */ #ifndef prepare_arch_switch -# define prepare_arch_switch(rq, next) do { } while (0) -# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) -# define task_running(rq, p) ((rq)->curr == (p)) +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline int task_running(runqueue_t *rq, task_t *p) +{ + return rq->curr == p; +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ + spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline int task_running(runqueue_t *rq, task_t *p) +{ +#ifdef CONFIG_SMP + return p->oncpu; +#else + return rq->curr == p; +#endif +} + +static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->oncpu = 1; #endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + spin_unlock_irq(&rq->lock); +#else + spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->oncpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->oncpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ /* * task_rq_lock - lock the runqueue a given task resides on and disable @@ -565,20 +553,6 @@ struct file_operations proc_schedstat_op # define schedstat_add(rq, field, amt) do { } while (0); #endif -/* - * rq_lock - lock a given runqueue and disable interrupts. - */ -static runqueue_t *this_rq_lock(void) -{ - runqueue_t *rq; - - local_irq_disable(); - rq = this_rq(); - spin_lock(&rq->lock); - - return rq; -} - static inline void rq_unlock(runqueue_t *rq) { spin_unlock_irq(&rq->lock); @@ -703,8 +677,18 @@ static void dequeue_task(struct task_str static void enqueue_task(struct task_struct *p, prio_array_t *array) { + struct list_head *entry = array->queue + p->prio; sched_info_queued(p); - list_add_tail(&p->run_list, array->queue + p->prio); + + if (!rt_task(p)) { + /* + * Cycle tasks on the same priority level. This reduces their + * timeslice fluctuations due to higher priority tasks expiring. + */ + if (!list_empty(entry)) + entry = entry->next; + } + list_add_tail(&p->run_list, entry); __set_bit(p->prio, array->bitmap); array->nr_active++; p->array = array; @@ -723,44 +707,123 @@ static inline void enqueue_task_head(str p->array = array; } +static inline unsigned long long clock_us(void) +{ + return sched_clock() >> 10; +} + /* - * effective_prio - return the priority that is based on the static - * priority but is modified by bonuses/penalties. - * - * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] - * into the -5 ... 0 ... +5 bonus/penalty range. - * - * We use 25% of the full 0...39 priority range so that: - * - * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. - * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * add_task_time updates a task @p after @time of doing the specified @type + * of activity. See STIME_*. This is used for priority calculation. + */ +static inline void add_task_time(task_t *p, unsigned long long time, unsigned long type) +{ + unsigned long ratio; + unsigned long long tmp; + unsigned long t; + + if (type == STIME_SLEEP) { + if (time > MAX_SLEEP_AFFECT*4) + time = MAX_SLEEP_AFFECT*4; + t = ((unsigned long)time + 3) / 4; + } else { + unsigned long div = 60 - USER_PRIO(p->static_prio); + t = (unsigned long)time * 30; + t = t / div; + t = t * 30; + t = t / div; + } + + ratio = MAX_SLEEP - t; + tmp = (unsigned long long)ratio*p->total_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->total_time = (unsigned long)tmp; + + tmp = (unsigned long long)ratio*p->sleep_time + MAX_SLEEP/2; + tmp >>= MAX_SLEEP_SHIFT; + p->sleep_time = (unsigned long)tmp; + + p->total_time += t; + if (type == STIME_SLEEP) + p->sleep_time += t; +} + +static unsigned long task_sleep_avg(task_t *p) +{ + return (SLEEP_FACTOR * p->sleep_time) / (p->total_time + 1); +} + +/* + * The higher a thread's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority thread gets MIN_TIMESLICE worth of execution time. * - * Both properties are important to certain workloads. + * Timeslices are scaled, so if only low priority processes are running, + * they will all get long timeslices. + */ +static int task_timeslice(task_t *p, runqueue_t *rq) +{ + int idx, base, delta; + int timeslice; + + if (rt_task(p)) + return RT_TIMESLICE; + + idx = min(p->prio, rq->expired->min_prio); + delta = p->prio - idx; + base = BASE_TIMESLICE * (MAX_USER_PRIO + 1) / (delta + 2); + base = base * (MAX_USER_PRIO + 1) / (delta + 2); + + base = base * 40 / (70 - USER_PRIO(idx)); + base = base * 40 / (70 - USER_PRIO(idx)); + + timeslice = base >> 10; + timeslice = timeslice * HZ / 1000; + if (timeslice < MIN_TIMESLICE) + timeslice = MIN_TIMESLICE; + + return timeslice; +} + +/* + * task_priority: calculates a task's priority based on previous running + * history (see add_task_time). The priority is just a simple linear function + * based on sleep_avg and static_prio. */ -static int effective_prio(task_t *p) +static int task_priority(task_t *p) { + unsigned long sleep_avg; int bonus, prio; if (rt_task(p)) return p->prio; - bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; + sleep_avg = task_sleep_avg(p); + + prio = USER_PRIO(p->static_prio) + 10; + bonus = (((MAX_USER_PRIO + 1) / 3) * sleep_avg + (SLEEP_FACTOR / 2)) + / SLEEP_FACTOR; + prio = MAX_RT_PRIO + prio - bonus; - prio = p->static_prio - bonus; if (prio < MAX_RT_PRIO) - prio = MAX_RT_PRIO; + return MAX_RT_PRIO; if (prio > MAX_PRIO-1) - prio = MAX_PRIO-1; + return MAX_PRIO-1; + return prio; } /* * __activate_task - move a task to the runqueue. */ -static inline void __activate_task(task_t *p, runqueue_t *rq) +static inline void __activate_task(task_t *p, runqueue_t *rq, prio_array_t *array) { - enqueue_task(p, rq->active); + enqueue_task(p, array); rq->nr_running++; + if (!rt_task(p)) { + if (p->prio < array->min_prio) + array->min_prio = p->prio; + } } /* @@ -772,80 +835,6 @@ static inline void __activate_idle_task( rq->nr_running++; } -static void recalc_task_prio(task_t *p, unsigned long long now) -{ - unsigned long long __sleep_time = now - p->timestamp; - unsigned long sleep_time; - - if (__sleep_time > NS_MAX_SLEEP_AVG) - sleep_time = NS_MAX_SLEEP_AVG; - else - sleep_time = (unsigned long)__sleep_time; - - if (likely(sleep_time > 0)) { - /* - * User tasks that sleep a long time are categorised as - * idle and will get just interactive status to stay active & - * prevent them suddenly becoming cpu hogs and starving - * other processes. - */ - if (p->mm && p->activated != -1 && - sleep_time > INTERACTIVE_SLEEP(p)) { - p->sleep_avg = JIFFIES_TO_NS(MAX_SLEEP_AVG - - DEF_TIMESLICE); - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } else { - /* - * The lower the sleep avg a task has the more - * rapidly it will rise with sleep time. - */ - sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; - - /* - * Tasks with low interactive_credit are limited to - * one timeslice worth of sleep avg bonus. - */ - if (LOW_CREDIT(p) && - sleep_time > JIFFIES_TO_NS(task_timeslice(p))) - sleep_time = JIFFIES_TO_NS(task_timeslice(p)); - - /* - * Non high_credit tasks waking from uninterruptible - * sleep are limited in their sleep_avg rise as they - * are likely to be cpu hogs waiting on I/O - */ - if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm) { - if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) - sleep_time = 0; - else if (p->sleep_avg + sleep_time >= - INTERACTIVE_SLEEP(p)) { - p->sleep_avg = INTERACTIVE_SLEEP(p); - sleep_time = 0; - } - } - - /* - * This code gives a bonus to interactive tasks. - * - * The boost works by updating the 'average sleep time' - * value here, based on ->timestamp. The more time a - * task spends sleeping, the higher the average gets - - * and the higher the priority boost gets as well. - */ - p->sleep_avg += sleep_time; - - if (p->sleep_avg > NS_MAX_SLEEP_AVG) { - p->sleep_avg = NS_MAX_SLEEP_AVG; - if (!HIGH_CREDIT(p)) - p->interactive_credit++; - } - } - } - - p->prio = effective_prio(p); -} - /* * activate_task - move a task to the runqueue and do priority recalculation * @@ -854,9 +843,10 @@ static void recalc_task_prio(task_t *p, */ static void activate_task(task_t *p, runqueue_t *rq, int local) { - unsigned long long now; + unsigned long long now, sleep; + prio_array_t *array; - now = sched_clock(); + now = clock_us(); #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ @@ -865,44 +855,34 @@ static void activate_task(task_t *p, run + rq->timestamp_last_tick; } #endif - - recalc_task_prio(p, now); - /* - * This checks to make sure it's not an uninterruptible task - * that is now waking up. + * If we have slept through an active/expired array switch, restart + * our timeslice too. */ - if (!p->activated) { - /* - * Tasks which were woken up by interrupts (ie. hw events) - * are most likely of interactive nature. So we give them - * the credit of extending their sleep time to the period - * of time they spend on the runqueue, waiting for execution - * on a CPU, first time around: - */ - if (in_interrupt()) - p->activated = 2; - else { - /* - * Normal first-time wakeups get a credit too for - * on-runqueue time, but it will be weighted down: - */ - p->activated = 1; - } - } + + sleep = now - p->timestamp; p->timestamp = now; + add_task_time(p, sleep, STIME_SLEEP); + p->prio = task_priority(p); - __activate_task(p, rq); + array = rq->active; + if (unlikely(p->used_slice == -1)) { + /* This only applys to newly woken children */ + array = rq->expired; + p->used_slice = 0; + } else if (rq->array_sequence != p->array_sequence) + p->used_slice = 0; + + __activate_task(p, rq, array); } /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct task_struct *p, runqueue_t *rq) +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) { + p->array_sequence = rq->array_sequence; rq->nr_running--; - if (p->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible++; dequeue_task(p, p->array); p->array = NULL; } @@ -1226,28 +1206,14 @@ out_set_cpu: out_activate: #endif /* CONFIG_SMP */ - if (old_state == TASK_UNINTERRUPTIBLE) { + if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - /* - * Tasks on involuntary sleep don't earn - * sleep_avg beyond just interactive state. - */ - p->activated = -1; - } - - /* - * Sync wakeups (i.e. those types of wakeups where the waker - * has indicated that it will leave the CPU in short order) - * don't trigger a preemption, if the woken up task will run on - * this cpu. (in this case the 'I will reschedule' promise of - * the waker guarantees that the freshly woken up task is going - * to be considered on this CPU.) - */ activate_task(p, rq, cpu == this_cpu); if (!sync || cpu != this_cpu) { if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); } + success = 1; out_running: @@ -1261,7 +1227,7 @@ out: int fastcall wake_up_process(task_t * p) { return try_to_wake_up(p, TASK_STOPPED | - TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); + TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); } EXPORT_SYMBOL(wake_up_process); @@ -1282,6 +1248,9 @@ static int find_idlest_cpu(struct task_s */ void fastcall sched_fork(task_t *p) { + unsigned long sleep_avg; + runqueue_t *rq; + /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -1291,46 +1260,52 @@ void fastcall sched_fork(task_t *p) p->state = TASK_RUNNING; INIT_LIST_HEAD(&p->run_list); p->array = NULL; - spin_lock_init(&p->switch_lock); #ifdef CONFIG_SCHEDSTATS memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + p->oncpu = 0; +#endif #ifdef CONFIG_PREEMPT - /* - * During context-switch we hold precisely one spinlock, which - * schedule_tail drops. (in the common case it's this_rq()->lock, - * but it also can be p->switch_lock.) So we compensate with a count - * of 1. Also, we want to start with kernel preemption disabled. - */ + /* Want to start with kernel preemption disabled. */ p->thread_info->preempt_count = 1; #endif - /* - * Share the timeslice between parent and child, thus the - * total amount of pending timeslices in the system doesn't change, - * resulting in more scheduling fairness. - */ + + preempt_disable(); + rq = this_rq(); + + /* XXX */ + if (unlikely(p->comm[0] == 'X' && p->comm[1] == 'F')) { + static int warned = 0; + if (!warned) { + printk(KERN_INFO "Renicing %s for you\n", p->comm); + warned = 1; + } + p->static_prio = NICE_TO_PRIO(-10); + } + + /* Get MIN_HISTORY of history with the same sleep_avg as parent. */ + sleep_avg = task_sleep_avg(current); + p->total_time = MIN_HISTORY; + p->sleep_time = p->total_time * sleep_avg / SLEEP_FACTOR; + + /* Parent loses 1/4 of sleep time for forking */ + current->sleep_time = 3*current->sleep_time/4; + + p->used_slice = 0; local_irq_disable(); - p->time_slice = (current->time_slice + 1) >> 1; - /* - * The remainder of the first timeslice might be recovered by - * the parent if the child exits early enough. - */ - p->first_time_slice = 1; - current->time_slice >>= 1; - p->timestamp = sched_clock(); - if (unlikely(!current->time_slice)) { - /* - * This case is rare, it happens when the parent has only - * a single jiffy left from its timeslice. Taking the - * runqueue lock is not a problem. - */ - current->time_slice = 1; - preempt_disable(); - scheduler_tick(0, 0); - local_irq_enable(); - preempt_enable(); - } else - local_irq_enable(); + if (unlikely(current->used_slice == -1 || current == rq->idle)) + p->used_slice = -1; + else { + int ts = task_timeslice(current, rq); + current->used_slice += (ts + 3) / 4; + if (current->used_slice >= ts) { + current->used_slice = -1; + set_need_resched(); + } + } + local_irq_enable(); + preempt_enable(); } /* @@ -1344,57 +1319,55 @@ void fastcall wake_up_new_task(task_t * { unsigned long flags; int this_cpu, cpu; - runqueue_t *rq, *this_rq; + runqueue_t *rq; + prio_array_t *array; + + BUG_ON(p->state != TASK_RUNNING); + + p->prio = task_priority(p); + p->timestamp = clock_us(); rq = task_rq_lock(p, &flags); - cpu = task_cpu(p); this_cpu = smp_processor_id(); - - BUG_ON(p->state != TASK_RUNNING); + cpu = task_cpu(p); schedstat_inc(rq, wunt_cnt); - /* - * We decrease the sleep average of forking parents - * and children as well, to keep max-interactive tasks - * from forking tasks that are max-interactive. The parent - * (current) is done further down, under its lock. - */ - p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * - CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - p->interactive_credit = 0; - - p->prio = effective_prio(p); + array = rq->active; + if (unlikely(p->used_slice == -1)) { + p->used_slice = 0; + array = rq->expired; + } else { + int total = task_timeslice(p, rq); + int ts = max((total + 3) / 4, MIN_TIMESLICE); + ts = min(ts, (int)FORKED_TS_MAX); + p->used_slice = total - ts; + } if (likely(cpu == this_cpu)) { - if (!(clone_flags & CLONE_VM)) { + if (!(clone_flags & CLONE_VM) && likely(array == rq->active)) { /* * The VM isn't cloned, so we're in a good position to * do child-runs-first in anticipation of an exec. This * usually avoids a lot of COW overhead. */ - if (unlikely(!current->array)) - __activate_task(p, rq); - else { + if (p->prio >= current->prio) { p->prio = current->prio; list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; p->array->nr_active++; rq->nr_running++; - } + } else + __activate_task(p, rq, array); + set_need_resched(); - } else + } else { /* Run child last */ - __activate_task(p, rq); - /* - * We skip the following code due to cpu == this_cpu - * - * task_rq_unlock(rq, &flags); - * this_rq = task_rq_lock(current, &flags); - */ - this_rq = rq; + __activate_task(p, rq, array); + } +#ifdef CONFIG_SMP } else { - this_rq = cpu_rq(this_cpu); + runqueue_t *this_rq = this_rq(); /* * Not the local CPU - must adjust timestamp. This should @@ -1402,52 +1375,18 @@ void fastcall wake_up_new_task(task_t * */ p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + rq->timestamp_last_tick; - __activate_task(p, rq); + __activate_task(p, rq, array); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); schedstat_inc(rq, wunt_moved); - /* - * Parent and child are on different CPUs, now get the - * parent runqueue to update the parent's ->sleep_avg: - */ - task_rq_unlock(rq, &flags); - this_rq = task_rq_lock(current, &flags); +#endif } - current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) * - PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); - task_rq_unlock(this_rq, &flags); + task_rq_unlock(rq, &flags); } -/* - * Potentially available exiting-child timeslices are - * retrieved here - this way the parent does not get - * penalized for creating too many threads. - * - * (this cannot be used to 'generate' timeslices - * artificially, because any timeslice recovered here - * was given away by the parent in the first place.) - */ void fastcall sched_exit(task_t * p) { - unsigned long flags; - runqueue_t *rq; - - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > task_timeslice(p))) - p->parent->time_slice = task_timeslice(p); - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); - task_rq_unlock(rq, &flags); } /** @@ -1483,7 +1422,8 @@ static void finish_task_switch(task_t *p * Manfred Spraul */ prev_task_flags = prev->flags; - finish_arch_switch(rq, prev); + finish_arch_switch(prev); + finish_lock_switch(rq, prev); if (mm) mmdrop(mm); if (unlikely(prev_task_flags & PF_DEAD)) { @@ -1500,7 +1440,10 @@ static void finish_task_switch(task_t *p asmlinkage void schedule_tail(task_t *prev) { finish_task_switch(prev); - +#ifdef __ARCH_WANT_UNLOCKED_CTXSW + /* In this case, finish_task_switch does not reenable preemption */ + preempt_enable(); +#endif if (current->set_child_tid) put_user(current->pid, current->set_child_tid); } @@ -1759,6 +1702,10 @@ void pull_task(runqueue_t *src_rq, prio_ set_task_cpu(p, this_cpu); this_rq->nr_running++; enqueue_task(p, this_array); + if (!rt_task(p)) { + if (p->prio < this_array->min_prio) + this_array->min_prio = p->prio; + } p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; /* @@ -2062,7 +2009,6 @@ static int load_balance(int this_cpu, ru unsigned long imbalance; int nr_moved; - spin_lock(&this_rq->lock); schedstat_inc(sd, lb_cnt[idle]); group = find_busiest_group(sd, this_cpu, &imbalance, idle); @@ -2097,12 +2043,11 @@ static int load_balance(int this_cpu, ru * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ - double_lock_balance(this_rq, busiest); + double_rq_lock(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle); - spin_unlock(&busiest->lock); + double_rq_unlock(this_rq, busiest); } - spin_unlock(&this_rq->lock); if (!nr_moved) { schedstat_inc(sd, lb_failed[idle]); @@ -2136,8 +2081,6 @@ static int load_balance(int this_cpu, ru return nr_moved; out_balanced: - spin_unlock(&this_rq->lock); - /* tune up the balancing interval */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; @@ -2342,42 +2285,11 @@ static inline void idle_balance(int cpu, } #endif -static inline int wake_priority_sleeper(runqueue_t *rq) -{ -#ifdef CONFIG_SCHED_SMT - /* - * If an SMT sibling task has been put to sleep for priority - * reasons reschedule the idle task to see if it can now run. - */ - if (rq->nr_running) { - resched_task(rq->idle); - return 1; - } -#endif - return 0; -} - DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * We place interactive tasks back into the active array, if possible. - * - * To guarantee that this does not starve expired tasks we ignore the - * interactivity of a task if the first expired task had to wait more - * than a 'reasonable' amount of time. This deadline timeout is - * load-dependent, as the frequency of array switched decreases with - * increasing number of running tasks. We also ignore the interactivity - * if a better static_prio task has expired: - */ -#define EXPIRED_STARVING(rq) \ - ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ - (jiffies - (rq)->expired_timestamp >= \ - STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ - ((rq)->curr->static_prio > (rq)->best_expired_prio)) - -/* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * @@ -2386,12 +2298,16 @@ EXPORT_PER_CPU_SYMBOL(kstat); */ void scheduler_tick(int user_ticks, int sys_ticks) { + enum idle_type cpu_status; int cpu = smp_processor_id(); struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; runqueue_t *rq = this_rq(); task_t *p = current; + int ts; - rq->timestamp_last_tick = sched_clock(); +#ifdef CONFIG_SMP + rq->timestamp_last_tick = clock_us(); +#endif if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_ticks); @@ -2410,11 +2326,11 @@ void scheduler_tick(int user_ticks, int cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; - if (wake_priority_sleeper(rq)) - goto out; - rebalance_tick(cpu, rq, IDLE); - return; + cpu_status = IDLE; + goto out; } + cpu_status = NOT_IDLE; + if (TASK_NICE(p) > 0) cpustat->nice += user_ticks; else @@ -2422,168 +2338,24 @@ void scheduler_tick(int user_ticks, int cpustat->system += sys_ticks; /* Task might have expired already, but not scheduled off yet */ - if (p->array != rq->active) { - set_tsk_need_resched(p); + if (unlikely(p->used_slice == -1)) goto out; - } - spin_lock(&rq->lock); - /* - * The task was running during this tick - update the - * time slice counter. Note: we do not update a thread's - * priority until it either goes to sleep or uses up its - * timeslice. This makes it possible for interactive tasks - * to use up their timeslices at their highest priority levels. - */ - if (rt_task(p)) { - /* - * RR tasks need a special form of timeslice management. - * FIFO tasks have no timeslices. - */ - if ((p->policy == SCHED_RR) && !--p->time_slice) { - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - set_tsk_need_resched(p); - - /* put it at the end of the queue: */ - dequeue_task(p, rq->active); - enqueue_task(p, rq->active); - } - goto out_unlock; - } - if (!--p->time_slice) { - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - p->time_slice = task_timeslice(p); - p->first_time_slice = 0; - - if (!rq->expired_timestamp) - rq->expired_timestamp = jiffies; - if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { - enqueue_task(p, rq->expired); - if (p->static_prio < rq->best_expired_prio) - rq->best_expired_prio = p->static_prio; - } else - enqueue_task(p, rq->active); - } else { - /* - * Prevent a too long timeslice allowing a task to monopolize - * the CPU. We do this by splitting up the timeslice into - * smaller pieces. - * - * Note: this does not mean the task's timeslices expire or - * get lost in any way, they just might be preempted by - * another task of equal priority. (one with higher - * priority would have preempted this task already.) We - * requeue this task to the end of the list on this priority - * level, which is in essence a round-robin of tasks with - * equal priority. - * - * This only applies to tasks in the interactive - * delta range with at least TIMESLICE_GRANULARITY to requeue. - */ - if (TASK_INTERACTIVE(p) && !((task_timeslice(p) - - p->time_slice) % TIMESLICE_GRANULARITY(p)) && - (p->time_slice >= TIMESLICE_GRANULARITY(p)) && - (p->array == rq->active)) { - - dequeue_task(p, rq->active); - set_tsk_need_resched(p); - p->prio = effective_prio(p); - enqueue_task(p, rq->active); - } - } -out_unlock: - spin_unlock(&rq->lock); -out: - rebalance_tick(cpu, rq, NOT_IDLE); -} - -#ifdef CONFIG_SCHED_SMT -static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) -{ - int i; - struct sched_domain *sd = rq->sd; - cpumask_t sibling_map; - if (!(sd->flags & SD_SHARE_CPUPOWER)) - return; - - cpus_and(sibling_map, sd->span, cpu_online_map); - for_each_cpu_mask(i, sibling_map) { - runqueue_t *smt_rq; - - if (i == cpu) - continue; - - smt_rq = cpu_rq(i); + if (unlikely(p->policy == SCHED_FIFO)) + goto out; - /* - * If an SMT sibling task is sleeping due to priority - * reasons wake it up now. - */ - if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) - resched_task(smt_rq->idle); + /* p was running during this tick. Update its time slice counter. */ + p->used_slice++; + ts = task_timeslice(p, rq); + if (unlikely(p->used_slice >= ts)) { + p->used_slice = -1; + set_tsk_need_resched(p); } -} - -static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) -{ - struct sched_domain *sd = rq->sd; - cpumask_t sibling_map; - int ret = 0, i; - - if (!(sd->flags & SD_SHARE_CPUPOWER)) - return 0; - - cpus_and(sibling_map, sd->span, cpu_online_map); - for_each_cpu_mask(i, sibling_map) { - runqueue_t *smt_rq; - task_t *smt_curr; - - if (i == cpu) - continue; - smt_rq = cpu_rq(i); - smt_curr = smt_rq->curr; - - /* - * If a user task with lower static priority than the - * running task on the SMT sibling is trying to schedule, - * delay it till there is proportionately less timeslice - * left of the sibling task to prevent a lower priority - * task from using an unfair proportion of the - * physical cpu's resources. -ck - */ - if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(p) || rt_task(smt_curr)) && - p->mm && smt_curr->mm && !rt_task(p)) - ret = 1; - - /* - * Reschedule a lower priority task on the SMT sibling, - * or wake it up if it has been put to sleep for priority - * reasons. - */ - if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > - task_timeslice(smt_curr) || rt_task(p)) && - smt_curr->mm && p->mm && !rt_task(smt_curr)) || - (smt_curr == smt_rq->idle && smt_rq->nr_running)) - resched_task(smt_curr); - } - return ret; -} -#else -static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) -{ +out: + rebalance_tick(cpu, rq, NOT_IDLE); } -static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) -{ - return 0; -} -#endif - /* * schedule() is the main scheduler function. */ @@ -2603,11 +2375,10 @@ asmlinkage void __sched schedule(void) * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { - if (unlikely(in_atomic())) { - printk(KERN_ERR "bad: scheduling while atomic!\n"); - dump_stack(); - } + if (unlikely(in_atomic()) && + likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { + printk(KERN_ERR "bad: scheduling while atomic!\n"); + dump_stack(); } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -2627,19 +2398,10 @@ need_resched: release_kernel_lock(prev); schedstat_inc(rq, sched_cnt); - now = sched_clock(); - if (likely(now - prev->timestamp < NS_MAX_SLEEP_AVG)) - run_time = now - prev->timestamp; - else - run_time = NS_MAX_SLEEP_AVG; - - /* - * Tasks with interactive credits get charged less run_time - * at high sleep_avg to delay them losing their interactive - * status - */ - if (HIGH_CREDIT(prev)) - run_time /= (CURRENT_BONUS(prev) ? : 1); + now = clock_us(); + run_time = now - prev->timestamp; + prev->timestamp = now; + add_task_time(prev, run_time, STIME_RUN); spin_lock_irq(&rq->lock); @@ -2653,17 +2415,39 @@ need_resched: if (unlikely((prev->state & TASK_INTERRUPTIBLE) && unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; - else + else { deactivate_task(prev, rq); + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + goto no_check_expired; + } } + if (unlikely(prev->used_slice == -1)) { + if (rt_task(prev)) { + if (prev->policy == SCHED_RR) { + dequeue_task(prev, prev->array); + enqueue_task(prev, rq->active); + } + } else { + dequeue_task(prev, prev->array); + prev->prio = task_priority(prev); + enqueue_task(prev, rq->expired); + if (prev->prio < rq->expired->min_prio) + rq->expired->min_prio = prev->prio; + } + prev->used_slice = 0; + } +no_check_expired: + cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { + rq->array_sequence++; idle_balance(cpu, rq); if (!rq->nr_running) { + rq->arrays[0].min_prio = MAX_PRIO; + rq->arrays[1].min_prio = MAX_PRIO; next = rq->idle; - rq->expired_timestamp = 0; - wake_sleeping_dependent(cpu, rq); goto switch_tasks; } } @@ -2674,11 +2458,11 @@ need_resched: * Switch the active and expired arrays. */ schedstat_inc(rq, sched_switch); + rq->array_sequence++; rq->active = rq->expired; rq->expired = array; + rq->expired->min_prio = MAX_PRIO; array = rq->active; - rq->expired_timestamp = 0; - rq->best_expired_prio = MAX_PRIO; } else schedstat_inc(rq, sched_noswitch); @@ -2686,37 +2470,11 @@ need_resched: queue = array->queue + idx; next = list_entry(queue->next, task_t, run_list); - if (dependent_sleeper(cpu, rq, next)) { - schedstat_inc(rq, sched_goidle); - next = rq->idle; - goto switch_tasks; - } - - if (!rt_task(next) && next->activated > 0) { - unsigned long long delta = now - next->timestamp; - - if (next->activated == 1) - delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; - - array = next->array; - dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); - enqueue_task(next, array); - } - next->activated = 0; switch_tasks: prefetch(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); - prev->sleep_avg -= run_time; - if ((long)prev->sleep_avg <= 0) { - prev->sleep_avg = 0; - if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) - prev->interactive_credit--; - } - prev->timestamp = now; - sched_info_switch(prev, next); if (likely(prev != next)) { next->timestamp = now; @@ -2724,10 +2482,10 @@ switch_tasks: rq->curr = next; ++*switch_count; - prepare_arch_switch(rq, next); + prepare_lock_switch(rq, next); + prepare_arch_switch(next); prev = context_switch(rq, prev, next); barrier(); - finish_task_switch(prev); } else spin_unlock_irq(&rq->lock); @@ -3204,12 +2962,12 @@ static int setscheduler(pid_t pid, int p array = p->array; if (array) - deactivate_task(p, task_rq(p)); + deactivate_task(p, rq); retval = 0; oldprio = p->prio; __setscheduler(p, policy, lp.sched_priority); if (array) { - __activate_task(p, task_rq(p)); + __activate_task(p, rq, array); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -3432,37 +3190,31 @@ out_unlock: */ asmlinkage long sys_sched_yield(void) { - runqueue_t *rq = this_rq_lock(); - prio_array_t *array = current->array; - prio_array_t *target = rq->expired; +#ifdef CONFIG_SCHEDSTATS + runqueue_t *rq; +#endif - schedstat_inc(rq, yld_cnt); - /* - * We implement yielding by moving the task into the expired - * queue. - * - * (special rule: RT tasks will just roundrobin in the active - * array.) - */ - if (rt_task(current)) - target = rq->active; + local_irq_disable(); +#ifdef CONFIG_SCHEDSTATS + rq = this_rq(); + schedstat_inc(rq, yld_cnt); + spin_lock(&rq->lock); if (current->array->nr_active == 1) { schedstat_inc(rq, yld_act_empty); if (!rq->expired->nr_active) schedstat_inc(rq, yld_both_empty); } else if (!rq->expired->nr_active) schedstat_inc(rq, yld_exp_empty); - - dequeue_task(current, array); - enqueue_task(current, target); - /* * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ _raw_spin_unlock(&rq->lock); preempt_enable_no_resched(); +#endif + current->used_slice = -1; + local_irq_enable(); schedule(); @@ -3579,6 +3331,8 @@ long sys_sched_rr_get_interval(pid_t pid int retval = -EINVAL; struct timespec t; task_t *p; + unsigned long flags; + runqueue_t *rq; if (pid < 0) goto out_nounlock; @@ -3593,8 +3347,9 @@ long sys_sched_rr_get_interval(pid_t pid if (retval) goto out_unlock; - jiffies_to_timespec(p->policy & SCHED_FIFO ? - 0 : task_timeslice(p), &t); + rq = task_rq_lock(p, &flags); + jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : task_timeslice(p, rq), &t); + task_rq_unlock(rq, &flags); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: @@ -3707,15 +3462,17 @@ void __devinit init_idle(task_t *idle, i runqueue_t *rq = cpu_rq(cpu); unsigned long flags; - idle->sleep_avg = 0; - idle->interactive_credit = 0; idle->array = NULL; idle->prio = MAX_PRIO; idle->state = TASK_RUNNING; + idle->used_slice = 0; set_task_cpu(idle, cpu); spin_lock_irqsave(&rq->lock, flags); rq->curr = rq->idle = idle; +#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) + idle->oncpu = 1; +#endif set_tsk_need_resched(idle); spin_unlock_irqrestore(&rq->lock, flags); @@ -4521,7 +4278,6 @@ void __init sched_init(void) spin_lock_init(&rq->lock); rq->active = rq->arrays; rq->expired = rq->arrays + 1; - rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP rq->sd = &sched_domain_init; @@ -4535,11 +4291,12 @@ void __init sched_init(void) for (j = 0; j < 2; j++) { array = rq->arrays + j; + array->min_prio = MAX_PRIO; for (k = 0; k < MAX_PRIO; k++) { INIT_LIST_HEAD(array->queue + k); __clear_bit(k, array->bitmap); } - // delimiter for bitsearch + /* delimiter for bitsearch */ __set_bit(MAX_PRIO, array->bitmap); } } diff -puN kernel/sysctl.c~rollup kernel/sysctl.c --- linux-2.6/kernel/sysctl.c~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/kernel/sysctl.c 2004-08-20 18:15:23.000000000 +1000 @@ -65,6 +65,9 @@ extern int sysctl_lower_zone_protection; extern int min_free_kbytes; extern int printk_ratelimit_jiffies; extern int printk_ratelimit_burst; +extern int sched_base_timeslice; +extern int sched_min_base; +extern int sched_max_base; #if defined(CONFIG_X86_LOCAL_APIC) && defined(__i386__) int unknown_nmi_panic; @@ -641,12 +644,25 @@ static ctl_table kern_table[] = { .proc_handler = &proc_unknown_nmi_panic, }, #endif + { + .ctl_name = KERN_SCHED_TIMESLICE, + .procname = "base_timeslice", + .data = &sched_base_timeslice, + .maxlen = sizeof (sched_base_timeslice), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &sched_min_base, + .extra2 = &sched_max_base, + }, + { .ctl_name = 0 } }; /* Constants for minimum and maximum testing in vm_table. We use these as one-element integer vectors. */ static int zero; +static int one = 1; static int one_hundred = 100; @@ -723,15 +739,27 @@ static ctl_table vm_table[] = { }, { .ctl_name = VM_SWAPPINESS, - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), + .procname = "mapped_page_cost", + .data = &vm_mapped_page_cost, + .maxlen = sizeof(vm_mapped_page_cost), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, - .extra1 = &zero, + .extra1 = &one, .extra2 = &one_hundred, }, + { + .ctl_name = VM_FREE_LOCAL_HARDER, + .procname = "free_local_harder", + .data = &vm_free_local_harder, + .maxlen = sizeof(vm_free_local_harder), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &zero, + .extra2 = &one, + }, + #ifdef CONFIG_HUGETLB_PAGE { .ctl_name = VM_HUGETLB_PAGES, @@ -802,9 +830,9 @@ static ctl_table vm_table[] = { }, { .ctl_name = VM_VFS_CACHE_PRESSURE, - .procname = "vfs_cache_pressure", - .data = &sysctl_vfs_cache_pressure, - .maxlen = sizeof(sysctl_vfs_cache_pressure), + .procname = "vfs_cache_cost", + .data = &sysctl_vfs_cache_cost, + .maxlen = sizeof(sysctl_vfs_cache_cost), .mode = 0644, .proc_handler = &proc_dointvec, .strategy = &sysctl_intvec, @@ -932,6 +960,7 @@ static ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { .ctl_name = 0 } }; diff -puN mm/filemap.c~rollup mm/filemap.c --- linux-2.6/mm/filemap.c~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/mm/filemap.c 2004-08-20 18:15:21.000000000 +1000 @@ -764,11 +764,7 @@ page_ok: if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - /* - * Mark the page accessed if we read the beginning. - */ - if (!offset) - mark_page_accessed(page); + mark_page_accessed(page); /* * Ok, we have the page, and it's up-to-date, so diff -puN mm/hugetlb.c~rollup mm/hugetlb.c --- linux-2.6/mm/hugetlb.c~rollup 2004-08-20 18:15:20.000000000 +1000 +++ linux-2.6-npiggin/mm/hugetlb.c 2004-08-20 18:15:22.000000000 +1000 @@ -130,9 +130,12 @@ static void update_and_free_page(struct nr_huge_pages--; nr_huge_pages_node[page_zone(page)->zone_pgdat->node_id]--; for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) { - page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | - 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | - 1 << PG_private | 1<< PG_writeback); + page[i].flags &= ~( + 1 << PG_locked | 1 << PG_error | + 1 << PG_referenced | 1 << PG_dirty | + 1 << PG_active_mapped | 1 << PG_active_unmapped | + 1 << PG_reserved | 1 << PG_private | + 1 << PG_writeback); set_page_count(&page[i], 0); } set_page_count(page, 1); diff -puN mm/memory.c~rollup mm/memory.c --- linux-2.6/mm/memory.c~rollup 2004-08-20 18:15:21.000000000 +1000 +++ linux-2.6-npiggin/mm/memory.c 2004-08-20 18:15:22.000000000 +1000 @@ -1121,7 +1121,8 @@ static int do_wp_page(struct mm_struct * else page_remove_rmap(old_page); break_cow(vma, new_page, address, page_table); - lru_cache_add_active(new_page); + lru_cache_add(new_page); + mark_page_accessed(new_page); page_add_anon_rmap(new_page, vma, address); /* Free the old page.. */ @@ -1468,7 +1469,7 @@ do_anonymous_page(struct mm_struct *mm, entry = maybe_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)), vma); - lru_cache_add_active(page); + lru_cache_add(page); mark_page_accessed(page); page_add_anon_rmap(page, vma, addr); } @@ -1580,7 +1581,7 @@ retry: entry = maybe_mkwrite(pte_mkdirty(entry), vma); set_pte(page_table, entry); if (anon) { - lru_cache_add_active(new_page); + lru_cache_add(new_page); page_add_anon_rmap(new_page, vma, address); } else page_add_file_rmap(new_page); diff -puN mm/oom_kill.c~rollup mm/oom_kill.c --- linux-2.6/mm/oom_kill.c~rollup 2004-08-20 18:15:21.000000000 +1000 +++ linux-2.6-npiggin/mm/oom_kill.c 2004-08-20 18:15:23.000000000 +1000 @@ -144,11 +144,10 @@ static void __oom_kill_task(task_t *p) printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm); /* - * We give our sacrificial lamb high priority and access to - * all the memory it needs. That way it should be able to - * exit() and clear out its resources quickly... + * We give our sacrificial lamb access to all the memory it needs. + * That way it should be able to exit() and clear out its resources + * quickly... */ - p->time_slice = HZ; p->flags |= PF_MEMALLOC | PF_MEMDIE; /* This process has hardware access, be more careful. */ diff -puN mm/page-writeback.c~rollup mm/page-writeback.c --- linux-2.6/mm/page-writeback.c~rollup 2004-08-20 18:15:21.000000000 +1000 +++ linux-2.6-npiggin/mm/page-writeback.c 2004-08-20 18:15:23.000000000 +1000 @@ -377,8 +377,7 @@ static void wb_kupdate(unsigned long arg oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; start_jif = jiffies; next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; - nr_to_write = wbs.nr_dirty + wbs.nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); + nr_to_write = wbs.nr_dirty + wbs.nr_unstable + inodes_stat.nr_inodes; while (nr_to_write > 0) { wbc.encountered_congestion = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; diff -puN mm/page_alloc.c~rollup mm/page_alloc.c --- linux-2.6/mm/page_alloc.c~rollup 2004-08-20 18:15:21.000000000 +1000 +++ linux-2.6-npiggin/mm/page_alloc.c 2004-08-20 18:15:23.000000000 +1000 @@ -87,7 +87,7 @@ static void bad_page(const char *functio page->flags &= ~(1 << PG_private | 1 << PG_locked | 1 << PG_lru | - 1 << PG_active | + 1 << PG_active_mapped | 1 << PG_dirty | 1 << PG_swapcache | 1 << PG_writeback); @@ -226,7 +226,8 @@ static inline void free_pages_check(cons 1 << PG_lru | 1 << PG_private | 1 << PG_locked | - 1 << PG_active | + 1 << PG_active_mapped | + 1 << PG_active_unmapped | 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | @@ -259,8 +260,6 @@ free_pages_bulk(struct zone *zone, int c base = zone->zone_mem_map; area = zone->free_area + order; spin_lock_irqsave(&zone->lock, flags); - zone->all_unreclaimable = 0; - zone->pages_scanned = 0; while (!list_empty(list) && count--) { page = list_entry(list->prev, struct page, lru); /* have to delete it as __free_pages_bulk list manipulates */ @@ -347,7 +346,8 @@ static void prep_new_page(struct page *p 1 << PG_private | 1 << PG_locked | 1 << PG_lru | - 1 << PG_active | + 1 << PG_active_mapped | + 1 << PG_active_unmapped | 1 << PG_dirty | 1 << PG_reclaim | 1 << PG_swapcache | @@ -664,6 +664,8 @@ buffered_rmqueue(struct zone *zone, int if (page != NULL) { BUG_ON(bad_range(zone, page)); mod_page_state_zone(zone, pgalloc, 1 << order); + if (numa_node_id() != zone->zone_pgdat->node_id) + mod_page_state(pgalloc_remote, 1 << order); prep_new_page(page, order); if (order && (gfp_flags & __GFP_COMP)) prep_compound_page(page, order); @@ -671,6 +673,8 @@ buffered_rmqueue(struct zone *zone, int return page; } +int vm_free_local_harder = 1; + /* * This is the 'heart' of the zoned buddy allocator. * @@ -699,7 +703,6 @@ __alloc_pages(unsigned int gfp_mask, uns struct task_struct *p = current; int i; int alloc_type; - int do_retry; int can_try_harder; might_sleep_if(wait); @@ -726,6 +729,38 @@ __alloc_pages(unsigned int gfp_mask, uns alloc_type = zone_idx(zones[0]); + if (!vm_free_local_harder || + (p->flags & (PF_MEMALLOC | PF_MEMDIE)) || !wait) + goto no_local_harder; + + /* Go through the zonelist, looking for a local zone with enough free */ + if (zones[0]->zone_pgdat->node_id == numa_node_id()) { + for (i = 0; (z = zones[i]) != NULL; i++) { + if (z->zone_pgdat->node_id != numa_node_id()) + break; + + min = z->pages_high + (1<protection[alloc_type]; + + if (z->free_pages < min) + continue; + + page = buffered_rmqueue(z, order, gfp_mask); + if (page) + goto got_pg; + } + + p->flags |= PF_MEMALLOC; + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + + try_to_free_pages(zones, gfp_mask, order, 1); + + p->reclaim_state = NULL; + p->flags &= ~PF_MEMALLOC; + + } + +no_local_harder: /* Go through the zonelist once, looking for a zone with enough free */ for (i = 0; (z = zones[i]) != NULL; i++) { min = z->pages_low + (1<protection[alloc_type]; @@ -782,7 +817,7 @@ rebalance: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - try_to_free_pages(zones, gfp_mask, order); + try_to_free_pages(zones, gfp_mask, order, 0); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; @@ -811,16 +846,11 @@ rebalance: * In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order * <= 3, but that may not be true in other implementations. */ - do_retry = 0; if (!(gfp_mask & __GFP_NORETRY)) { if ((order <= 3) || (gfp_mask & __GFP_REPEAT)) - do_retry = 1; + goto rebalance; if (gfp_mask & __GFP_NOFAIL) - do_retry = 1; - } - if (do_retry) { - blk_congestion_wait(WRITE, HZ/50); - goto rebalance; + goto rebalance; } nopage: @@ -1070,7 +1100,7 @@ void get_zone_counts(unsigned long *acti *inactive = 0; *free = 0; for_each_zone(zone) { - *active += zone->nr_active; + *active += zone->nr_active_mapped + zone->nr_active_unmapped; *inactive += zone->nr_inactive; *free += zone->free_pages; } @@ -1188,7 +1218,7 @@ void show_free_areas(void) K(zone->pages_min), K(zone->pages_low), K(zone->pages_high), - K(zone->nr_active), + K(zone->nr_active_mapped + zone->nr_active_unmapped), K(zone->nr_inactive), K(zone->present_pages) ); @@ -1586,8 +1616,6 @@ static void __init free_area_init_core(s zone->zone_pgdat = pgdat; zone->free_pages = 0; - zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - /* * The per-cpu-pages pools are set to around 1000th of the * size of the zone. But no more than 1/4 of a meg - there's @@ -1621,12 +1649,17 @@ static void __init free_area_init_core(s } printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", zone_names[j], realsize, batch); - INIT_LIST_HEAD(&zone->active_list); + INIT_LIST_HEAD(&zone->active_mapped_list); + INIT_LIST_HEAD(&zone->active_unmapped_list); INIT_LIST_HEAD(&zone->inactive_list); - zone->nr_scan_active = 0; + zone->nr_scan_active_mapped = 0; + zone->nr_scan_active_unmapped = 0; zone->nr_scan_inactive = 0; - zone->nr_active = 0; + zone->nr_dirty_inactive = 0; + zone->nr_active_mapped = 0; + zone->nr_active_unmapped = 0; zone->nr_inactive = 0; + INIT_LIST_HEAD(&zone->zone_shrinker_list); if (!size) continue; @@ -1776,10 +1809,11 @@ static char *vmstat_text[] = { "pgpgout", "pswpin", "pswpout", - "pgalloc_high", + "pgalloc_high", "pgalloc_normal", "pgalloc_dma", + "pgalloc_remote", "pgfree", "pgactivate", "pgdeactivate", @@ -1990,13 +2024,18 @@ static void setup_per_zone_pages_min(voi } for_each_zone(zone) { + unsigned long tmp; spin_lock_irqsave(&zone->lru_lock, flags); + tmp = (pages_min * zone->present_pages) / lowmem_pages; if (is_highmem(zone)) { /* - * Often, highmem doesn't need to reserve any pages. - * But the pages_min/low/high values are also used for - * batching up page reclaim activity so we need a - * decent value here. + * __GFP_HIGH and PF_MEMALLOC allocations usually don't + * need highmem pages, so cap pages_min to a small + * value here. + * + * The (pages_high-pages_low) and (pages_low-pages_min) + * deltas controls asynch page reclaim, and so should + * not be capped for highmem. */ int min_pages; @@ -2007,15 +2046,15 @@ static void setup_per_zone_pages_min(voi min_pages = 128; zone->pages_min = min_pages; } else { - /* if it's a lowmem zone, reserve a number of pages + /* + * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->pages_min = (pages_min * zone->present_pages) / - lowmem_pages; + zone->pages_min = tmp; } - zone->pages_low = zone->pages_min * 2; - zone->pages_high = zone->pages_min * 3; + zone->pages_low = zone->pages_min + tmp; + zone->pages_high = zone->pages_low + tmp; spin_unlock_irqrestore(&zone->lru_lock, flags); } } diff -puN mm/rmap.c~rollup mm/rmap.c --- linux-2.6/mm/rmap.c~rollup 2004-08-20 18:15:21.000000000 +1000 +++ linux-2.6-npiggin/mm/rmap.c 2004-08-20 18:15:21.000000000 +1000 @@ -252,15 +252,15 @@ unsigned long page_address_in_vma(struct * Subfunctions of page_referenced: page_referenced_one called * repeatedly from either page_referenced_anon or page_referenced_file. */ -static int page_referenced_one(struct page *page, - struct vm_area_struct *vma, unsigned int *mapcount) +static void page_gather_one(struct page *page, + struct vm_area_struct *vma, unsigned int *mapcount, + int *referenced, int *dirty) { struct mm_struct *mm = vma->vm_mm; unsigned long address; pgd_t *pgd; pmd_t *pmd; pte_t *pte; - int referenced = 0; if (!mm->rss) goto out; @@ -286,7 +286,10 @@ static int page_referenced_one(struct pa goto out_unmap; if (ptep_clear_flush_young(vma, address, pte)) - referenced++; + (*referenced)++; + + if (pte_dirty(*pte)) + (*dirty)++; if (mm != current->mm && has_swap_token(mm)) referenced++; @@ -301,28 +304,27 @@ out_unmap: out_unlock: spin_unlock(&mm->page_table_lock); out: - return referenced; + ; } -static int page_referenced_anon(struct page *page) +static inline void +page_gather_anon(struct page *page, int *referenced, int *dirty) { unsigned int mapcount; struct anon_vma *anon_vma; struct vm_area_struct *vma; - int referenced = 0; anon_vma = page_lock_anon_vma(page); if (!anon_vma) - return referenced; + return; mapcount = page_mapcount(page); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - referenced += page_referenced_one(page, vma, &mapcount); + page_gather_one(page, vma, &mapcount, referenced, dirty); if (!mapcount) break; } spin_unlock(&anon_vma->lock); - return referenced; } /** @@ -336,14 +338,14 @@ static int page_referenced_anon(struct p * * This function is only called from page_referenced for object-based pages. */ -static int page_referenced_file(struct page *page) +static inline void +page_gather_file(struct page *page, int *referenced, int *dirty) { unsigned int mapcount; struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); struct vm_area_struct *vma; struct prio_tree_iter iter; - int referenced = 0; /* * The caller's checks on page->mapping and !PageAnon have made @@ -371,16 +373,15 @@ static int page_referenced_file(struct p vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) == (VM_LOCKED|VM_MAYSHARE)) { - referenced++; + (*referenced)++; break; } - referenced += page_referenced_one(page, vma, &mapcount); + page_gather_one(page, vma, &mapcount, referenced, dirty); if (!mapcount) break; } spin_unlock(&mapping->i_mmap_lock); - return referenced; } /** @@ -391,29 +392,29 @@ static int page_referenced_file(struct p * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. */ -int page_referenced(struct page *page, int is_locked) +void page_gather(struct page *page, int is_locked, int *referenced, int *dirty) { - int referenced = 0; + *referenced = 0; + *dirty = 0; if (page_test_and_clear_young(page)) - referenced++; + (*referenced)++; if (TestClearPageReferenced(page)) - referenced++; + (*referenced)++; if (page_mapped(page) && page->mapping) { if (PageAnon(page)) - referenced += page_referenced_anon(page); + page_gather_anon(page, referenced, dirty); else if (is_locked) - referenced += page_referenced_file(page); + page_gather_file(page, referenced, dirty); else if (TestSetPageLocked(page)) - referenced++; + (*referenced)++; else if (page->mapping) { - referenced += page_referenced_file(page); + page_gather_file(page, referenced, dirty); unlock_page(page); } } - return referenced; } /** diff -puN mm/shmem.c~rollup mm/shmem.c --- linux-2.6/mm/shmem.c~rollup 2004-08-20 18:15:21.000000000 +1000 +++ linux-2.6-npiggin/mm/shmem.c 2004-08-20 18:15:21.000000000 +1000 @@ -1431,11 +1431,7 @@ static void do_shmem_file_read(struct fi */ if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - /* - * Mark the page accessed if we read the beginning. - */ - if (!offset) - mark_page_accessed(page); + mark_page_accessed(page); } /* diff -puN mm/swap.c~rollup mm/swap.c --- linux-2.6/mm/swap.c~rollup 2004-08-20 18:15:21.000000000 +1000 +++ linux-2.6-npiggin/mm/swap.c 2004-08-20 18:15:22.000000000 +1000 @@ -78,14 +78,18 @@ int rotate_reclaimable_page(struct page return 1; if (PageDirty(page)) return 1; - if (PageActive(page)) + if (PageActiveMapped(page)) + return 1; + if (PageActiveUnmapped(page)) return 1; if (!PageLRU(page)) return 1; zone = page_zone(page); spin_lock_irqsave(&zone->lru_lock, flags); - if (PageLRU(page) && !PageActive(page)) { + if (PageLRU(page) + && !PageActiveMapped(page) && !PageActiveUnmapped(page)) { + list_del(&page->lru); list_add_tail(&page->lru, &zone->inactive_list); inc_page_state(pgrotated); @@ -96,48 +100,11 @@ int rotate_reclaimable_page(struct page return 0; } -/* - * FIXME: speed this up? - */ -void fastcall activate_page(struct page *page) -{ - struct zone *zone = page_zone(page); - - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(zone, page); - SetPageActive(page); - add_page_to_active_list(zone, page); - inc_page_state(pgactivate); - } - spin_unlock_irq(&zone->lru_lock); -} - -/* - * Mark a page as having seen activity. - * - * inactive,unreferenced -> inactive,referenced - * inactive,referenced -> active,unreferenced - * active,unreferenced -> active,referenced - */ -void fastcall mark_page_accessed(struct page *page) -{ - if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { - activate_page(page); - ClearPageReferenced(page); - } else if (!PageReferenced(page)) { - SetPageReferenced(page); - } -} - -EXPORT_SYMBOL(mark_page_accessed); - /** * lru_cache_add: add a page to the page lists * @page: the page to add */ static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, }; -static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, }; void fastcall lru_cache_add(struct page *page) { @@ -149,25 +116,12 @@ void fastcall lru_cache_add(struct page put_cpu_var(lru_add_pvecs); } -void fastcall lru_cache_add_active(struct page *page) -{ - struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); - - page_cache_get(page); - if (!pagevec_add(pvec, page)) - __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_active_pvecs); -} - void lru_add_drain(void) { struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); if (pagevec_count(pvec)) __pagevec_lru_add(pvec); - pvec = &__get_cpu_var(lru_add_active_pvecs); - if (pagevec_count(pvec)) - __pagevec_lru_add_active(pvec); put_cpu_var(lru_add_pvecs); } @@ -304,6 +258,7 @@ void __pagevec_lru_add(struct pagevec *p } if (TestSetPageLRU(page)) BUG(); + ClearPageUsedOnce(page); add_page_to_inactive_list(zone, page); } if (zone) @@ -314,33 +269,6 @@ void __pagevec_lru_add(struct pagevec *p EXPORT_SYMBOL(__pagevec_lru_add); -void __pagevec_lru_add_active(struct pagevec *pvec) -{ - int i; - struct zone *zone = NULL; - - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - if (TestSetPageLRU(page)) - BUG(); - if (TestSetPageActive(page)) - BUG(); - add_page_to_active_list(zone, page); - } - if (zone) - spin_unlock_irq(&zone->lru_lock); - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); -} - /* * Try to drop buffers from the pages in a pagevec */ @@ -422,9 +350,6 @@ static void lru_drain_cache(unsigned int /* CPU is dead, so no locking needed. */ if (pagevec_count(pvec)) __pagevec_lru_add(pvec); - pvec = &per_cpu(lru_add_active_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add_active(pvec); } /* Drop the CPU's cached committed space back into the central pool. */ diff -puN mm/swap_state.c~rollup mm/swap_state.c --- linux-2.6/mm/swap_state.c~rollup 2004-08-20 18:15:21.000000000 +1000 +++ linux-2.6-npiggin/mm/swap_state.c 2004-08-20 18:15:22.000000000 +1000 @@ -375,7 +375,8 @@ struct page *read_swap_cache_async(swp_e /* * Initiate read into locked page and return. */ - lru_cache_add_active(new_page); + lru_cache_add(new_page); + mark_page_accessed(new_page); swap_readpage(NULL, new_page); return new_page; } diff -puN mm/swapfile.c~rollup mm/swapfile.c --- linux-2.6/mm/swapfile.c~rollup 2004-08-20 18:15:21.000000000 +1000 +++ linux-2.6-npiggin/mm/swapfile.c 2004-08-20 18:15:21.000000000 +1000 @@ -469,10 +469,10 @@ static unsigned long unuse_pmd(struct vm pte_unmap(pte); /* - * Move the page to the active list so it is not - * immediately swapped out again after swapon. + * Touch the page so it is not immediately swapped + * out again after swapon. */ - activate_page(page); + mark_page_accessed(page); /* add 1 since address may be 0 */ return 1 + offset + address; diff -puN mm/vmscan.c~rollup mm/vmscan.c --- linux-2.6/mm/vmscan.c~rollup 2004-08-20 18:15:21.000000000 +1000 +++ linux-2.6-npiggin/mm/vmscan.c 2004-08-20 18:15:23.000000000 +1000 @@ -58,6 +58,12 @@ struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; + /* Incremented by the number of congested pages that we encountered */ + unsigned long nr_congested; + + /* Number of dirty pages we're putting on the inactive list */ + unsigned long nr_dirty_inactive; + /* Incremented by the number of pages reclaimed */ unsigned long nr_reclaimed; @@ -66,12 +72,13 @@ struct scan_control { /* How many pages shrink_cache() should reclaim */ int nr_to_reclaim; - /* Ask shrink_caches, or shrink_zone to scan at this priority */ - unsigned int priority; + /* Are all zones in the current scan unreclaimable? */ + int all_unreclaimable; /* This context's GFP mask */ unsigned int gfp_mask; + int preserve_active; /* Don't eat into the active list */ int may_writepage; }; @@ -117,10 +124,9 @@ struct shrinker { #endif /* - * From 0 .. 100. Higher means more swappy. + * From 1 .. 100. Higher means less swappy. */ -int vm_swappiness = 60; -static long total_memory; +int vm_mapped_page_cost = 32; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -130,16 +136,16 @@ static DECLARE_RWSEM(shrinker_rwsem); */ struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker) { - struct shrinker *shrinker; + struct shrinker *shrinker; - shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); - if (shrinker) { - shrinker->shrinker = theshrinker; - shrinker->seeks = seeks; - shrinker->nr = 0; - down_write(&shrinker_rwsem); - list_add(&shrinker->list, &shrinker_list); - up_write(&shrinker_rwsem); + shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL); + if (shrinker) { + shrinker->shrinker = theshrinker; + shrinker->seeks = seeks; + shrinker->nr = 0; + down_write(&shrinker_rwsem); + list_add(&shrinker->list, &shrinker_list); + up_write(&shrinker_rwsem); } return shrinker; } @@ -157,6 +163,81 @@ void remove_shrinker(struct shrinker *sh } EXPORT_SYMBOL(remove_shrinker); +static unsigned int zone_shrinker_idx; + +/* + * Add a shrinker callback to be called from the vm + */ +int set_zone_shrinker(zone_shrinker_fn fn, int seeks) +{ + int idx; + struct zone_shrinker *zs; + struct zone *zone; + + down_write(&shrinker_rwsem); + idx = zone_shrinker_idx++; + + for_each_zone(zone) { + zs = kmalloc(sizeof(*zs), GFP_KERNEL); + if (!zs) { + up_write(&shrinker_rwsem); + remove_zone_shrinker(idx); + return -ENOMEM; + } + INIT_LIST_HEAD(&zs->lru); + zs->shrinker = fn; + zs->seeks = seeks; + zs->nr = 0; + zs->idx = idx; + spin_lock_irq(&zone->lru_lock); + list_add(&zs->list, &zone->zone_shrinker_list); + spin_unlock_irq(&zone->lru_lock); + } + up_write(&shrinker_rwsem); + return idx; +} +EXPORT_SYMBOL(set_zone_shrinker); + +struct zone_shrinker *get_zone_shrinker(struct zone *zone, int idx) +{ + struct zone_shrinker *zs; + struct zone_shrinker *ret = NULL; + + spin_lock_irq(&zone->lru_lock); + list_for_each_entry(zs, &zone->zone_shrinker_list, list) { + if (zs->idx == idx) { + ret = zs; + break; + } + } + spin_unlock_irq(&zone->lru_lock); + return ret; +} +EXPORT_SYMBOL(get_zone_shrinker); + +/* + * Remove one + */ +void remove_zone_shrinker(int idx) +{ + struct zone *zone; + + down_write(&shrinker_rwsem); + for_each_zone(zone) { + struct zone_shrinker *zs; + list_for_each_entry(zs, &zone->zone_shrinker_list, list) { + if (zs->idx == idx) { + spin_lock_irq(&zone->lru_lock); + list_del(&zs->list); + spin_unlock_irq(&zone->lru_lock); + kfree(zs); + } + } + } + up_write(&shrinker_rwsem); +} +EXPORT_SYMBOL(remove_zone_shrinker); + #define SHRINK_BATCH 128 /* * Call the shrink functions to age shrinkable caches @@ -175,36 +256,36 @@ EXPORT_SYMBOL(remove_shrinker); * are eligible for the caller's allocation attempt. It is used for balancing * slab reclaim versus page reclaim. */ -static int shrink_slab(unsigned long scanned, unsigned int gfp_mask, - unsigned long lru_pages) +static int shrink_slab(struct zone *zone, unsigned long scanned, unsigned long lru_pages, unsigned int gfp_mask) { + struct zone_shrinker *zs; struct shrinker *shrinker; if (scanned == 0) - return 0; + scanned = 1; if (!down_read_trylock(&shrinker_rwsem)) return 0; - list_for_each_entry(shrinker, &shrinker_list, list) { + list_for_each_entry(zs, &zone->zone_shrinker_list, list) { unsigned long long delta; unsigned long total_scan; - delta = (4 * scanned) / shrinker->seeks; - delta *= (*shrinker->shrinker)(0, gfp_mask); - do_div(delta, lru_pages + 1); - shrinker->nr += delta; - if (shrinker->nr < 0) - shrinker->nr = LONG_MAX; /* It wrapped! */ + delta = (4 * scanned) / zs->seeks; + delta *= (*zs->shrinker)(zs, 0, gfp_mask); + do_div(delta, zone->nr_inactive + zone->nr_active_mapped + zone->nr_active_unmapped + 1); + zs->nr_scan += delta; + if (zs->nr_scan < 0) + zs->nr_scan = LONG_MAX; /* It wrapped! */ - total_scan = shrinker->nr; - shrinker->nr = 0; + total_scan = zs->nr_scan; + zs->nr_scan = 0; while (total_scan >= SHRINK_BATCH) { long this_scan = SHRINK_BATCH; int shrink_ret; - shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); + shrink_ret = (*zs->shrinker)(zs, this_scan, gfp_mask); if (shrink_ret == -1) break; mod_page_state(slabs_scanned, this_scan); @@ -213,31 +294,43 @@ static int shrink_slab(unsigned long sca cond_resched(); } - shrinker->nr += total_scan; + zs->nr_scan += total_scan; } - up_read(&shrinker_rwsem); - return 0; -} -/* Called without lock on whether page is mapped, so answer is unstable */ -static inline int page_mapping_inuse(struct page *page) -{ - struct address_space *mapping; + list_for_each_entry(shrinker, &shrinker_list, list) { + unsigned long long delta = 0; + unsigned long nr_slab; + unsigned long total_scan; - /* Page is in somebody's page tables. */ - if (page_mapped(page)) - return 1; + nr_slab = (*shrinker->shrinker)(0, gfp_mask); + if (nr_slab > shrinker->nr) { + delta = (scanned / shrinker->seeks) + 1; + delta *= (nr_slab - shrinker->nr); + do_div(delta, lru_pages + 1); + } + shrinker->nr += delta + 1; - /* Be more reluctant to reclaim swapcache than pagecache */ - if (PageSwapCache(page)) - return 1; + total_scan = shrinker->nr; + shrinker->nr = 0; - mapping = page_mapping(page); - if (!mapping) - return 0; + while (total_scan >= SHRINK_BATCH) { + long this_scan = SHRINK_BATCH; + int shrink_ret; + + shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask); + if (shrink_ret == -1) + break; + mod_page_state(slabs_scanned, this_scan); + total_scan -= this_scan; - /* File is mmap'd by somebody? */ - return mapping_mapped(mapping); + cond_resched(); + } + + zs->nr_scan += total_scan; + } + + up_read(&shrinker_rwsem); + return 0; } static inline int is_page_cache_freeable(struct page *page) @@ -245,13 +338,17 @@ static inline int is_page_cache_freeable return page_count(page) - !!PagePrivate(page) == 2; } -static int may_write_to_queue(struct backing_dev_info *bdi) +static int may_write_to_queue(struct backing_dev_info *bdi, struct scan_control *sc) { + int congested = bdi_write_congested(bdi); + if (congested) + sc->nr_congested++; + if (current_is_kswapd()) return 1; if (current_is_pdflush()) /* This is unlikely, but why not... */ return 1; - if (!bdi_write_congested(bdi)) + if (!congested) return 1; if (bdi == current->backing_dev_info) return 1; @@ -284,9 +381,10 @@ static void handle_write_error(struct ad } /* - * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). + * pageout is called by hrink_list() for each dirty page. Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +static pageout_t pageout(struct page *page, struct address_space *mapping, + struct scan_control *sc) { /* * If the page is dirty, only perform writeback if that write @@ -311,7 +409,7 @@ static pageout_t pageout(struct page *pa return PAGE_KEEP; if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info)) + if (!may_write_to_queue(mapping->backing_dev_info, sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -359,7 +457,7 @@ static int shrink_list(struct list_head struct address_space *mapping; struct page *page; int may_enter_fs; - int referenced; + int referenced, dirty, mapped; page = lru_to_page(page_list); list_del(&page->lru); @@ -367,20 +465,34 @@ static int shrink_list(struct list_head if (TestSetPageLocked(page)) goto keep; - BUG_ON(PageActive(page)); + BUG_ON(PageActiveMapped(page) || PageActiveUnmapped(page)); if (PageWriteback(page)) goto keep_locked; + mapped = page_mapped(page); sc->nr_scanned++; - /* Double the slab pressure for mapped and swapcache pages */ - if (page_mapped(page) || PageSwapCache(page)) - sc->nr_scanned++; - - referenced = page_referenced(page, 1); - /* In active use or really unfreeable? Activate it. */ - if (referenced && page_mapping_inuse(page)) - goto activate_locked; + /* Increase the slab pressure for mapped pages */ + if (mapped) + sc->nr_scanned += vm_mapped_page_cost; + + page_gather(page, 1, &referenced, &dirty); + /* Has been referenced. Activate it. */ + if (referenced) { + /* + * Has been referenced. Activate used twice or + * mapped pages, otherwise give it another chance + * on the inactive list + */ + if (TestSetPageUsedOnce(page) || mapped) + goto activate_locked; + if (dirty) { + set_page_dirty(page); + sc->nr_dirty_inactive++; + } + sc->nr_scanned--; /* Don't count pages' first round */ + goto keep_locked; + } #ifdef CONFIG_SWAP /* @@ -413,16 +525,16 @@ static int shrink_list(struct list_head } if (PageDirty(page)) { - if (referenced) - goto keep_locked; - if (!may_enter_fs) - goto keep_locked; - if (laptop_mode && !sc->may_writepage) + if (!may_enter_fs || + (laptop_mode && !sc->may_writepage)) { + sc->nr_dirty_inactive++; goto keep_locked; + } /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { + switch(pageout(page, mapping, sc)) { case PAGE_KEEP: + sc->nr_dirty_inactive++; goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; @@ -484,7 +596,7 @@ static int shrink_list(struct list_head /* * The non-racy check for busy page. It is critical to check * PageDirty _after_ making sure that the page is freeable and - * not in use by anybody. (pagecache + us == 2) + * not in use by anybody. (pagecache + us == 2) */ if (page_count(page) != 2 || PageDirty(page)) { write_unlock_irq(&mapping->tree_lock); @@ -514,7 +626,10 @@ free_it: continue; activate_locked: - SetPageActive(page); + if (page_mapped(page)) + SetPageActiveMapped(page); + else + SetPageActiveUnmapped(page); pgactivate++; keep_locked: unlock_page(page); @@ -579,13 +694,14 @@ static void shrink_cache(struct zone *zo nr_taken++; } zone->nr_inactive -= nr_taken; - zone->pages_scanned += nr_taken; spin_unlock_irq(&zone->lru_lock); - if (nr_taken == 0) - goto done; - max_scan -= nr_scan; + if (nr_taken == 0) { + spin_lock_irq(&zone->lru_lock); + continue; + } + if (current_is_kswapd()) mod_page_state_zone(zone, pgscan_kswapd, nr_scan); else @@ -605,9 +721,13 @@ static void shrink_cache(struct zone *zo if (TestSetPageLRU(page)) BUG(); list_del(&page->lru); - if (PageActive(page)) - add_page_to_active_list(zone, page); - else + if (PageActiveMapped(page)) { + ClearPageUsedOnce(page); + add_page_to_active_mapped_list(zone, page); + } else if (PageActiveUnmapped(page)) { + ClearPageUsedOnce(page); + add_page_to_active_unmapped_list(zone, page); + } else add_page_to_inactive_list(zone, page); if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -615,9 +735,8 @@ static void shrink_cache(struct zone *zo spin_lock_irq(&zone->lru_lock); } } - } + } spin_unlock_irq(&zone->lru_lock); -done: pagevec_release(&pvec); } @@ -639,9 +758,9 @@ done: * But we had to alter page->flags anyway. */ static void -refill_inactive_zone(struct zone *zone, struct scan_control *sc) +shrink_active_list(struct zone *zone, struct list_head *list, unsigned long *nr_list_pages, struct scan_control *sc) { - int pgmoved; + int pgmoved, pgmoved_unmapped; int pgdeactivate = 0; int pgscanned = 0; int nr_pages = sc->nr_to_scan; @@ -650,17 +769,14 @@ refill_inactive_zone(struct zone *zone, LIST_HEAD(l_active); /* Pages to go onto the active_list */ struct page *page; struct pagevec pvec; - int reclaim_mapped = 0; - long mapped_ratio; - long distress; - long swap_tendency; lru_add_drain(); pgmoved = 0; + spin_lock_irq(&zone->lru_lock); - while (pgscanned < nr_pages && !list_empty(&zone->active_list)) { - page = lru_to_page(&zone->active_list); - prefetchw_prev_lru_page(page, &zone->active_list, flags); + while (pgscanned < nr_pages && !list_empty(list)) { + page = lru_to_page(list); + prefetchw_prev_lru_page(page, list, flags); if (!TestClearPageLRU(page)) BUG(); list_del(&page->lru); @@ -673,58 +789,37 @@ refill_inactive_zone(struct zone *zone, */ __put_page(page); SetPageLRU(page); - list_add(&page->lru, &zone->active_list); + list_add(&page->lru, list); } else { list_add(&page->lru, &l_hold); pgmoved++; } pgscanned++; } - zone->nr_active -= pgmoved; + *nr_list_pages -= pgmoved; + zone->pages_scanned += pgmoved; spin_unlock_irq(&zone->lru_lock); - /* - * `distress' is a measure of how much trouble we're having reclaiming - * pages. 0 -> no problems. 100 -> great trouble. - */ - distress = 100 >> zone->prev_priority; - - /* - * The point of this algorithm is to decide when to start reclaiming - * mapped memory instead of just pagecache. Work out how much memory - * is mapped. - */ - mapped_ratio = (sc->nr_mapped * 100) / total_memory; - - /* - * Now decide how much we really want to unmap some pages. The mapped - * ratio is downgraded - just because there's a lot of mapped memory - * doesn't necessarily mean that page reclaim isn't succeeding. - * - * The distress ratio is important - we don't want to start going oom. - * - * A 100% value of vm_swappiness overrides this algorithm altogether. - */ - swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; - - /* - * Now use this metric to decide whether to start moving mapped memory - * onto the inactive list. - */ - if (swap_tendency >= 100) - reclaim_mapped = 1; - while (!list_empty(&l_hold)) { + int referenced, dirty; + page = lru_to_page(&l_hold); list_del(&page->lru); - if (page_mapped(page)) { - if (!reclaim_mapped || - (total_swap_pages == 0 && PageAnon(page)) || - page_referenced(page, 0)) { - list_add(&page->lru, &l_active); - continue; - } + + if ((total_swap_pages == 0 && PageAnon(page))) { + list_add(&page->lru, &l_active); + continue; + } + page_gather(page, 0, &referenced, &dirty); + if (referenced) { + list_add(&page->lru, &l_active); + continue; + } + if (dirty) { + set_page_dirty(page); + sc->nr_dirty_inactive++; } + list_add(&page->lru, &l_inactive); } @@ -736,7 +831,8 @@ refill_inactive_zone(struct zone *zone, prefetchw_prev_lru_page(page, &l_inactive, flags); if (TestSetPageLRU(page)) BUG(); - if (!TestClearPageActive(page)) + if (!TestClearPageActiveMapped(page) + && !TestClearPageActiveUnmapped(page)) BUG(); list_move(&page->lru, &zone->inactive_list); pgmoved++; @@ -760,23 +856,37 @@ refill_inactive_zone(struct zone *zone, } pgmoved = 0; + pgmoved_unmapped = 0; while (!list_empty(&l_active)) { page = lru_to_page(&l_active); prefetchw_prev_lru_page(page, &l_active, flags); if (TestSetPageLRU(page)) BUG(); - BUG_ON(!PageActive(page)); - list_move(&page->lru, &zone->active_list); - pgmoved++; + if(!TestClearPageActiveMapped(page) + && !TestClearPageActiveUnmapped(page)) + BUG(); + if (page_mapped(page)) { + SetPageActiveMapped(page); + list_move(&page->lru, &zone->active_mapped_list); + pgmoved++; + } else { + SetPageActiveUnmapped(page); + list_move(&page->lru, &zone->active_unmapped_list); + pgmoved_unmapped++; + } + if (!pagevec_add(&pvec, page)) { - zone->nr_active += pgmoved; + zone->nr_active_mapped += pgmoved; pgmoved = 0; + zone->nr_active_unmapped += pgmoved_unmapped; + pgmoved_unmapped = 0; spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); spin_lock_irq(&zone->lru_lock); } } - zone->nr_active += pgmoved; + zone->nr_active_mapped += pgmoved; + zone->nr_active_unmapped += pgmoved_unmapped; spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); @@ -784,52 +894,121 @@ refill_inactive_zone(struct zone *zone, mod_page_state(pgdeactivate, pgdeactivate); } +#define SCAN_MASK 0x00000fff +#define SCAN_SHIFT 7 + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static void shrink_zone(struct zone *zone, struct scan_control *sc) { + unsigned long long tmp; + unsigned long scan_active, scan_active_mapped, scan_active_unmapped; + unsigned long scan_inactive; unsigned long nr_active; - unsigned long nr_inactive; + int count; + + if (sc->preserve_active) { + if (zone->nr_inactive * 8 >= + zone->nr_active_mapped + zone->nr_active_unmapped) + sc->all_unreclaimable = 0; + } else if (!zone->all_unreclaimable) + sc->all_unreclaimable = 0; + if (zone->all_unreclaimable) { + scan_inactive = zone->nr_inactive; + scan_active_mapped = 1; + scan_active_unmapped = vm_mapped_page_cost; + goto scan; + } + + nr_active = zone->nr_active_mapped + zone->nr_active_unmapped; + scan_inactive = (nr_active + zone->nr_inactive); + + if (nr_active >= (zone->nr_inactive + 1) && !sc->preserve_active) { + /* + * Add one to `nr_to_scan' just to make sure that the kernel + * will slowly sift through the active list. + */ + if (nr_active >= 4*(zone->nr_inactive*2 + 1)) { + /* Don't scan more than 4 times inactive list scan */ + scan_active = 4*scan_inactive; + } else { + /* Cast to long long so the multiply doesn't overflow */ + tmp = (unsigned long long)scan_inactive * nr_active; + do_div(tmp, zone->nr_inactive*2 + 1); + scan_active = (unsigned long)tmp; + } + scan_active *= 2; + + tmp = scan_active * zone->nr_active_mapped; + do_div(tmp, nr_active + 1); + scan_active_mapped = ((unsigned long)tmp + 1) + / vm_mapped_page_cost; + scan_active_unmapped = scan_active - tmp + 1; + } else { + /* Don't scan the active list if the inactive list is large */ + scan_active_mapped = zone->nr_active_mapped / 32; + scan_active_unmapped = zone->nr_active_unmapped * vm_mapped_page_cost / 32; + } + +scan: + /* zero this before scanning */ + sc->nr_dirty_inactive = 0; + sc->nr_to_scan = SWAP_CLUSTER_MAX; + + count = (zone->nr_scan_active_mapped + scan_active_mapped); + zone->nr_scan_active_mapped = count & SCAN_MASK; + count >>= SCAN_SHIFT; + while (count >= SWAP_CLUSTER_MAX) { + count -= SWAP_CLUSTER_MAX; + shrink_active_list(zone, &zone->active_mapped_list, + &zone->nr_active_mapped, sc); + } + + count = (zone->nr_scan_active_unmapped + scan_active_unmapped); + zone->nr_scan_active_unmapped = count & SCAN_MASK; + count >>= SCAN_SHIFT; + while (count >= SWAP_CLUSTER_MAX) { + count -= SWAP_CLUSTER_MAX; + shrink_active_list(zone, &zone->active_unmapped_list, + &zone->nr_active_unmapped, sc); + } + + count = (zone->nr_scan_inactive + scan_inactive); + zone->nr_scan_inactive = count & SCAN_MASK; + count >>= SCAN_SHIFT; + while (count >= SWAP_CLUSTER_MAX) { + if (sc->nr_to_reclaim <= 0) + break; + count -= SWAP_CLUSTER_MAX; + shrink_cache(zone, sc); + } /* - * Add one to `nr_to_scan' just to make sure that the kernel will - * slowly sift through the active list. + * Try to write back as many pages as the number of dirty ones + * we're adding to the inactive list. This tends to cause slow + * streaming writers to write data to the disk smoothly, at the + * dirtying rate, which is nice. But that's undesirable in + * laptop mode, where we *want* lumpy writeout. So in laptop + * mode, write out the whole world. */ - zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; - nr_active = zone->nr_scan_active; - if (nr_active >= SWAP_CLUSTER_MAX) - zone->nr_scan_active = 0; - else - nr_active = 0; - - zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; - nr_inactive = zone->nr_scan_inactive; - if (nr_inactive >= SWAP_CLUSTER_MAX) - zone->nr_scan_inactive = 0; - else - nr_inactive = 0; - - sc->nr_to_reclaim = SWAP_CLUSTER_MAX; - - while (nr_active || nr_inactive) { - if (nr_active) { - sc->nr_to_scan = min(nr_active, - (unsigned long)SWAP_CLUSTER_MAX); - nr_active -= sc->nr_to_scan; - refill_inactive_zone(zone, sc); - } - - if (nr_inactive) { - sc->nr_to_scan = min(nr_inactive, - (unsigned long)SWAP_CLUSTER_MAX); - nr_inactive -= sc->nr_to_scan; - shrink_cache(zone, sc); - if (sc->nr_to_reclaim <= 0) - break; - } + zone->nr_dirty_inactive += sc->nr_dirty_inactive; + count = zone->nr_dirty_inactive; + if (count > zone->nr_inactive / 2 + || (!(laptop_mode && !sc->may_writepage) + && count > SWAP_CLUSTER_MAX)) { + zone->nr_dirty_inactive = 0; + wakeup_bdflush(laptop_mode ? 0 : count*2); + sc->may_writepage = 1; } + + if (sc->nr_reclaimed) { + zone->all_unreclaimable = 0; + zone->pages_scanned = 0; + } + if (zone->pages_scanned > zone->present_pages) + zone->all_unreclaimable = 1; } /* @@ -849,24 +1028,25 @@ shrink_zone(struct zone *zone, struct sc * scan then give up on it. */ static void -shrink_caches(struct zone **zones, struct scan_control *sc) +shrink_caches(struct zone **zones, struct scan_control *sc, unsigned long lru_pages) { + struct reclaim_state *reclaim_state = current->reclaim_state; int i; + sc->all_unreclaimable = 1; for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; - - zone->temp_priority = sc->priority; - if (zone->prev_priority > sc->priority) - zone->prev_priority = sc->priority; - - if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) - continue; /* Let kswapd poll it */ - + if (sc->preserve_active && zone->zone_pgdat->node_id != numa_node_id()) + break; shrink_zone(zone, sc); + shrink_slab(zone, sc->nr_scanned, lru_pages, sc->gfp_mask); + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } } } - + /* * This is the main entry point to direct page reclaim. * @@ -881,67 +1061,54 @@ shrink_caches(struct zone **zones, struc * allocation attempt will fail. */ int try_to_free_pages(struct zone **zones, - unsigned int gfp_mask, unsigned int order) + unsigned int gfp_mask, unsigned int order, int local) { - int priority; int ret = 0; int total_scanned = 0, total_reclaimed = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc; unsigned long lru_pages = 0; int i; + sc.nr_to_reclaim = SWAP_CLUSTER_MAX; sc.gfp_mask = gfp_mask; sc.may_writepage = 0; + sc.preserve_active = local; inc_page_state(allocstall); for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; - - zone->temp_priority = DEF_PRIORITY; - lru_pages += zone->nr_active + zone->nr_inactive; + if (local && zone->zone_pgdat->node_id != numa_node_id()) + break; + lru_pages += zone->nr_active_mapped + + zone->nr_active_unmapped + zone->nr_inactive; } - for (priority = DEF_PRIORITY; priority >= 0; priority--) { + for (;;) { sc.nr_mapped = read_page_state(nr_mapped); + sc.nr_congested = 0; sc.nr_scanned = 0; sc.nr_reclaimed = 0; - sc.priority = priority; - shrink_caches(zones, &sc); - shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); - if (reclaim_state) { - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) { - ret = 1; - goto out; - } + shrink_caches(zones, &sc, lru_pages); total_scanned += sc.nr_scanned; total_reclaimed += sc.nr_reclaimed; - - /* - * Try to write back as many pages as we just scanned. This - * tends to cause slow streaming writers to write data to the - * disk smoothly, at the dirtying rate, which is nice. But - * that's undesirable in laptop mode, where we *want* lumpy - * writeout. So in laptop mode, write out the whole world. - */ - if (total_scanned > SWAP_CLUSTER_MAX + SWAP_CLUSTER_MAX/2) { - wakeup_bdflush(laptop_mode ? 0 : total_scanned); - sc.may_writepage = 1; + if (total_reclaimed >= SWAP_CLUSTER_MAX) { + ret = 1; + goto out; } /* Take a nap, wait for some writeback to complete */ - if (sc.nr_scanned && priority < DEF_PRIORITY - 2) + if (sc.all_unreclaimable) + break; + if (sc.nr_congested * 10 > sc.nr_scanned) { + if (local) + break; blk_congestion_wait(WRITE, HZ/10); + } } - if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) + if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY) && !local) out_of_memory(gfp_mask); out: - for (i = 0; zones[i] != 0; i++) - zones[i]->prev_priority = zones[i]->temp_priority; return ret; } @@ -972,8 +1139,8 @@ out: */ static int balance_pgdat(pg_data_t *pgdat, int nr_pages) { + int all_zones_ok; int to_free = nr_pages; - int priority; int i; int total_scanned = 0, total_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; @@ -981,92 +1148,62 @@ static int balance_pgdat(pg_data_t *pgda sc.gfp_mask = GFP_KERNEL; sc.may_writepage = 0; + sc.preserve_active = 0; sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); - for (i = 0; i < pgdat->nr_zones; i++) { - struct zone *zone = pgdat->node_zones + i; - - zone->temp_priority = DEF_PRIORITY; - } - - for (priority = DEF_PRIORITY; priority >= 0; priority--) { - int all_zones_ok = 1; - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ + for (;;) { unsigned long lru_pages = 0; + int first_low = 0; + all_zones_ok = 1; - if (nr_pages == 0) { - /* - * Scan in the highmem->dma direction for the highest - * zone which needs scanning - */ - for (i = pgdat->nr_zones - 1; i >= 0; i--) { - struct zone *zone = pgdat->node_zones + i; - - if (zone->all_unreclaimable && - priority != DEF_PRIORITY) - continue; + sc.nr_scanned = 0; + sc.nr_congested = 0; + sc.nr_reclaimed = 0; + sc.all_unreclaimable = 1; - if (zone->free_pages <= zone->pages_high) { - end_zone = i; - goto scan; - } - } - goto out; - } else { - end_zone = pgdat->nr_zones - 1; - } -scan: - for (i = 0; i <= end_zone; i++) { + for (i = pgdat->nr_zones - 1; i >= 0; i--) { struct zone *zone = pgdat->node_zones + i; - - lru_pages += zone->nr_active + zone->nr_inactive; + if (is_highmem(zone)) + continue; + lru_pages += zone->nr_active_mapped + + zone->nr_active_unmapped + + zone->nr_inactive; } - /* - * Now scan the zone in the dma->highmem direction, stopping - * at the last zone which needs scanning. - * - * We do this because the page allocator works in the opposite - * direction. This prevents the page allocator from allocating - * pages behind kswapd's direction of progress, which would - * cause too much scanning of the lower zones. - */ - for (i = 0; i <= end_zone; i++) { + /* Scan in the highmem->dma direction */ + for (i = pgdat->nr_zones - 1; i >= 0; i--) { struct zone *zone = pgdat->node_zones + i; - if (zone->all_unreclaimable && priority != DEF_PRIORITY) - continue; - if (nr_pages == 0) { /* Not software suspend */ - if (zone->free_pages <= zone->pages_high) - all_zones_ok = 0; - } - zone->temp_priority = priority; - if (zone->prev_priority > priority) - zone->prev_priority = priority; - sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = priority; + unsigned long pgfree = zone->free_pages; + unsigned long pghigh = zone->pages_high; + + /* + * This satisfies the "incremental min" or + * lower zone protection logic in the allocator + */ + if (first_low > i) + pghigh += zone->protection[first_low]; + if (pgfree >= pghigh) + continue; + if (first_low < i) + first_low = i; + + all_zones_ok = 0; + sc.nr_to_reclaim = pghigh - pgfree; + } else + sc.nr_to_reclaim = INT_MAX; /* Software susp */ + shrink_zone(zone, &sc); reclaim_state->reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); + shrink_slab(zone, sc.nr_scanned, GFP_KERNEL, lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; - total_reclaimed += sc.nr_reclaimed; - if (zone->all_unreclaimable) - continue; - if (zone->pages_scanned > zone->present_pages * 2) - zone->all_unreclaimable = 1; - /* - * If we've done a decent amount of scanning and - * the reclaim ratio is low, start doing writepage - * even in laptop mode - */ - if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > total_reclaimed+total_reclaimed/2) - sc.may_writepage = 1; } + total_reclaimed += sc.nr_reclaimed; + total_scanned += sc.nr_scanned; + if (nr_pages && to_free > total_reclaimed) continue; /* swsusp: need to do more work */ if (all_zones_ok) @@ -1075,21 +1212,17 @@ scan: * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. */ - if (total_scanned && priority < DEF_PRIORITY - 2) + if (sc.all_unreclaimable) + schedule_timeout(HZ/10); + else if (sc.nr_congested * 10 > sc.nr_scanned) blk_congestion_wait(WRITE, HZ/10); } -out: - for (i = 0; i < pgdat->nr_zones; i++) { - struct zone *zone = pgdat->node_zones + i; - - zone->prev_priority = zone->temp_priority; - } return total_reclaimed; } /* * The background pageout daemon, started as a kernel thread - * from the init process. + * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity @@ -1213,7 +1346,6 @@ static int __init kswapd_init(void) for_each_pgdat(pgdat) pgdat->kswapd = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); - total_memory = nr_free_pagecache_pages(); hotcpu_notifier(cpu_callback, 0); return 0; } _