Index: linux-2.6.8.1-ck9/include/linux/mmzone.h =================================================================== --- linux-2.6.8.1-ck9.orig/include/linux/mmzone.h 2004-10-02 11:39:51.000000000 +1000 +++ linux-2.6.8.1-ck9/include/linux/mmzone.h 2004-10-02 11:40:04.000000000 +1000 @@ -125,7 +125,7 @@ struct zone { */ spinlock_t lock; unsigned long free_pages; - unsigned long pages_min, pages_low, pages_high; + unsigned long pages_min, pages_low, pages_high, pages_unmapped; /* * protection[] is a pre-calculated number of extra pages that must be * available in a zone in order for __alloc_pages() to allocate memory @@ -276,6 +276,7 @@ typedef struct pglist_data { struct pglist_data *pgdat_next; wait_queue_head_t kswapd_wait; struct task_struct *kswapd; + unsigned long mapped_nrpages; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) Index: linux-2.6.8.1-ck9/include/linux/swap.h =================================================================== --- linux-2.6.8.1-ck9.orig/include/linux/swap.h 2004-10-02 11:39:51.000000000 +1000 +++ linux-2.6.8.1-ck9/include/linux/swap.h 2004-10-02 11:40:04.000000000 +1000 @@ -174,7 +174,8 @@ extern void swap_setup(void); /* linux/mm/vmscan.c */ extern int try_to_free_pages(struct zone **, unsigned int, unsigned int); extern int shrink_all_memory(int); -extern int vm_swappiness; +extern int vm_mapped; +extern int vm_hardmaplimit; #ifdef CONFIG_MMU /* linux/mm/shmem.c */ Index: linux-2.6.8.1-ck9/include/linux/sysctl.h =================================================================== --- linux-2.6.8.1-ck9.orig/include/linux/sysctl.h 2004-10-02 11:39:51.000000000 +1000 +++ linux-2.6.8.1-ck9/include/linux/sysctl.h 2004-10-02 11:40:04.000000000 +1000 @@ -159,7 +159,7 @@ enum VM_OVERCOMMIT_RATIO=16, /* percent of RAM to allow overcommit in */ VM_PAGEBUF=17, /* struct: Control pagebuf parameters */ VM_HUGETLB_PAGES=18, /* int: Number of available Huge Pages */ - VM_SWAPPINESS=19, /* Tendency to steal mapped memory */ + VM_MAPPED=19, /* percent mapped min while evicting cache */ VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */ VM_MIN_FREE_KBYTES=21, /* Minimum free kilobytes to maintain */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ @@ -167,6 +167,7 @@ enum VM_BLOCK_DUMP=24, /* block dump mode */ VM_HUGETLB_GROUP=25, /* permitted hugetlb group */ VM_VFS_CACHE_PRESSURE=26, /* dcache/icache reclaim pressure */ + VM_HARDMAPLIMIT=27, /* Make mapped a hard limit */ }; Index: linux-2.6.8.1-ck9/kernel/sysctl.c =================================================================== --- linux-2.6.8.1-ck9.orig/kernel/sysctl.c 2004-10-02 11:39:51.000000000 +1000 +++ linux-2.6.8.1-ck9/kernel/sysctl.c 2004-10-02 11:40:04.000000000 +1000 @@ -717,16 +717,24 @@ static ctl_table vm_table[] = { .proc_handler = &proc_dointvec, }, { - .ctl_name = VM_SWAPPINESS, - .procname = "swappiness", - .data = &vm_swappiness, - .maxlen = sizeof(vm_swappiness), + .ctl_name = VM_MAPPED, + .procname = "mapped", + .data = &vm_mapped, + .maxlen = sizeof(vm_mapped), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .strategy = &sysctl_intvec, .extra1 = &zero, .extra2 = &one_hundred, }, + { + .ctl_name = VM_HARDMAPLIMIT, + .procname = "hardmaplimit", + .data = &vm_hardmaplimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_HUGETLB_PAGE { .ctl_name = VM_HUGETLB_PAGES, Index: linux-2.6.8.1-ck9/mm/page_alloc.c =================================================================== --- linux-2.6.8.1-ck9.orig/mm/page_alloc.c 2004-10-02 11:39:51.000000000 +1000 +++ linux-2.6.8.1-ck9/mm/page_alloc.c 2004-10-02 11:40:04.000000000 +1000 @@ -628,6 +628,13 @@ __alloc_pages(unsigned int gfp_mask, uns */ if (rt_task(p)) min -= z->pages_low >> 1; + else if (vm_mapped && wait && + z->free_pages < z->pages_unmapped && + z->free_pages > z->pages_low) { + z->zone_pgdat->mapped_nrpages = + z->pages_unmapped - z->free_pages; + wakeup_kswapd(z); + } if (z->free_pages >= min || (!wait && z->free_pages >= z->pages_high)) { @@ -1905,6 +1912,7 @@ static void setup_per_zone_pages_min(voi zone->pages_low = zone->pages_min * 2; zone->pages_high = zone->pages_min * 3; + zone->pages_unmapped = zone->pages_min * 4; spin_unlock_irqrestore(&zone->lru_lock, flags); } } Index: linux-2.6.8.1-ck9/mm/vmscan.c =================================================================== --- linux-2.6.8.1-ck9.orig/mm/vmscan.c 2004-10-02 11:39:51.000000000 +1000 +++ linux-2.6.8.1-ck9/mm/vmscan.c 2004-10-02 12:00:56.040907228 +1000 @@ -115,10 +115,8 @@ struct shrinker { #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) #endif -/* - * From 0 .. 100. Higher means more swappy. - */ -int vm_swappiness = 60; +int vm_mapped = 66; +int vm_hardmaplimit = 1; static long total_memory; static LIST_HEAD(shrinker_list); @@ -338,7 +336,8 @@ static pageout_t pageout(struct page *pa /* * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed */ -static int shrink_list(struct list_head *page_list, struct scan_control *sc) +static int shrink_list(struct list_head *page_list, struct scan_control *sc, + int maplimit) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; @@ -366,6 +365,8 @@ static int shrink_list(struct list_head goto keep_locked; sc->nr_scanned++; + if (maplimit && page_mapped(page)) + goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ if (page_mapped(page) || PageSwapCache(page)) sc->nr_scanned++; @@ -543,6 +544,7 @@ static void shrink_cache(struct zone *zo LIST_HEAD(page_list); struct pagevec pvec; int max_scan = sc->nr_to_scan; + int maplimit = 0; pagevec_init(&pvec, 1); @@ -584,11 +586,12 @@ static void shrink_cache(struct zone *zo goto done; max_scan -= nr_scan; - if (current_is_kswapd()) + if (current_is_kswapd()) { mod_page_state_zone(zone, pgscan_kswapd, nr_scan); - else + maplimit = !!zone->zone_pgdat->mapped_nrpages; + } else mod_page_state_zone(zone, pgscan_direct, nr_scan); - nr_freed = shrink_list(&page_list, sc); + nr_freed = shrink_list(&page_list, sc, maplimit); if (current_is_kswapd()) mod_page_state(kswapd_steal, nr_freed); mod_page_state_zone(zone, pgsteal, nr_freed); @@ -643,15 +646,12 @@ refill_inactive_zone(struct zone *zone, int pgdeactivate = 0; int pgscanned = 0; int nr_pages = sc->nr_to_scan; + unsigned int mapped_ratio; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ LIST_HEAD(l_active); /* Pages to go onto the active_list */ struct page *page; struct pagevec pvec; - int reclaim_mapped = 0; - long mapped_ratio; - long distress; - long swap_tendency; lru_add_drain(); pgmoved = 0; @@ -681,44 +681,16 @@ refill_inactive_zone(struct zone *zone, zone->nr_active -= pgmoved; spin_unlock_irq(&zone->lru_lock); - /* - * `distress' is a measure of how much trouble we're having reclaiming - * pages. 0 -> no problems. 100 -> great trouble. - */ - distress = 100 >> zone->prev_priority; - - /* - * The point of this algorithm is to decide when to start reclaiming - * mapped memory instead of just pagecache. Work out how much memory - * is mapped. - */ mapped_ratio = (sc->nr_mapped * 100) / total_memory; - /* - * Now decide how much we really want to unmap some pages. The mapped - * ratio is downgraded - just because there's a lot of mapped memory - * doesn't necessarily mean that page reclaim isn't succeeding. - * - * The distress ratio is important - we don't want to start going oom. - * - * A 100% value of vm_swappiness overrides this algorithm altogether. - */ - swap_tendency = mapped_ratio / 2 + distress + vm_swappiness; - - /* - * Now use this metric to decide whether to start moving mapped memory - * onto the inactive list. - */ - if (swap_tendency >= 100) - reclaim_mapped = 1; - while (!list_empty(&l_hold)) { page = lru_to_page(&l_hold); list_del(&page->lru); if (page_mapped(page)) { - if (!reclaim_mapped) { - list_add(&page->lru, &l_active); - continue; + if (zone->zone_pgdat->mapped_nrpages || + (vm_hardmaplimit && mapped_ratio < vm_mapped)) { + list_add(&page->lru, &l_active); + continue; } page_map_lock(page); if (page_referenced(page)) { @@ -981,11 +953,12 @@ out: * the page allocator fallback scheme to ensure that aging of pages is balanced * across the zones. */ -static int balance_pgdat(pg_data_t *pgdat, int nr_pages) +static int +balance_pgdat(pg_data_t *pgdat, int nr_pages, unsigned long mapped_nrpages) { int to_free = nr_pages; int priority; - int i; + int i, maplimit = 0; int total_scanned = 0, total_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc; @@ -994,6 +967,23 @@ static int balance_pgdat(pg_data_t *pgda sc.may_writepage = 0; sc.nr_mapped = read_page_state(nr_mapped); + /* + * Sanity check to ensure we don't have a stale maplimit set + * and are calling balance_pgdat for a different reason. + */ + if (!nr_pages && mapped_nrpages) { + maplimit = 1; + nr_pages = mapped_nrpages; + } + + /* + * kswapd does a light balance_pgdat() when there is less than 1/3 + * ram free provided there is less than vm_mapped % of that ram + * mapped. + */ + if (maplimit && sc.nr_mapped * 100 / total_memory > vm_mapped) + return 0; + inc_page_state(pageoutrun); for (i = 0; i < pgdat->nr_zones; i++) { @@ -1007,6 +997,12 @@ static int balance_pgdat(pg_data_t *pgda int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; + /* + * Only do low priority scanning if we're here due to + * mapped watermark. + */ + if (maplimit && priority < DEF_PRIORITY) + goto out; if (nr_pages == 0) { /* * Scan in the highmem->dma direction for the highest @@ -1019,10 +1015,13 @@ static int balance_pgdat(pg_data_t *pgda priority != DEF_PRIORITY) continue; - if (zone->free_pages <= zone->pages_high) { - end_zone = i; - goto scan; + if (zone->free_pages <= zone->pages_high || + (maplimit && zone->free_pages <= + zone->pages_unmapped)) { + end_zone = i; + goto scan; } + } goto out; } else { @@ -1148,7 +1147,7 @@ static int kswapd(void *p) schedule(); finish_wait(&pgdat->kswapd_wait, &wait); - balance_pgdat(pgdat, 0); + balance_pgdat(pgdat, 0, pgdat->mapped_nrpages); } return 0; } @@ -1158,8 +1157,10 @@ static int kswapd(void *p) */ void wakeup_kswapd(struct zone *zone) { - if (zone->free_pages > zone->pages_low) + if (zone->free_pages > zone->pages_unmapped) return; + if (zone->free_pages <= zone->pages_low) + zone->zone_pgdat->mapped_nrpages = 0; if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) return; wake_up_interruptible(&zone->zone_pgdat->kswapd_wait); @@ -1182,7 +1183,7 @@ int shrink_all_memory(int nr_pages) current->reclaim_state = &reclaim_state; for_each_pgdat(pgdat) { int freed; - freed = balance_pgdat(pgdat, nr_to_free); + freed = balance_pgdat(pgdat, nr_to_free, 0); ret += freed; nr_to_free -= freed; if (nr_to_free <= 0)