Having a hot and a cold pcp list means that new cold pages can be wasted when a hot page is needed but none available. Disallow cold page allocation from taking hot pages. Index: linux-2.6/include/linux/mmzone.h =================================================================== --- linux-2.6.orig/include/linux/mmzone.h +++ linux-2.6/include/linux/mmzone.h @@ -43,15 +43,13 @@ struct zone_padding { #define ZONE_PADDING(name) #endif -struct per_cpu_pages { +struct per_cpu_pageset { + struct list_head list; /* the list of pages */ int count; /* number of pages in the list */ + int cold_count; /* number of cold pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ - struct list_head list; /* the list of pages */ -}; -struct per_cpu_pageset { - struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ #ifdef CONFIG_NUMA unsigned long numa_hit; /* allocated in intended node */ unsigned long numa_miss; /* allocated in non intended node */ Index: linux-2.6/mm/page_alloc.c =================================================================== --- linux-2.6.orig/mm/page_alloc.c +++ linux-2.6/mm/page_alloc.c @@ -508,10 +508,8 @@ static int rmqueue_bulk(struct zone *zon void drain_remote_pages(void) { struct zone *zone; - int i; unsigned long flags; - local_irq_save(flags); for_each_zone(zone) { struct per_cpu_pageset *pset; @@ -519,17 +517,14 @@ void drain_remote_pages(void) if (zone->zone_pgdat->node_id == numa_node_id()) continue; + local_irq_save(flags); pset = zone->pageset[smp_processor_id()]; - for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { - struct per_cpu_pages *pcp; - - pcp = &pset->pcp[i]; - if (pcp->count) - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); - } + if (pset->count) + pset->count -= free_pages_bulk(zone, pset->count, + &pset->list, 0); + pset->cold_count = min(pset->cold_count, pset->count); + local_irq_restore(flags); } - local_irq_restore(flags); } #endif @@ -538,21 +533,16 @@ static void __drain_pages(unsigned int c { unsigned long flags; struct zone *zone; - int i; for_each_zone(zone) { struct per_cpu_pageset *pset; pset = zone_pcp(zone, cpu); - for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { - struct per_cpu_pages *pcp; - - pcp = &pset->pcp[i]; - local_irq_save(flags); - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); - local_irq_restore(flags); - } + local_irq_save(flags); + pset->count -= free_pages_bulk(zone, pset->count, + &pset->list, 0); + pset->cold_count = min(pset->cold_count, pset->count); + local_irq_restore(flags); } } #endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */ @@ -630,7 +620,8 @@ static void FASTCALL(free_hot_cold_page( static void fastcall free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; + struct per_cpu_pageset *pset; + struct list_head *entry; unsigned long flags; arch_free_page(page, 0); @@ -639,13 +630,21 @@ static void fastcall free_hot_cold_page( if (PageAnon(page)) page->mapping = NULL; free_pages_check(page); - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; + pset = zone_pcp(zone, get_cpu()); local_irq_save(flags); page_state(pgfree)++; - list_add(&page->lru, &pcp->list); - pcp->count++; - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + pset->count++; + entry = &pset->list; + if (cold) { + pset->cold_count++; + entry = entry->prev; /* tail */ + } + list_add(&page->lru, entry); + if (pset->count > pset->high) { + pset->count -= free_pages_bulk(zone, pset->batch, + &pset->list, 0); + pset->cold_count = min(pset->cold_count, pset->count); + } local_irq_restore(flags); put_cpu(); } @@ -683,19 +682,31 @@ buffered_rmqueue(struct zone *zone, int int cpu = get_cpu(); if (order == 0) { - struct per_cpu_pages *pcp; + struct per_cpu_pageset *pset; + struct list_head *entry; - pcp = &zone_pcp(zone, cpu)->pcp[cold]; + pset = zone_pcp(zone, cpu); local_irq_save(flags); - if (!pcp->count) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); - if (unlikely(!pcp->count)) + if (!pset->count || (cold && !pset->cold_count && + pset->count <= pset->high - (pset->high>>2))) { + int count; + count = rmqueue_bulk(zone, 0,pset->batch, &pset->list); + if (unlikely(!count)) goto failed; + pset->count += count; + pset->cold_count += count; } - page = list_entry(pcp->list.next, struct page, lru); + + pset->count--; + entry = pset->list.next; + if (cold) { + if (pset->cold_count) + pset->cold_count--; + entry = pset->list.prev; + } + pset->cold_count = min(pset->cold_count, pset->count); + page = list_entry(entry, struct page, lru); list_del(&page->lru); - pcp->count--; } else { spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order); @@ -1293,7 +1304,7 @@ void si_meminfo_node(struct sysinfo *val void show_free_areas(void) { struct page_state ps; - int cpu, temperature; + int cpu; unsigned long active; unsigned long inactive; unsigned long free; @@ -1310,20 +1321,15 @@ void show_free_areas(void) printk("\n"); for (cpu = 0; cpu < NR_CPUS; ++cpu) { - struct per_cpu_pageset *pageset; + struct per_cpu_pageset *pset; if (!cpu_possible(cpu)) continue; - pageset = zone_pcp(zone, cpu); + pset = zone_pcp(zone, cpu); - for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: high %d, batch %d used:%d\n", - cpu, - temperature ? "cold" : "hot", - pageset->pcp[temperature].high, - pageset->pcp[temperature].batch, - pageset->pcp[temperature].count); + printk("cpu %d: high %d, batch %d used:%d\n", + cpu, pset->high, pset->batch, pset->count); } } @@ -1751,19 +1757,11 @@ static int __devinit zone_batchsize(stru inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) { - struct per_cpu_pages *pcp; - - pcp = &p->pcp[0]; /* hot */ - pcp->count = 0; - pcp->high = 4 * batch; - pcp->batch = max(1UL, 1 * batch); - INIT_LIST_HEAD(&pcp->list); - - pcp = &p->pcp[1]; /* cold*/ - pcp->count = 0; - pcp->high = 2 * batch; - pcp->batch = max(1UL, 1 * batch); - INIT_LIST_HEAD(&pcp->list); + INIT_LIST_HEAD(&p->list); + p->count = 0; + p->cold_count = 0; + p->high = 6 * batch; + p->batch = max(1UL, 1 * batch); } #ifdef CONFIG_NUMA @@ -2123,27 +2121,15 @@ static int zoneinfo_show(struct seq_file ")" "\n pagesets"); for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { - struct per_cpu_pageset *pageset; - int j; + struct per_cpu_pageset *pset; - pageset = zone_pcp(zone, i); - for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { - if (pageset->pcp[j].count) - break; - } - if (j == ARRAY_SIZE(pageset->pcp)) - continue; - for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { - seq_printf(m, - "\n cpu: %i pcp: %i" - "\n count: %i" - "\n high: %i" - "\n batch: %i", - i, j, - pageset->pcp[j].count, - pageset->pcp[j].high, - pageset->pcp[j].batch); - } + pset = zone_pcp(zone, i); + seq_printf(m, + "\n cpu: %i" + "\n count: %i" + "\n high: %i" + "\n batch: %i", + i, pset->count, pset->high, pset->batch); #ifdef CONFIG_NUMA seq_printf(m, "\n numa_hit: %lu" @@ -2152,12 +2138,12 @@ static int zoneinfo_show(struct seq_file "\n interleave_hit: %lu" "\n local_node: %lu" "\n other_node: %lu", - pageset->numa_hit, - pageset->numa_miss, - pageset->numa_foreign, - pageset->interleave_hit, - pageset->local_node, - pageset->other_node); + pset->numa_hit, + pset->numa_miss, + pset->numa_foreign, + pset->interleave_hit, + pset->local_node, + pset->other_node); #endif } seq_printf(m,