Global counter updates Sadly there was still contention at 512 processors even after both of the prior measures were taken. This another special measure only applicable to NUMA machines. For machines with very high contention, we can update all counters if a single counter overflows. This reduces the amount of times the shared cacheline needs with the global VM counters needs to be acquired. Signed-off-by: Christoph Lameter Index: linux-2.6.18-rc4/mm/vmstat.c =================================================================== --- linux-2.6.18-rc4.orig/mm/vmstat.c 2006-08-18 18:00:25.815019031 -0700 +++ linux-2.6.18-rc4/mm/vmstat.c 2006-08-18 18:02:30.845379991 -0700 @@ -178,6 +178,44 @@ static void refresh_zone_stat_thresholds } } +#ifdef CONFIG_NUMA +#define UPDATE_ALL(__z) (unlikely((__z)->stat_threshold > 32)) +#else +#define UPDATE_ALL(__z) 0 +#endif + +/* + * Update all counters that have some differentials. This is called + * when the counter threshold becomes very large to limit the number + * of updates of the cacheline + */ +static void update_all_counters(struct per_cpu_pageset *pcp, + struct zone *zone) +{ + int i, j; + unsigned long flags; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i += sizeof(long)) { + /* + * Use a long fetch to check a group of counters + * at a time. This works because the pcp structure + * is cacheline aligned. Extraneous counters at the + * end are always zero. + */ + if (! *(long *)(pcp->vm_stat_diff + i)) + continue; + + local_irq_save(flags); + + for(j = i; j < i + sizeof(long); j++) { + zone_page_state_add(pcp->vm_stat_diff[j], + zone, j); + pcp->vm_stat_diff[j] = 0; + } + local_irq_restore(flags); + } +} + /* * For use when we know that interrupts are disabled. */ @@ -243,7 +281,12 @@ static void __inc_zone_state(struct zone if (unlikely(*p > zone->stat_threshold)) { int overstep = zone->stat_threshold / 2; - zone_page_state_add(*p + overstep, zone, item); + if (UPDATE_ALL(zone)) { + update_all_counters(pcp, zone); + zone_page_state_add(overstep, zone, item); + } else + zone_page_state_add(*p + overstep, zone, item); + *p = -overstep; } } @@ -262,10 +305,15 @@ void __dec_zone_page_state(struct page * (*p)--; - if (unlikely(*p < - zone->stat_threshold)) { + if (unlikely(*p < -zone->stat_threshold)) { int overstep = zone->stat_threshold / 2; - zone_page_state_add(*p - overstep, zone, item); + if (UPDATE_ALL(zone)) { + update_all_counters(pcp, zone); + zone_page_state_add(-overstep, zone, item); + } else + zone_page_state_add(*p - overstep, zone, item); + *p = overstep; } } @@ -308,8 +356,6 @@ EXPORT_SYMBOL(dec_zone_page_state); void refresh_cpu_vm_stats(int cpu) { struct zone *zone; - int i,j; - unsigned long flags; for_each_zone(zone) { struct per_cpu_pageset *pcp; @@ -319,26 +365,8 @@ void refresh_cpu_vm_stats(int cpu) pcp = zone_pcp(zone, cpu); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i += sizeof(long)) { - /* - * Use a long fetch to check a group of counters - * at a time. This works because the pcp structure - * is cacheline aligned. Extraneous counters at the - * end are always zero. - */ - if (! *(long *)(pcp->vm_stat_diff + i)) - continue; - - local_irq_save(flags); - for (j = i; j < i + sizeof(long); j++) - if (pcp->vm_stat_diff[j]) { - zone_page_state_add(pcp->vm_stat_diff[j], - zone, j); - pcp->vm_stat_diff[j] = 0; - } - local_irq_restore(flags); - } - } + update_all_counters(zone_pcp(zone, cpu), zone); + } } static void __refresh_cpu_vm_stats(void *dummy)