CPU OPS: Optimize per cpu handling throughout the kernel Go through the kernel and use the new cpu ops. This covers: SLUB: Avoid offset calculations and preempt disable/enable in the fast path Network layer: General counter handling RCU: Optimizes readlock/unlock Disks: Optimize counter handling Currently this only works for CPU_ALLOC/allocpercpu. However, at the end of this patchset these operations can be applied to percpu data. Signed-off-by: Christoph Lameter --- drivers/net/loopback.c | 8 +++----- include/linux/genhd.h | 8 ++------ include/linux/vmstat.h | 10 ++++------ include/net/neighbour.h | 6 +----- include/net/snmp.h | 15 +++++++-------- kernel/srcu.c | 13 ++++--------- mm/slub.c | 42 ++++++++++++++++++++---------------------- net/core/sock.c | 2 +- 8 files changed, 42 insertions(+), 62 deletions(-) Index: linux-2.6/kernel/srcu.c =================================================================== --- linux-2.6.orig/kernel/srcu.c 2007-11-17 18:07:44.753772545 -0800 +++ linux-2.6/kernel/srcu.c 2007-11-17 18:07:54.853772706 -0800 @@ -111,12 +111,9 @@ int srcu_read_lock(struct srcu_struct *s { int idx; - preempt_disable(); idx = sp->completed & 0x1; - barrier(); /* ensure compiler looks -once- at sp->completed. */ - THIS_CPU(sp->per_cpu_ref)->c[idx]++; - srcu_barrier(); /* ensure compiler won't misorder critical section. */ - preempt_enable(); + srcu_barrier(); + _CPU_INC(sp->per_cpu_ref->c[idx]); return idx; } @@ -132,10 +129,8 @@ int srcu_read_lock(struct srcu_struct *s */ void srcu_read_unlock(struct srcu_struct *sp, int idx) { - preempt_disable(); - srcu_barrier(); /* ensure compiler won't misorder critical section. */ - THIS_CPU(sp->per_cpu_ref)->c[idx]--; - preempt_enable(); + srcu_barrier(); + _CPU_DEC(sp->per_cpu_ref->c[idx]); } /** Index: linux-2.6/drivers/net/loopback.c =================================================================== --- linux-2.6.orig/drivers/net/loopback.c 2007-11-17 18:07:44.761773015 -0800 +++ linux-2.6/drivers/net/loopback.c 2007-11-17 18:07:54.853772706 -0800 @@ -134,7 +134,7 @@ static void emulate_large_send_offload(s */ static int loopback_xmit(struct sk_buff *skb, struct net_device *dev) { - struct pcpu_lstats *pcpu_lstats, *lb_stats; + struct pcpu_lstats *pcpu_lstats; skb_orphan(skb); @@ -154,11 +154,9 @@ static int loopback_xmit(struct sk_buff #endif dev->last_rx = jiffies; - /* it's OK to use per_cpu_ptr() because BHs are off */ pcpu_lstats = netdev_priv(dev); - lb_stats = THIS_CPU(pcpu_lstats); - lb_stats->bytes += skb->len; - lb_stats->packets++; + __CPU_ADD(pcpu_lstats->bytes, skb->len); + __CPU_INC(pcpu_lstats->packets); netif_rx(skb); Index: linux-2.6/include/linux/genhd.h =================================================================== --- linux-2.6.orig/include/linux/genhd.h 2007-11-17 18:07:44.765772981 -0800 +++ linux-2.6/include/linux/genhd.h 2007-11-17 18:07:54.853772706 -0800 @@ -158,7 +158,7 @@ struct disk_attribute { */ #ifdef CONFIG_SMP #define __disk_stat_add(gendiskp, field, addnd) \ - (THIS_CPU(gendiskp->dkstats)->field += addnd) + __CPU_ADD(gendiskp->dkstats->field, addnd) #define disk_stat_read(gendiskp, field) \ ({ \ @@ -187,11 +187,7 @@ static inline void disk_stat_set_all(str #endif #define disk_stat_add(gendiskp, field, addnd) \ - do { \ - preempt_disable(); \ - __disk_stat_add(gendiskp, field, addnd); \ - preempt_enable(); \ - } while (0) + _CPU_ADD(gendiskp->dkstats->field, addnd); #define __disk_stat_dec(gendiskp, field) __disk_stat_add(gendiskp, field, -1) #define disk_stat_dec(gendiskp, field) disk_stat_add(gendiskp, field, -1) Index: linux-2.6/include/net/neighbour.h =================================================================== --- linux-2.6.orig/include/net/neighbour.h 2007-11-17 18:07:44.777773095 -0800 +++ linux-2.6/include/net/neighbour.h 2007-11-17 18:07:54.853772706 -0800 @@ -81,11 +81,7 @@ struct neigh_statistics }; #define NEIGH_CACHE_STAT_INC(tbl, field) \ - do { \ - preempt_disable(); \ - (THIS_CPU((tbl)->stats)->field)++; \ - preempt_enable(); \ - } while (0) + _CPU_INC(tbl->stats->field) struct neighbour { Index: linux-2.6/include/net/snmp.h =================================================================== --- linux-2.6.orig/include/net/snmp.h 2007-11-17 18:07:44.785772810 -0800 +++ linux-2.6/include/net/snmp.h 2007-11-17 18:07:54.853772706 -0800 @@ -132,19 +132,18 @@ struct linux_mib { #define SNMP_STAT_BHPTR(name) (name[0]) #define SNMP_STAT_USRPTR(name) (name[1]) -#define SNMP_INC_STATS_BH(mib, field) \ - (__THIS_CPU(mib[0])->mibs[field]++) +#define SNMP_INC_STATS_BH(mib, field) __CPU_INC(mib[0]->mibs[field]) #define SNMP_INC_STATS_OFFSET_BH(mib, field, offset) \ - (__THIS_CPU(mib[0])->mibs[field + (offset)]++) + __CPU_INC(mib[0]->mibs[field + (offset)]) #define SNMP_INC_STATS_USER(mib, field) \ - (__THIS_CPU(mib[1])->mibs[field]++) + __CPU_INC(mib[1]->mibs[field]) #define SNMP_INC_STATS(mib, field) \ - (__THIS_CPU(mib[!in_softirq()])->mibs[field]++) + __CPU_INC(mib[!in_softirq()]->mibs[field]) #define SNMP_DEC_STATS(mib, field) \ - (__THIS_CPU(mib[!in_softirq()])->mibs[field]--) + __CPU_DEC(mib[!in_softirq()]->mibs[field]) #define SNMP_ADD_STATS_BH(mib, field, addend) \ - (__THIS_CPU(mib[0])->mibs[field] += addend) + __CPU_ADD(mib[0]->mibs[field], addend) #define SNMP_ADD_STATS_USER(mib, field, addend) \ - (__THIS_CPU(mib[1])->mibs[field] += addend) + __CPU_ADD(mib[1]->mibs[field], addend) #endif Index: linux-2.6/net/core/sock.c =================================================================== --- linux-2.6.orig/net/core/sock.c 2007-11-17 18:07:44.793773145 -0800 +++ linux-2.6/net/core/sock.c 2007-11-17 18:07:54.853772706 -0800 @@ -1809,7 +1809,7 @@ static LIST_HEAD(proto_list); */ static void inuse_add(struct proto *prot, int inc) { - THIS_CPU(prot->inuse_ptr)[0] += inc; + __CPU_ADD(prot->inuse_ptr[0], inc); } static int inuse_get(const struct proto *prot) Index: linux-2.6/mm/slub.c =================================================================== --- linux-2.6.orig/mm/slub.c 2007-11-17 18:07:44.801773353 -0800 +++ linux-2.6/mm/slub.c 2007-11-17 18:07:54.857772593 -0800 @@ -1478,10 +1478,10 @@ static void flush_all(struct kmem_cache * Check if the objects in a per cpu structure fit numa * locality expectations. */ -static inline int node_match(struct kmem_cache_cpu *c, int node) +static inline int node_match(int cnode, int node) { #ifdef CONFIG_NUMA - if (node != -1 && c->node != node) + if (node != -1 && cnode != node) return 0; #endif return 1; @@ -1530,20 +1530,22 @@ static noinline unsigned long get_new_sl * we need to allocate a new slab. This is slowest path since we may sleep. */ static void *__slab_alloc(struct kmem_cache *s, - gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) + gfp_t gfpflags, int node, void *addr) { void **object; unsigned long state; + struct kmem_cache_cpu *c; #ifdef CONFIG_FAST_CMPXCHG_LOCAL unsigned long flags; local_irq_save(flags); preempt_enable_no_resched(); #endif + c = THIS_CPU(s->cpu_slab); if (likely(c->page)) { state = slab_lock(c->page); - if (unlikely(node_match(c, node) && + if (unlikely(node_match(c->node, node) && c->page->freelist != c->page->end)) goto load_freelist; @@ -1617,29 +1619,26 @@ static void __always_inline *slab_alloc( struct kmem_cache_cpu *c; #ifdef CONFIG_FAST_CMPXCHG_LOCAL - preempt_disable(); - c = THIS_CPU(s->cpu_slab); + c = s->cpu_slab; do { - object = c->freelist; - if (unlikely(is_end(object) || !node_match(c, node))) { - object = __slab_alloc(s, gfpflags, node, addr, c); - if (unlikely(!object)) { - preempt_enable(); + object = _CPU_READ(c->freelist); + if (unlikely(is_end(object) || + !node_match(_CPU_READ(c->node), node))) { + object = __slab_alloc(s, gfpflags, node, addr); + if (unlikely(!object)) goto out; - } break; } - } while (cmpxchg_local(&c->freelist, object, + } while (_CPU_CMPXCHG(c->freelist, object, get_freepointer(s, object)) != object); - preempt_enable(); #else unsigned long flags; local_irq_save(flags); c = THIS_CPU(s->cpu_slab); - if (unlikely((is_end(c->freelist)) || !node_match(c, node))) { + if (unlikely((is_end(c->freelist)) || !node_match(c->node, node))) { - object = __slab_alloc(s, gfpflags, node, addr, c); + object = __slab_alloc(s, gfpflags, node, addr); if (unlikely(!object)) { local_irq_restore(flags); goto out; @@ -1762,11 +1761,10 @@ static void __always_inline slab_free(st #ifdef CONFIG_FAST_CMPXCHG_LOCAL void **freelist; - preempt_disable(); - c = THIS_CPU(s->cpu_slab); + c = s->cpu_slab; debug_check_no_locks_freed(object, s->objsize); do { - freelist = c->freelist; + freelist = _CPU_READ(c->freelist); barrier(); /* * If the compiler would reorder the retrieval of c->page to @@ -1779,13 +1777,13 @@ static void __always_inline slab_free(st * then any change of cpu_slab will cause the cmpxchg to fail * since the freelist pointers are unique per slab. */ - if (unlikely(page != c->page || c->node < 0)) { + if (unlikely(page != _CPU_READ(c->page) || + _CPU_READ(c->node) < 0)) { __slab_free(s, page, x, addr); break; } set_freepointer(s, object, freelist); - } while (cmpxchg_local(&c->freelist, freelist, object) != freelist); - preempt_enable(); + } while (_CPU_CMPXCHG(c->freelist, freelist, object) != freelist); #else unsigned long flags;