GIT 7516ee624d01e35a28b9278306ff172c3699d6dd rsync://oss.sgi.com/git/xfs-2.6.git commit 7516ee624d01e35a28b9278306ff172c3699d6dd Author: Nathan Scott Date: Mon Feb 6 15:04:57 2006 +1100 [XFS] Fix a mutex_destroy diagnostic about a locked-mutex-on-destroy from quota code. SGI-PV: 949149 SGI-Modid: xfs-linux-melb:xfs-kern:25123a Signed-off-by: Nathan Scott commit ea3123c8c891a9b47bb6132b28a30b7562917308 Author: Nathan Scott Date: Mon Feb 6 15:03:51 2006 +1100 [XFS] Cleanup the use of zones/slabs, more consistent and allows flags to be passed. SGI-PV: 949073 SGI-Modid: xfs-linux-melb:xfs-kern:25122a Signed-off-by: Nathan Scott commit c555402e86b258ef8f8fefc3b0ebfa2e3d743f05 Author: David Chinner Date: Mon Feb 6 14:28:45 2006 +1100 [XFS] On machines with more than 8 cpus, when running parallel I/O threads, the incore superblock lock becomes the limiting factor for buffered write throughput. Make the contended fields in the incore superblock use per-cpu counters so that there is no global lock to limit scalability. SGI-PV: 946630 SGI-Modid: xfs-linux-melb:xfs-kern:25106a Signed-off-by: David Chinner Signed-off-by: Nathan Scott commit 8b94de4ce671f3b01d253e4a09833b15ca67db28 Author: Christoph Hellwig Date: Mon Feb 6 14:23:25 2006 +1100 [XFS] XFS propagates MS_NOATIME through two levels internally but doesn't actually use it. Kill this dead code. SGI-PV: 904196 SGI-Modid: xfs-linux-melb:xfs-kern:25086a Signed-off-by: Christoph Hellwig Signed-off-by: Nathan Scott --- diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h index c64a29c..b266e6f 100644 --- a/fs/xfs/linux-2.6/kmem.h +++ b/fs/xfs/linux-2.6/kmem.h @@ -23,15 +23,8 @@ #include /* - * memory management routines + * Process flags handling */ -#define KM_SLEEP 0x0001u -#define KM_NOSLEEP 0x0002u -#define KM_NOFS 0x0004u -#define KM_MAYFAIL 0x0008u - -#define kmem_zone kmem_cache -#define kmem_zone_t struct kmem_cache typedef unsigned long xfs_pflags_t; @@ -67,74 +60,102 @@ typedef unsigned long xfs_pflags_t; *(NSTATEP) = *(OSTATEP); \ } while (0) -static __inline gfp_t kmem_flags_convert(unsigned int __nocast flags) +/* + * General memory allocation interfaces + */ + +#define KM_SLEEP 0x0001u +#define KM_NOSLEEP 0x0002u +#define KM_NOFS 0x0004u +#define KM_MAYFAIL 0x0008u + +/* + * We use a special process flag to avoid recursive callbacks into + * the filesystem during transactions. We will also issue our own + * warnings, so we explicitly skip any generic ones (silly of us). + */ +static inline gfp_t +kmem_flags_convert(unsigned int __nocast flags) { - gfp_t lflags = __GFP_NOWARN; /* we'll report problems, if need be */ + gfp_t lflags; -#ifdef DEBUG - if (unlikely(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL))) { - printk(KERN_WARNING - "XFS: memory allocation with wrong flags (%x)\n", flags); - BUG(); - } -#endif + BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL)); if (flags & KM_NOSLEEP) { - lflags |= GFP_ATOMIC; + lflags = GFP_ATOMIC | __GFP_NOWARN; } else { - lflags |= GFP_KERNEL; - - /* avoid recusive callbacks to filesystem during transactions */ + lflags = GFP_KERNEL | __GFP_NOWARN; if (PFLAGS_TEST_FSTRANS() || (flags & KM_NOFS)) lflags &= ~__GFP_FS; } - - return lflags; + return lflags; } -static __inline kmem_zone_t * +extern void *kmem_alloc(size_t, unsigned int __nocast); +extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast); +extern void *kmem_zalloc(size_t, unsigned int __nocast); +extern void kmem_free(void *, size_t); + +/* + * Zone interfaces + */ + +#define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN +#define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT +#define KM_ZONE_SPREAD 0 + +#define kmem_zone kmem_cache +#define kmem_zone_t struct kmem_cache + +static inline kmem_zone_t * kmem_zone_init(int size, char *zone_name) { return kmem_cache_create(zone_name, size, 0, 0, NULL, NULL); } -static __inline void +static inline kmem_zone_t * +kmem_zone_init_flags(int size, char *zone_name, unsigned long flags, + void (*construct)(void *, kmem_zone_t *, unsigned long)) +{ + return kmem_cache_create(zone_name, size, 0, flags, construct, NULL); +} + +static inline void kmem_zone_free(kmem_zone_t *zone, void *ptr) { kmem_cache_free(zone, ptr); } -static __inline void +static inline void kmem_zone_destroy(kmem_zone_t *zone) { if (zone && kmem_cache_destroy(zone)) BUG(); } -extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast); +extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast); -extern void *kmem_alloc(size_t, unsigned int __nocast); -extern void *kmem_realloc(void *, size_t, size_t, unsigned int __nocast); -extern void *kmem_zalloc(size_t, unsigned int __nocast); -extern void kmem_free(void *, size_t); +/* + * Low memory cache shrinkers + */ typedef struct shrinker *kmem_shaker_t; typedef int (*kmem_shake_func_t)(int, gfp_t); -static __inline kmem_shaker_t +static inline kmem_shaker_t kmem_shake_register(kmem_shake_func_t sfunc) { return set_shrinker(DEFAULT_SEEKS, sfunc); } -static __inline void +static inline void kmem_shake_deregister(kmem_shaker_t shrinker) { remove_shrinker(shrinker); } -static __inline int +static inline int kmem_shake_allow(gfp_t gfp_mask) { return (gfp_mask & __GFP_WAIT); diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index bfb4f29..cdb905a 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1805,13 +1805,12 @@ xfs_flush_buftarg( int __init xfs_buf_init(void) { - int error = -ENOMEM; - #ifdef XFS_BUF_TRACE xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_SLEEP); #endif - xfs_buf_zone = kmem_zone_init(sizeof(xfs_buf_t), "xfs_buf"); + xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf", + KM_ZONE_HWALIGN, NULL); if (!xfs_buf_zone) goto out_free_trace_buf; @@ -1839,7 +1838,7 @@ xfs_buf_init(void) #ifdef XFS_BUF_TRACE ktrace_free(xfs_buf_trace_buf); #endif - return error; + return -ENOMEM; } void diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 67389b7..377a9f5 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -100,6 +100,11 @@ */ #undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */ #define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */ +#if CONFIG_SMP +#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ +#else +#undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ +#endif /* * State flag for unwritten extent buffers. diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index f22e426..0c7ed4b 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -76,8 +76,6 @@ xfs_args_allocate( strncpy(args->fsname, sb->s_id, MAXNAMELEN); /* Copy the already-parsed mount(2) flags we're interested in */ - if (sb->s_flags & MS_NOATIME) - args->flags |= XFSMNT_NOATIME; if (sb->s_flags & MS_DIRSYNC) args->flags |= XFSMNT_DIRSYNC; if (sb->s_flags & MS_SYNCHRONOUS) @@ -339,8 +337,8 @@ linvfs_alloc_inode( { vnode_t *vp; - vp = kmem_cache_alloc(xfs_vnode_zone, kmem_flags_convert(KM_SLEEP)); - if (!vp) + vp = kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP); + if (unlikely(!vp)) return NULL; return LINVFS_GET_IP(vp); } @@ -354,23 +352,21 @@ linvfs_destroy_inode( STATIC void linvfs_inode_init_once( - void *data, - kmem_cache_t *cachep, + void *vnode, + kmem_zone_t *zonep, unsigned long flags) { - vnode_t *vp = (vnode_t *)data; - if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) - inode_init_once(LINVFS_GET_IP(vp)); + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(LINVFS_GET_IP((vnode_t *)vnode)); } STATIC int -linvfs_init_zones(void) +xfs_init_zones(void) { - xfs_vnode_zone = kmem_cache_create("xfs_vnode", - sizeof(vnode_t), 0, SLAB_RECLAIM_ACCOUNT, - linvfs_inode_init_once, NULL); + xfs_vnode_zone = kmem_zone_init_flags(sizeof(vnode_t), "xfs_vnode_t", + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM, + linvfs_inode_init_once); if (!xfs_vnode_zone) goto out; @@ -379,14 +375,12 @@ linvfs_init_zones(void) goto out_destroy_vnode_zone; xfs_ioend_pool = mempool_create(4 * MAX_BUF_PER_PAGE, - mempool_alloc_slab, mempool_free_slab, - xfs_ioend_zone); + mempool_alloc_slab, mempool_free_slab, + xfs_ioend_zone); if (!xfs_ioend_pool) goto out_free_ioend_zone; - return 0; - out_free_ioend_zone: kmem_zone_destroy(xfs_ioend_zone); out_destroy_vnode_zone: @@ -396,7 +390,7 @@ linvfs_init_zones(void) } STATIC void -linvfs_destroy_zones(void) +xfs_destroy_zones(void) { mempool_destroy(xfs_ioend_pool); kmem_zone_destroy(xfs_vnode_zone); @@ -407,7 +401,7 @@ linvfs_destroy_zones(void) * Attempt to flush the inode, this will actually fail * if the inode is pinned, but we dirty the inode again * at the point when it is unpinned after a log write, - * since this is when the inode itself becomes flushable. + * since this is when the inode itself becomes flushable. */ STATIC int linvfs_write_inode( @@ -965,7 +959,7 @@ init_xfs_fs( void ) ktrace_init(64); - error = linvfs_init_zones(); + error = xfs_init_zones(); if (error < 0) goto undo_zones; @@ -988,7 +982,7 @@ undo_register: xfs_buf_terminate(); undo_buffers: - linvfs_destroy_zones(); + xfs_destroy_zones(); undo_zones: return error; @@ -1002,7 +996,7 @@ exit_xfs_fs( void ) unregister_filesystem(&xfs_fs_type); xfs_cleanup(); xfs_buf_terminate(); - linvfs_destroy_zones(); + xfs_destroy_zones(); ktrace_uninit(); } diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 53a00fb..22d41f8 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -2789,9 +2789,7 @@ xfs_qm_freelist_destroy(xfs_frlist_t *ql xfs_qm_dqdestroy(dqp); dqp = nextdqp; } - /* - * Don't bother about unlocking. - */ + mutex_unlock(&ql->qh_lock); mutex_destroy(&ql->qh_lock); ASSERT(ql->qh_nelems == 0); diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h index f57cc9a..022fff6 100644 --- a/fs/xfs/xfs_clnt.h +++ b/fs/xfs/xfs_clnt.h @@ -68,8 +68,6 @@ struct xfs_mount_args { * enforcement */ #define XFSMNT_PQUOTAENF 0x00000040 /* IRIX project quota limit * enforcement */ -#define XFSMNT_NOATIME 0x00000100 /* don't modify access - * times on reads */ #define XFSMNT_NOALIGN 0x00000200 /* don't allocate at * stripe boundaries*/ #define XFSMNT_RETERR 0x00000400 /* return error to user */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index b4d971b..56caa88 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -462,6 +462,7 @@ xfs_fs_counts( { unsigned long s; + xfs_icsb_sync_counters_lazy(mp); s = XFS_SB_LOCK(mp); cnt->freedata = mp->m_sb.sb_fdblocks; cnt->freertx = mp->m_sb.sb_frextents; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 62188ea..d56bbdb 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -51,11 +51,31 @@ STATIC int xfs_uuid_mount(xfs_mount_t *) STATIC void xfs_uuid_unmount(xfs_mount_t *mp); STATIC void xfs_unmountfs_wait(xfs_mount_t *); + +#ifdef HAVE_PERCPU_SB +STATIC void xfs_icsb_destroy_counters(xfs_mount_t *); +STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, int); +STATIC void xfs_icsb_sync_counters(xfs_mount_t *); +STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t, + int, int); +STATIC int xfs_icsb_modify_counters_locked(xfs_mount_t *, xfs_sb_field_t, + int, int); + +#else + +#define xfs_icsb_destroy_counters(mp) do { } while (0) +#define xfs_icsb_balance_counter(mp, a, b) do { } while (0) +#define xfs_icsb_sync_counters(mp) do { } while (0) +#define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0) +#define xfs_icsb_modify_counters_locked(mp, a, b, c) do { } while (0) + +#endif + static const struct { - short offset; - short type; /* 0 = integer - * 1 = binary / string (no translation) - */ + short offset; + short type; /* 0 = integer + * 1 = binary / string (no translation) + */ } xfs_sb_info[] = { { offsetof(xfs_sb_t, sb_magicnum), 0 }, { offsetof(xfs_sb_t, sb_blocksize), 0 }, @@ -113,7 +133,11 @@ xfs_mount_init(void) { xfs_mount_t *mp; - mp = kmem_zalloc(sizeof(*mp), KM_SLEEP); + mp = kmem_zalloc(sizeof(xfs_mount_t), KM_SLEEP); + + if (xfs_icsb_init_counters(mp)) { + mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; + } AIL_LOCKINIT(&mp->m_ail_lock, "xfs_ail"); spinlock_init(&mp->m_sb_lock, "xfs_sb"); @@ -136,8 +160,8 @@ xfs_mount_init(void) */ void xfs_mount_free( - xfs_mount_t *mp, - int remove_bhv) + xfs_mount_t *mp, + int remove_bhv) { if (mp->m_ihash) xfs_ihash_free(mp); @@ -177,6 +201,7 @@ xfs_mount_free( VFS_REMOVEBHV(vfsp, &mp->m_bhv); } + xfs_icsb_destroy_counters(mp); kmem_free(mp, sizeof(xfs_mount_t)); } @@ -527,6 +552,10 @@ xfs_readsb(xfs_mount_t *mp) ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); } + xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); + mp->m_sb_bp = bp; xfs_buf_relse(bp); ASSERT(XFS_BUF_VALUSEMA(bp) > 0); @@ -1154,6 +1183,9 @@ xfs_unmountfs_writesb(xfs_mount_t *mp) sbp = xfs_getsb(mp, 0); if (!(XFS_MTOVFS(mp)->vfs_flag & VFS_RDONLY || XFS_FORCED_SHUTDOWN(mp))) { + + xfs_icsb_sync_counters(mp); + /* * mark shared-readonly if desired */ @@ -1227,7 +1259,6 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fi xfs_trans_log_buf(tp, bp, first, last); } - /* * xfs_mod_incore_sb_unlocked() is a utility routine common used to apply * a delta to a specified field in the in-core superblock. Simply @@ -1237,7 +1268,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fi * * The SB_LOCK must be held when this routine is called. */ -STATIC int +int xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field, int delta, int rsvd) { @@ -1405,13 +1436,30 @@ xfs_mod_incore_sb(xfs_mount_t *mp, xfs_s { unsigned long s; int status; + + /* check for per-cpu counters */ + switch (field) { +#ifdef HAVE_PERCPU_SB + case XFS_SBS_ICOUNT: + case XFS_SBS_IFREE: + case XFS_SBS_FDBLOCKS: + if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { + status = xfs_icsb_modify_counters(mp, field, + delta, rsvd); + break; + } + /* FALLTHROUGH */ +#endif + default: + s = XFS_SB_LOCK(mp); + status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); + XFS_SB_UNLOCK(mp, s); + break; + } - s = XFS_SB_LOCK(mp); - status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); - XFS_SB_UNLOCK(mp, s); return status; } - + /* * xfs_mod_incore_sb_batch() is used to change more than one field * in the in-core superblock structure at a time. This modification @@ -1445,8 +1493,26 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, * from the loop so we'll fall into the undo loop * below. */ - status = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, - msbp->msb_delta, rsvd); + switch (msbp->msb_field) { +#ifdef HAVE_PERCPU_SB + case XFS_SBS_ICOUNT: + case XFS_SBS_IFREE: + case XFS_SBS_FDBLOCKS: + if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { + status = xfs_icsb_modify_counters_locked(mp, + msbp->msb_field, + msbp->msb_delta, rsvd); + break; + } + /* FALLTHROUGH */ +#endif + default: + status = xfs_mod_incore_sb_unlocked(mp, + msbp->msb_field, + msbp->msb_delta, rsvd); + break; + } + if (status != 0) { break; } @@ -1463,8 +1529,28 @@ xfs_mod_incore_sb_batch(xfs_mount_t *mp, if (status != 0) { msbp--; while (msbp >= msb) { - status = xfs_mod_incore_sb_unlocked(mp, - msbp->msb_field, -(msbp->msb_delta), rsvd); + switch (msbp->msb_field) { +#ifdef HAVE_PERCPU_SB + case XFS_SBS_ICOUNT: + case XFS_SBS_IFREE: + case XFS_SBS_FDBLOCKS: + if (!(mp->m_flags & XFS_MOUNT_NO_PERCPU_SB)) { + status = + xfs_icsb_modify_counters_locked(mp, + msbp->msb_field, + -(msbp->msb_delta), + rsvd); + break; + } + /* FALLTHROUGH */ +#endif + default: + status = xfs_mod_incore_sb_unlocked(mp, + msbp->msb_field, + -(msbp->msb_delta), + rsvd); + break; + } ASSERT(status == 0); msbp--; } @@ -1577,3 +1663,445 @@ xfs_mount_log_sbunit( xfs_mod_sb(tp, fields); xfs_trans_commit(tp, 0, NULL); } + + +#ifdef HAVE_PERCPU_SB +/* + * Per-cpu incore superblock counters + * + * Simple concept, difficult implementation + * + * Basically, replace the incore superblock counters with a distributed per cpu + * counter for contended fields (e.g. free block count). + * + * Difficulties arise in that the incore sb is used for ENOSPC checking, and + * hence needs to be accurately read when we are running low on space. Hence + * there is a method to enable and disable the per-cpu counters based on how + * much "stuff" is available in them. + * + * Basically, a counter is enabled if there is enough free resource to justify + * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local + * ENOSPC), then we disable the counters to synchronise all callers and + * re-distribute the available resources. + * + * If, once we redistributed the available resources, we still get a failure, + * we disable the per-cpu counter and go through the slow path. + * + * The slow path is the current xfs_mod_incore_sb() function. This means that + * when we disable a per-cpu counter, we need to drain it's resources back to + * the global superblock. We do this after disabling the counter to prevent + * more threads from queueing up on the counter. + * + * Essentially, this means that we still need a lock in the fast path to enable + * synchronisation between the global counters and the per-cpu counters. This + * is not a problem because the lock will be local to a CPU almost all the time + * and have little contention except when we get to ENOSPC conditions. + * + * Basically, this lock becomes a barrier that enables us to lock out the fast + * path while we do things like enabling and disabling counters and + * synchronising the counters. + * + * Locking rules: + * + * 1. XFS_SB_LOCK() before picking up per-cpu locks + * 2. per-cpu locks always picked up via for_each_online_cpu() order + * 3. accurate counter sync requires XFS_SB_LOCK + per cpu locks + * 4. modifying per-cpu counters requires holding per-cpu lock + * 5. modifying global counters requires holding XFS_SB_LOCK + * 6. enabling or disabling a counter requires holding the XFS_SB_LOCK + * and _none_ of the per-cpu locks. + * + * Disabled counters are only ever re-enabled by a balance operation + * that results in more free resources per CPU than a given threshold. + * To ensure counters don't remain disabled, they are rebalanced when + * the global resource goes above a higher threshold (i.e. some hysteresis + * is present to prevent thrashing). + * + * Note: hotplug CPUs not yet supported + */ +int +xfs_icsb_init_counters( + xfs_mount_t *mp) +{ + xfs_icsb_cnts_t *cntp; + int i; + + mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t); + if (mp->m_sb_cnts == NULL) + return -ENOMEM; + + for_each_online_cpu(i) { + cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); + spin_lock_init(&cntp->icsb_lock); + } + /* + * start with all counters disabled so that the + * initial balance kicks us off correctly + */ + mp->m_icsb_counters = -1; + return 0; +} + +STATIC void +xfs_icsb_destroy_counters( + xfs_mount_t *mp) +{ + if (mp->m_sb_cnts) + free_percpu(mp->m_sb_cnts); +} + + +STATIC inline void +xfs_icsb_lock_all_counters( + xfs_mount_t *mp) +{ + xfs_icsb_cnts_t *cntp; + int i; + + for_each_online_cpu(i) { + cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); + spin_lock(&cntp->icsb_lock); + } +} + +STATIC inline void +xfs_icsb_unlock_all_counters( + xfs_mount_t *mp) +{ + xfs_icsb_cnts_t *cntp; + int i; + + for_each_online_cpu(i) { + cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); + spin_unlock(&cntp->icsb_lock); + } +} + +STATIC void +xfs_icsb_count( + xfs_mount_t *mp, + xfs_icsb_cnts_t *cnt, + int flags) +{ + xfs_icsb_cnts_t *cntp; + int i; + + memset(cnt, 0, sizeof(xfs_icsb_cnts_t)); + + if (!(flags & XFS_ICSB_LAZY_COUNT)) + xfs_icsb_lock_all_counters(mp); + + for_each_online_cpu(i) { + cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); + cnt->icsb_icount += cntp->icsb_icount; + cnt->icsb_ifree += cntp->icsb_ifree; + cnt->icsb_fdblocks += cntp->icsb_fdblocks; + } + + if (!(flags & XFS_ICSB_LAZY_COUNT)) + xfs_icsb_unlock_all_counters(mp); +} + +STATIC int +xfs_icsb_counter_disabled( + xfs_mount_t *mp, + xfs_sb_field_t field) +{ + ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); + return test_bit(field, &mp->m_icsb_counters); +} + +STATIC int +xfs_icsb_disable_counter( + xfs_mount_t *mp, + xfs_sb_field_t field) +{ + xfs_icsb_cnts_t cnt; + + ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); + + xfs_icsb_lock_all_counters(mp); + if (!test_and_set_bit(field, &mp->m_icsb_counters)) { + /* drain back to superblock */ + + xfs_icsb_count(mp, &cnt, XFS_ICSB_SB_LOCKED|XFS_ICSB_LAZY_COUNT); + switch(field) { + case XFS_SBS_ICOUNT: + mp->m_sb.sb_icount = cnt.icsb_icount; + break; + case XFS_SBS_IFREE: + mp->m_sb.sb_ifree = cnt.icsb_ifree; + break; + case XFS_SBS_FDBLOCKS: + mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; + break; + default: + BUG(); + } + } + + xfs_icsb_unlock_all_counters(mp); + + return 0; +} + +STATIC void +xfs_icsb_enable_counter( + xfs_mount_t *mp, + xfs_sb_field_t field, + uint64_t count, + uint64_t resid) +{ + xfs_icsb_cnts_t *cntp; + int i; + + ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); + + xfs_icsb_lock_all_counters(mp); + for_each_online_cpu(i) { + cntp = per_cpu_ptr(mp->m_sb_cnts, i); + switch (field) { + case XFS_SBS_ICOUNT: + cntp->icsb_icount = count + resid; + break; + case XFS_SBS_IFREE: + cntp->icsb_ifree = count + resid; + break; + case XFS_SBS_FDBLOCKS: + cntp->icsb_fdblocks = count + resid; + break; + default: + BUG(); + break; + } + resid = 0; + } + clear_bit(field, &mp->m_icsb_counters); + xfs_icsb_unlock_all_counters(mp); +} + +STATIC void +xfs_icsb_sync_counters_int( + xfs_mount_t *mp, + int flags) +{ + xfs_icsb_cnts_t cnt; + int s; + + /* Pass 1: lock all counters */ + if ((flags & XFS_ICSB_SB_LOCKED) == 0) + s = XFS_SB_LOCK(mp); + + xfs_icsb_count(mp, &cnt, flags); + + /* Step 3: update mp->m_sb fields */ + if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT)) + mp->m_sb.sb_icount = cnt.icsb_icount; + if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE)) + mp->m_sb.sb_ifree = cnt.icsb_ifree; + if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS)) + mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; + + if ((flags & XFS_ICSB_SB_LOCKED) == 0) + XFS_SB_UNLOCK(mp, s); +} + +/* + * Accurate update of per-cpu counters to incore superblock + */ +STATIC void +xfs_icsb_sync_counters( + xfs_mount_t *mp) +{ + xfs_icsb_sync_counters_int(mp, 0); +} + +/* + * lazy addition used for things like df, background sb syncs, etc + */ +void +xfs_icsb_sync_counters_lazy( + xfs_mount_t *mp) +{ + xfs_icsb_sync_counters_int(mp, XFS_ICSB_LAZY_COUNT); +} + +/* + * Balance and enable/disable counters as necessary. + * + * Thresholds for re-enabling counters are somewhat magic. + * inode counts are chosen to be the same number as single + * on disk allocation chunk per CPU, and free blocks is + * something far enough zero that we aren't going thrash + * when we get near ENOSPC. + */ +#define XFS_ICSB_INO_CNTR_REENABLE 64 +#define XFS_ICSB_FDBLK_CNTR_REENABLE 512 +STATIC void +xfs_icsb_balance_counter( + xfs_mount_t *mp, + xfs_sb_field_t field, + int flags) +{ + uint64_t count, resid = 0; + int weight = num_online_cpus(); + int s; + + if (!(flags & XFS_ICSB_SB_LOCKED)) + s = XFS_SB_LOCK(mp); + + /* disable counter and sync counter */ + xfs_icsb_disable_counter(mp, field); + + /* update counters - first CPU gets residual*/ + switch (field) { + case XFS_SBS_ICOUNT: + count = mp->m_sb.sb_icount; + resid = do_div(count, weight); + if (count < XFS_ICSB_INO_CNTR_REENABLE) + goto out; + break; + case XFS_SBS_IFREE: + count = mp->m_sb.sb_ifree; + resid = do_div(count, weight); + if (count < XFS_ICSB_INO_CNTR_REENABLE) + goto out; + break; + case XFS_SBS_FDBLOCKS: + count = mp->m_sb.sb_fdblocks; + resid = do_div(count, weight); + if (count < XFS_ICSB_FDBLK_CNTR_REENABLE) + goto out; + break; + default: + BUG(); + break; + } + + xfs_icsb_enable_counter(mp, field, count, resid); +out: + if (!(flags & XFS_ICSB_SB_LOCKED)) + XFS_SB_UNLOCK(mp, s); +} + +STATIC int +xfs_icsb_modify_counters_int( + xfs_mount_t *mp, + xfs_sb_field_t field, + int delta, + int rsvd, + int flags) +{ + xfs_icsb_cnts_t *icsbp; + long long lcounter; /* long counter for 64 bit fields */ + int cpu, s, locked = 0; + int ret = 0, balance_done = 0; + +again: + cpu = get_cpu(); + icsbp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, cpu), + spin_lock(&icsbp->icsb_lock); + if (unlikely(xfs_icsb_counter_disabled(mp, field))) + goto slow_path; + + switch (field) { + case XFS_SBS_ICOUNT: + lcounter = icsbp->icsb_icount; + lcounter += delta; + if (unlikely(lcounter < 0)) + goto slow_path; + icsbp->icsb_icount = lcounter; + break; + + case XFS_SBS_IFREE: + lcounter = icsbp->icsb_ifree; + lcounter += delta; + if (unlikely(lcounter < 0)) + goto slow_path; + icsbp->icsb_ifree = lcounter; + break; + + case XFS_SBS_FDBLOCKS: + BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0); + + lcounter = icsbp->icsb_fdblocks; + lcounter += delta; + if (unlikely(lcounter < 0)) + goto slow_path; + icsbp->icsb_fdblocks = lcounter; + break; + default: + BUG(); + break; + } + spin_unlock(&icsbp->icsb_lock); + put_cpu(); + if (locked) + XFS_SB_UNLOCK(mp, s); + return 0; + + /* + * The slow path needs to be run with the SBLOCK + * held so that we prevent other threads from + * attempting to run this path at the same time. + * this provides exclusion for the balancing code, + * and exclusive fallback if the balance does not + * provide enough resources to continue in an unlocked + * manner. + */ +slow_path: + spin_unlock(&icsbp->icsb_lock); + put_cpu(); + + /* need to hold superblock incase we need + * to disable a counter */ + if (!(flags & XFS_ICSB_SB_LOCKED)) { + s = XFS_SB_LOCK(mp); + locked = 1; + flags |= XFS_ICSB_SB_LOCKED; + } + if (!balance_done) { + xfs_icsb_balance_counter(mp, field, flags); + balance_done = 1; + goto again; + } else { + /* + * we might not have enough on this local + * cpu to allocate for a bulk request. + * We need to drain this field from all CPUs + * and disable the counter fastpath + */ + xfs_icsb_disable_counter(mp, field); + } + + ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); + + if (locked) + XFS_SB_UNLOCK(mp, s); + return ret; +} + +STATIC int +xfs_icsb_modify_counters( + xfs_mount_t *mp, + xfs_sb_field_t field, + int delta, + int rsvd) +{ + return xfs_icsb_modify_counters_int(mp, field, delta, rsvd, 0); +} + +/* + * Called when superblock is already locked + */ +STATIC int +xfs_icsb_modify_counters_locked( + xfs_mount_t *mp, + xfs_sb_field_t field, + int delta, + int rsvd) +{ + return xfs_icsb_modify_counters_int(mp, field, delta, + rsvd, XFS_ICSB_SB_LOCKED); +} +#endif diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index cd3cf96..7cca511 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -267,6 +267,32 @@ typedef struct xfs_ioops { #define XFS_IODONE(vfsp) \ (*(mp)->m_io_ops.xfs_iodone)(vfsp) +#ifdef HAVE_PERCPU_SB + +/* + * Valid per-cpu incore superblock counters. Note that if you add new counters, + * you may need to define new counter disabled bit field descriptors as there + * are more possible fields in the superblock that can fit in a bitfield on a + * 32 bit platform. The XFS_SBS_* values for the current current counters just + * fit. + */ +typedef struct xfs_icsb_cnts { + uint64_t icsb_fdblocks; + uint64_t icsb_ifree; + uint64_t icsb_icount; + spinlock_t icsb_lock; +} xfs_icsb_cnts_t; + +#define XFS_ICSB_SB_LOCKED (1 << 0) /* sb already locked */ +#define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */ + +extern int xfs_icsb_init_counters(struct xfs_mount *); +extern void xfs_icsb_sync_counters_lazy(struct xfs_mount *); + +#else +#define xfs_icsb_init_counters(mp) (0) +#define xfs_icsb_sync_counters_lazy(mp) do { } while (0) +#endif typedef struct xfs_mount { bhv_desc_t m_bhv; /* vfs xfs behavior */ @@ -372,6 +398,10 @@ typedef struct xfs_mount { struct xfs_qmops m_qm_ops; /* vector of XQM ops */ struct xfs_ioops m_io_ops; /* vector of I/O ops */ atomic_t m_active_trans; /* number trans frozen */ +#ifdef HAVE_PERCPU_SB + xfs_icsb_cnts_t *m_sb_cnts; /* per-cpu superblock counters */ + unsigned long m_icsb_counters; /* disabled per-cpu counters */ +#endif } xfs_mount_t; /* @@ -386,8 +416,6 @@ typedef struct xfs_mount { #define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem operations, typically for disk errors in metadata */ -#define XFS_MOUNT_NOATIME (1ULL << 5) /* don't modify inode access - times on reads */ #define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to user */ #define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment @@ -411,6 +439,8 @@ typedef struct xfs_mount { #define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ #define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred * I/O size in stat() */ +#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock + counters */ /* @@ -548,6 +578,8 @@ extern void xfs_unmountfs_close(xfs_moun extern int xfs_unmountfs_writesb(xfs_mount_t *); extern int xfs_unmount_flush(xfs_mount_t *, int); extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int, int); +extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, + int, int); extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index d77901c..e48befa 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -380,7 +380,7 @@ typedef struct xfs_trans { xfs_trans_header_t t_header; /* header for in-log trans */ unsigned int t_busy_free; /* busy descs free */ xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */ - xfs_pflags_t t_pflags; /* saved pflags state */ + unsigned long t_pflags; /* saved process flags state */ } xfs_trans_t; #endif /* __KERNEL__ */ diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index b6ad370..5dd84fe 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -55,7 +55,7 @@ #include "xfs_clnt.h" #include "xfs_fsops.h" -STATIC int xfs_sync(bhv_desc_t *, int, cred_t *); +STATIC int xfs_sync(bhv_desc_t *, int, cred_t *); int xfs_init(void) @@ -77,11 +77,12 @@ xfs_init(void) "xfs_bmap_free_item"); xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), "xfs_btree_cur"); - xfs_inode_zone = kmem_zone_init(sizeof(xfs_inode_t), "xfs_inode"); xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state"); xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); + xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); + xfs_acl_zone_init(xfs_acl_zone, "xfs_acl"); /* * The size of the zone allocated buf log item is the maximum @@ -93,17 +94,30 @@ xfs_init(void) (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) / NBWORD) * sizeof(int))), "xfs_buf_item"); - xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + - ((XFS_EFD_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))), + xfs_efd_zone = + kmem_zone_init((sizeof(xfs_efd_log_item_t) + + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efd_item"); - xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) + - ((XFS_EFI_MAX_FAST_EXTENTS - 1) * sizeof(xfs_extent_t))), + xfs_efi_zone = + kmem_zone_init((sizeof(xfs_efi_log_item_t) + + ((XFS_EFI_MAX_FAST_EXTENTS - 1) * + sizeof(xfs_extent_t))), "xfs_efi_item"); - xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); - xfs_ili_zone = kmem_zone_init(sizeof(xfs_inode_log_item_t), "xfs_ili"); - xfs_chashlist_zone = kmem_zone_init(sizeof(xfs_chashlist_t), - "xfs_chashlist"); - xfs_acl_zone_init(xfs_acl_zone, "xfs_acl"); + + /* + * These zones warrant special memory allocator hints + */ + xfs_inode_zone = + kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | + KM_ZONE_SPREAD, NULL); + xfs_ili_zone = + kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", + KM_ZONE_SPREAD, NULL); + xfs_chashlist_zone = + kmem_zone_init_flags(sizeof(xfs_chashlist_t), "xfs_chashlist", + KM_ZONE_SPREAD, NULL); /* * Allocate global trace buffers. @@ -258,8 +272,6 @@ xfs_start_flags( mp->m_inoadd = XFS_INO64_OFFSET; } #endif - if (ap->flags & XFSMNT_NOATIME) - mp->m_flags |= XFS_MOUNT_NOATIME; if (ap->flags & XFSMNT_RETERR) mp->m_flags |= XFS_MOUNT_RETERR; if (ap->flags & XFSMNT_NOALIGN) @@ -654,11 +666,6 @@ xfs_mntupdate( xfs_mount_t *mp = XFS_BHVTOM(bdp); int error; - if (args->flags & XFSMNT_NOATIME) - mp->m_flags |= XFS_MOUNT_NOATIME; - else - mp->m_flags &= ~XFS_MOUNT_NOATIME; - if (args->flags & XFSMNT_BARRIER) mp->m_flags |= XFS_MOUNT_BARRIER; else @@ -814,6 +821,7 @@ xfs_statvfs( statp->f_type = XFS_SB_MAGIC; + xfs_icsb_sync_counters_lazy(mp); s = XFS_SB_LOCK(mp); statp->f_bsize = sbp->sb_blocksize; lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;