GIT 23547ca541933404e08d6ba06091452ff7420739 git://oss.sgi.com:8090/xfs/xfs-2.6 commit Author: David Chinner Date: Mon May 14 18:24:23 2007 +1000 [XFS] Barriers need to be dynamically checked and switched off If the underlying block device sudden stops supporting barriers, we need to handle the -EOPNOTSUPP error in a sane manner rather than shutting downteh filesystem. If we get this error, clear the barrier flag, reissue the I/O, and tell the world bad things are occurring. SGI-PV: 964544 SGI-Modid: xfs-linux-melb:xfs-kern:28568a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin commit 82ac53af75bfe6918c4826733728a62f72119e07 Author: David Chinner Date: Mon May 14 18:24:16 2007 +1000 [XFS] Fix use-after-free during log unmount. Don't reference the log buffer after running the callbacks as the callback can trigger the log buffers to be freed during unmount. SGI-PV: 964545 SGI-Modid: xfs-linux-melb:xfs-kern:28567a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin commit 9aac40e94eb4b738de85f57bb2410d2bed2f3612 Author: David Chinner Date: Mon May 14 18:24:09 2007 +1000 [XFS] Sleeping with the ilock waiting for I/O completion is Bad. Recent fixes to the filesystem freezing code introduced a vn_iowait call in the middle of the sync code. Unfortunately, at the point where this call was added we are holding the ilock. The ilock is needed by I/O completion for unwritten extent conversion and now updating the file size. Hence I/o cannot complete if we hol dthe ilock while waiting for I/O completion. Fix up the bug and clean the code up around it. SGI-PV: 963674 SGI-Modid: xfs-linux-melb:xfs-kern:28566a Signed-off-by: David Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Tim Shimmin commit aafe3b78c46087b5634568288f98a30da38cc852 Author: Nathan Scott Date: Mon May 14 18:24:02 2007 +1000 [XFS] Don't grow filesystems past the size they can index. When growing a filesystem we don't check to see if the new size overflows the page cache index range, so we can do silly things like grow a filesystem page 16TB on a 32bit. Check new filesystem sizes against the limits the kernel can support. SGI-PV: 957886 SGI-Modid: xfs-linux-melb:xfs-kern:28563a Signed-Off-By: Nathan Scott Signed-off-by: David Chinner Signed-off-by: Tim Shimmin commit 5ebcebedcb4323e665c75ad03faa6e014bfa44c5 Author: Christoph Hellwig Date: Mon May 14 18:23:50 2007 +1000 [XFS] Only use refcounted pages for I/O Many block drivers (aoe, iscsi) really want refcountable pages in bios, which is what almost everyone send down. XFS unfortunately has a few places where it sends down buffers that may come from kmalloc, which breaks them. Fix the places that use kmalloc()d buffers. SGI-PV: 964546 SGI-Modid: xfs-linux-melb:xfs-kern:28562a Signed-Off-By: Christoph Hellwig Signed-off-by: David Chinner Signed-off-by: Tim Shimmin fs/xfs/linux-2.6/xfs_buf.c | 62 +++++++++++++++++++----------------- fs/xfs/linux-2.6/xfs_buf.h | 2 + fs/xfs/linux-2.6/xfs_super.c | 8 ----- fs/xfs/xfs_fsops.c | 2 + fs/xfs/xfs_log.c | 48 ++++++++++++++++++---------- fs/xfs/xfs_mount.c | 35 +++++++++++++------- fs/xfs/xfs_mount.h | 1 + fs/xfs/xfs_rtalloc.c | 4 ++ fs/xfs/xfs_vfsops.c | 73 ++++++++++++++++-------------------------- 9 files changed, 123 insertions(+), 112 deletions(-) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index fe4f66a..34049b3 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -314,7 +314,7 @@ xfs_buf_free( ASSERT(list_empty(&bp->b_hash_list)); - if (bp->b_flags & _XBF_PAGE_CACHE) { + if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { uint i; if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) @@ -323,18 +323,11 @@ xfs_buf_free( for (i = 0; i < bp->b_page_count; i++) { struct page *page = bp->b_pages[i]; - ASSERT(!PagePrivate(page)); + if (bp->b_flags & _XBF_PAGE_CACHE) + ASSERT(!PagePrivate(page)); page_cache_release(page); } _xfs_buf_free_pages(bp); - } else if (bp->b_flags & _XBF_KMEM_ALLOC) { - /* - * XXX(hch): bp->b_count_desired might be incorrect (see - * xfs_buf_associate_memory for details), but fortunately - * the Linux version of kmem_free ignores the len argument.. - */ - kmem_free(bp->b_addr, bp->b_count_desired); - _xfs_buf_free_pages(bp); } xfs_buf_deallocate(bp); @@ -764,41 +757,41 @@ xfs_buf_get_noaddr( size_t len, xfs_buftarg_t *target) { - size_t malloc_len = len; + unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; + int error, i; xfs_buf_t *bp; - void *data; - int error; bp = xfs_buf_allocate(0); if (unlikely(bp == NULL)) goto fail; _xfs_buf_initialize(bp, target, 0, len, 0); - try_again: - data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL | KM_LARGE); - if (unlikely(data == NULL)) + error = _xfs_buf_get_pages(bp, page_count, 0); + if (error) goto fail_free_buf; - /* check whether alignment matches.. */ - if ((__psunsigned_t)data != - ((__psunsigned_t)data & ~target->bt_smask)) { - /* .. else double the size and try again */ - kmem_free(data, malloc_len); - malloc_len <<= 1; - goto try_again; + for (i = 0; i < page_count; i++) { + bp->b_pages[i] = alloc_page(GFP_KERNEL); + if (!bp->b_pages[i]) + goto fail_free_mem; } + bp->b_flags |= _XBF_PAGES; - error = xfs_buf_associate_memory(bp, data, len); - if (error) + error = _xfs_buf_map_pages(bp, XBF_MAPPED); + if (unlikely(error)) { + printk(KERN_WARNING "%s: failed to map pages\n", + __FUNCTION__); goto fail_free_mem; - bp->b_flags |= _XBF_KMEM_ALLOC; + } xfs_buf_unlock(bp); - XB_TRACE(bp, "no_daddr", data); + XB_TRACE(bp, "no_daddr", len); return bp; + fail_free_mem: - kmem_free(data, malloc_len); + while (--i >= 0) + __free_page(bp->b_pages[i]); fail_free_buf: xfs_buf_free(bp); fail: @@ -1000,7 +993,18 @@ xfs_buf_iodone_work( xfs_buf_t *bp = container_of(work, xfs_buf_t, b_iodone_work); - if (bp->b_iodone) + /* + * We can get an EOPNOTSUPP to ordered writes. Here we clear the + * ordered flag and reissue them. Because we can't tell the higher + * layers directly that they should not issue ordered I/O anymore, they + * need to check if the ordered flag was cleared during I/O completion. + */ + if ((bp->b_error == EOPNOTSUPP) && + (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { + XB_TRACE(bp, "ordered_retry", bp->b_iodone); + bp->b_flags &= ~XBF_ORDERED; + xfs_buf_iorequest(bp); + } else if (bp->b_iodone) (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index b6241f6..b5908a3 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -63,7 +63,7 @@ typedef enum { /* flags used only internally */ _XBF_PAGE_CACHE = (1 << 17),/* backed by pagecache */ - _XBF_KMEM_ALLOC = (1 << 18),/* backed by kmem_alloc() */ + _XBF_PAGES = (1 << 18), /* backed by refcounted pages */ _XBF_RUN_QUEUES = (1 << 19),/* run block device task queue */ _XBF_DELWRI_Q = (1 << 21), /* buffer on delwri queue */ } xfs_buf_flags_t; diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index bf9a9d5..dad8239 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -304,14 +304,6 @@ xfs_mountfs_check_barriers(xfs_mount_t * return; } - if (mp->m_ddev_targp->bt_bdev->bd_disk->queue->ordered == - QUEUE_ORDERED_NONE) { - xfs_fs_cmn_err(CE_NOTE, mp, - "Disabling barriers, not supported by the underlying device"); - mp->m_flags &= ~XFS_MOUNT_BARRIER; - return; - } - if (xfs_readonly_buftarg(mp->m_ddev_targp)) { xfs_fs_cmn_err(CE_NOTE, mp, "Disabling barriers, underlying device is readonly"); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index b599e6b..25e5eae 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -140,6 +140,8 @@ xfs_growfs_data_private( pct = in->imaxpct; if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) return XFS_ERROR(EINVAL); + if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) + return error; dpct = pct - mp->m_sb.sb_imax_pct; error = xfs_read_buf(mp, mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index c48bf61..39598ab 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -951,6 +951,19 @@ xlog_iodone(xfs_buf_t *bp) l = iclog->ic_log; /* + * If the ordered flag has been removed by a lower + * layer, it means the underlyin device no longer supports + * barrier I/O. Warn loudly and turn off barriers. + */ + if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) { + l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; + xfs_fs_cmn_err(CE_WARN, l->l_mp, + "xlog_iodone: Barriers are no longer supported" + " by device. Disabling barriers\n"); + xfs_buftrace("XLOG_IODONE BARRIERS OFF", bp); + } + + /* * Race to shutdown the filesystem if we see an error. */ if (XFS_TEST_ERROR((XFS_BUF_GETERROR(bp)), l->l_mp, @@ -967,14 +980,16 @@ xlog_iodone(xfs_buf_t *bp) } else if (iclog->ic_state & XLOG_STATE_IOERROR) { aborted = XFS_LI_ABORTED; } + + /* log I/O is always issued ASYNC */ + ASSERT(XFS_BUF_ISASYNC(bp)); xlog_state_done_syncing(iclog, aborted); - if (!(XFS_BUF_ISASYNC(bp))) { - /* - * Corresponding psema() will be done in bwrite(). If we don't - * vsema() here, panic. - */ - XFS_BUF_V_IODONESEMA(bp); - } + /* + * do not reference the buffer (bp) here as we could race + * with it being freed after writing the unmount record to the + * log. + */ + } /* xlog_iodone */ /* @@ -1199,11 +1214,18 @@ xlog_alloc_log(xfs_mount_t *mp, *iclogp = (xlog_in_core_t *) kmem_zalloc(sizeof(xlog_in_core_t), KM_SLEEP); iclog = *iclogp; - iclog->hic_data = (xlog_in_core_2_t *) - kmem_zalloc(iclogsize, KM_SLEEP | KM_LARGE); - iclog->ic_prev = prev_iclog; prev_iclog = iclog; + + bp = xfs_buf_get_noaddr(log->l_iclog_size, mp->m_logdev_targp); + if (!XFS_BUF_CPSEMA(bp)) + ASSERT(0); + XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); + XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); + XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); + iclog->ic_bp = bp; + iclog->hic_data = bp->b_addr; + log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); head = &iclog->ic_header; @@ -1216,11 +1238,6 @@ xlog_alloc_log(xfs_mount_t *mp, INT_SET(head->h_fmt, ARCH_CONVERT, XLOG_FMT); memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); - bp = xfs_buf_get_empty(log->l_iclog_size, mp->m_logdev_targp); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_iodone); - XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb); - XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); - iclog->ic_bp = bp; iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; iclog->ic_state = XLOG_STATE_ACTIVE; @@ -1528,7 +1545,6 @@ #ifdef XFS_LOG_TRACE } #endif next_iclog = iclog->ic_next; - kmem_free(iclog->hic_data, log->l_iclog_size); kmem_free(iclog, sizeof(xlog_in_core_t)); iclog = next_iclog; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index a96bde6..5de1f39 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -202,6 +202,27 @@ xfs_mount_free( kmem_free(mp, sizeof(xfs_mount_t)); } +/* + * Check size of device based on the (data/realtime) block count. + * Note: this check is used by the growfs code as well as mount. + */ +int +xfs_sb_validate_fsb_count( + xfs_sb_t *sbp, + __uint64_t nblocks) +{ + ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); + ASSERT(sbp->sb_blocklog >= BBSHIFT); + +#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ + if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX) + return E2BIG; +#else /* Limited by UINT_MAX of sectors */ + if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX) + return E2BIG; +#endif + return 0; +} /* * Check the validity of the SB found. @@ -284,18 +305,8 @@ xfs_mount_validate_sb( return XFS_ERROR(EFSCORRUPTED); } - ASSERT(PAGE_SHIFT >= sbp->sb_blocklog); - ASSERT(sbp->sb_blocklog >= BBSHIFT); - -#if XFS_BIG_BLKNOS /* Limited by ULONG_MAX of page cache index */ - if (unlikely( - (sbp->sb_dblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX || - (sbp->sb_rblocks >> (PAGE_SHIFT - sbp->sb_blocklog)) > ULONG_MAX)) { -#else /* Limited by UINT_MAX of sectors */ - if (unlikely( - (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX || - (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) { -#endif + if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) || + xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) { xfs_fs_mount_cmn_err(flags, "file system too large to be mounted on this system."); return XFS_ERROR(E2BIG); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 82304b9..871a5bf 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -624,6 +624,7 @@ extern int xfs_sync_inodes(xfs_mount_t * extern xfs_agnumber_t xfs_initialize_perag(struct bhv_vfs *, xfs_mount_t *, xfs_agnumber_t); extern void xfs_xlatesb(void *, struct xfs_sb *, int, __int64_t); +extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t); extern struct xfs_dmops xfs_dmcore_stub; extern struct xfs_qmops xfs_qmcore_stub; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index b3a5f07..47082c0 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1882,11 +1882,13 @@ xfs_growfs_rt( (nrblocks = in->newblocks) <= sbp->sb_rblocks || (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) return XFS_ERROR(EINVAL); + if ((error = xfs_sb_validate_fsb_count(sbp, nrblocks))) + return error; /* * Read in the last block of the device, make sure it exists. */ error = xfs_read_buf(mp, mp->m_rtdev_targp, - XFS_FSB_TO_BB(mp, in->newblocks - 1), + XFS_FSB_TO_BB(mp, nrblocks - 1), XFS_FSB_TO_BB(mp, 1), 0, &bp); if (error) return error; diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 65c5612..92c1425 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -1128,58 +1128,41 @@ #define XFS_PREEMPT_MASK 0x7f * in the inode list. */ - if ((flags & SYNC_CLOSE) && (vp != NULL)) { - /* - * This is the shutdown case. We just need to - * flush and invalidate all the pages associated - * with the inode. Drop the inode lock since - * we can't hold it across calls to the buffer - * cache. - * - * We don't set the VREMAPPING bit in the vnode - * here, because we don't hold the vnode lock - * exclusively. It doesn't really matter, though, - * because we only come here when we're shutting - * down anyway. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); - - if (XFS_FORCED_SHUTDOWN(mp)) { - bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF); - } else { - error = bhv_vop_flushinval_pages(vp, 0, -1, FI_REMAPF); + /* + * If we have to flush data or wait for I/O completion + * we need to drop the ilock that we currently hold. + * If we need to drop the lock, insert a marker if we + * have not already done so. + */ + if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) || + ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) { + if (mount_locked) { + IPOINTER_INSERT(ip, mp); } + xfs_iunlock(ip, XFS_ILOCK_SHARED); - xfs_ilock(ip, XFS_ILOCK_SHARED); - - } else if ((flags & SYNC_DELWRI) && (vp != NULL)) { - if (VN_DIRTY(vp)) { - /* We need to have dropped the lock here, - * so insert a marker if we have not already - * done so. - */ - if (mount_locked) { - IPOINTER_INSERT(ip, mp); - } - - /* - * Drop the inode lock since we can't hold it - * across calls to the buffer cache. - */ - xfs_iunlock(ip, XFS_ILOCK_SHARED); + if (flags & SYNC_CLOSE) { + /* Shutdown case. Flush and invalidate. */ + if (XFS_FORCED_SHUTDOWN(mp)) + bhv_vop_toss_pages(vp, 0, -1, FI_REMAPF); + else + error = bhv_vop_flushinval_pages(vp, 0, + -1, FI_REMAPF); + } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) { error = bhv_vop_flush_pages(vp, (xfs_off_t)0, -1, fflag, FI_NONE); - xfs_ilock(ip, XFS_ILOCK_SHARED); } + /* + * When freezing, we need to wait ensure all I/O (including direct + * I/O) is complete to ensure no further data modification can take + * place after this point + */ + if (flags & SYNC_IOWAIT) + vn_iowait(vp); + + xfs_ilock(ip, XFS_ILOCK_SHARED); } - /* - * When freezing, we need to wait ensure all I/O (including direct - * I/O) is complete to ensure no further data modification can take - * place after this point - */ - if (flags & SYNC_IOWAIT) - vn_iowait(vp); if (flags & SYNC_BDFLUSH) { if ((flags & SYNC_ATTR) &&