ext4: journal credit fix for the delayed allocation's writepages() function From: Mingming Cao Previous delalloc writepages implementation started a new transaction outside of a loop which called get_block() to do the block allocation. Since we didn't know exactly how many blocks would need to be allocated, the estimated journal credits required was very conservative and caused many issues. With the reworked delayed allocation, a new transaction is created for each get_block(), thus we don't need to guess how many credits for the multiple chunk of allocation. We start every transaction with enough credits for inserting a single exent. When estimate the credits for indirect blocks to allocate a chunk of blocks, we need to know the number of data blocks to allocate. We use the total number of reserved delalloc datablocks; if that is too big, for non-extent files, we need to limit the number of blocks to EXT4_MAX_TRANS_BLOCKS. Code cleanup from Aneesh. Signed-off-by: Mingming Cao Reviewed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 8 ++--- fs/ext4/inode.c | 74 +++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 58 insertions(+), 24 deletions(-) Index: linux-2.6.27-rc3/fs/ext4/inode.c =================================================================== --- linux-2.6.27-rc3.orig/fs/ext4/inode.c 2008-08-19 16:03:30.000000000 -0700 +++ linux-2.6.27-rc3/fs/ext4/inode.c 2008-08-19 16:16:33.000000000 -0700 @@ -1848,29 +1848,53 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, struct buffer_head *bh) { - struct buffer_head *lbh = &mpd->lbh; sector_t next; + size_t b_size = bh->b_size; + struct buffer_head *lbh = &mpd->lbh; + int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; - next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); - + /* check if thereserved journal credits might overflow */ + if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { + if (nrblocks >= EXT4_MAX_TRANS_DATA) { + /* + * With non-extent format we are limited by the journal + * credit available. Total credit needed to insert + * nrblocks contiguous blocks is dependent on the + * nrblocks. So limit nrblocks. + */ + goto flush_it; + } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > + EXT4_MAX_TRANS_DATA) { + /* + * Adding the new buffer_head would make it cross the + * allowed limit for which we have journal credit + * reserved. So limit the new bh->b_size + */ + b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << + mpd->inode->i_blkbits; + /* we will do mpage_da_submit_io in the next loop */ + } + } /* * First block in the extent */ if (lbh->b_size == 0) { lbh->b_blocknr = logical; - lbh->b_size = bh->b_size; + lbh->b_size = b_size; lbh->b_state = bh->b_state & BH_FLAGS; return; } + next = lbh->b_blocknr + nrblocks; /* * Can we merge the block to our big extent? */ if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { - lbh->b_size += bh->b_size; + lbh->b_size += b_size; return; } +flush_it: /* * We couldn't merge the block to our extent, so we * need to flush current extent and start new one @@ -2231,17 +2255,29 @@ } /* - * For now just follow the DIO way to estimate the max credits - * needed to write out EXT4_MAX_WRITEBACK_PAGES. - * todo: need to calculate the max credits need for - * extent based files, currently the DIO credits is based on - * indirect-blocks mapping way. - * - * Probably should have a generic way to calculate credits - * for DIO, writepages, and truncate + * This is called via ext4_da_writepages() to + * calulate the total number of credits to reserve to fit + * a single extent allocation into a single transaction, + * ext4_da_writpeages() will loop calling this before + * the block allocation. */ -#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS -#define EXT4_MAX_WRITEBACK_CREDITS 25 + +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + + /* + * With non-extent format the journal credit needed to + * insert nrblocks contiguous block is dependent on + * number of contiguous block. So we will limit + * number of contiguous block to a sane value + */ + if (!(inode->i_flags & EXT4_EXTENTS_FL) && + (max_blocks > EXT4_MAX_TRANS_DATA)) + max_blocks = EXT4_MAX_TRANS_DATA; + + return ext4_chunk_trans_blocks(inode, max_blocks); +} static int ext4_da_writepages(struct address_space *mapping, struct writeback_control *wbc) @@ -2283,7 +2319,7 @@ * by delalloc */ BUG_ON(ext4_should_journal_data(inode)); - needed_blocks = EXT4_DATA_TRANS_BLOCKS(inode->i_sb); + needed_blocks = ext4_da_writepages_trans_blocks(inode); /* start a new transaction*/ handle = ext4_journal_start(inode, needed_blocks); @@ -4461,11 +4497,9 @@ * the modification of a single pages into a single transaction, * which may include multile chunk of block allocations. * - * This could be called via ext4_write_begin() or later - * ext4_da_writepages() in delalyed allocation case. + * This could be called via ext4_write_begin() * - * In both case it's possible that we could allocating multiple - * chunks of blocks. We need to consider the worse case, when + * We need to consider the worse case, when * one new block per extent. */ int ext4_writepage_trans_blocks(struct inode *inode) Index: linux-2.6.27-rc3/fs/ext4/extents.c =================================================================== --- linux-2.6.27-rc3.orig/fs/ext4/extents.c 2008-08-19 16:15:15.000000000 -0700 +++ linux-2.6.27-rc3/fs/ext4/extents.c 2008-08-19 16:16:33.000000000 -0700 @@ -1753,7 +1753,7 @@ * When pass the actual path, the caller should calculate credits * under i_data_sem. */ -int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, struct ext4_ext_path *path) { if (path) { @@ -1772,12 +1772,12 @@ * and other metadat blocks still need to be * accounted. */ - /* 1 one bitmap, 1 block group descriptor */ + /* 1 bitmap, 1 block group descriptor */ ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); } } - return ext4_chunk_trans_blocks(inode, num); + return ext4_chunk_trans_blocks(inode, nrblocks); } /* @@ -1791,7 +1791,7 @@ * If the nrblocks are discontiguous, they could cause * the whole tree split more than once, but this is really rare. */ -int ext4_ext_index_trans_blocks(struct inode *inode, int num, int chunk) +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) { int index; int depth = ext_depth(inode);