ext4: Add percpu dirty block accounting. From: Aneesh Kumar K.V This patch adds dirty block accounting using percpu_counters. Delayed allocation block reservation is now done by updating dirty block counter. In the later patch we switch to non delalloc mode if the filesystem free blocks is greater than 150% of total filesystem dirty blocks Signed-off-by: Aneesh Kumar K.V Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 59 +++++++++++++++++++++++++++++++++--------------------- fs/ext4/ext4_sb.h | 1 fs/ext4/inode.c | 22 ++++++++++---------- fs/ext4/mballoc.c | 17 ++------------- fs/ext4/super.c | 8 ++++++- 5 files changed, 59 insertions(+), 48 deletions(-) Index: linux-2.6.27-rc3/fs/ext4/balloc.c =================================================================== --- linux-2.6.27-rc3.orig/fs/ext4/balloc.c 2008-08-27 13:54:26.000000000 -0700 +++ linux-2.6.27-rc3/fs/ext4/balloc.c 2008-08-27 13:59:34.000000000 -0700 @@ -1603,26 +1603,38 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi, ext4_fsblk_t nblocks) { - s64 free_blocks; + s64 free_blocks, dirty_blocks; ext4_fsblk_t root_blocks = 0; struct percpu_counter *fbc = &sbi->s_freeblocks_counter; + struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; - free_blocks = percpu_counter_read(fbc); + free_blocks = percpu_counter_read_positive(fbc); + dirty_blocks = percpu_counter_read_positive(dbc); if (!capable(CAP_SYS_RESOURCE) && sbi->s_resuid != current->fsuid && (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) root_blocks = ext4_r_blocks_count(sbi->s_es); - if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK) - free_blocks = percpu_counter_sum(&sbi->s_freeblocks_counter); - - if (free_blocks < (root_blocks + nblocks)) + if (free_blocks - (nblocks + root_blocks + dirty_blocks) < + EXT4_FREEBLOCKS_WATERMARK) { + free_blocks = percpu_counter_sum(fbc); + dirty_blocks = percpu_counter_sum(dbc); + if (dirty_blocks < 0) { + printk(KERN_CRIT "Dirty block accounting " + "went wrong %lld\n", + dirty_blocks); + } + } + /* Check whether we have space after + * accounting for current dirty blocks + */ + if (free_blocks < ((s64)(root_blocks + nblocks) + dirty_blocks)) /* we don't have free space */ return -ENOSPC; - /* reduce fs free blocks counter */ - percpu_counter_sub(fbc, nblocks); + /* Add the blocks to nblocks */ + percpu_counter_add(dbc, nblocks); return 0; } @@ -1638,23 +1650,28 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, ext4_fsblk_t nblocks) { - ext4_fsblk_t free_blocks; + ext4_fsblk_t free_blocks, dirty_blocks; ext4_fsblk_t root_blocks = 0; + struct percpu_counter *fbc = &sbi->s_freeblocks_counter; + struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + free_blocks = percpu_counter_read_positive(fbc); + dirty_blocks = percpu_counter_read_positive(dbc); if (!capable(CAP_SYS_RESOURCE) && sbi->s_resuid != current->fsuid && (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) root_blocks = ext4_r_blocks_count(sbi->s_es); - if (free_blocks - (nblocks + root_blocks) < EXT4_FREEBLOCKS_WATERMARK) - free_blocks = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); - - if (free_blocks <= root_blocks) + if (free_blocks - (nblocks + root_blocks + dirty_blocks) < + EXT4_FREEBLOCKS_WATERMARK) { + free_blocks = percpu_counter_sum_positive(fbc); + dirty_blocks = percpu_counter_sum_positive(dbc); + } + if (free_blocks <= (root_blocks + dirty_blocks)) /* we don't have free space */ return 0; - if (free_blocks - root_blocks < nblocks) + if (free_blocks - (root_blocks + dirty_blocks) < nblocks) return free_blocks - root_blocks; return nblocks; } @@ -1941,13 +1958,11 @@ le16_add_cpu(&gdp->bg_free_blocks_count, -num); gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); spin_unlock(sb_bgl_lock(sbi, group_no)); - if (!EXT4_I(inode)->i_delalloc_reserved_flag && (*count != num)) { - /* - * we allocated less blocks than we - * claimed. Add the difference back. - */ - percpu_counter_add(&sbi->s_freeblocks_counter, *count - num); - } + percpu_counter_sub(&sbi->s_freeblocks_counter, num); + /* + * Now reduce the dirty block count also. Should not go negative + */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, num); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, group_no); spin_lock(sb_bgl_lock(sbi, flex_group)); Index: linux-2.6.27-rc3/fs/ext4/ext4_sb.h =================================================================== --- linux-2.6.27-rc3.orig/fs/ext4/ext4_sb.h 2008-08-12 18:55:39.000000000 -0700 +++ linux-2.6.27-rc3/fs/ext4/ext4_sb.h 2008-08-27 13:59:34.000000000 -0700 @@ -59,6 +59,7 @@ struct percpu_counter s_freeblocks_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyblocks_counter; struct blockgroup_lock s_blockgroup_lock; /* root of the per fs reservation window tree */ Index: linux-2.6.27-rc3/fs/ext4/inode.c =================================================================== --- linux-2.6.27-rc3.orig/fs/ext4/inode.c 2008-08-27 13:54:26.000000000 -0700 +++ linux-2.6.27-rc3/fs/ext4/inode.c 2008-08-27 13:59:34.000000000 -0700 @@ -1030,19 +1030,20 @@ BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - /* Account for allocated meta_blocks */ - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; - - /* update fs free blocks counter for truncate case */ - percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free); + if (mdb_free) { + /* Account for allocated meta_blocks */ + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; + + /* update fs dirty blocks counter */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); + EXT4_I(inode)->i_allocated_meta_blocks = 0; + EXT4_I(inode)->i_reserved_meta_blocks = mdb; + } /* update per-inode reservations */ BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); EXT4_I(inode)->i_reserved_data_blocks -= used; - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - EXT4_I(inode)->i_reserved_meta_blocks = mdb; - EXT4_I(inode)->i_allocated_meta_blocks = 0; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } @@ -1588,8 +1589,8 @@ release = to_free + mdb_free; - /* update fs free blocks counter for truncate case */ - percpu_counter_add(&sbi->s_freeblocks_counter, release); + /* update fs dirty blocks counter for truncate case */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); /* update per-inode reservations */ BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); @@ -2471,7 +2472,6 @@ index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; - retry: /* * With delayed allocation, we don't log the i_disksize update Index: linux-2.6.27-rc3/fs/ext4/mballoc.c =================================================================== --- linux-2.6.27-rc3.orig/fs/ext4/mballoc.c 2008-08-27 13:54:26.000000000 -0700 +++ linux-2.6.27-rc3/fs/ext4/mballoc.c 2008-08-27 13:59:34.000000000 -0700 @@ -2968,22 +2968,11 @@ le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); - + percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); /* - * free blocks account has already be reduced/reserved - * at write_begin() time for delayed allocation - * do not double accounting + * Now reduce the dirty block count also. Should not go negative */ - if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED) && - ac->ac_o_ex.fe_len != ac->ac_b_ex.fe_len) { - /* - * we allocated less blocks than we calimed - * Add the difference back - */ - percpu_counter_add(&sbi->s_freeblocks_counter, - ac->ac_o_ex.fe_len - ac->ac_b_ex.fe_len); - } - + percpu_counter_sub(&sbi->s_dirtyblocks_counter, ac->ac_b_ex.fe_len); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, ac->ac_b_ex.fe_group); Index: linux-2.6.27-rc3/fs/ext4/super.c =================================================================== --- linux-2.6.27-rc3.orig/fs/ext4/super.c 2008-08-27 09:31:21.000000000 -0700 +++ linux-2.6.27-rc3/fs/ext4/super.c 2008-08-27 13:59:34.000000000 -0700 @@ -520,6 +520,7 @@ percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -2257,6 +2258,9 @@ err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sb)); } + if (!err) { + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + } if (err) { printk(KERN_ERR "EXT4-fs: insufficient memory\n"); goto failed_mount3; @@ -2489,6 +2493,7 @@ percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); @@ -3162,7 +3167,8 @@ buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; - buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); + buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - + percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); ext4_free_blocks_count_set(es, buf->f_bfree); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es))