ext4: Add basic delayed allocation support From: Alex Tomas Two special ->get_block() methods are introduced: * ext4_da_get_block_prep() to be used with ->prepare_write(), defers allocation till flush * ext4_da_get_block_write() to be used with mpage_da_writepages(), allocate blocks and correct on-disk size Current implementation works with data=writeback only, you should mount filesystem with delalloc,data=writeback options. TODO: * reservation * data=ordered * quota * bmap Signed-off-by: Alex Tomas Signed-off-by: Dave Kleikamp --- fs/ext4/inode.c | 160 +++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/super.c | 6 +- include/linux/ext4_fs.h | 1 3 files changed, 166 insertions(+), 1 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2947800..6bb788d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -39,6 +39,8 @@ #include "xattr.h" #include "acl.h" +static void ext4_invalidatepage(struct page *page, unsigned long offset); + /* * Test whether an inode is a fast symlink. */ @@ -1352,6 +1354,146 @@ static int ext4_journalled_write_end(struct file *file, } /* + * this is a special callback for ->prepare_write() only + * it's intention is to return mapped block or reserve space + */ +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + + BUG_ON(create == 0); + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + + /* first, we need to know whether the block is allocated already + * XXX: when the filesystem has a lot of free blocks, we could + * reserve even allocated blocks to save this lookup */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0); + if (ret >= 0) { + if (buffer_mapped(bh_result)) { + bh_result->b_size = (ret << inode->i_blkbits); + } else { + /* the block isn't allocated yet, let's reserve space */ + /* XXX: call reservation here */ + /* + * XXX: __block_prepare_write() unmaps passed block, + * is it OK? + */ + map_bh(bh_result, inode->i_sb, 0); + set_buffer_new(bh_result); + set_buffer_delay(bh_result); + } + ret = 0; + } + + return ret; +} + + +static int ext4_da_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + return block_prepare_write(page, from, to, ext4_da_get_block_prep); +} + +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret, needed_blocks = ext4_writepage_trans_blocks(inode); + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + loff_t disksize = EXT4_I(inode)->i_disksize; + handle_t *handle = NULL; + + if (create) { + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + } + + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, create, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + + /* + * Update on-disk size along with block allocation + * we don't use 'extend_disksize' as size may change + * within already allocated block -bzzz + */ + disksize = ((loff_t) iblock + ret) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) { + /* + * XXX: replace with spinlock if seen contended -bzzz + */ + down_write(&EXT4_I(inode)->i_data_sem); + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + up_write(&EXT4_I(inode)->i_data_sem); + + if (EXT4_I(inode)->i_disksize == disksize) { + if (handle == NULL) + handle = ext4_journal_start(inode, 1); + if (!IS_ERR(handle)) + ext4_mark_inode_dirty(handle, inode); + } + } + + ret = 0; + } + +out: + if (handle && !IS_ERR(handle)) + ext4_journal_stop(handle); + + return ret; +} + +static int ext4_da_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write); +} + +static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + /* + * Drop reserved blocks + */ + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + + /* + * is this block fully invalidated? + */ + if (offset <= curr_off && buffer_delay(bh)) { + clear_buffer_delay(bh); + /* XXX: add real stuff here */ + } + curr_off = next_off; + bh = bh->b_this_page; + } while (bh != head); + +out: + ext4_invalidatepage(page, offset); + + return; +} + + +/* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. * @@ -1802,10 +1944,28 @@ static const struct address_space_operations ext4_journalled_aops = { .releasepage = ext4_releasepage, }; +static const struct address_space_operations ext4_da_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writeback_writepage, + .writepages = ext4_da_writepages, + .sync_page = block_sync_page, + .prepare_write = ext4_da_prepare_write, + .commit_write = generic_commit_write, + .bmap = ext4_bmap, + .invalidatepage = ext4_da_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, +}; + void ext4_set_aops(struct inode *inode) { if (ext4_should_order_data(inode)) inode->i_mapping->a_ops = &ext4_ordered_aops; + else if (ext4_should_writeback_data(inode) && + test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; else if (ext4_should_writeback_data(inode)) inode->i_mapping->a_ops = &ext4_writeback_aops; else diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 092775f..6822f58 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -886,7 +886,7 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, - Opt_mballoc, Opt_nomballoc, Opt_stripe, + Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, }; static match_table_t tokens = { @@ -944,6 +944,7 @@ static match_table_t tokens = { {Opt_mballoc, "mballoc"}, {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, + {Opt_delalloc, "delalloc"}, {Opt_err, NULL}, {Opt_resize, "resize"}, }; @@ -1306,6 +1307,9 @@ clear_qf_name: return 0; sbi->s_stripe = option; break; + case Opt_delalloc: + set_opt(sbi->s_mount_opt, DELALLOC); + break; default: printk (KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h index 1852313..adc16a9 100644 --- a/include/linux/ext4_fs.h +++ b/include/linux/ext4_fs.h @@ -521,6 +521,7 @@ do { \ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt