ext4: Add basic delayed allocation support From: Alex Tomas Two special ->get_block() methods are introduced: * ext4_da_get_block_prep() to be used with ->write_begin(), defers allocation till flush * ext4_da_get_block_write() to be used with mpage_da_writepages(), allocate blocks and correct on-disk size Current implementation works with data=writeback only, you should mount filesystem with delalloc,data=writeback options. TODO: * reservation * data=ordered * quota * bmap Signed-off-by: Alex Tomas Signed-off-by: Mingming Cao Signed-off-by: Dave Kleikamp Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 fs/ext4/inode.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/super.c | 6 + 3 files changed, 182 insertions(+), 1 deletion(-) Index: linux-2.6.26-rc4/fs/ext4/inode.c =================================================================== --- linux-2.6.26-rc4.orig/fs/ext4/inode.c 2008-05-29 10:52:37.000000000 -0700 +++ linux-2.6.26-rc4/fs/ext4/inode.c 2008-05-29 10:52:39.000000000 -0700 @@ -39,6 +39,8 @@ #include "xattr.h" #include "acl.h" +static void ext4_invalidatepage(struct page *page, unsigned long offset); + /* * Test whether an inode is a fast symlink. */ @@ -1435,6 +1437,162 @@ static int ext4_journalled_write_end(str } /* + * this is a special callback for ->prepare_write() only + * it's intention is to return mapped block or reserve space + */ +static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + + BUG_ON(create == 0); + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + + /* first, we need to know whether the block is allocated already + * XXX: when the filesystem has a lot of free blocks, we could + * reserve even allocated blocks to save this lookup */ + ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0); + if (ret >= 0) { + if (buffer_mapped(bh_result)) { + bh_result->b_size = (ret << inode->i_blkbits); + } else { + /* the block isn't allocated yet, let's reserve space */ + /* XXX: call reservation here */ + /* + * XXX: __block_prepare_write() unmaps passed block, + * is it OK? + */ + map_bh(bh_result, inode->i_sb, 0); + set_buffer_new(bh_result); + set_buffer_delay(bh_result); + } + ret = 0; + } + + return ret; +} + +static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret, needed_blocks = ext4_writepage_trans_blocks(inode); + unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + loff_t disksize = EXT4_I(inode)->i_disksize; + handle_t *handle = NULL; + + if (create) { + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + } + + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, create, 0); + if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + + /* + * Update on-disk size along with block allocation + * we don't use 'extend_disksize' as size may change + * within already allocated block -bzzz + */ + disksize = ((loff_t) iblock + ret) << inode->i_blkbits; + if (disksize > i_size_read(inode)) + disksize = i_size_read(inode); + if (disksize > EXT4_I(inode)->i_disksize) { + /* + * XXX: replace with spinlock if seen contended -bzzz + */ + down_write(&EXT4_I(inode)->i_data_sem); + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; + up_write(&EXT4_I(inode)->i_data_sem); + + if (EXT4_I(inode)->i_disksize == disksize) { + if (handle == NULL) + handle = ext4_journal_start(inode, 1); + if (!IS_ERR(handle)) + ext4_mark_inode_dirty(handle, inode); + } + } + + ret = 0; + } + +out: + if (handle && !IS_ERR(handle)) + ext4_journal_stop(handle); + + return ret; +} + +static int ext4_da_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + return mpage_da_writepages(mapping, wbc, ext4_da_get_block_write); +} + +static int ext4_da_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + struct page *page; + pgoff_t index; + unsigned from, to; + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + page = __grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + *pagep = page; + + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + ext4_da_get_block_prep); + return ret; +} + +static void ext4_da_invalidatepage(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh; + unsigned int curr_off = 0; + + /* + * Drop reserved blocks + */ + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + + /* + * is this block fully invalidated? + */ + if (offset <= curr_off && buffer_delay(bh)) { + clear_buffer_delay(bh); + /* XXX: add real stuff here */ + } + curr_off = next_off; + bh = bh->b_this_page; + } while (bh != head); + +out: + ext4_invalidatepage(page, offset); + + return; +} + + +/* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. * @@ -1893,10 +2051,28 @@ static const struct address_space_operat .releasepage = ext4_releasepage, }; +static const struct address_space_operations ext4_da_aops = { + .readpage = ext4_readpage, + .readpages = ext4_readpages, + .writepage = ext4_writeback_writepage, + .writepages = ext4_da_writepages, + .sync_page = block_sync_page, + .write_begin = ext4_da_write_begin, + .write_end = generic_write_end, + .bmap = ext4_bmap, + .invalidatepage = ext4_da_invalidatepage, + .releasepage = ext4_releasepage, + .direct_IO = ext4_direct_IO, + .migratepage = buffer_migrate_page, +}; + void ext4_set_aops(struct inode *inode) { if (ext4_should_order_data(inode)) inode->i_mapping->a_ops = &ext4_ordered_aops; + else if (ext4_should_writeback_data(inode) && + test_opt(inode->i_sb, DELALLOC)) + inode->i_mapping->a_ops = &ext4_da_aops; else if (ext4_should_writeback_data(inode)) inode->i_mapping->a_ops = &ext4_writeback_aops; else Index: linux-2.6.26-rc4/fs/ext4/super.c =================================================================== --- linux-2.6.26-rc4.orig/fs/ext4/super.c 2008-05-29 10:52:38.000000000 -0700 +++ linux-2.6.26-rc4/fs/ext4/super.c 2008-05-29 10:52:39.000000000 -0700 @@ -887,7 +887,7 @@ enum { Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, - Opt_mballoc, Opt_nomballoc, Opt_stripe, + Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, }; static match_table_t tokens = { @@ -946,6 +946,7 @@ static match_table_t tokens = { {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, + {Opt_delalloc, "delalloc"}, {Opt_err, NULL}, }; @@ -1324,6 +1325,9 @@ set_qf_format: return 0; sbi->s_stripe = option; break; + case Opt_delalloc: + set_opt(sbi->s_mount_opt, DELALLOC); + break; default: printk (KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " Index: linux-2.6.26-rc4/fs/ext4/ext4.h =================================================================== --- linux-2.6.26-rc4.orig/fs/ext4/ext4.h 2008-05-29 10:52:38.000000000 -0700 +++ linux-2.6.26-rc4/fs/ext4/ext4.h 2008-05-29 10:52:39.000000000 -0700 @@ -536,6 +536,7 @@ do { \ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ +#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt