ext4: online defrag-- Move victim files for the target file (-f mode) From: Akira Fujita Move victim files to make sufficient space and reallocates the contiguous blocks for the target file. Signed-off-by: Akira Fujita Signed-off-by: Takashi Sato --- fs/ext4/balloc.c | 10 - fs/ext4/defrag.c | 460 +++++++++++++++++++++++++++++++++++++++++++++---- fs/ext4/ext4.h | 29 ++- fs/ext4/ext4_extents.h | 5 fs/ext4/extents.c | 53 ++++- fs/ext4/ioctl.c | 5 fs/ext4/mballoc.c | 5 fs/ext4/mballoc.h | 1 8 files changed, 521 insertions(+), 47 deletions(-) Index: linux-2.6.26-rc9/fs/ext4/balloc.c =================================================================== --- linux-2.6.26-rc9.orig/fs/ext4/balloc.c 2008-07-11 16:05:20.000000000 -0700 +++ linux-2.6.26-rc9/fs/ext4/balloc.c 2008-07-11 16:05:20.000000000 -0700 @@ -428,7 +428,7 @@ restart: * If the goal block is within the reservation window, return 1; * otherwise, return 0; */ -static int +int goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal, ext4_group_t group, struct super_block *sb) { @@ -533,7 +533,7 @@ void ext4_rsv_window_add(struct super_bl * from the filesystem reservation window rb tree. Must be called with * rsv_lock hold. */ -static void rsv_window_remove(struct super_block *sb, +void rsv_window_remove(struct super_block *sb, struct ext4_reserve_window_node *rsv) { rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; @@ -548,7 +548,7 @@ static void rsv_window_remove(struct sup * * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED. */ -static inline int rsv_is_empty(struct ext4_reserve_window *rsv) +inline int rsv_is_empty(struct ext4_reserve_window *rsv) { /* a valid reservation end block could not be 0 */ return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED; @@ -1284,7 +1284,7 @@ static int find_next_reservable_window( * @bitmap_bh: the block group block bitmap * */ -static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv, +int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv, ext4_grpblk_t grp_goal, struct super_block *sb, ext4_group_t group, struct buffer_head *bitmap_bh) { @@ -1428,7 +1428,7 @@ retry: * expand the reservation window size if necessary on a best-effort * basis before ext4_new_blocks() tries to allocate blocks, */ -static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv, +void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv, struct super_block *sb, int size) { struct ext4_reserve_window_node *next_rsv; Index: linux-2.6.26-rc9/fs/ext4/defrag.c =================================================================== --- linux-2.6.26-rc9.orig/fs/ext4/defrag.c 2008-07-11 16:05:20.000000000 -0700 +++ linux-2.6.26-rc9/fs/ext4/defrag.c 2008-07-11 16:05:20.000000000 -0700 @@ -218,6 +218,267 @@ out: } /** + * ext4_defrag_reserve_blocks - Reserve blocks for defrag + * + * @org_inode: original inode + * @goal: the goal offset of the block reservation + * @len: blocks count we need to reserve + * + * This function returns 0 if succeed, otherwise returns error value. + */ + +static int +ext4_defrag_reserve_blocks(struct inode *org_inode, ext4_fsblk_t goal, int len) +{ + struct super_block *sb = NULL; + handle_t *handle; + struct buffer_head *bitmap_bh = NULL; + struct ext4_block_alloc_info *block_i; + struct ext4_reserve_window_node *my_rsv = NULL; + unsigned short windowsz = 0; + ext4_group_t group_no; + ext4_grpblk_t grp_target_blk; + int err = 0; + + down_write(&EXT4_I(org_inode)->i_data_sem); + + handle = ext4_journal_start(org_inode, EXT4_RESERVE_TRANS_BLOCKS); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + handle = NULL; + goto out; + } + + if (S_ISREG(org_inode->i_mode) && + !EXT4_I(org_inode)->i_block_alloc_info) { + ext4_init_block_alloc_info(org_inode); + } else if (!S_ISREG(org_inode->i_mode)) { + printk(KERN_ERR "ext4 defrag: Invalid file type\n"); + err = -EINVAL; + goto out; + } + + sb = org_inode->i_sb; + if (!sb) { + printk(KERN_ERR "ext4 defrag: Non-existent device\n"); + err = -ENXIO; + goto out; + } + ext4_get_group_no_and_offset(sb, goal, &group_no, + &grp_target_blk); + + block_i = EXT4_I(org_inode)->i_block_alloc_info; + /* Block reservation should be enabled */ + BUG_ON(!block_i); + + windowsz = block_i->rsv_window_node.rsv_goal_size; + /* Goal size should be set */ + BUG_ON(!windowsz); + + my_rsv = &block_i->rsv_window_node; + + bitmap_bh = ext4_read_block_bitmap(sb, group_no); + if (!bitmap_bh) { + err = -ENOSPC; + goto out; + } + + BUFFER_TRACE(bitmap_bh, "get undo access for new block"); + err = ext4_journal_get_undo_access(handle, bitmap_bh); + if (err) + goto out; + + err = alloc_new_reservation(my_rsv, grp_target_blk, sb, + group_no, bitmap_bh); + if (err < 0) { + printk(KERN_ERR "ext4 defrag: Block reservation failed." + "offset [%d], bg[%lu]\n", grp_target_blk, group_no); + ext4_discard_reservation(org_inode); + goto out; + } else if (len > EXT4_DEFAULT_RESERVE_BLOCKS) { + try_to_extend_reservation(my_rsv, sb, + len - EXT4_DEFAULT_RESERVE_BLOCKS); + } + +out: + up_write(&EXT4_I(org_inode)->i_data_sem); + ext4_journal_release_buffer(handle, bitmap_bh); + brelse(bitmap_bh); + + if (handle) + ext4_journal_stop(handle); + + return err; +} + +/** + * ext4_defrag_block_within_rsv - Is target extent reserved ? + * + * @org_inode: original inode + * @ex_start: physical block offset of the extent which already moved + * @ex_len: block length of the extent + * + * This function returns 0 if succeed, otherwise returns error value. + */ +static int +ext4_defrag_block_within_rsv(struct inode *org_inode, ext4_fsblk_t ex_start, + int ex_len) +{ + struct super_block *sb = org_inode->i_sb; + struct ext4_block_alloc_info *block_i; + ext4_group_t group_no; + ext4_grpblk_t grp_blk; + struct ext4_reserve_window_node *rsv; + + block_i = EXT4_I(org_inode)->i_block_alloc_info; + /* Block reservation should be enabled */ + BUG_ON(!block_i); + + /* Goal size should be set */ + BUG_ON(!block_i->rsv_window_node.rsv_goal_size); + + rsv = &block_i->rsv_window_node; + if (rsv_is_empty(&rsv->rsv_window)) { + printk(KERN_ERR "ext4 defrag: Reservation window is empty\n"); + return -ENOSPC; + } + + ext4_get_group_no_and_offset(sb, ex_start, &group_no, &grp_blk); + + if (!goal_in_my_reservation(&rsv->rsv_window, grp_blk, group_no, sb) + || !goal_in_my_reservation(&rsv->rsv_window, + grp_blk + ex_len - 1, group_no, sb)){ + /* Goal blocks are not in the reservation window */ + printk(KERN_ERR "ext4 defrag: %d or %d in bg %lu is " + "not in rsv_window\n", grp_blk, + grp_blk + ex_len - 1, group_no); + return -ENOSPC; + } + return 0; +} + +/* + * ext4_defrag_reserve_fblocks - + * Reserve free blocks with ext4_defrag_reserve_blocks + * + * @org_inode: original inode to get a block group number + * @ext_info: freeblocks distribution which stored extent-like style + * @ext_info->ext[]: an array of struct ext4_extents_data + * + * This function returns 0 if succeed, otherwise returns error value. + */ +static int +ext4_defrag_reserve_fblocks(struct inode *org_inode, + struct ext4_extents_info *ext_info) +{ + ext4_fsblk_t ex_start = 0; + int i, len, ret; + + for (i = 0; i < ext_info->entries; i++) { + ex_start = ext_info->ext[i].start; + len = ext_info->ext[i].len; + + ret = ext4_defrag_reserve_blocks(org_inode, ex_start, len); + if (ret < 0) { + printk(KERN_ERR "ext4 defrag: " + "Block reservation failed. offset [%llu], " + "length [%d]\n", ex_start, len); + goto err; + } + + /* Confirm that blocks are in the reservation window */ + ret = ext4_defrag_block_within_rsv(org_inode, ex_start, len); + if (ret < 0) { + printk(KERN_ERR "ext4 defrag: " + "Reservation window is not set. " + "offset [%llu], length [%d]\n", ex_start, len); + goto err; + } + } + return ret; + +err: + down_write(&EXT4_I(org_inode)->i_data_sem); + ext4_discard_reservation(org_inode); + up_write(&EXT4_I(org_inode)->i_data_sem); + return ret; +} + +/** + * ext4_defrag_move_victim - Create free space for defrag + * + * @target_filp: target file + * @ext_info: target extents array to move + * + * This function returns 0 if succeed, otherwise + * returns error value. + */ +static int +ext4_defrag_move_victim(struct file *target_filp, + struct ext4_extents_info *ext_info) +{ + struct inode *org_inode = target_filp->f_dentry->d_inode; + struct super_block *sb = org_inode->i_sb; + struct file victim_file; + struct dentry victim_dent; + struct inode *victim_inode; + struct ext4_extent_data ext; + ext4_fsblk_t goal = ext_info->goal; + ext4_group_t group; + ext4_grpblk_t grp_off; + int ret, i; + + /* Setup dummy extent data */ + ext.len = 0; + + /* Get the inode of the victim file */ + victim_inode = ext4_iget(sb, ext_info->ino); + if (IS_ERR(victim_inode)) + return PTR_ERR(victim_inode); + + /* Setup file for the victim file */ + victim_dent.d_inode = victim_inode; + victim_file.f_dentry = &victim_dent; + victim_file.f_mapping = victim_inode->i_mapping; + + /* Set the goal appropriate offset */ + if (goal == -1) { + ext4_get_group_no_and_offset(victim_inode->i_sb, + ext_info->ext[0].start, &group, &grp_off); + goal = ext4_group_first_block_no(sb, group + 1); + } + + for (i = 0; i < ext_info->entries; i++) { + /* Move original blocks to another block group */ + ret = ext4_defrag(&victim_file, ext_info->ext[i].block, + ext_info->ext[i].len, goal, DEFRAG_FORCE_VICTIM, &ext); + if (ret < 0) { + printk(KERN_ERR "ext4 defrag: " + "Moving victim file failed. ino [%llu]\n", + ext_info->ino); + goto err; + } + + /* Sync journal blocks before reservation */ + ret = ext4_force_commit(sb); + if (ret) { + printk(KERN_ERR "ext4 defrag: " + "ext4_force_commit failed(%d)\n", ret); + goto err; + } + } + + iput(victim_inode); + return 0; +err: + down_write(&EXT4_I(org_inode)->i_data_sem); + ext4_discard_reservation(org_inode); + up_write(&EXT4_I(org_inode)->i_data_sem); + iput(victim_inode); + return ret; +} + +/** * ext4_defrag_fblocks_distribution - Search free blocks distribution * * @org_inode: original inode @@ -383,6 +644,29 @@ int ext4_defrag_ioctl(struct inode *inod &ext_info, sizeof(ext_info))) return -EFAULT; } + } else if (cmd == EXT4_IOC_RESERVE_BLOCK) { + struct ext4_extents_info ext_info; + + if (copy_from_user(&ext_info, + (struct ext4_extents_info __user *)arg, + sizeof(ext_info))) + return -EFAULT; + + err = ext4_defrag_reserve_fblocks(inode, &ext_info); + } else if (cmd == EXT4_IOC_MOVE_VICTIM) { + struct ext4_extents_info ext_info; + + if (copy_from_user(&ext_info, + (struct ext4_extents_info __user *)arg, + sizeof(ext_info))) + return -EFAULT; + + err = ext4_defrag_move_victim(filp, &ext_info); + + } else if (cmd == EXT4_IOC_BLOCK_RELEASE) { + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_reservation(inode); + up_write(&EXT4_I(inode)->i_data_sem); } else if (cmd == EXT4_IOC_DEFRAG) { struct ext4_ext_defrag_data defrag; struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; @@ -409,7 +693,8 @@ int ext4_defrag_ioctl(struct inode *inod } err = ext4_defrag(filp, defrag.start_offset, - defrag.defrag_size, defrag.goal); + defrag.defrag_size, defrag.goal, defrag.flag, + &defrag.ext); } return err; @@ -425,6 +710,7 @@ int ext4_defrag_ioctl(struct inode *inod * @start_ext: first new extent to be merged * @new_ext: middle of new extent to be merged * @end_ext: last new extent to be merged + * @phase: phase of the force defrag mode * * This function returns 0 if succeed, otherwise returns error value. */ @@ -432,14 +718,20 @@ static int ext4_defrag_merge_across_blocks(handle_t *handle, struct inode *org_inode, struct ext4_extent *o_start, struct ext4_extent *o_end, struct ext4_extent *start_ext, struct ext4_extent *new_ext, - struct ext4_extent *end_ext) + struct ext4_extent *end_ext, int phase) { struct ext4_ext_path *org_path = NULL; ext4_lblk_t eblock = 0; int new_flag = 0; int end_flag = 0; + int defrag_flag; int err; + if (phase == DEFRAG_FORCE_VICTIM) + defrag_flag = 1; + else + defrag_flag = 0; + if (le16_to_cpu(start_ext->ee_len) && le16_to_cpu(new_ext->ee_len) && le16_to_cpu(end_ext->ee_len)) { @@ -516,8 +808,8 @@ ext4_defrag_merge_across_blocks(handle_t org_path = NULL; goto out; } - err = ext4_ext_insert_extent(handle, org_inode, - org_path, new_ext); + err = ext4_ext_insert_extent_defrag(handle, org_inode, + org_path, new_ext, defrag_flag); if (err) goto out; } @@ -530,8 +822,8 @@ ext4_defrag_merge_across_blocks(handle_t org_path = NULL; goto out; } - err = ext4_ext_insert_extent(handle, org_inode, - org_path, end_ext); + err = ext4_ext_insert_extent_defrag(handle, org_inode, + org_path, end_ext, defrag_flag); if (err) goto out; } @@ -609,6 +901,7 @@ ext4_defrag_merge_inside_block(struct ex * @new_ext: middle of new extent to be merged * @end_ext: last new extent to be merged * @replaced: the number of blocks which will be replaced with new_ext + * @phase: phase of the force defrag mode * * This function returns 0 if succeed, otherwise returns error value. */ @@ -617,7 +910,7 @@ ext4_defrag_merge_extents(handle_t *hand struct ext4_ext_path *org_path, struct ext4_extent *o_start, struct ext4_extent *o_end, struct ext4_extent *start_ext, struct ext4_extent *new_ext, - struct ext4_extent *end_ext, ext4_fsblk_t replaced) + struct ext4_extent *end_ext, ext4_fsblk_t replaced, int phase) { struct ext4_extent_header *eh; unsigned need_slots, slots_range; @@ -655,7 +948,7 @@ ext4_defrag_merge_extents(handle_t *hand ret = ext4_defrag_merge_across_blocks(handle, org_inode, o_start, o_end, start_ext, new_ext, - end_ext); + end_ext, phase); if (ret < 0) return ret; } else { @@ -688,13 +981,14 @@ ext4_defrag_merge_extents(handle_t *hand * @org_path: path indicates first extent to be defraged * @dext: destination extent * @from: start offset on the target file + * @phase: phase of the force defrag mode * * This function returns 0 if succeed, otherwise returns error value. */ static int ext4_defrag_leaf_block(handle_t *handle, struct inode *org_inode, struct ext4_ext_path *org_path, struct ext4_extent *dext, - ext4_lblk_t *from) + ext4_lblk_t *from, int phase) { struct ext4_extent *oext, *o_start = NULL, *o_end = NULL, *prev_ext; struct ext4_extent new_ext, start_ext, end_ext; @@ -795,7 +1089,7 @@ ext4_defrag_leaf_block(handle_t *handle, + le16_to_cpu(oext->ee_len) - 1) { ret = ext4_defrag_merge_extents(handle, org_inode, org_path, o_start, o_end, &start_ext, - &new_ext, &end_ext, replaced); + &new_ext, &end_ext, replaced, phase); if (ret < 0) return ret; @@ -847,6 +1141,7 @@ ext4_defrag_leaf_block(handle_t *handle, * @from_page: page offset of org_inode * @dest_from_page: page offset of dest_inode * @count_page: page count to be replaced + * @phase: phase of the force defrag mode * * This function returns 0 if succeed, otherwise returns error value. * Replace extents for blocks from "from" to "from + count - 1". @@ -854,7 +1149,7 @@ ext4_defrag_leaf_block(handle_t *handle, static int ext4_defrag_replace_branches(handle_t *handle, struct inode *org_inode, struct inode *dest_inode, pgoff_t from_page, - pgoff_t dest_from_page, pgoff_t count_page) + pgoff_t dest_from_page, pgoff_t count_page, int phase) { struct ext4_ext_path *org_path = NULL; struct ext4_ext_path *dest_path = NULL; @@ -922,7 +1217,7 @@ ext4_defrag_replace_branches(handle_t *h /* Loop for the original extent blocks */ err = ext4_defrag_leaf_block(handle, org_inode, - org_path, dext, &from); + org_path, dext, &from, phase); if (err < 0) goto out; @@ -932,7 +1227,7 @@ ext4_defrag_replace_branches(handle_t *h * e.g. ext4_defrag_merge_extents() */ err = ext4_defrag_leaf_block(handle, dest_inode, - dest_path, swap_ext, &dest_off); + dest_path, swap_ext, &dest_off, -1); if (err < 0) goto out; @@ -1028,6 +1323,7 @@ out: * @req_blocks: contiguous blocks count we need * @iblock: target file offset * @goal: goal offset + * @phase: phase of the force defrag mode * */ static void @@ -1036,8 +1332,22 @@ ext4_defrag_fill_ar(struct inode *org_in struct ext4_ext_path *org_path, struct ext4_ext_path *dest_path, ext4_fsblk_t req_blocks, ext4_lblk_t iblock, - ext4_fsblk_t goal) + ext4_fsblk_t goal, int phase) { + ext4_group_t org_grp_no; + ext4_grpblk_t org_blk_off; + int org_depth = ext_depth(org_inode); + + if (phase == DEFRAG_FORCE_VICTIM) { + ext4_get_group_no_and_offset(org_inode->i_sb, + ext_pblock(org_path[org_depth].p_ext), + &org_grp_no, &org_blk_off); + ar->excepted_group = org_grp_no; + } else { + /* Allocate contiguous blocks to any block group */ + ar->excepted_group = -1; + } + ar->inode = dest_inode; ar->len = req_blocks; ar->logical = iblock; @@ -1101,19 +1411,70 @@ ext4_defrag_alloc_blocks(handle_t *handl } /** + * ext4_defrag_check_phase + * - Check condition of the allocated blocks (only force defrag mode) + * + * @ar: allocation request for multiple block allocation + * @dest_grp_no: block group num of the allocated blocks + * @goal_grp_no: block group num of the destination of block allocation + * @alloc_total: sum total of the allocated blocks + * @req_blocks: contiguous blocks count we need + * @phase: phase of the force defrag mode + * + * This function returns 0 if succeed, otherwise returns error value. + */ +static int +ext4_defrag_check_phase(struct ext4_allocation_request *ar, + ext4_group_t dest_grp_no, ext4_group_t goal_grp_no, + ext4_fsblk_t alloc_total, ext4_lblk_t req_blocks, + int phase) +{ + int err = 0; + + switch (phase) { + case DEFRAG_FORCE_TRY: + /* If there is not enough space, return -ENOSPC. */ + if (ar->len != req_blocks) + /* -ENOSPC triggers DEFRAG_FORCE_VICTIM phase. */ + err = -ENOSPC; + break; + case DEFRAG_FORCE_VICTIM: + /* We can't allocate new blocks in the same block group. */ + if (dest_grp_no == ar->excepted_group) { + printk(KERN_ERR "ext4 defrag: Failed to allocate" + " victim file to other block group\n"); + err = -ENOSPC; + } + break; + case DEFRAG_FORCE_GATHER: + /* Maybe reserved blocks are already used by other process. */ + if (dest_grp_no != goal_grp_no + || alloc_total != req_blocks) { + printk(KERN_ERR "ext4 defrag: Reserved blocks are" + " already used by other process\n"); + err = -EIO; + } + break; + } + + return err; +} + +/** * ext4_defrag_partial - Defrag a file per page * * @tmp_inode: temporary inode * @filp: pointer to file * @org_offset: page index on original file * @dest_offset: page index on temporary file + * @phase: phase of the force defrag mode * * * This function returns 0 if succeed, otherwise returns error value. */ static int ext4_defrag_partial(struct inode *tmp_inode, struct file *filp, - pgoff_t org_offset, pgoff_t dest_offset) + pgoff_t org_offset, pgoff_t dest_offset, int phase) { struct inode *org_inode = filp->f_dentry->d_inode; struct address_space *mapping = org_inode->i_mapping; @@ -1180,7 +1541,7 @@ ext4_defrag_partial(struct inode *tmp_in /* Release old bh and drop refs */ try_to_release_page(page, 0); ret = ext4_defrag_replace_branches(handle, org_inode, tmp_inode, - org_offset, dest_offset, 1); + org_offset, dest_offset, 1, phase); if (ret < 0) goto out; @@ -1227,6 +1588,7 @@ out: * @tar_end: the last block number of the allocated blocks * @sum_tmp: the extents count in the allocated blocks * @goal: block offset for allocaton + * @phase: phase of the force defrag mode * * * This function returns the values as below. @@ -1237,7 +1599,7 @@ out: static int ext4_defrag_comp_ext_count(struct inode *org_inode, struct ext4_ext_path *org_path, ext4_lblk_t tar_end, - int sum_tmp, ext4_fsblk_t goal) + int sum_tmp, ext4_fsblk_t goal, int phase) { struct ext4_extent *ext = NULL; int depth = ext_depth(org_inode); @@ -1264,7 +1626,8 @@ ext4_defrag_comp_ext_count(struct inode if (sum_org == sum_tmp && !goal) { /* Not improved */ ret = 1; - } else if (sum_org < sum_tmp) { + } else if (sum_org < sum_tmp && + phase != DEFRAG_FORCE_VICTIM) { /* Fragment increased */ ret = -ENOSPC; printk(KERN_ERR "ext4 defrag: " @@ -1293,6 +1656,7 @@ ext4_defrag_comp_ext_count(struct inode * @tar_blocks: the number of blocks to allocate * @iblock: file related offset * @goal: block offset for allocaton + * @phase: phase of the force defrag mode * * * This function returns the value as below: @@ -1304,7 +1668,7 @@ static int ext4_defrag_new_extent_tree(struct inode *org_inode, struct inode *tmp_inode, struct ext4_ext_path *org_path, ext4_lblk_t tar_start, ext4_lblk_t tar_blocks, ext4_lblk_t iblock, - ext4_fsblk_t goal) + ext4_fsblk_t goal, int phase) { handle_t *handle; struct ext4_extent_header *eh = NULL; @@ -1314,6 +1678,8 @@ ext4_defrag_new_extent_tree(struct inode ext4_fsblk_t alloc_total = 0; ext4_fsblk_t newblock = 0; ext4_lblk_t tar_end = tar_start + tar_blocks - 1; + ext4_group_t dest_group_no, goal_group_no; + ext4_grpblk_t dest_blk_off, goal_blk_off; int sum_tmp = 0; int metadata = 1; int ret, ret2; @@ -1330,7 +1696,7 @@ ext4_defrag_new_extent_tree(struct inode /* Fill struct ext4_allocation_request with necessary info */ ext4_defrag_fill_ar(org_inode, tmp_inode, &ar, org_path, - dest_path, tar_blocks, iblock, goal); + dest_path, tar_blocks, iblock, goal, phase); handle = ext4_journal_start(tmp_inode, 0); if (IS_ERR(handle)) { @@ -1338,6 +1704,9 @@ ext4_defrag_new_extent_tree(struct inode goto out2; } + ext4_get_group_no_and_offset(tmp_inode->i_sb, goal, + &goal_group_no, &goal_blk_off); + while (alloc_total != tar_blocks) { /* Allocate blocks */ ret = ext4_defrag_alloc_blocks(handle, org_inode, tmp_inode, @@ -1345,8 +1714,20 @@ ext4_defrag_new_extent_tree(struct inode if (ret < 0) goto out; + ext4_get_group_no_and_offset(tmp_inode->i_sb, newblock, + &dest_group_no, &dest_blk_off); + alloc_total += ar.len; + /* the checks that done in force mode */ + if (phase) { + ret = ext4_defrag_check_phase(&ar, dest_group_no, + goal_group_no, alloc_total, + tar_blocks, phase); + if (ret < 0) + goto out; + } + newex.ee_block = cpu_to_le32(alloc_total - ar.len); ext4_ext_store_pblock(&newex, newblock); newex.ee_len = cpu_to_le16(ar.len); @@ -1356,13 +1737,14 @@ ext4_defrag_new_extent_tree(struct inode if (ret < 0) goto out; - ar.goal = newblock + ar.len; + if (!phase) + ar.goal = newblock + ar.len; ar.len = tar_blocks - alloc_total; sum_tmp++; } ret = ext4_defrag_comp_ext_count(org_inode, org_path, tar_end, - sum_tmp, goal); + sum_tmp, goal, phase); out: if (ret < 0 || ret == 1) { @@ -1393,14 +1775,16 @@ out2: * ext4_defrag_check - Check the enviroment whether a defrag can be done * * @org_inode: original inode + * @ext: extent to be moved (only defrag force mode) * @defrag_size: size of defrag in blocks * @goal: poiter to block offset for allocation + * @phase: phase of the force defrag mode * * This function returns 0 if succeed, otherwise returns error value. */ static int -ext4_defrag_check(struct inode *org_inode, ext4_lblk_t defrag_size, - ext4_fsblk_t *goal) +ext4_defrag_check(struct inode *org_inode, struct ext4_extent_data *ext, + ext4_lblk_t defrag_size, ext4_fsblk_t *goal, int *phase) { /* ext4 online defrag supports only 4KB block size */ @@ -1417,6 +1801,17 @@ ext4_defrag_check(struct inode *org_inod return -EOPNOTSUPP; } + if (ext->len) { + /* Setup for the force defrag mode */ + if (ext->len < defrag_size) { + printk(KERN_ERR "ext4 defrag: " + "Invalid length of extent\n"); + return -EINVAL; + } + *phase = DEFRAG_FORCE_GATHER; + *goal = ext->start; + } + return 0; } @@ -1495,13 +1890,16 @@ out: * @block_start: starting offset to defrag in blocks * @defrag_size: size of defrag in blocks * @goal: block offset for allocation + * @phase: phase of the force defrag mode + * @ext: extent to be moved (only defrag force mode) * * This function returns the number of blocks if succeed, otherwise * returns error value. */ int ext4_defrag(struct file *filp, ext4_lblk_t block_start, - ext4_lblk_t defrag_size, ext4_fsblk_t goal) + ext4_lblk_t defrag_size, ext4_fsblk_t goal, int phase, + struct ext4_extent_data *ext) { struct inode *org_inode = filp->f_dentry->d_inode, *tmp_inode = NULL; struct ext4_ext_path *org_path = NULL, *holecheck_path = NULL; @@ -1511,7 +1909,7 @@ ext4_defrag(struct file *filp, ext4_lblk int ret, depth, seq_extents, last_extent = 0; /* Check the filesystem enviroment whether defrag can be done */ - ret = ext4_defrag_check(org_inode, defrag_size, &goal); + ret = ext4_defrag_check(org_inode, ext, defrag_size, &goal, &phase); if (ret < 0) return ret; @@ -1627,11 +2025,11 @@ ext4_defrag(struct file *filp, ext4_lblk ret = ext4_defrag_new_extent_tree(org_inode, tmp_inode, org_path, seq_start, seq_blocks, - block_start, goal); + block_start, goal, phase); if (ret < 0) { break; - } else if (ret == 1) { + } else if (ret == 1 && (!goal || (goal && !phase))) { ret = 0; seq_start = le32_to_cpu(ext_cur->ee_block); goto CLEANUP; @@ -1655,7 +2053,7 @@ ext4_defrag(struct file *filp, ext4_lblk while (page_offset <= seq_end_page) { /* Swap original branches with new branches */ ret = ext4_defrag_partial(tmp_inode, filp, - page_offset, dest_offset); + page_offset, dest_offset, phase); if (ret < 0) goto out; @@ -1708,6 +2106,10 @@ out: kfree(holecheck_path); } + if (phase == DEFRAG_FORCE_GATHER) + /* Release reserved block in force mode */ + ext4_discard_reservation(org_inode); + up_write(&EXT4_I(org_inode)->i_data_sem); mutex_unlock(&org_inode->i_mutex); Index: linux-2.6.26-rc9/fs/ext4/ext4.h =================================================================== --- linux-2.6.26-rc9.orig/fs/ext4/ext4.h 2008-07-11 16:05:20.000000000 -0700 +++ linux-2.6.26-rc9/fs/ext4/ext4.h 2008-07-11 16:05:20.000000000 -0700 @@ -97,6 +97,11 @@ struct ext4_allocation_request { unsigned long len; /* flags. see above EXT4_MB_HINT_* */ unsigned long flags; + /* + * for ext4 online defrag: + * the block group which is excepted from allocation target + */ + long long excepted_group; }; /* @@ -306,6 +311,9 @@ struct ext4_new_group_data { #define EXT4_IOC_GROUP_INFO _IOW('f', 11, struct ext4_group_data_info) #define EXT4_IOC_FREE_BLOCKS_INFO _IOW('f', 12, struct ext4_extents_info) #define EXT4_IOC_EXTENTS_INFO _IOW('f', 13, struct ext4_extents_info) +#define EXT4_IOC_RESERVE_BLOCK _IOW('f', 14, struct ext4_extents_info) +#define EXT4_IOC_MOVE_VICTIM _IOW('f', 15, struct ext4_extents_info) +#define EXT4_IOC_BLOCK_RELEASE _IO('f', 8) /* * ioctl commands in 32 bit emulation @@ -334,8 +342,15 @@ struct ext4_new_group_data { * * DEFRAG_MAX_ENT: the maximum number of extents for exchanging between * kernel-space and user-space per an ioctl + * DEFRAG_FORCE_TRY: check whether we have free space fragmentation or not + * DEFRAG_FORCE_VICTIM: move victim extents to make sufficient space + * DEFRAG_FORCE_GATHER: move the target file into the free space made in the + * DEFRAG_FORCE_VICTIM phase */ #define DEFRAG_MAX_ENT 32 +#define DEFRAG_FORCE_TRY 1 +#define DEFRAG_FORCE_VICTIM 2 +#define DEFRAG_FORCE_GATHER 3 struct ext4_extent_data { ext4_lblk_t block; /* start logical block number */ @@ -347,6 +362,8 @@ struct ext4_ext_defrag_data { ext4_lblk_t start_offset; /* start offset to defrag in blocks */ ext4_lblk_t defrag_size; /* size of defrag in blocks */ ext4_fsblk_t goal; /* block offset for allocation */ + int flag; /* free space mode flag */ + struct ext4_extent_data ext; }; struct ext4_group_data_info { @@ -1045,8 +1062,17 @@ extern struct ext4_group_desc * ext4_get extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); extern void ext4_init_block_alloc_info(struct inode *); extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv); +extern void try_to_extend_reservation(struct ext4_reserve_window_node *, + struct super_block *, int); +extern int alloc_new_reservation(struct ext4_reserve_window_node *, + ext4_grpblk_t, struct super_block *, + ext4_group_t, struct buffer_head *); extern ext4_grpblk_t bitmap_search_next_usable_block(ext4_grpblk_t, struct buffer_head *, ext4_grpblk_t); +extern int rsv_is_empty(struct ext4_reserve_window *rsv); +extern int goal_in_my_reservation(struct ext4_reserve_window *rsv, + ext4_grpblk_t grp_goal, ext4_group_t group, + struct super_block *sb); /* dir.c */ extern int ext4_check_dir_entry(const char *, struct inode *, @@ -1180,7 +1206,8 @@ extern void ext4_inode_table_set(struct extern int ext4_ext_journal_restart(handle_t *handle, int needed); /* defrag.c */ extern int ext4_defrag(struct file *filp, ext4_lblk_t block_start, - ext4_lblk_t defrag_size, ext4_fsblk_t goal); + ext4_lblk_t defrag_size, ext4_fsblk_t goal, + int flag, struct ext4_extent_data *ext); extern int ext4_defrag_ioctl(struct inode *, struct file *, unsigned int, unsigned long); Index: linux-2.6.26-rc9/fs/ext4/ext4_extents.h =================================================================== --- linux-2.6.26-rc9.orig/fs/ext4/ext4_extents.h 2008-07-11 16:05:18.000000000 -0700 +++ linux-2.6.26-rc9/fs/ext4/ext4_extents.h 2008-07-11 16:05:20.000000000 -0700 @@ -234,5 +234,10 @@ extern void ext4_ext_drop_refs(struct ex extern ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block); +extern int ext4_ext_insert_extent_defrag(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int defrag); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); + #endif /* _EXT4_EXTENTS */ Index: linux-2.6.26-rc9/fs/ext4/extents.c =================================================================== --- linux-2.6.26-rc9.orig/fs/ext4/extents.c 2008-07-11 16:05:18.000000000 -0700 +++ linux-2.6.26-rc9/fs/ext4/extents.c 2008-07-11 16:05:20.000000000 -0700 @@ -185,11 +185,17 @@ ext4_fsblk_t ext4_ext_find_goal(struct i static ext4_fsblk_t ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *ex, int *err) + struct ext4_extent *ex, int *err, + ext4_fsblk_t defrag_goal) { ext4_fsblk_t goal, newblock; - goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); + if (defrag_goal) + goal = defrag_goal; + else + goal = ext4_ext_find_goal(inode, path, + le32_to_cpu(ex->ee_block)); + newblock = ext4_new_meta_block(handle, inode, goal, err); return newblock; } @@ -673,7 +679,8 @@ static int ext4_ext_insert_index(handle_ */ static int ext4_ext_split(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext, int at) + struct ext4_extent *newext, int at, + ext4_fsblk_t defrag_goal) { struct buffer_head *bh = NULL; int depth = ext_depth(inode); @@ -724,7 +731,7 @@ static int ext4_ext_split(handle_t *hand ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { newblock = ext4_ext_new_meta_block(handle, inode, path, - newext, &err); + newext, &err, defrag_goal); if (newblock == 0) goto cleanup; ablocks[a] = newblock; @@ -911,7 +918,8 @@ cleanup: */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext) + struct ext4_extent *newext, + ext4_fsblk_t defrag_goal) { struct ext4_ext_path *curp = path; struct ext4_extent_header *neh; @@ -920,7 +928,8 @@ static int ext4_ext_grow_indepth(handle_ ext4_fsblk_t newblock; int err = 0; - newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); + newblock = ext4_ext_new_meta_block(handle, inode, path, + newext, &err, defrag_goal); if (newblock == 0) return err; @@ -996,7 +1005,8 @@ out: */ static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent *newext) + struct ext4_extent *newext, + ext4_fsblk_t defrag_goal) { struct ext4_ext_path *curp; int depth, i, err = 0; @@ -1016,7 +1026,8 @@ repeat: if (EXT_HAS_FREE_INDEX(curp)) { /* if we found index with free entry, then use that * entry: create all needed subtree and add new leaf */ - err = ext4_ext_split(handle, inode, path, newext, i); + err = ext4_ext_split(handle, inode, path, newext, i, + defrag_goal); if (err) goto out; @@ -1029,7 +1040,8 @@ repeat: err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, path, newext); + err = ext4_ext_grow_indepth(handle, inode, path, + newext, defrag_goal); if (err) goto out; @@ -1209,7 +1221,7 @@ ext4_ext_search_right(struct inode *inod * allocated block. Thus, index entries have to be consistent * with leaves. */ -static ext4_lblk_t +ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path) { int depth; @@ -1475,6 +1487,19 @@ int ext4_ext_insert_extent(handle_t *han struct ext4_ext_path *path, struct ext4_extent *newext) { + return ext4_ext_insert_extent_defrag(handle, inode, path, newext, 0); +} + +/* + * ext4_ext_insert_extent_defrag: + * The difference from ext4_ext_insert_extent is to use the first block + * in newext as the goal of the new index block. + */ +int +ext4_ext_insert_extent_defrag(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *newext, int defrag) +{ struct ext4_extent_header * eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ @@ -1482,6 +1507,7 @@ int ext4_ext_insert_extent(handle_t *han int depth, len, err; ext4_lblk_t next; unsigned uninitialized = 0; + ext4_fsblk_t defrag_goal; BUG_ON(ext4_ext_get_actual_len(newext) == 0); depth = ext_depth(inode); @@ -1542,11 +1568,16 @@ repeat: le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); } + if (defrag) + defrag_goal = ext_pblock(newext); + else + defrag_goal = 0; /* * There is no free space in the found leaf. * We're gonna add a new leaf in the tree. */ - err = ext4_ext_create_new_leaf(handle, inode, path, newext); + err = ext4_ext_create_new_leaf(handle, inode, path, + newext, defrag_goal); if (err) goto cleanup; depth = ext_depth(inode); Index: linux-2.6.26-rc9/fs/ext4/ioctl.c =================================================================== --- linux-2.6.26-rc9.orig/fs/ext4/ioctl.c 2008-07-11 16:05:20.000000000 -0700 +++ linux-2.6.26-rc9/fs/ext4/ioctl.c 2008-07-11 16:05:20.000000000 -0700 @@ -245,7 +245,10 @@ setversion_out: case EXT4_IOC_DEFRAG: case EXT4_IOC_GROUP_INFO: case EXT4_IOC_FREE_BLOCKS_INFO: - case EXT4_IOC_EXTENTS_INFO: { + case EXT4_IOC_EXTENTS_INFO: + case EXT4_IOC_RESERVE_BLOCK: + case EXT4_IOC_MOVE_VICTIM: + case EXT4_IOC_BLOCK_RELEASE: { return ext4_defrag_ioctl(inode, filp, cmd, arg); } case EXT4_IOC_GROUP_ADD: { Index: linux-2.6.26-rc9/fs/ext4/mballoc.c =================================================================== --- linux-2.6.26-rc9.orig/fs/ext4/mballoc.c 2008-07-11 16:05:13.000000000 -0700 +++ linux-2.6.26-rc9/fs/ext4/mballoc.c 2008-07-11 16:05:20.000000000 -0700 @@ -1772,6 +1772,10 @@ repeat: if (group == EXT4_SB(sb)->s_groups_count) group = 0; + if (ac->ac_excepted_group != -1 && + group == ac->ac_excepted_group) + continue; + /* quick check to skip empty groups */ grp = ext4_get_group_info(ac->ac_sb, group); if (grp->bb_free == 0) @@ -4094,6 +4098,7 @@ ext4_mb_initialize_context(struct ext4_a ac->ac_bitmap_page = NULL; ac->ac_buddy_page = NULL; ac->ac_lg = NULL; + ac->ac_excepted_group = ar->excepted_group; /* we have to define context: we'll we work with a file or * locality group. this is a policy, actually */ Index: linux-2.6.26-rc9/fs/ext4/mballoc.h =================================================================== --- linux-2.6.26-rc9.orig/fs/ext4/mballoc.h 2008-07-11 16:04:25.000000000 -0700 +++ linux-2.6.26-rc9/fs/ext4/mballoc.h 2008-07-11 16:05:20.000000000 -0700 @@ -205,6 +205,7 @@ struct ext4_allocation_context { struct page *ac_buddy_page; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; + long long ac_excepted_group; }; #define AC_STATUS_CONTINUE 1