diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile index 7c5ab63..6a9e30c 100644 --- a/fs/reiserfs/Makefile +++ b/fs/reiserfs/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \ super.o prints.o objectid.o lbalance.o ibalance.o stree.o \ hashes.o tail_conversion.o journal.o resize.o \ - item_ops.o ioctl.o procfs.o xattr.o + item_ops.o ioctl.o procfs.o xattr.o lock.o ifeq ($(CONFIG_REISERFS_FS_XATTR),y) reiserfs-objs += xattr_user.o xattr_trusted.o diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index e716161..6854957 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -1249,14 +1249,18 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, else if (bitmap == 0) block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1; + reiserfs_write_unlock(sb); bh = sb_bread(sb, block); + reiserfs_write_lock(sb); if (bh == NULL) reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) " "reading failed", __func__, block); else { if (buffer_locked(bh)) { PROC_INFO_INC(sb, scan_bitmap.wait); + reiserfs_write_unlock(sb); __wait_on_buffer(bh); + reiserfs_write_lock(sb); } BUG_ON(!buffer_uptodate(bh)); BUG_ON(atomic_read(&bh->b_count) == 0); diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 45ee3d3..e8b79d2 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -176,14 +176,22 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, // user space buffer is swapped out. At that time // entry can move to somewhere else memcpy(local_buf, d_name, d_reclen); + + /* + * Since filldir might sleep, we can release + * the write lock here for other waiters + */ + reiserfs_write_unlock(inode->i_sb); if (filldir (dirent, local_buf, d_reclen, d_off, d_ino, DT_UNKNOWN) < 0) { + reiserfs_write_lock(inode->i_sb); if (local_buf != small_buf) { kfree(local_buf); } goto end; } + reiserfs_write_lock(inode->i_sb); if (local_buf != small_buf) { kfree(local_buf); } diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 4beb964..2787349 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -21,14 +21,6 @@ #include #include -#ifdef CONFIG_REISERFS_CHECK - -struct tree_balance *cur_tb = NULL; /* detects whether more than one - copy of tb exists as a means - of checking whether schedule - is interrupting do_balance */ -#endif - static inline void buffer_info_init_left(struct tree_balance *tb, struct buffer_info *bi) { @@ -1841,11 +1833,12 @@ static int check_before_balancing(struct tree_balance *tb) { int retval = 0; - if (cur_tb) { + if (REISERFS_SB(tb->tb_sb)->cur_tb) { reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule " "occurred based on cur_tb not being null at " "this point in code. do_balance cannot properly " - "handle schedule occurring while it runs."); + "handle concurrent tree accesses on a same " + "mount point."); } /* double check that buffers that we will modify are unlocked. (fix_nodes should already have @@ -1987,7 +1980,7 @@ static inline void do_balance_starts(struct tree_balance *tb) "check");*/ RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB"); #ifdef CONFIG_REISERFS_CHECK - cur_tb = tb; + REISERFS_SB(tb->tb_sb)->cur_tb = tb; #endif } @@ -1997,7 +1990,7 @@ static inline void do_balance_completed(struct tree_balance *tb) #ifdef CONFIG_REISERFS_CHECK check_leaf_level(tb); check_internal_levels(tb); - cur_tb = NULL; + REISERFS_SB(tb->tb_sb)->cur_tb = NULL; #endif /* reiserfs_free_block is no longer schedule safe. So, we need to diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 5e5a4e6..d2f3133 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -563,9 +563,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, return needed_nodes; } -#ifdef CONFIG_REISERFS_CHECK -extern struct tree_balance *cur_tb; -#endif /* Set parameters for balancing. * Performs write of results of analysis of balancing into structure tb, @@ -1022,7 +1019,11 @@ static int get_far_parent(struct tree_balance *tb, /* Check whether the common parent is locked. */ if (buffer_locked(*pcom_father)) { + + /* Release the write lock while the buffer is busy */ + reiserfs_write_unlock(tb->tb_sb); __wait_on_buffer(*pcom_father); + reiserfs_write_lock(tb->tb_sb); if (FILESYSTEM_CHANGED_TB(tb)) { brelse(*pcom_father); return REPEAT_SEARCH; @@ -1927,7 +1928,9 @@ static int get_direct_parent(struct tree_balance *tb, int h) return REPEAT_SEARCH; if (buffer_locked(bh)) { + reiserfs_write_unlock(tb->tb_sb); __wait_on_buffer(bh); + reiserfs_write_lock(tb->tb_sb); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } @@ -1965,7 +1968,9 @@ static int get_neighbors(struct tree_balance *tb, int h) tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb-> FL[h]); son_number = B_N_CHILD_NUM(tb->FL[h], child_position); + reiserfs_write_unlock(sb); bh = sb_bread(sb, son_number); + reiserfs_write_lock(sb); if (!bh) return IO_ERROR; if (FILESYSTEM_CHANGED_TB(tb)) { @@ -2003,7 +2008,9 @@ static int get_neighbors(struct tree_balance *tb, int h) child_position = (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0; son_number = B_N_CHILD_NUM(tb->FR[h], child_position); + reiserfs_write_unlock(sb); bh = sb_bread(sb, son_number); + reiserfs_write_lock(sb); if (!bh) return IO_ERROR; if (FILESYSTEM_CHANGED_TB(tb)) { @@ -2278,7 +2285,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) REPEAT_SEARCH : CARRY_ON; } #endif + reiserfs_write_unlock(tb->tb_sb); __wait_on_buffer(locked); + reiserfs_write_lock(tb->tb_sb); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } @@ -2349,12 +2358,14 @@ int fix_nodes(int op_mode, struct tree_balance *tb, /* if it possible in indirect_to_direct conversion */ if (buffer_locked(tbS0)) { + reiserfs_write_unlock(tb->tb_sb); __wait_on_buffer(tbS0); + reiserfs_write_lock(tb->tb_sb); if (FILESYSTEM_CHANGED_TB(tb)) return REPEAT_SEARCH; } #ifdef CONFIG_REISERFS_CHECK - if (cur_tb) { + if (REISERFS_SB(tb->tb_sb)->cur_tb) { print_cur_tb("fix_nodes"); reiserfs_panic(tb->tb_sb, "PAP-8305", "there is pending do_balance"); diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 6fd0f47..cd65176 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -489,10 +489,14 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode, disappeared */ if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { int err; - lock_kernel(); + + reiserfs_write_lock(inode->i_sb); + err = reiserfs_commit_for_inode(inode); REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; - unlock_kernel(); + + reiserfs_write_unlock(inode->i_sb); + if (err < 0) ret = err; } @@ -601,6 +605,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, __le32 *item; int done; int fs_gen; + int lock_depth; struct reiserfs_transaction_handle *th = NULL; /* space reserved in transaction batch: . 3 balancings in direct->indirect conversion @@ -616,12 +621,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block, loff_t new_offset = (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; - /* bad.... */ - reiserfs_write_lock(inode->i_sb); + lock_depth = reiserfs_write_lock_once(inode->i_sb); version = get_inode_item_key_version(inode); if (!file_capable(inode, block)) { - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, lock_depth); return -EFBIG; } @@ -633,7 +637,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, /* find number of block-th logical block of the file */ ret = _get_block_create_0(inode, block, bh_result, create | GET_BLOCK_READ_DIRECT); - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, lock_depth); return ret; } /* @@ -751,7 +755,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, if (!dangle && th) retval = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, lock_depth); /* the item was found, so new blocks were not added to the file ** there is no need to make sure the inode is updated with this @@ -997,10 +1001,16 @@ int reiserfs_get_block(struct inode *inode, sector_t block, if (retval) goto failure; } - /* inserting indirect pointers for a hole can take a - ** long time. reschedule if needed + /* + * inserting indirect pointers for a hole can take a + * long time. reschedule if needed and also release the write + * lock for others. */ - cond_resched(); + if (need_resched()) { + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + schedule(); + lock_depth = reiserfs_write_lock_once(inode->i_sb); + } retval = search_for_position_by_key(inode->i_sb, &key, &path); if (retval == IO_ERROR) { @@ -1035,7 +1045,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block, retval = err; } - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, lock_depth); reiserfs_check_path(&path); return retval; } @@ -2076,8 +2086,9 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) int error; struct buffer_head *bh = NULL; int err2; + int lock_depth; - reiserfs_write_lock(inode->i_sb); + lock_depth = reiserfs_write_lock_once(inode->i_sb); if (inode->i_size > 0) { error = grab_tail_page(inode, &page, &bh); @@ -2146,14 +2157,17 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps) page_cache_release(page); } - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + return 0; out: if (page) { unlock_page(page); page_cache_release(page); } - reiserfs_write_unlock(inode->i_sb); + + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + return error; } @@ -2612,7 +2626,10 @@ int reiserfs_prepare_write(struct file *f, struct page *page, int ret; int old_ref = 0; + reiserfs_write_unlock(inode->i_sb); reiserfs_wait_on_write_block(inode->i_sb); + reiserfs_write_lock(inode->i_sb); + fix_tail_page_for_writing(page); if (reiserfs_transaction_running(inode->i_sb)) { struct reiserfs_transaction_handle *th; @@ -2668,6 +2685,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, int update_sd = 0; struct reiserfs_transaction_handle *th; unsigned start; + int lock_depth = 0; + bool locked = false; if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND) pos ++; @@ -2694,9 +2713,11 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, ** to do the i_size updates here. */ pos += copied; + if (pos > inode->i_size) { struct reiserfs_transaction_handle myth; - reiserfs_write_lock(inode->i_sb); + lock_depth = reiserfs_write_lock_once(inode->i_sb); + locked = true; /* If the file have grown beyond the border where it can have a tail, unmark it as needing a tail packing */ @@ -2707,10 +2728,9 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; ret = journal_begin(&myth, inode->i_sb, 1); - if (ret) { - reiserfs_write_unlock(inode->i_sb); + if (ret) goto journal_error; - } + reiserfs_update_inode_transaction(inode); inode->i_size = pos; /* @@ -2722,34 +2742,36 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, reiserfs_update_sd(&myth, inode); update_sd = 1; ret = journal_end(&myth, inode->i_sb, 1); - reiserfs_write_unlock(inode->i_sb); if (ret) goto journal_error; } if (th) { - reiserfs_write_lock(inode->i_sb); + if (!locked) { + lock_depth = reiserfs_write_lock_once(inode->i_sb); + locked = true; + } if (!update_sd) mark_inode_dirty(inode); ret = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock(inode->i_sb); if (ret) goto out; } out: + if (locked) + reiserfs_write_unlock_once(inode->i_sb, lock_depth); unlock_page(page); page_cache_release(page); return ret == 0 ? copied : ret; journal_error: + reiserfs_write_unlock_once(inode->i_sb, lock_depth); + locked = false; if (th) { - reiserfs_write_lock(inode->i_sb); if (!update_sd) reiserfs_update_sd(th, inode); ret = reiserfs_end_persistent_transaction(th); - reiserfs_write_unlock(inode->i_sb); } - goto out; } @@ -2762,7 +2784,10 @@ int reiserfs_commit_write(struct file *f, struct page *page, int update_sd = 0; struct reiserfs_transaction_handle *th = NULL; + reiserfs_write_unlock(inode->i_sb); reiserfs_wait_on_write_block(inode->i_sb); + reiserfs_write_lock(inode->i_sb); + if (reiserfs_transaction_running(inode->i_sb)) { th = current->journal_info; } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 0ccc3fd..5e40b0c 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -141,9 +141,11 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, default: return -ENOIOCTLCMD; } - lock_kernel(); + + reiserfs_write_lock(inode->i_sb); ret = reiserfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg)); - unlock_kernel(); + reiserfs_write_unlock(inode->i_sb); + return ret; } #endif diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 77f5bb7..86c1ff4 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -429,21 +429,6 @@ static void clear_prepared_bits(struct buffer_head *bh) clear_buffer_journal_restore_dirty(bh); } -/* utility function to force a BUG if it is called without the big -** kernel lock held. caller is the string printed just before calling BUG() -*/ -void reiserfs_check_lock_depth(struct super_block *sb, char *caller) -{ -#ifdef CONFIG_SMP - if (current->lock_depth < 0) { - reiserfs_panic(sb, "journal-1", "%s called without kernel " - "lock held", caller); - } -#else - ; -#endif -} - /* return a cnode with same dev, block number and size in table, or null if not found */ static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct super_block @@ -556,7 +541,8 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table, static inline void lock_journal(struct super_block *sb) { PROC_INFO_INC(sb, journal.lock_journal); - mutex_lock(&SB_JOURNAL(sb)->j_mutex); + + reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb); } /* unlock the current transaction */ @@ -708,7 +694,9 @@ static void check_barrier_completion(struct super_block *s, disable_barrier(s); set_buffer_uptodate(bh); set_buffer_dirty(bh); + reiserfs_write_unlock(s); sync_dirty_buffer(bh); + reiserfs_write_lock(s); } } @@ -996,8 +984,13 @@ static int reiserfs_async_progress_wait(struct super_block *s) { DEFINE_WAIT(wait); struct reiserfs_journal *j = SB_JOURNAL(s); - if (atomic_read(&j->j_async_throttle)) + + if (atomic_read(&j->j_async_throttle)) { + reiserfs_write_unlock(s); congestion_wait(WRITE, HZ / 10); + reiserfs_write_lock(s); + } + return 0; } @@ -1043,7 +1036,8 @@ static int flush_commit_list(struct super_block *s, } /* make sure nobody is trying to flush this one at the same time */ - mutex_lock(&jl->j_commit_mutex); + reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s); + if (!journal_list_still_alive(s, trans_id)) { mutex_unlock(&jl->j_commit_mutex); goto put_jl; @@ -1061,12 +1055,17 @@ static int flush_commit_list(struct super_block *s, if (!list_empty(&jl->j_bh_list)) { int ret; - unlock_kernel(); + + /* + * We might sleep in numerous places inside + * write_ordered_buffers. Relax the write lock. + */ + reiserfs_write_unlock(s); ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, journal, jl, &jl->j_bh_list); if (ret < 0 && retval == 0) retval = ret; - lock_kernel(); + reiserfs_write_lock(s); } BUG_ON(!list_empty(&jl->j_bh_list)); /* @@ -1085,8 +1084,11 @@ static int flush_commit_list(struct super_block *s, SB_ONDISK_JOURNAL_SIZE(s); tbh = journal_find_get_block(s, bn); if (tbh) { - if (buffer_dirty(tbh)) - ll_rw_block(WRITE, 1, &tbh) ; + if (buffer_dirty(tbh)) { + reiserfs_write_unlock(s); + ll_rw_block(WRITE, 1, &tbh); + reiserfs_write_lock(s); + } put_bh(tbh) ; } } @@ -1114,12 +1116,19 @@ static int flush_commit_list(struct super_block *s, bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); tbh = journal_find_get_block(s, bn); + + reiserfs_write_unlock(s); wait_on_buffer(tbh); + reiserfs_write_lock(s); // since we're using ll_rw_blk above, it might have skipped over // a locked buffer. Double check here // - if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */ + /* redundant, sync_dirty_buffer() checks */ + if (buffer_dirty(tbh)) { + reiserfs_write_unlock(s); sync_dirty_buffer(tbh); + reiserfs_write_lock(s); + } if (unlikely(!buffer_uptodate(tbh))) { #ifdef CONFIG_REISERFS_CHECK reiserfs_warning(s, "journal-601", @@ -1143,10 +1152,15 @@ static int flush_commit_list(struct super_block *s, if (buffer_dirty(jl->j_commit_bh)) BUG(); mark_buffer_dirty(jl->j_commit_bh) ; + reiserfs_write_unlock(s); sync_dirty_buffer(jl->j_commit_bh) ; + reiserfs_write_lock(s); } - } else + } else { + reiserfs_write_unlock(s); wait_on_buffer(jl->j_commit_bh); + reiserfs_write_lock(s); + } check_barrier_completion(s, jl->j_commit_bh); @@ -1286,7 +1300,9 @@ static int _update_journal_header_block(struct super_block *sb, if (trans_id >= journal->j_last_flush_trans_id) { if (buffer_locked((journal->j_header_bh))) { + reiserfs_write_unlock(sb); wait_on_buffer((journal->j_header_bh)); + reiserfs_write_lock(sb); if (unlikely(!buffer_uptodate(journal->j_header_bh))) { #ifdef CONFIG_REISERFS_CHECK reiserfs_warning(sb, "journal-699", @@ -1312,12 +1328,16 @@ static int _update_journal_header_block(struct super_block *sb, disable_barrier(sb); goto sync; } + reiserfs_write_unlock(sb); wait_on_buffer(journal->j_header_bh); + reiserfs_write_lock(sb); check_barrier_completion(sb, journal->j_header_bh); } else { sync: set_buffer_dirty(journal->j_header_bh); + reiserfs_write_unlock(sb); sync_dirty_buffer(journal->j_header_bh); + reiserfs_write_lock(sb); } if (!buffer_uptodate(journal->j_header_bh)) { reiserfs_warning(sb, "journal-837", @@ -1409,7 +1429,7 @@ static int flush_journal_list(struct super_block *s, /* if flushall == 0, the lock is already held */ if (flushall) { - mutex_lock(&journal->j_flush_mutex); + reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); } else if (mutex_trylock(&journal->j_flush_mutex)) { BUG(); } @@ -1553,7 +1573,11 @@ static int flush_journal_list(struct super_block *s, reiserfs_panic(s, "journal-1011", "cn->bh is NULL"); } + + reiserfs_write_unlock(s); wait_on_buffer(cn->bh); + reiserfs_write_lock(s); + if (!cn->bh) { reiserfs_panic(s, "journal-1012", "cn->bh is NULL"); @@ -1769,7 +1793,7 @@ static int kupdate_transactions(struct super_block *s, struct reiserfs_journal *journal = SB_JOURNAL(s); chunk.nr = 0; - mutex_lock(&journal->j_flush_mutex); + reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s); if (!journal_list_still_alive(s, orig_trans_id)) { goto done; } @@ -1973,11 +1997,19 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, reiserfs_mounted_fs_count--; /* wait for all commits to finish */ cancel_delayed_work(&SB_JOURNAL(sb)->j_work); + + /* + * We must release the write lock here because + * the workqueue job (flush_async_commit) needs this lock + */ + reiserfs_write_unlock(sb); flush_workqueue(commit_wq); + if (!reiserfs_mounted_fs_count) { destroy_workqueue(commit_wq); commit_wq = NULL; } + reiserfs_write_lock(sb); free_journal_ram(sb); @@ -2243,7 +2275,11 @@ static int journal_read_transaction(struct super_block *sb, /* read in the log blocks, memcpy to the corresponding real block */ ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); for (i = 0; i < get_desc_trans_len(desc); i++) { + + reiserfs_write_unlock(sb); wait_on_buffer(log_blocks[i]); + reiserfs_write_lock(sb); + if (!buffer_uptodate(log_blocks[i])) { reiserfs_warning(sb, "journal-1212", "REPLAY FAILURE fsck required! " @@ -2964,8 +3000,11 @@ static void queue_log_writer(struct super_block *s) init_waitqueue_entry(&wait, current); add_wait_queue(&journal->j_join_wait, &wait); set_current_state(TASK_UNINTERRUPTIBLE); - if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) + if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) { + reiserfs_write_unlock(s); schedule(); + reiserfs_write_lock(s); + } __set_current_state(TASK_RUNNING); remove_wait_queue(&journal->j_join_wait, &wait); } @@ -2982,7 +3021,9 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id) struct reiserfs_journal *journal = SB_JOURNAL(sb); unsigned long bcount = journal->j_bcount; while (1) { + reiserfs_write_unlock(sb); schedule_timeout_uninterruptible(1); + reiserfs_write_lock(sb); journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; while ((atomic_read(&journal->j_wcount) > 0 || atomic_read(&journal->j_jlock)) && @@ -3033,7 +3074,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { unlock_journal(sb); + reiserfs_write_unlock(sb); reiserfs_wait_on_write_block(sb); + reiserfs_write_lock(sb); PROC_INFO_INC(sb, journal.journal_relock_writers); goto relock; } @@ -3506,14 +3549,14 @@ static void flush_async_commits(struct work_struct *work) struct reiserfs_journal_list *jl; struct list_head *entry; - lock_kernel(); + reiserfs_write_lock(sb); if (!list_empty(&journal->j_journal_list)) { /* last entry is the youngest, commit it and you get everything */ entry = journal->j_journal_list.prev; jl = JOURNAL_LIST_ENTRY(entry); flush_commit_list(sb, jl, 1); } - unlock_kernel(); + reiserfs_write_unlock(sb); } /* @@ -4041,7 +4084,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, * the new transaction is fully setup, and we've already flushed the * ordered bh list */ - mutex_lock(&jl->j_commit_mutex); + reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb); /* save the transaction id in case we need to commit it later */ commit_trans_id = jl->j_trans_id; @@ -4156,7 +4199,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, next = cn->next; free_cnode(sb, cn); cn = next; + reiserfs_write_unlock(sb); cond_resched(); + reiserfs_write_lock(sb); } /* we are done with both the c_bh and d_bh, but @@ -4203,10 +4248,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, * is lost. */ if (!list_empty(&jl->j_tail_bh_list)) { - unlock_kernel(); + reiserfs_write_unlock(sb); write_ordered_buffers(&journal->j_dirty_buffers_lock, journal, jl, &jl->j_tail_bh_list); - lock_kernel(); + reiserfs_write_lock(sb); } BUG_ON(!list_empty(&jl->j_tail_bh_list)); mutex_unlock(&jl->j_commit_mutex); diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c new file mode 100644 index 0000000..cb1bba3 --- /dev/null +++ b/fs/reiserfs/lock.c @@ -0,0 +1,89 @@ +#include +#include + +/* + * The previous reiserfs locking scheme was heavily based on + * the tricky properties of the Bkl: + * + * - it was acquired recursively by a same task + * - the performances relied on the release-while-schedule() property + * + * Now that we replace it by a mutex, we still want to keep the same + * recursive property to avoid big changes in the code structure. + * We use our own lock_owner here because the owner field on a mutex + * is only available in SMP or mutex debugging, also we only need this field + * for this mutex, no need for a system wide mutex facility. + * + * Also this lock is often released before a call that could block because + * reiserfs performances were partialy based on the release while schedule() + * property of the Bkl. + */ +void reiserfs_write_lock(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + if (sb_i->lock_owner != current) { + mutex_lock(&sb_i->lock); + sb_i->lock_owner = current; + } + + /* No need to protect it, only the current task touches it */ + sb_i->lock_depth++; +} + +void reiserfs_write_unlock(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + /* + * Are we unlocking without even holding the lock? + * Such a situation could even raise a BUG() if we don't + * want the data become corrupted + */ + WARN_ONCE(sb_i->lock_owner != current, + "Superblock write lock imbalance"); + + if (--sb_i->lock_depth == -1) { + sb_i->lock_owner = NULL; + mutex_unlock(&sb_i->lock); + } +} + +/* + * If we already own the lock, just exit and don't increase the depth. + * Useful when we don't want to lock more than once. + * + * We always return the lock_depth we had before calling + * this function. + */ +int reiserfs_write_lock_once(struct super_block *s) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(s); + + if (sb_i->lock_owner != current) { + mutex_lock(&sb_i->lock); + sb_i->lock_owner = current; + return sb_i->lock_depth++; + } + + return sb_i->lock_depth; +} + +void reiserfs_write_unlock_once(struct super_block *s, int lock_depth) +{ + if (lock_depth == -1) + reiserfs_write_unlock(s); +} + +/* + * Utility function to force a BUG if it is called without the superblock + * write lock held. caller is the string printed just before calling BUG() + */ +void reiserfs_check_lock_depth(struct super_block *sb, char *caller) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); + + if (sb_i->lock_depth < 0) + reiserfs_panic(sb, "%s called without kernel lock held %d", + caller); +} diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 2715791..b3973c9 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -324,6 +324,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { int retval; + int lock_depth; struct inode *inode = NULL; struct reiserfs_dir_entry de; INITIALIZE_PATH(path_to_entry); @@ -331,7 +332,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len) return ERR_PTR(-ENAMETOOLONG); - reiserfs_write_lock(dir->i_sb); + /* + * Might be called with or without the write lock, must be careful + * to not recursively hold it in case we want to release the lock + * before rescheduling. + */ + lock_depth = reiserfs_write_lock_once(dir->i_sb); + de.de_gen_number_bit_string = NULL; retval = reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, @@ -341,7 +348,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, inode = reiserfs_iget(dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); if (!inode || IS_ERR(inode)) { - reiserfs_write_unlock(dir->i_sb); + reiserfs_write_unlock_once(dir->i_sb, lock_depth); return ERR_PTR(-EACCES); } @@ -350,7 +357,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry, if (IS_PRIVATE(dir)) inode->i_flags |= S_PRIVATE; } - reiserfs_write_unlock(dir->i_sb); + reiserfs_write_unlock_once(dir->i_sb, lock_depth); if (retval == IO_ERROR) { return ERR_PTR(-EIO); } diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 536eaca..adbc6f5 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -349,10 +349,6 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...) . */ -#ifdef CONFIG_REISERFS_CHECK -extern struct tree_balance *cur_tb; -#endif - void __reiserfs_panic(struct super_block *sb, const char *id, const char *function, const char *fmt, ...) { diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c index 238e9d9..6a7bfb3 100644 --- a/fs/reiserfs/resize.c +++ b/fs/reiserfs/resize.c @@ -142,7 +142,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new) set_buffer_uptodate(bh); mark_buffer_dirty(bh); + reiserfs_write_unlock(s); sync_dirty_buffer(bh); + reiserfs_write_lock(s); // update bitmap_info stuff bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; brelse(bh); diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index d036ee5..5fa7118 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -222,9 +222,6 @@ static inline int bin_search(const void *key, /* Key to search for. */ return ITEM_NOT_FOUND; } -#ifdef CONFIG_REISERFS_CHECK -extern struct tree_balance *cur_tb; -#endif /* Minimal possible key. It is never in the tree. */ const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} }; @@ -519,25 +516,48 @@ static int is_tree_node(struct buffer_head *bh, int level) #define SEARCH_BY_KEY_READA 16 -/* The function is NOT SCHEDULE-SAFE! */ -static void search_by_key_reada(struct super_block *s, +/* + * The function is NOT SCHEDULE-SAFE! + * It might unlock the write lock if we needed to wait for a block + * to be read. Note that in this case it won't recover the lock to avoid + * high contention resulting from too much lock requests, especially + * the caller (search_by_key) will perform other schedule-unsafe + * operations just after calling this function. + * + * @return true if we have unlocked + */ +static bool search_by_key_reada(struct super_block *s, struct buffer_head **bh, b_blocknr_t *b, int num) { int i, j; + bool unlocked = false; for (i = 0; i < num; i++) { bh[i] = sb_getblk(s, b[i]); } + /* + * We are going to read some blocks on which we + * have a reference. It's safe, though we might be + * reading blocks concurrently changed if we release + * the lock. But it's still fine because we check later + * if the tree changed + */ for (j = 0; j < i; j++) { /* * note, this needs attention if we are getting rid of the BKL * you have to make sure the prepared bit isn't set on this buffer */ - if (!buffer_uptodate(bh[j])) + if (!buffer_uptodate(bh[j])) { + if (!unlocked) { + reiserfs_write_unlock(s); + unlocked = true; + } ll_rw_block(READA, 1, bh + j); + } brelse(bh[j]); } + return unlocked; } /************************************************************************** @@ -625,11 +645,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s have a pointer to it. */ if ((bh = last_element->pe_buffer = sb_getblk(sb, block_number))) { + bool unlocked = false; + if (!buffer_uptodate(bh) && reada_count > 1) - search_by_key_reada(sb, reada_bh, + /* may unlock the write lock */ + unlocked = search_by_key_reada(sb, reada_bh, reada_blocks, reada_count); + /* + * If we haven't already unlocked the write lock, + * then we need to do that here before reading + * the current block + */ + if (!buffer_uptodate(bh) && !unlocked) { + reiserfs_write_unlock(sb); + unlocked = true; + } ll_rw_block(READ, 1, &bh); wait_on_buffer(bh); + + if (unlocked) + reiserfs_write_lock(sb); if (!buffer_uptodate(bh)) goto io_error; } else { @@ -673,7 +708,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s !key_in_buffer(search_path, key, sb), "PAP-5130: key is not in the buffer"); #ifdef CONFIG_REISERFS_CHECK - if (cur_tb) { + if (REISERFS_SB(sb)->cur_tb) { print_cur_tb("5140"); reiserfs_panic(sb, "PAP-5140", "schedule occurred in do_balance!"); @@ -1024,7 +1059,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st reiserfs_free_block(th, inode, block, 1); } + reiserfs_write_unlock(sb); cond_resched(); + reiserfs_write_lock(sb); if (item_moved (&s_ih, path)) { need_re_search = 1; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 3567fb9..b301f7d 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -468,6 +468,13 @@ static void reiserfs_put_super(struct super_block *s) struct reiserfs_transaction_handle th; th.t_trans_id = 0; + /* + * We didn't need to explicitly lock here before, because put_super + * is called with the bkl held. + * Now that we have our own lock, we must explicitly lock. + */ + reiserfs_write_lock(s); + /* change file system state to current state if it was mounted with read-write permissions */ if (!(s->s_flags & MS_RDONLY)) { if (!journal_begin(&th, s, 10)) { @@ -497,6 +504,8 @@ static void reiserfs_put_super(struct super_block *s) reiserfs_proc_info_done(s); + reiserfs_write_unlock(s); + mutex_destroy(&REISERFS_SB(s)->lock); kfree(s->s_fs_info); s->s_fs_info = NULL; @@ -556,25 +565,28 @@ static void reiserfs_dirty_inode(struct inode *inode) struct reiserfs_transaction_handle th; int err = 0; + int lock_depth; + if (inode->i_sb->s_flags & MS_RDONLY) { reiserfs_warning(inode->i_sb, "clm-6006", "writing inode %lu on readonly FS", inode->i_ino); return; } - reiserfs_write_lock(inode->i_sb); + lock_depth = reiserfs_write_lock_once(inode->i_sb); /* this is really only used for atime updates, so they don't have ** to be included in O_SYNC or fsync */ err = journal_begin(&th, inode->i_sb, 1); - if (err) { - reiserfs_write_unlock(inode->i_sb); - return; - } + if (err) + goto out; + reiserfs_update_sd(&th, inode); journal_end(&th, inode->i_sb, 1); - reiserfs_write_unlock(inode->i_sb); + +out: + reiserfs_write_unlock_once(inode->i_sb, lock_depth); } #ifdef CONFIG_REISERFS_FS_POSIX_ACL @@ -1189,7 +1201,15 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) unsigned int qfmt = 0; #ifdef CONFIG_QUOTA int i; +#endif + + /* + * We used to protect using the implicitly acquired bkl here. + * Now we must explictly acquire our own lock + */ + reiserfs_write_lock(s); +#ifdef CONFIG_QUOTA memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); #endif @@ -1315,9 +1335,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) out_ok: replace_mount_options(s, new_opts); + reiserfs_write_unlock(s); return 0; out_err: + reiserfs_write_unlock(s); kfree(new_opts); return err; } @@ -1422,7 +1444,9 @@ static int read_super_block(struct super_block *s, int offset) static int reread_meta_blocks(struct super_block *s) { ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); + reiserfs_write_unlock(s); wait_on_buffer(SB_BUFFER_WITH_SB(s)); + reiserfs_write_lock(s); if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); return 1; @@ -1631,7 +1655,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); if (!sbi) { errval = -ENOMEM; - goto error; + goto error_alloc; } s->s_fs_info = sbi; /* Set default values for options: non-aggressive tails, RO on errors */ @@ -1645,6 +1669,20 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) /* setup default block allocator options */ reiserfs_init_alloc_options(s); + mutex_init(&REISERFS_SB(s)->lock); + REISERFS_SB(s)->lock_depth = -1; + + /* + * This function is called with the bkl, which also was the old + * locking used here. + * do_journal_begin() will soon check if we hold the lock (ie: was the + * bkl). This is likely because do_journal_begin() has several another + * callers because at this time, it doesn't seem to be necessary to + * protect against anything. + * Anyway, let's be conservative and lock for now. + */ + reiserfs_write_lock(s); + jdev_name = NULL; if (reiserfs_parse_options (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, @@ -1870,9 +1908,13 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) init_waitqueue_head(&(sbi->s_wait)); spin_lock_init(&sbi->bitmap_lock); + reiserfs_write_unlock(s); + return (0); error: + reiserfs_write_unlock(s); +error_alloc: if (jinit_done) { /* kill the commit thread, free journal ram */ journal_release_error(NULL, s); } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8e7deb0..ee3ddd9 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -976,7 +976,7 @@ int reiserfs_lookup_privroot(struct super_block *s) int err = 0; /* If we don't have the privroot located yet - go find it */ - mutex_lock(&s->s_root->d_inode->i_mutex); + reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s); dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { @@ -1011,7 +1011,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) if (privroot->d_inode) { s->s_xattr = reiserfs_xattr_handlers; - mutex_lock(&privroot->d_inode->i_mutex); + reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; dentry = lookup_one_len(XAROOT_NAME, privroot, diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 2245c78..7665f41 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -52,11 +52,63 @@ #define REISERFS_IOC32_GETVERSION FS_IOC32_GETVERSION #define REISERFS_IOC32_SETVERSION FS_IOC32_SETVERSION -/* Locking primitives */ -/* Right now we are still falling back to (un)lock_kernel, but eventually that - would evolve into real per-fs locks */ -#define reiserfs_write_lock( sb ) lock_kernel() -#define reiserfs_write_unlock( sb ) unlock_kernel() +/* + * Locking primitives. The write lock is a per superblock + * special mutex that has properties close to the Big Kernel Lock + * which was used in the previous locking scheme. + */ +void reiserfs_write_lock(struct super_block *s); +void reiserfs_write_unlock(struct super_block *s); +int reiserfs_write_lock_once(struct super_block *s); +void reiserfs_write_unlock_once(struct super_block *s, int lock_depth); + +/* + * Several mutexes depend on the write lock. + * However sometimes we want to relax the write lock while we hold + * these mutexes, according to the release/reacquire on schedule() + * properties of the Bkl that were used. + * Reiserfs performances and locking were based on this scheme. + * Now that the write lock is a mutex and not the bkl anymore, doing so + * may result in a deadlock: + * + * A acquire write_lock + * A acquire j_commit_mutex + * A release write_lock and wait for something + * B acquire write_lock + * B can't acquire j_commit_mutex and sleep + * A can't acquire write lock anymore + * deadlock + * + * What we do here is avoiding such deadlock by playing the same game + * than the Bkl: if we can't acquire a mutex that depends on the write lock, + * we release the write lock, wait a bit and then retry. + * + * The mutexes concerned by this hack are: + * - The commit mutex of a journal list + * - The flush mutex + * - The journal lock + * - The inode mutex + */ +static inline void reiserfs_mutex_lock_safe(struct mutex *m, + struct super_block *s) +{ + reiserfs_write_unlock(s); + mutex_lock(m); + reiserfs_write_lock(s); +} + +/* + * When we schedule, we usually want to also release the write lock, + * according to the previous bkl based locking scheme of reiserfs. + */ +static inline void reiserfs_cond_resched(struct super_block *s) +{ + if (need_resched()) { + reiserfs_write_unlock(s); + schedule(); + reiserfs_write_lock(s); + } +} struct fid; @@ -1298,7 +1350,11 @@ static inline loff_t max_reiserfs_offset(struct inode *inode) #define get_generation(s) atomic_read (&fs_generation(s)) #define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen) #define __fs_changed(gen,s) (gen != get_generation (s)) -#define fs_changed(gen,s) ({cond_resched(); __fs_changed(gen, s);}) +#define fs_changed(gen,s) \ +({ \ + reiserfs_cond_resched(s); \ + __fs_changed(gen, s); \ +}) /***************************************************************************/ /* FIXATE NODES */ diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h index 6473650..2f8bb04 100644 --- a/include/linux/reiserfs_fs_sb.h +++ b/include/linux/reiserfs_fs_sb.h @@ -7,6 +7,8 @@ #ifdef __KERNEL__ #include #include +#include +#include #endif typedef enum { @@ -355,6 +357,13 @@ struct reiserfs_sb_info { struct reiserfs_journal *s_journal; /* pointer to journal information */ unsigned short s_mount_state; /* reiserfs state (valid, invalid) */ + /* Serialize writers access, replace the old bkl */ + struct mutex lock; + /* Owner of the lock (can be recursive) */ + struct task_struct *lock_owner; + /* Depth of the lock, start from -1 like the bkl */ + int lock_depth; + /* Comment? -Hans */ void (*end_io_handler) (struct buffer_head *, int); hashf_t s_hash_function; /* pointer to function which is used @@ -408,6 +417,17 @@ struct reiserfs_sb_info { char *s_qf_names[MAXQUOTAS]; int s_jquota_fmt; #endif +#ifdef CONFIG_REISERFS_CHECK + + struct tree_balance *cur_tb; /* + * Detects whether more than one + * copy of tb exists per superblock + * as a means of checking whether + * do_balance is executing concurrently + * against another tree reader/writer + * on a same mount point. + */ +#endif }; /* Definitions of reiserfs on-disk properties: */