GIT aa87e4647c32366037c21b54af79f0fd8cae951b git+ssh://master.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2.git#ALL commit 59dbc55b39a58491b9e1959b31e32a13f3f66c4a Author: Mark Fasheh Date: Mon Nov 19 18:40:16 2007 -0800 ocfs2: log valid inode # on bad inode If the inode block isn't valid then we don't want to print the value from that, instead print the block number which was passed in (which should always be correct). Also, turn this into a debug print for now - folks who hit an actual problem always have other logs indicating what the source is. Signed-off-by: Mark Fasheh commit 69530f34e2c7a54d34e3ca96c808373a25fa8abb Author: Mark Fasheh Date: Mon Nov 19 18:31:17 2007 -0800 ocfs2: Filter -ENOSPC in mlog_errno() It's almost never worth printing in that situation and we keep forgetting to manually filter it out. Signed-off-by: Mark Fasheh commit 91520c10cb6b4edf6353259b5f996325e00b991a Author: Joe Perches Date: Mon Nov 19 17:53:34 2007 -0800 [PATCH] fs/ocfs2: Add missing "space" Signed-off-by: Joe Perches Signed-off-by: Mark Fasheh commit 9cf30df5b9d99f8f72bb3266573808233187bed7 Author: Jan Kara Date: Tue Nov 13 19:59:33 2007 +0100 [PATCH] ocfs2: Remove expensive bitmap scanning Enable expensive bitmap scanning only if DEBUG option is enabled. The bitmap scanning quite loads the CPU and on my machine the write throughput of dd if=/dev/zero of=/ocfs2/file bs=1M count=500 conv=sync improves from 37 MB/s to 45.4 MB/s in local mode... Signed-off-by: Jan Kara Signed-off-by: Mark Fasheh commit bbf517587bea4fbf611adfaa0238537883a3deaa Author: Mark Fasheh Date: Wed Nov 7 16:35:14 2007 -0800 ocfs2: Remove bug statement in ocfs2_dentry_iput() The existing bug statement didn't take into account unhashed dentries which might not have a cluster lock on them. This could happen if a node exporting the file system via NFS is rebooted, re-exported to nfs clients and then unmounted. It's fine in this case to not have a dentry cluster lock. Just remove the bug statement and replace it with an error print, which does the proper checks. Though we want to know if something has happened which might have prevented a cluster lock from being created, it's definitely not necessary to panic the machine for this. Signed-off-by: Mark Fasheh commit b552b59da3079bdfac43a8dc44b9c7cbca76472c Author: Mark Fasheh Date: Wed Nov 7 14:40:36 2007 -0800 ocfs2: Support commit= mount option Mostly taken from ext3. This allows the user to set the jbd commit interval, in seconds. The default of 5 seconds stays the same, but now users can easily increase the commit interval. Typically, this would be increased in order to benefit performance at the expense of data-safety. Signed-off-by: Mark Fasheh commit 71108114df5d5c5f1ccb2fc3cade1161ed72c326 Author: Mark Fasheh Date: Wed Nov 7 14:21:45 2007 -0800 ocfs2: Reset journal parameters after s_mount_opt update Right now we're just setting them from the existing parameters, not the new ones that a remount specified. Signed-off-by: Mark Fasheh commit a4372b59334d2e7d0ef89eafb8af8d0eedfcf984 Author: Sunil Mushran Date: Tue Nov 6 16:10:23 2007 -0800 ocfs2: Update default cluster timeouts Lots of people are having trouble with the default timeouts, which are too low. These new values are derived from an informal survey taken on ocfs2-users, as well as data from bug reports. This should reduce the amount of cluster disconnects and subsequent fencing seen during normal workloads. Signed-off-by: Sunil Mushran Signed-off-by: Mark Fasheh commit 05b719633c4ea75cd0d14b94f47ece80ea403110 Author: Mark Fasheh Date: Tue Nov 6 15:52:58 2007 -0800 ocfs2: bump version number Bump the printed version to 1.5.0. This helps us quickly identify which version of Ocfs2 a bug filer is running. Signed-off-by: Mark Fasheh commit 77ad0b1e0e5643d394edf648882dbd10afc607a0 Author: Mark Fasheh Date: Tue Oct 30 12:09:03 2007 -0700 ocfs2: Documentation update Remove 'readpages' from the list in ocfs2.txt. Instead of having two identical lists, I just removed the list in the OCFS2 section of fs/Kconfig and added a pointer to Documentation/filesystems/ocfs2.txt. Signed-off-by: Mark Fasheh commit 93ca2d7f587773ad465f74e6cee02bf17ecfcfe6 Author: Mark Fasheh Date: Tue Oct 30 12:08:32 2007 -0700 ocfs2: Readpages support Add ->readpages support to Ocfs2. This is rather trivial - all it required is a small update to ocfs2_get_block (for mapping full extents via b_size) and an ocfs2_readpages() function which partially mirrors ocfs2_readpage(). Signed-off-by: Mark Fasheh commit 908e37ab48962f5f8f2ca5291e606c40c866b73c Author: Mark Fasheh Date: Tue Oct 16 18:38:24 2007 -0700 ocfs2: Deferred cleanup of orphaned inodes Unlink in Ocfs2 is an expensive operation, mostly due to locking overhead. To improve performance, we defer final iput of orphaned inodes onto the ocfs2_wq thread, thus allowing sys_unlink to return without incurring all the locking overhead of ocfs2_delete_inode(). This also has the side-effect that the unlinking node will have a better chance of being the one to actually remove the inode from it's orphan dir, thus reducing lock conention. The queue is globally rate-limited so as to prevent any process from pinning an unbounded number of inodes in memory. Signed-off-by: Mark Fasheh Documentation/filesystems/ocfs2.txt | 12 +++- fs/Kconfig | 19 +++--- fs/ocfs2/aops.c | 75 ++++++++++++++++++++++- fs/ocfs2/cluster/heartbeat.h | 2 +- fs/ocfs2/cluster/masklog.h | 2 +- fs/ocfs2/cluster/tcp.h | 4 +- fs/ocfs2/cluster/ver.c | 2 +- fs/ocfs2/dcache.c | 20 +++++- fs/ocfs2/dlm/dlmfsver.c | 2 +- fs/ocfs2/dlm/dlmmaster.c | 4 +- fs/ocfs2/dlm/dlmver.c | 2 +- fs/ocfs2/inode.c | 6 +- fs/ocfs2/inode.h | 2 + fs/ocfs2/journal.c | 8 ++- fs/ocfs2/localalloc.c | 5 +- fs/ocfs2/namei.c | 115 +++++++++++++++++++++++++++++++++++ fs/ocfs2/namei.h | 3 + fs/ocfs2/ocfs2.h | 5 ++ fs/ocfs2/super.c | 35 ++++++++++- fs/ocfs2/ver.c | 2 +- 20 files changed, 290 insertions(+), 35 deletions(-) diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt index ed55238..b63bd2d 100644 --- a/Documentation/filesystems/ocfs2.txt +++ b/Documentation/filesystems/ocfs2.txt @@ -35,7 +35,6 @@ Features which OCFS2 does not support yet: - Directory change notification (F_NOTIFY) - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease) - POSIX ACLs - - readpages / writepages (not user visible) Mount options ============= @@ -62,3 +61,14 @@ data=writeback Data ordering is not preserved, data may be written preferred_slot=0(*) During mount, try to use this filesystem slot first. If it is in use by another node, the first empty one found will be chosen. Invalid values will be ignored. +commit=nrsec (*) Ocfs2 can be told to sync all its data and metadata + every 'nrsec' seconds. The default value is 5 seconds. + This means that if you lose your power, you will lose + as much as the latest 5 seconds of work (your + filesystem will not be damaged though, thanks to the + journaling). This default value (or any low value) + will hurt performance, but it's good for data-safety. + Setting it to 0 will have the same effect as leaving + it at the default (5 seconds). + Setting it to very large values will improve + performance. diff --git a/fs/Kconfig b/fs/Kconfig index 429a002..8729bff 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -440,14 +440,8 @@ config OCFS2_FS Tools web page: http://oss.oracle.com/projects/ocfs2-tools OCFS2 mailing lists: http://oss.oracle.com/projects/ocfs2/mailman/ - Note: Features which OCFS2 does not support yet: - - extended attributes - - quotas - - cluster aware flock - - Directory change notification (F_NOTIFY) - - Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease) - - POSIX ACLs - - readpages / writepages (not user visible) + For more information on OCFS2, see the file + . config OCFS2_DEBUG_MASKLOG bool "OCFS2 logging support" @@ -459,6 +453,15 @@ config OCFS2_DEBUG_MASKLOG This option will enlarge your kernel, but it allows debugging of ocfs2 filesystem issues. +config OCFS2_DEBUG_FS + bool "OCFS2 expensive checks" + depends on OCFS2_FS + default n + help + This option will enable expensive consistency checks. Enable + this option for debugging only as it is likely to decrease + performance of the filesystem. + config MINIX_FS tristate "Minix fs support" help diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 556e34c..ed1016c 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -26,6 +26,7 @@ #include #include #include +#include #define MLOG_MASK_PREFIX ML_FILE_IO #include @@ -139,7 +140,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, { int err = 0; unsigned int ext_flags; - u64 p_blkno, past_eof; + u64 max_blocks = bh_result->b_size >> inode->i_blkbits; + u64 p_blkno, count, past_eof; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, @@ -155,7 +157,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, goto bail; } - err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, + err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, &count, &ext_flags); if (err) { mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " @@ -164,6 +166,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, goto bail; } + if (max_blocks < count) + count = max_blocks; + /* * ocfs2 never allocates in this function - the only time we * need to use BH_New is when we're extending i_size on a file @@ -178,6 +183,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) map_bh(bh_result, inode->i_sb, p_blkno); + bh_result->b_size = count << inode->i_blkbits; + if (!ocfs2_sparse_alloc(osb)) { if (p_blkno == 0) { err = -EIO; @@ -331,6 +338,69 @@ out: return ret; } +/* + * This is used only for read-ahead. Failures or difficult to handle + * siutations are safe to ignore. + * + * Right now, we don't bother with BH_Boundary - in-inode extent lists + * are quite large (243 extents on 4k blocks), so most inodes don't + * grow out to a tree. If need be, detecting boundary extents could + * trivially be added in a future version of ocfs2_get_block(). + */ +static int ocfs2_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + int ret, err = -EIO; + struct inode *inode = mapping->host; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + loff_t start; + struct page *last; + + /* + * Use the nonblocking flag for the dlm code to avoid page + * lock inversion, but don't bother with retrying. + */ + ret = ocfs2_meta_lock_full(inode, NULL, 0, OCFS2_LOCK_NONBLOCK); + if (ret) + return err; + + if (down_read_trylock(&oi->ip_alloc_sem) == 0) { + ocfs2_meta_unlock(inode, 0); + return err; + } + + ret = ocfs2_data_lock_full(inode, 0, OCFS2_LOCK_NONBLOCK); + if (ret) + goto out_unlock; + + /* + * Don't bother with inline-data. There isn't anything + * to read-ahead in that case anyway... + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) + goto out_unlock_data; + + /* + * Check whether a remote node truncated this file - we just + * drop out in that case as it's not worth handling here. + */ + last = list_entry(pages->prev, struct page, lru); + start = (loff_t)last->index << PAGE_CACHE_SHIFT; + if (start >= i_size_read(inode)) + goto out_unlock_data; + + err = mpage_readpages(mapping, pages, nr_pages, ocfs2_get_block); + +out_unlock_data: + ocfs2_data_unlock(inode, 0); + +out_unlock: + up_read(&oi->ip_alloc_sem); + ocfs2_meta_unlock(inode, 0); + + return err; +} + /* Note: Because we don't support holes, our allocation has * already happened (allocation writes zeros to the file data) * so we don't have to worry about ordered writes in @@ -1917,6 +1987,7 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, + .readpages = ocfs2_readpages, .writepage = ocfs2_writepage, .write_begin = ocfs2_write_begin, .write_end = ocfs2_write_end, diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 35397dd..e511339 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -35,7 +35,7 @@ #define O2HB_LIVE_THRESHOLD 2 /* number of equal samples to be seen as dead */ extern unsigned int o2hb_dead_threshold; -#define O2HB_DEFAULT_DEAD_THRESHOLD 7 +#define O2HB_DEFAULT_DEAD_THRESHOLD 31 /* Otherwise MAX_WRITE_TIMEOUT will be zero... */ #define O2HB_MIN_DEAD_THRESHOLD 2 #define O2HB_MAX_WRITE_TIMEOUT_MS (O2HB_REGION_TIMEOUT_MS * (o2hb_dead_threshold - 1)) diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index cd04606..597e064 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -212,7 +212,7 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; #define mlog_errno(st) do { \ int _st = (st); \ if (_st != -ERESTARTSYS && _st != -EINTR && \ - _st != AOP_TRUNCATED_PAGE) \ + _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC) \ mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ } while (0) diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index da880fc..f36f66a 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -60,8 +60,8 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data, /* same as hb delay, we're waiting for another node to recognize our hb */ #define O2NET_RECONNECT_DELAY_MS_DEFAULT 2000 -#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 5000 -#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 10000 +#define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 +#define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 /* TODO: figure this out.... */ diff --git a/fs/ocfs2/cluster/ver.c b/fs/ocfs2/cluster/ver.c index 7286c48..a56eee6 100644 --- a/fs/ocfs2/cluster/ver.c +++ b/fs/ocfs2/cluster/ver.c @@ -28,7 +28,7 @@ #include "ver.h" -#define CLUSTER_BUILD_VERSION "1.3.3" +#define CLUSTER_BUILD_VERSION "1.5.0" #define VERSION_STR "OCFS2 Node Manager " CLUSTER_BUILD_VERSION diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 1957a5e..9923278 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -344,12 +344,24 @@ static void ocfs2_dentry_iput(struct dentry *dentry, struct inode *inode) { struct ocfs2_dentry_lock *dl = dentry->d_fsdata; - mlog_bug_on_msg(!dl && !(dentry->d_flags & DCACHE_DISCONNECTED), - "dentry: %.*s\n", dentry->d_name.len, - dentry->d_name.name); + if (!dl) { + /* + * No dentry lock is ok if we're disconnected or + * unhashed. + */ + if (!(dentry->d_flags & DCACHE_DISCONNECTED) && + !d_unhashed(dentry)) { + unsigned long long ino = 0ULL; + if (inode) + ino = (unsigned long long)OCFS2_I(inode)->ip_blkno; + mlog(ML_ERROR, "Dentry is missing cluster lock. " + "inode: %llu, d_flags: 0x%x, d_name: %.*s\n", + ino, dentry->d_flags, dentry->d_name.len, + dentry->d_name.name); + } - if (!dl) goto out; + } mlog_bug_on_msg(dl->dl_count == 0, "dentry: %.*s, count: %u\n", dentry->d_name.len, dentry->d_name.name, diff --git a/fs/ocfs2/dlm/dlmfsver.c b/fs/ocfs2/dlm/dlmfsver.c index d2be3ad..a733b33 100644 --- a/fs/ocfs2/dlm/dlmfsver.c +++ b/fs/ocfs2/dlm/dlmfsver.c @@ -28,7 +28,7 @@ #include "dlmfsver.h" -#define DLM_BUILD_VERSION "1.3.3" +#define DLM_BUILD_VERSION "1.5.0" #define VERSION_STR "OCFS2 DLMFS " DLM_BUILD_VERSION diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 62e4a7d..a54d33d 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -908,7 +908,7 @@ lookup: * but they might own this lockres. wait on them. */ bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); if (bit < O2NM_MAX_NODES) { - mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to" + mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " "recover before lock mastery can begin\n", dlm->name, namelen, (char *)lockid, bit); wait_on_recovery = 1; @@ -962,7 +962,7 @@ redo_request: spin_lock(&dlm->spinlock); bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); if (bit < O2NM_MAX_NODES) { - mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to" + mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " "recover before lock mastery can begin\n", dlm->name, namelen, (char *)lockid, bit); wait_on_recovery = 1; diff --git a/fs/ocfs2/dlm/dlmver.c b/fs/ocfs2/dlm/dlmver.c index 7ef2653..dfc0da4 100644 --- a/fs/ocfs2/dlm/dlmver.c +++ b/fs/ocfs2/dlm/dlmver.c @@ -28,7 +28,7 @@ #include "dlmver.h" -#define DLM_BUILD_VERSION "1.3.3" +#define DLM_BUILD_VERSION "1.5.0" #define VERSION_STR "OCFS2 DLM " DLM_BUILD_VERSION diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 1d5e0cb..ebb2bbe 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -455,8 +455,8 @@ static int ocfs2_read_locked_inode(struct inode *inode, status = -EINVAL; fe = (struct ocfs2_dinode *) bh->b_data; if (!OCFS2_IS_VALID_DINODE(fe)) { - mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)le64_to_cpu(fe->i_blkno), 7, + mlog(0, "Invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)args->fi_blkno, 7, fe->i_signature); goto bail; } @@ -863,7 +863,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode, status = ocfs2_try_open_lock(inode, 1); if (status == -EAGAIN) { status = 0; - mlog(0, "Skipping delete of %llu because it is in use on" + mlog(0, "Skipping delete of %llu because it is in use on " "other nodes\n", (unsigned long long)oi->ip_blkno); goto bail; } diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 70e881c..b972749 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -56,6 +56,8 @@ struct ocfs2_inode_info /* protected by recovery_lock. */ struct inode *ip_next_orphan; + struct inode *ip_next_unlinked; + u32 ip_dir_start_lookup; /* next two are protected by trans_inc_lock */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f9d01e2..d9212ac 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -304,14 +304,18 @@ int ocfs2_journal_dirty_data(handle_t *handle, return err; } -#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * 5) +#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE) void ocfs2_set_journal_params(struct ocfs2_super *osb) { journal_t *journal = osb->journal->j_journal; + unsigned long commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; + + if (osb->osb_commit_interval) + commit_interval = osb->osb_commit_interval; spin_lock(&journal->j_state_lock); - journal->j_commit_interval = OCFS2_DEFAULT_COMMIT_INTERVAL; + journal->j_commit_interval = commit_interval; if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) journal->j_flags |= JFS_BARRIER; else diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index d272847..58ea88b 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -484,6 +484,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; +#ifdef OCFS2_DEBUG_FS if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { ocfs2_error(osb->sb, "local alloc inode %llu says it has " @@ -494,6 +495,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, status = -EIO; goto bail; } +#endif free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - le32_to_cpu(alloc->id1.bitmap1.i_used); @@ -712,9 +714,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, void *bitmap; struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc); - mlog_entry("total = %u, COUNT = %u, used = %u\n", + mlog_entry("total = %u, used = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total), - ocfs2_local_alloc_count_bits(alloc), le32_to_cpu(alloc->id1.bitmap1.i_used)); if (!alloc->id1.bitmap1.i_total) { diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 989ac27..4700ed2 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -40,6 +40,7 @@ #include #include #include +#include #define MLOG_MASK_PREFIX ML_NAMEI #include @@ -96,6 +97,9 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, /* An orphan dir name is an 8 byte value, printed as a hex string */ #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) +static atomic_t ocfs2_num_deferred_orphans = ATOMIC_INIT(0); +#define OCFS2_MAX_DEFERRED_ORPHANS 10000 + static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { @@ -661,6 +665,115 @@ out: } /* + * Deferred cleanup of orphaned inodes + * + * Unlink in Ocfs2 is rather expensive due to cluster locking + * (network) overhead. We can speed up the 2nd half of unlink by + * deferring cleanup of orphaned inodes to the ocfs2 work queue. This + * gains us about a 25% speedup during large unlink storms. + * + * We're careful to only do this on inodes that have been orphaned and + * sucesfully put in the orphan dir. This leaves a clear trail for + * recovery to follow should the machine crash. + */ + +static void __ocfs2_cleanup_orphans(struct ocfs2_super *osb) +{ + struct inode *inode; + + spin_lock(&osb->osb_lock); + inode = osb->osb_first_unlinked_inode; + while (inode) { + osb->osb_first_unlinked_inode = OCFS2_I(inode)->ip_next_unlinked; + if (osb->osb_last_unlinked_inode == inode) + osb->osb_last_unlinked_inode = NULL; + OCFS2_I(inode)->ip_next_unlinked = NULL; + + spin_unlock(&osb->osb_lock); + + iput(inode); + atomic_dec(&ocfs2_num_deferred_orphans); + + cond_resched(); + + spin_lock(&osb->osb_lock); + inode = osb->osb_first_unlinked_inode; + } + spin_unlock(&osb->osb_lock); +} + +void ocfs2_cleanup_orphans(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + osb_orphan_cleanup_wq.work); + + __ocfs2_cleanup_orphans(osb); +} + +void ocfs2_flush_orphans(struct ocfs2_super *osb) +{ + if (ocfs2_mount_local(osb)) + return; + + cancel_delayed_work(&osb->osb_orphan_cleanup_wq); + flush_workqueue(ocfs2_wq); + + __ocfs2_cleanup_orphans(osb); +} + +static void ocfs2_defer_inode_delete(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *last; + + if (ocfs2_mount_local(osb)) + return; + + if (inode->i_nlink > 0) + return; + + BUG_ON(OCFS2_I(inode)->ip_next_unlinked); + + /* + * Rate limit the machine-wide total number of deferred inode + * deletes. If we hit the limit, we can schedule an immediate + * flush of this mount point to free up some space. + */ + if (!atomic_add_unless(&ocfs2_num_deferred_orphans, 1, + OCFS2_MAX_DEFERRED_ORPHANS)) { + if (osb->osb_first_unlinked_inode) { + cancel_delayed_work(&osb->osb_orphan_cleanup_wq); + queue_delayed_work(ocfs2_wq, + &osb->osb_orphan_cleanup_wq, 0); + } + return; + } + + igrab(inode); + + /* + * Insert at the tail of the list so that we maintain a FIFO. + */ + spin_lock(&osb->osb_lock); + if (osb->osb_first_unlinked_inode == NULL) { + BUG_ON(osb->osb_last_unlinked_inode != NULL); + osb->osb_first_unlinked_inode = inode; + osb->osb_last_unlinked_inode = inode; + } else { + BUG_ON(osb->osb_last_unlinked_inode == NULL); + last = osb->osb_last_unlinked_inode; + OCFS2_I(last)->ip_next_unlinked = inode; + osb->osb_last_unlinked_inode = inode; + } + + spin_unlock(&osb->osb_lock); + + cancel_delayed_work(&osb->osb_orphan_cleanup_wq); + queue_delayed_work(ocfs2_wq, &osb->osb_orphan_cleanup_wq, + 2 * HZ); +} + +/* * Takes and drops an exclusive lock on the given dentry. This will * force other nodes to drop it. */ @@ -825,6 +938,8 @@ static int ocfs2_unlink(struct inode *dir, goto leave; } + ocfs2_defer_inode_delete(inode); + dir->i_ctime = dir->i_mtime = CURRENT_TIME; if (S_ISDIR(inode->i_mode)) drop_nlink(dir); diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 688aef6..f9aab81 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -36,4 +36,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct inode *inode, struct buffer_head *orphan_dir_bh); +void ocfs2_cleanup_orphans(struct work_struct *work); +void ocfs2_flush_orphans(struct ocfs2_super *osb); + #endif /* OCFS2_NAMEI_H */ diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 60a23e1..64ddd1e 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -231,6 +231,7 @@ struct ocfs2_super wait_queue_head_t checkpoint_event; atomic_t needs_checkpoint; struct ocfs2_journal *journal; + unsigned long osb_commit_interval; enum ocfs2_local_alloc_state local_alloc_state; struct buffer_head *local_alloc_bh; @@ -286,6 +287,10 @@ struct ocfs2_super struct ocfs2_node_map osb_recovering_orphan_dirs; unsigned int *osb_orphan_wipes; wait_queue_head_t osb_wipe_event; + + struct delayed_work osb_orphan_cleanup_wq; + struct inode *osb_first_unlinked_inode; + struct inode *osb_last_unlinked_inode; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index be562ac..6884b93 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -84,6 +84,7 @@ MODULE_LICENSE("GPL"); struct mount_options { + unsigned long commit_interval; unsigned long mount_opt; unsigned int atime_quantum; signed short slot; @@ -150,6 +151,7 @@ enum { Opt_data_writeback, Opt_atime_quantum, Opt_slot, + Opt_commit, Opt_err, }; @@ -165,6 +167,7 @@ static match_table_t tokens = { {Opt_data_writeback, "data=writeback"}, {Opt_atime_quantum, "atime_quantum=%u"}, {Opt_slot, "preferred_slot=%u"}, + {Opt_commit, "commit=%u"}, {Opt_err, NULL} }; @@ -438,14 +441,16 @@ unlock_osb: } if (!ret) { - if (!ocfs2_is_hard_readonly(osb)) - ocfs2_set_journal_params(osb); - /* Only save off the new mount options in case of a successful * remount. */ osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; + if (parsed_options.commit_interval) + osb->osb_commit_interval = parsed_options.commit_interval; + + if (!ocfs2_is_hard_readonly(osb)) + ocfs2_set_journal_params(osb); } out: return ret; @@ -597,6 +602,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; + osb->osb_commit_interval = parsed_options.commit_interval; sb->s_magic = OCFS2_SUPER_MAGIC; @@ -747,6 +753,7 @@ static int ocfs2_parse_options(struct super_block *sb, mlog_entry("remount: %d, options: \"%s\"\n", is_remount, options ? options : "(none)"); + mopt->commit_interval = 0; mopt->mount_opt = 0; mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; mopt->slot = OCFS2_INVALID_SLOT; @@ -816,6 +823,18 @@ static int ocfs2_parse_options(struct super_block *sb, if (option) mopt->slot = (s16)option; break; + case Opt_commit: + option = 0; + if (match_int(&args[0], &option)) { + status = 0; + goto bail; + } + if (option < 0) + return 0; + if (option == 0) + option = JBD_DEFAULT_MAX_COMMIT_AGE; + mopt->commit_interval = HZ * option; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -864,6 +883,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (osb->s_atime_quantum != OCFS2_DEFAULT_ATIME_QUANTUM) seq_printf(s, ",atime_quantum=%u", osb->s_atime_quantum); + if (osb->osb_commit_interval) + seq_printf(s, ",commit=%u", + (unsigned) (osb->osb_commit_interval / HZ)); + return 0; } @@ -1019,6 +1042,8 @@ static void ocfs2_inode_init_once(struct kmem_cache *cachep, void *data) oi->ip_blkno = 0ULL; oi->ip_clusters = 0; + oi->ip_next_unlinked = NULL; + ocfs2_lock_res_init_once(&oi->ip_rw_lockres); ocfs2_lock_res_init_once(&oi->ip_meta_lockres); ocfs2_lock_res_init_once(&oi->ip_data_lockres); @@ -1212,6 +1237,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); + ocfs2_flush_orphans(osb); + ocfs2_shutdown_local_alloc(osb); ocfs2_truncate_log_shutdown(osb); @@ -1466,6 +1493,8 @@ static int ocfs2_initialize_super(struct super_block *sb, INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); journal->j_state = OCFS2_JOURNAL_FREE; + INIT_DELAYED_WORK(&osb->osb_orphan_cleanup_wq, ocfs2_cleanup_orphans); + /* get some pseudo constants for clustersize bits */ osb->s_clustersize_bits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); diff --git a/fs/ocfs2/ver.c b/fs/ocfs2/ver.c index 5405ce1..e2488f4 100644 --- a/fs/ocfs2/ver.c +++ b/fs/ocfs2/ver.c @@ -29,7 +29,7 @@ #include "ver.h" -#define OCFS2_BUILD_VERSION "1.3.3" +#define OCFS2_BUILD_VERSION "1.5.0" #define VERSION_STR "OCFS2 " OCFS2_BUILD_VERSION