GIT f8b4c6027275d9b2d5004726a6d1bb818a13ddef git://oss.oracle.com/home/sourcebo/git/ocfs2.git/#ALL commit a37793a7e9b6db51d8a2b8a992b528c84c3c4aa8 Author: Adrian Bunk Date: Tue May 16 17:26:41 2006 +0200 [PATCH] fs/ocfs2/dlm/: cleanups This patch #if 0's the no longer used dlm_dump_lock_resources(). Since this makes dlmdebug.h empty, this patch also removes this header. Additionally, the needlessly global dlm_is_node_recovered() is made static. Signed-off-by: Adrian Bunk Signed-off-by: Mark Fasheh commit fbecc5ac445e07ad8a73964cd09067c186482d60 Author: Joel Becker Date: Wed May 10 18:28:59 2006 -0700 ocfs2: Compile-time disabling of ocfs2 debugging output. Give gcc the chance to compile out the debug logging code in ocfs2. This saves some size at the expense of being able to debug the code. Signed-off-by: Joel Becker commit 0187ddbe19a309c3d0c06beb9f82a6f57681ccbe Author: Mark Fasheh Date: Tue May 9 15:09:35 2006 -0700 ocfs2: warn the user on a dead timeout mistmatch Print a warning to the user when a node with a different dead count joins the region. Signed-off-by: Mark Fasheh commit f5ab4d01e84da8895a5a6eca94a7b954ded2db4f Author: Mark Fasheh Date: Thu May 4 12:03:26 2006 -0700 ocfs2: clean up some osb fields Get rid of osb->uuid, osb->proc_sub_dir, and osb->osb_id. Those fields were unused, or could easily be removed. As a result, we also no longer need MAX_OSB_ID or ocfs2_globals_lock. Signed-off-by: Mark Fasheh commit f336f8a8de268dd94324a9b26f9526012179b961 Author: Mark Fasheh Date: Thu May 4 11:49:22 2006 -0700 ocfs2: fix init of uuid_net_key ocfs2_initialize_super() should be copying from the beginning of the uuid. Signed-off-by: Mark Fasheh commit 798913f7d5838cc4c217062d307f82008e9d24aa Author: Mark Fasheh Date: Mon May 1 14:56:57 2006 -0700 ocfs2: fix compiler warnings in dlm_convert_lock_handler() We need to cast to unsigned long long. Signed-off-by: Mark Fasheh commit 025159a68823d66951337aa7aad962ab9fa5a414 Author: Mark Fasheh Date: Mon May 1 14:55:10 2006 -0700 ocfs2: dlm_print_one_mle() needs to be defined Fixes compile breakage. Signed-off-by: Mark Fasheh commit ca3c734cf6452c58a65c6ec1bf39f7f0c2e5f2c9 Author: Kurt Hackel Date: Mon May 1 14:39:57 2006 -0700 ocfs2: remove whitespace in dlmunlock.c Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit da18811a3009d5070c7cd06f57a16222d87fde66 Author: Kurt Hackel Date: Mon May 1 14:39:29 2006 -0700 ocfs2: move dlm work to a private work queue The work that is done can block for long periods of time and so is not appropriate for keventd. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 594d7a4ada4594d645d8d6327a9e05cf7613f88f Author: Kurt Hackel Date: Mon May 1 14:34:08 2006 -0700 ocfs2: fix incorrect error returns Use DLM_REJECTED instead of DLM_RECOVERING. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit f1f1bea93ed5eebcefb1ae03178fc4aff23dfe17 Author: Kurt Hackel Date: Mon May 1 14:31:37 2006 -0700 ocfs2: tune down some noisy messages during dlm recovery Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 79b4a98f94bca9b6725eb2b3f08d23dbb06a5cdc Author: Kurt Hackel Date: Mon May 1 14:30:39 2006 -0700 ocfs2: display message before waiting for recovery to complete Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 85a964a59e3259c2ec7d612d1401811b60b4bc5b Author: Kurt Hackel Date: Mon May 1 14:29:59 2006 -0700 ocfs2: mlog in dlm_convert_lock_handler() should be ML_ERROR Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 8c5ce0ac514ec2f42cb86f9476b4cf3b1f240c83 Author: Kurt Hackel Date: Mon May 1 14:29:28 2006 -0700 ocfs2: retry operations when a lock is marked in recovery Before checking for a nonexistent lock, make sure the lockres is not marked RECOVERING. The caller will just retry and the state should be fixed up when recovery completes. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit b64c8e3709f0c6e4f8bfaf4837b8270706e58ffe Author: Kurt Hackel Date: Mon May 1 14:27:41 2006 -0700 ocfs2: use cond_resched() in dlm_thread() yield() does not yield. cond_resched() does. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit e05483fef620ef7e4f361c5098783b79be6ae8f2 Author: Kurt Hackel Date: Mon May 1 14:25:21 2006 -0700 ocfs2: use GFP_NOFS in some dlm operations Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 11d8bc6d89eb863d1631b7798980b9a3b9c1c817 Author: Kurt Hackel Date: Mon May 1 13:54:07 2006 -0700 ocfs2: wait for recovery when starting lock mastery Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 718fa6c87597cb9fd62821f2caa7d7553c902591 Author: Kurt Hackel Date: Mon May 1 13:51:49 2006 -0700 ocfs2: continue recovery when a dead node is encountered Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit ed3ddd0005dcd22abd3410d7ca8ef46f11eedc22 Author: Kurt Hackel Date: Mon May 1 13:50:12 2006 -0700 ocfs2: remove unneccesary spin_unlock() in dlm_remaster_locks() Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 84b16b6dd232372448d7031df006ca1b141844a4 Author: Kurt Hackel Date: Mon May 1 13:49:20 2006 -0700 ocfs2: dlm_remaster_locks() should never exit without completing We cannot restart recovery. Once we begin to recover a node, keep the state of the recovery intact and follow through, regardless of any other node deaths that may occur. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 2424164bc2b30aaf4355058ff50fceb642977cd0 Author: Kurt Hackel Date: Mon May 1 13:47:50 2006 -0700 ocfs2: special case recovery lock in dlmlock_remote() If the previous master of the recovery lock dies, let calc_usage take it down completely and let the caller completely redo the dlmlock() call. Otherwise, there will never be an opportunity to re-master the lockres and recovery wont be able to progress. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit ab7812fd79614ce583ef9128ede0c0ef216f85b7 Author: Kurt Hackel Date: Mon May 1 13:32:27 2006 -0700 ocfs2: pending mastery asserts and migrations should block each other Use the existing structure for blocking migrations when ASTs are pending to achieve the same result. If we can catch the assert before it goes on the wire, just cancel it and let the migration continue. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 085dc43d403177bbfc3770f8430b76a74770febe Author: Kurt Hackel Date: Mon May 1 13:30:49 2006 -0700 ocfs2: temporarily disable automatic lock migration Now we never change the owner of a lock resource until unmount or node death. This will be re-enabled once some issues in the algorithm used have been resolved. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 10900d704658e7a425d5d146268d646d647c78a5 Author: Kurt Hackel Date: Mon May 1 13:27:10 2006 -0700 ocfs2: do not unconditionally purge the lockres in dlmlock_remote() In dlmlock_remote(), do not call purge_lockres until the lock resource actually changes. otherwise, the mastery info on the lockres will go away underneath the caller. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit ae78f65ac10496bca64e74356a252e37debd3521 Author: Kurt Hackel Date: Mon May 1 12:02:07 2006 -0700 ocfs2: increase backoff before waiting for recovery When mastering non-recovery lock resources, additional time was frequently needed to allow the disk heartbeat to catch up with the network timeout. the recovery lock resource is time critical and avoids this path. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit bccdf7724ad3779cc80f4f24e2d12a45802d0635 Author: Kurt Hackel Date: Mon May 1 11:53:33 2006 -0700 ocfs2: have dlm_pre_master_reco_lockres() ignore dead nodes Recovery will spin in dlm_pre_master_reco_lockres if we do not ignore timed-out network responses from dead nodes. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 053c7137bd408916b876dc40262aa9873380c972 Author: Kurt Hackel Date: Mon May 1 11:51:45 2006 -0700 ocfs2: give the dlm dirty list a reference on the lockres Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 2b4af78629ec59791a70a1e17b8d52f1c6912359 Author: Kurt Hackel Date: Mon May 1 11:49:52 2006 -0700 ocfs2: teach dlm_restart_lock_mastery() to wait on recovery Change behavior of dlm_restart_lock_mastery() when a node goes down. Dump all responses that have been collected and start over. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit a983e81a1a58177b965787da7b0548a9b529a354 Author: Kurt Hackel Date: Mon May 1 11:46:59 2006 -0700 ocfs2: gracefully handle stale create_lock messages. This is an error on the sending side, so gracefully error out on the receiving end. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 12734481b548b670cec2352a79d27304b3dc2714 Author: Kurt Hackel Date: Mon May 1 11:32:14 2006 -0700 ocfs2: update lvb immediately during recovery Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit e684426d7de65442a80f0f1b4b86237d90a15bea Author: Kurt Hackel Date: Mon May 1 11:22:06 2006 -0700 ocfs2: do not send master requests to localhost Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit fc57a48b645374d1b34986f665a7a0e1047a0ba0 Author: Kurt Hackel Date: Mon May 1 11:16:45 2006 -0700 ocfs2: purge lockres' sooner Immediately purge a lockress that the local node is not the master of. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 6fa15be8b2724953217dafa74e7b0af289288abc Author: Kurt Hackel Date: Mon May 1 11:15:04 2006 -0700 ocfs2: dump mismatching migrated lvbs before BUG() Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 7f65283718d0efe398b6b56c149873099b8d589d Author: Kurt Hackel Date: Mon May 1 11:11:13 2006 -0700 ocfs2: make dlm recovery finalization 2 stage Makes it easier for the recovery process to deal with node death. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 32849e41d59ab3426d11d036472676e9676f58a9 Author: Kurt Hackel Date: Mon May 1 10:57:51 2006 -0700 ocfs2: dlm recovery / lockres reference count fix Take a reference on lockres structures while they are on the recovery list. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit d3421c18b9bfe64b323b91ff7faf440dcce5170b Author: Kurt Hackel Date: Thu Apr 27 19:26:15 2006 -0700 ocfs2: better error handling during assert master message handle errors during lock assert master by either killing self or other node Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 5161fc73aec43043a60909f5c146e09653b6b019 Author: Kurt Hackel Date: Thu Apr 27 19:24:21 2006 -0700 ocfs2: dump lockres info before we BUG() on a bad reference Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit ce7e851f46b61f6ff7780e47aa70ec94d04518dd Author: Mark Fasheh Date: Thu Apr 27 19:07:45 2006 -0700 ocfs2: do LVB puts in place Don't wait until the AST will be fired to do the LVB copy into the lock resource. Signed-off-by: Mark Fasheh commit 12c11f5ba8fc4ddab35fc433cb970e99d5ab13dc Author: Kurt Hackel Date: Thu Apr 27 19:04:49 2006 -0700 ocfs2: mle ref count debugging Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 268cff398ed4f78bdd8a1942eec689243cbcc347 Author: Kurt Hackel Date: Thu Apr 27 19:03:18 2006 -0700 ocfs2: allow for an assert message during lock mastery Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 5c82a1ec37c87a96b755992af8a456100a687079 Author: Kurt Hackel Date: Thu Apr 27 19:01:35 2006 -0700 ocfs2: take mle reference during migration Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 9c6cfc037f723df1640a83028edc741945059257 Author: Kurt Hackel Date: Thu Apr 27 19:00:26 2006 -0700 ocfs2: properly initialize the mle structure Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit da9f7da4a7cc0a3566af2b06e36b9a173a182c33 Author: Kurt Hackel Date: Thu Apr 27 18:53:04 2006 -0700 ocfs2: detach mle from heartbeat events Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit ca117fc061b47a6eef8b5d24fdbf56f274fd28cd Author: Kurt Hackel Date: Thu Apr 27 18:51:26 2006 -0700 ocfs2: mle ref counting fixes Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit e57ae48b0f9e2caa0767f5ae02903ef4ff756aa7 Author: Kurt Hackel Date: Thu Apr 27 18:47:41 2006 -0700 ocfs2: better mle debugging Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 783a7bcc01729fec54ca0d2c3739e0d7e66024c6 Author: Kurt Hackel Date: Thu Apr 27 18:08:51 2006 -0700 ocfs2: clean up recovery related messages Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 290973d09aaa2df9d75d98f9907dafba37a4d485 Author: Kurt Hackel Date: Thu Apr 27 18:06:58 2006 -0700 ocfs2: handle network errors during recovery Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 9ff759e38bb8cf86d1dbcb88251b6c10a01ec46d Author: Kurt Hackel Date: Thu Apr 27 18:05:41 2006 -0700 ocfs2: only recover one dead node at a time Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit b3957291d6731fd21e4d19ee18e659a84fcc1f25 Author: Kurt Hackel Date: Thu Apr 27 18:03:49 2006 -0700 ocfs2: Better tracking for recovery state changes Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 409ffa328ba577dd2e5c25809e1c1c4a36172b5d Author: Kurt Hackel Date: Thu Apr 27 18:02:10 2006 -0700 ocfs2: Fix empty lvb check The check for an empty lvb should check the entire buffer not just the first byte. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit c2fe8996fec1c6d127f687b44ffe63cfbe68ee4e Author: Kurt Hackel Date: Thu Apr 27 18:00:21 2006 -0700 ocfs2: fix inverted logic in dlm_is_node_dead Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 8915f9363b2b059094ea524aab437ace03bb38bc Author: Kurt Hackel Date: Thu Apr 27 17:59:46 2006 -0700 ocfs2: recheck lockres master before sending an unlock request. Recovery may have happened and it may now be mastered locally. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit 9f190101bba0242714fb1e21bc301cc19eec0a28 Author: Kurt Hackel Date: Thu Apr 27 17:58:23 2006 -0700 ocfs2: add a small delay after a failed migration Otherwise we risk starving other threads. Signed-off-by: Kurt Hackel Signed-off-by: Mark Fasheh commit f9aa7e3eed61202e872ea7b95110868eb92106c2 Author: Mark Fasheh Date: Thu Apr 27 17:53:22 2006 -0700 ocfs2: silence a debug print Signed-off-by: Mark Fasheh commit a419d77b9420632003c98d57cc903958b1e816c1 Author: Mark Fasheh Date: Thu Mar 23 11:23:29 2006 -0800 ocfs2: silence a compile warning in dlm_alloc_pagevec() Reported by Andrew Morton. Signed-off-by: Mark Fasheh commit f879afc0496443c9feb791164a7e89358916b34c Author: Joel Becker Date: Thu Mar 16 17:40:37 2006 -0800 [PATCH] ocfs2: Alloc at least a page for the DLM hash The OCFS2 DLM allocates a number of pages for a hash to lookup locks. There was a bug where a PAGE_SIZE bigger than the hash size (eg, 64K pages) would result in zero pages allocated. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh commit 1af8e8ce644db841c587e61c80c928b20bac0946 Author: Daniel Phillips Date: Fri Mar 10 18:08:16 2006 -0800 ocfs2: allocate lockres hash pages in an array This allows us to have a hash table greater than a single page which greatly improves dlm performance on some tests. Signed-off-by: Daniel Phillips Signed-off-by: Mark Fasheh commit 22c3f6e4f595cee6146b33b3d244f3e13af14937 Author: Mark Fasheh Date: Fri Mar 10 13:44:00 2006 -0800 ocfs2: inline dlm_lockres_get() It's called on every lookup so this might help performance a bit. Signed-off-by: Mark Fasheh commit cb6adc1431f7e883f4191bed0a6c5861ada41147 Author: Daniel Phillips Date: Fri Mar 10 13:31:47 2006 -0800 [PATCH] Clean up ocfs2 hash probe and make it faster Signed-Off-By: Daniel Phillips Signed-off-by: Mark Fasheh commit be82847bb31cf9f82d82dc909358d6fa67004b07 Author: Mark Fasheh Date: Thu Mar 9 17:55:56 2006 -0800 ocfs2: calculate lockid hash values outside of the spinlock Fixes a performance bug - pointed out by Andrew. Signed-off-by: Mark Fasheh commit 7b8cb1bbd31d413f977b6c14e55c197d2893712b Author: Mark Fasheh Date: Mon Mar 6 15:36:17 2006 -0800 ocfs2: move lockres qstr next to hlist_node structure Gains us a bit of performance on loads which heavily hit the lockres hash. Patch suggested by Daniel Phillips . Signed-off-by: Mark Fasheh commit f64493eb2ecfc65853552b07d268aed33d95d7b8 Author: Zach Brown Date: Thu Apr 27 17:25:32 2006 -0700 ocfs2: replace idr with u32 in tcp Signed-off-by: Zach Brown Signed-off-by: Mark Fasheh commit 10cf7554f4e90aead497e70f460de8ce9024f9c1 Author: Sunil Mushran Date: Thu Apr 27 16:44:13 2006 -0700 ocfs2: silence ENOENT during lookup of broken links Signed-off-by: Sunil Mushran Signed-off-by: Mark Fasheh commit 23413eef39731e619a9184e731d5d66b5d3199a8 Author: Sunil Mushran Date: Thu Apr 27 16:41:31 2006 -0700 ocfs2: Cleanup message prints Signed-off-by: Sunil Mushran Signed-off-by: Mark Fasheh commit e8613372ab22cb8f1fbcbfe2d61e4bb8bbac28a5 Author: Joel Becker Date: Thu Apr 27 16:36:14 2006 -0700 ocfs2: silence -EEXIST from ocfs2_extent_map_insert/lookup Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh commit 0e6dd31c00fe0ab96feeb9c0cbae95660d545dc3 Author: Joel Becker Date: Wed Apr 12 18:34:43 2006 -0700 configfs: Release memory in configfs_example. The configfs_example module was missing a ->release(). Signed-off-by: Joel Becker commit 5898445c0ecf79b25de2da4e41cd1ebaa7b1bcab Author: Joel Becker Date: Fri Mar 10 11:42:30 2006 -0800 configfs: Clear up a few extra spaces where there should be TABs. Signed-off-by: Joel Becker commit 10e5be93b9410d044de64850085f2d72e4d2ee85 Author: Adrian Bunk Date: Fri Mar 31 16:53:55 2006 +0200 [PATCH] fs/ocfs2/dlm/dlmrecovery.c: make dlm_lockres_master_requery() static dlm_lockres_master_requery() became global without any external usage. Signed-off-by: Adrian Bunk Signed-off-by: Mark Fasheh commit 765f81792db9f8a9542478337e54abfc9df1e685 Author: Adrian Bunk Date: Sun Mar 26 14:25:52 2006 +0200 ocfs2: OCFS2_FS must depend on SYSFS Signed-off-by: Adrian Bunk Signed-off-by: Mark Fasheh --- diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c index 3d4713a..2d6a14a 100644 --- a/Documentation/filesystems/configfs/configfs_example.c +++ b/Documentation/filesystems/configfs/configfs_example.c @@ -264,6 +264,15 @@ static struct config_item_type simple_ch }; +struct simple_children { + struct config_group group; +}; + +static inline struct simple_children *to_simple_children(struct config_item *item) +{ + return item ? container_of(to_config_group(item), struct simple_children, group) : NULL; +} + static struct config_item *simple_children_make_item(struct config_group *group, const char *name) { struct simple_child *simple_child; @@ -304,7 +313,13 @@ static ssize_t simple_children_attr_show "items have only one attribute that is readable and writeable.\n"); } +static void simple_children_release(struct config_item *item) +{ + kfree(to_simple_children(item)); +} + static struct configfs_item_operations simple_children_item_ops = { + .release = simple_children_release, .show_attribute = simple_children_attr_show, }; @@ -345,10 +360,6 @@ static struct configfs_subsystem simple_ * children of its own. */ -struct simple_children { - struct config_group group; -}; - static struct config_group *group_children_make_group(struct config_group *group, const char *name) { struct simple_children *simple_children; diff --git a/fs/Kconfig b/fs/Kconfig index f9b5842..9b777b8 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -326,7 +326,7 @@ source "fs/xfs/Kconfig" config OCFS2_FS tristate "OCFS2 file system support (EXPERIMENTAL)" - depends on NET && EXPERIMENTAL + depends on NET && SYSFS && EXPERIMENTAL select CONFIGFS_FS select JBD select CRC32 @@ -356,6 +356,16 @@ config OCFS2_FS - POSIX ACLs - readpages / writepages (not user visible) +config OCFS2_DEBUG_MASKLOG + bool "OCFS2 logging support" + depends on OCFS2_FS + default y + help + The ocfs2 filesystem has an extensive logging system. The system + allows selection of events to log via files in /sys/o2cb/logmask/. + This option will enlarge your kernel, but it allows debugging of + ocfs2 filesystem issues. + config MINIX_FS tristate "Minix fs support" help diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 5f95218..880c9ca 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -211,7 +211,7 @@ static void remove_dir(struct dentry * d struct configfs_dirent * sd; sd = d->d_fsdata; - list_del_init(&sd->s_sibling); + list_del_init(&sd->s_sibling); configfs_put(sd); if (d->d_inode) simple_rmdir(parent->d_inode,d); @@ -330,7 +330,7 @@ static int configfs_detach_prep(struct d ret = configfs_detach_prep(sd->s_dentry); if (!ret) - continue; + continue; } else ret = -ENOTEMPTY; @@ -931,7 +931,7 @@ int configfs_rename_dir(struct config_it new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { - if (!new_dentry->d_inode) { + if (!new_dentry->d_inode) { error = config_item_set_name(item, "%s", new_name); if (!error) { d_add(new_dentry, NULL); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index e5512e2..fb65e08 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -66,7 +66,7 @@ static void fill_item_path(struct config } static int create_link(struct config_item *parent_item, - struct config_item *item, + struct config_item *item, struct dentry *dentry) { struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 21f38ac..d508a60 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -517,6 +517,7 @@ static inline void o2hb_prepare_block(st hb_block->hb_seq = cpu_to_le64(cputime); hb_block->hb_node = node_num; hb_block->hb_generation = cpu_to_le64(generation); + hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS); /* This step must always happen last! */ hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg, @@ -645,6 +646,8 @@ static int o2hb_check_slot(struct o2hb_r struct o2nm_node *node; struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block; u64 cputime; + unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; + unsigned int slot_dead_ms; memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); @@ -733,6 +736,23 @@ fire_callbacks: &o2hb_live_slots[slot->ds_node_num]); slot->ds_equal_samples = 0; + + /* We want to be sure that all nodes agree on the + * number of milliseconds before a node will be + * considered dead. The self-fencing timeout is + * computed from this value, and a discrepancy might + * result in heartbeat calling a node dead when it + * hasn't self-fenced yet. */ + slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms); + if (slot_dead_ms && slot_dead_ms != dead_ms) { + /* TODO: Perhaps we can fail the region here. */ + mlog(ML_ERROR, "Node %d on device %s has a dead count " + "of %u ms, but our count is %u ms.\n" + "Please double check your configuration values " + "for 'O2CB_HEARTBEAT_THRESHOLD'\n", + slot->ds_node_num, reg->hr_dev_name, slot_dead_ms, + dead_ms); + } goto out; } diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 73edad7..a42628b 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -123,6 +123,17 @@ #ifndef MLOG_MASK_PREFIX #define MLOG_MASK_PREFIX 0 #endif +/* + * When logging is disabled, force the bit test to 0 for anything other + * than errors and notices, allowing gcc to remove the code completely. + * When enabled, allow all masks. + */ +#if defined(CONFIG_OCFS2_DEBUG_MASKLOG) +#define ML_ALLOWED_BITS ~0 +#else +#define ML_ALLOWED_BITS (ML_ERROR|ML_NOTICE) +#endif + #define MLOG_MAX_BITS 64 struct mlog_bits { @@ -187,7 +198,8 @@ #define __mlog_printk(level, fmt, args.. #define mlog(mask, fmt, args...) do { \ u64 __m = MLOG_MASK_PREFIX | (mask); \ - if (__mlog_test_u64(__m, mlog_and_bits) && \ + if ((__m & ML_ALLOWED_BITS) && \ + __mlog_test_u64(__m, mlog_and_bits) && \ !__mlog_test_u64(__m, mlog_not_bits)) { \ if (__m & ML_ERROR) \ __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ @@ -204,6 +216,7 @@ #define mlog_errno(st) do { \ mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ } while (0) +#if defined(CONFIG_OCFS2_DEBUG_MASKLOG) #define mlog_entry(fmt, args...) do { \ mlog(ML_ENTRY, "ENTRY:" fmt , ##args); \ } while (0) @@ -247,6 +260,13 @@ #define mlog_exit_ptr(ptr) do { \ #define mlog_exit_void() do { \ mlog(ML_EXIT, "EXIT\n"); \ } while (0) +#else +#define mlog_entry(...) do { } while (0) +#define mlog_entry_void(...) do { } while (0) +#define mlog_exit(...) do { } while (0) +#define mlog_exit_ptr(...) do { } while (0) +#define mlog_exit_void(...) do { } while (0) +#endif /* defined(CONFIG_OCFS2_DEBUG_MASKLOG) */ #define mlog_bug_on_msg(cond, fmt, args...) do { \ if (cond) { \ diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h index 9409606..3f4151d 100644 --- a/fs/ocfs2/cluster/ocfs2_heartbeat.h +++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h @@ -32,6 +32,7 @@ struct o2hb_disk_heartbeat_block { __u8 hb_pad1[3]; __le32 hb_cksum; __le64 hb_generation; + __le32 hb_dead_ms; }; #endif /* _OCFS2_HEARTBEAT_H */ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 0f60cc0..c6185be 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include @@ -172,31 +171,18 @@ static u8 o2net_num_from_nn(struct o2net } /* ------------------------------------------------------------ */ - -static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw) +static void o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw) { - int ret = 0; + spin_lock(&nn->nn_lock); - do { - if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) { - ret = -EAGAIN; - break; - } - spin_lock(&nn->nn_lock); - ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id); - if (ret == 0) - list_add_tail(&nsw->ns_node_item, - &nn->nn_status_list); - spin_unlock(&nn->nn_lock); - } while (ret == -EAGAIN); + nsw->ns_id = nn->nn_status_next_id++; + list_add_tail(&nsw->ns_node_item, &nn->nn_status_list); - if (ret == 0) { - init_waitqueue_head(&nsw->ns_wq); - nsw->ns_sys_status = O2NET_ERR_NONE; - nsw->ns_status = 0; - } + spin_unlock(&nn->nn_lock); - return ret; + init_waitqueue_head(&nsw->ns_wq); + nsw->ns_sys_status = O2NET_ERR_NONE; + nsw->ns_status = 0; } static void o2net_complete_nsw_locked(struct o2net_node *nn, @@ -210,31 +196,43 @@ static void o2net_complete_nsw_locked(st list_del_init(&nsw->ns_node_item); nsw->ns_sys_status = sys_status; nsw->ns_status = status; - idr_remove(&nn->nn_status_idr, nsw->ns_id); wake_up(&nsw->ns_wq); } } -static void o2net_complete_nsw(struct o2net_node *nn, - struct o2net_status_wait *nsw, - u64 id, enum o2net_system_error sys_status, - s32 status) +static void o2net_complete_nsw_id(struct o2net_node *nn, u32 id, + enum o2net_system_error sys_status, + s32 status) { + struct list_head *iter, *tmp; + struct o2net_status_wait *nsw; + int killed = 0; + spin_lock(&nn->nn_lock); - if (nsw == NULL) { - if (id > INT_MAX) - goto out; - nsw = idr_find(&nn->nn_status_idr, id); - if (nsw == NULL) - goto out; + list_for_each_safe(iter, tmp, &nn->nn_status_list) { + nsw = list_entry(iter, struct o2net_status_wait, ns_node_item); + if (id == nsw->ns_id) { + o2net_complete_nsw_locked(nn, nsw, sys_status, status); + killed = 1; + break; + } } - o2net_complete_nsw_locked(nn, nsw, sys_status, status); + spin_unlock(&nn->nn_lock); -out: + if (!killed) + mlog(ML_ERROR, "didn't find nsw for id %u\n", id); +} + +static void o2net_complete_nsw(struct o2net_node *nn, + struct o2net_status_wait *nsw, + enum o2net_system_error sys_status, + s32 status) +{ + spin_lock(&nn->nn_lock); + o2net_complete_nsw_locked(nn, nsw, sys_status, status); spin_unlock(&nn->nn_lock); - return; } static void o2net_complete_nodes_nsw(struct o2net_node *nn) @@ -396,8 +394,8 @@ static void o2net_set_nn_state(struct o2 } if (was_valid && !valid) { - mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n", - SC_NODEF_ARGS(old_sc)); + printk(KERN_INFO "o2net: no longer connected to " + SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); o2net_complete_nodes_nsw(nn); } @@ -409,10 +407,10 @@ static void o2net_set_nn_state(struct o2 * the only way to start connecting again is to down * heartbeat and bring it back up. */ cancel_delayed_work(&nn->nn_connect_expired); - mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n", - o2nm_this_node() > sc->sc_node->nd_num ? - "connected to" : "accepted connection from", - SC_NODEF_ARGS(sc)); + printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", + o2nm_this_node() > sc->sc_node->nd_num ? + "connected to" : "accepted connection from", + SC_NODEF_ARGS(sc)); } /* trigger the connecting worker func as long as we're not valid, @@ -928,9 +926,7 @@ int o2net_send_message_vec(u32 msg_type, vec[0].iov_base = msg; memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec)); - ret = o2net_prep_nsw(nn, &nsw); - if (ret) - goto out; + o2net_prep_nsw(nn, &nsw); msg->msg_num = cpu_to_be32(nsw.ns_id); @@ -963,7 +959,7 @@ out: kfree(vec); if (msg) kfree(msg); - o2net_complete_nsw(nn, &nsw, 0, 0, 0); + o2net_complete_nsw(nn, &nsw, 0, 0); return ret; } EXPORT_SYMBOL_GPL(o2net_send_message_vec); @@ -1019,10 +1015,9 @@ static int o2net_process_message(struct switch(be16_to_cpu(hdr->magic)) { case O2NET_MSG_STATUS_MAGIC: /* special type for returning message status */ - o2net_complete_nsw(nn, NULL, - be32_to_cpu(hdr->msg_num), - be32_to_cpu(hdr->sys_status), - be32_to_cpu(hdr->status)); + o2net_complete_nsw_id(nn, be32_to_cpu(hdr->msg_num), + be32_to_cpu(hdr->sys_status), + be32_to_cpu(hdr->status)); goto out; case O2NET_MSG_KEEP_REQ_MAGIC: o2net_sendpage(sc, o2net_keep_resp, @@ -1280,7 +1275,7 @@ static void o2net_idle_timer(unsigned lo do_gettimeofday(&now); - mlog(ML_NOTICE, "connection to " SC_NODEF_FMT " has been idle for 10 " + printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for 10 " "seconds, shutting it down.\n", SC_NODEF_ARGS(sc)); mlog(ML_NOTICE, "here are some times that might help debug the " "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " @@ -1825,7 +1820,6 @@ int o2net_init(void) /* until we see hb from a node we'll return einval */ nn->nn_persistent_error = -ENOTCONN; init_waitqueue_head(&nn->nn_sc_wq); - idr_init(&nn->nn_status_idr); INIT_LIST_HEAD(&nn->nn_status_list); } diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index ff9e2e2..fa2c705 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -70,7 +70,7 @@ struct o2net_node { * or fails or when an accepted socket is attached. */ wait_queue_head_t nn_sc_wq; - struct idr nn_status_idr; + u32 nn_status_next_id; struct list_head nn_status_list; /* connects are attempted from when heartbeat comes up until either hb @@ -166,7 +166,7 @@ enum o2net_system_error { struct o2net_status_wait { enum o2net_system_error ns_sys_status; s32 ns_status; - int ns_id; + u32 ns_id; wait_queue_head_t ns_wq; struct list_head ns_node_item; }; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index ae47f45..3d494d1 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -213,11 +213,9 @@ int ocfs2_find_files_on_disk(const char struct ocfs2_dir_entry **dirent) { int status = -ENOENT; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - mlog_entry("(osb=%p, parent=%llu, name='%.*s', blkno=%p, inode=%p)\n", - osb, (unsigned long long)OCFS2_I(inode)->ip_blkno, - namelen, name, blkno, inode); + mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n", + namelen, name, blkno, inode, dirent_bh, dirent); *dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent); if (!*dirent_bh || !*dirent) { diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 355593d..90ce915 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -197,12 +197,14 @@ static void dlm_update_lvb(struct dlm_ct lock->ml.node == dlm->node_num ? "master" : "remote"); memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN); - } else if (lksb->flags & DLM_LKSB_PUT_LVB) { - mlog(0, "setting lvb from lockres for %s node\n", - lock->ml.node == dlm->node_num ? "master" : - "remote"); - memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN); } + /* Do nothing for lvb put requests - they should be done in + * place when the lock is downconverted - otherwise we risk + * racing gets and puts which could result in old lvb data + * being propagated. We leave the put flag set and clear it + * here. In the future we might want to clear it at the time + * the put is actually done. + */ spin_unlock(&res->spinlock); } diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 88cc43d..14530ee 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -37,7 +37,17 @@ #define DLM_LOCK_RES_OWNER_UNKNOWN O #define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes #define DLM_THREAD_MS 200 // flush at least every 200 ms -#define DLM_HASH_BUCKETS (PAGE_SIZE / sizeof(struct hlist_head)) +#define DLM_HASH_SIZE_DEFAULT (1 << 14) +#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE +# define DLM_HASH_PAGES 1 +#else +# define DLM_HASH_PAGES (DLM_HASH_SIZE_DEFAULT / PAGE_SIZE) +#endif +#define DLM_BUCKETS_PER_PAGE (PAGE_SIZE / sizeof(struct hlist_head)) +#define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE) + +/* Intended to make it easier for us to switch out hash functions */ +#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) enum dlm_ast_type { DLM_AST = 0, @@ -61,7 +71,8 @@ static inline int dlm_is_recovery_lock(c return 0; } -#define DLM_RECO_STATE_ACTIVE 0x0001 +#define DLM_RECO_STATE_ACTIVE 0x0001 +#define DLM_RECO_STATE_FINALIZE 0x0002 struct dlm_recovery_ctxt { @@ -85,7 +96,7 @@ enum dlm_ctxt_state { struct dlm_ctxt { struct list_head list; - struct hlist_head *lockres_hash; + struct hlist_head **lockres_hash; struct list_head dirty_list; struct list_head purge_list; struct list_head pending_asts; @@ -120,6 +131,7 @@ struct dlm_ctxt struct o2hb_callback_func dlm_hb_down; struct task_struct *dlm_thread_task; struct task_struct *dlm_reco_thread_task; + struct workqueue_struct *dlm_worker; wait_queue_head_t dlm_thread_wq; wait_queue_head_t dlm_reco_thread_wq; wait_queue_head_t ast_wq; @@ -132,6 +144,11 @@ struct dlm_ctxt struct list_head dlm_eviction_callbacks; }; +static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i) +{ + return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); +} + /* these keventd work queue items are for less-frequently * called functions that cannot be directly called from the * net message handlers for some reason, usually because @@ -216,20 +233,29 @@ struct dlm_lock_resource /* WARNING: Please see the comment in dlm_init_lockres before * adding fields here. */ struct hlist_node hash_node; + struct qstr lockname; struct kref refs; - /* please keep these next 3 in this order - * some funcs want to iterate over all lists */ + /* + * Please keep granted, converting, and blocked in this order, + * as some funcs want to iterate over all lists. + * + * All four lists are protected by the hash's reference. + */ struct list_head granted; struct list_head converting; struct list_head blocked; + struct list_head purge; + /* + * These two lists require you to hold an additional reference + * while they are on the list. + */ struct list_head dirty; struct list_head recovering; // dlm_recovery_ctxt.resources list /* unused lock resources have their last_used stamped and are * put on a list for the dlm thread to run. */ - struct list_head purge; unsigned long last_used; unsigned migration_pending:1; @@ -238,7 +264,6 @@ struct dlm_lock_resource wait_queue_head_t wq; u8 owner; //node which owns the lock resource, or unknown u16 state; - struct qstr lockname; char lvb[DLM_LVB_LEN]; }; @@ -300,6 +325,15 @@ enum dlm_lockres_list { DLM_BLOCKED_LIST }; +static inline int dlm_lvb_is_empty(char *lvb) +{ + int i; + for (i=0; irefs); +} void dlm_lockres_put(struct dlm_lock_resource *res); void __dlm_unhash_lockres(struct dlm_lock_resource *res); void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, - unsigned int len); + unsigned int len, + unsigned int hash); struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int len); @@ -780,8 +822,6 @@ int dlm_begin_reco_handler(struct o2net_ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data); int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 nodenum, u8 *real_master); -int dlm_lockres_master_requery(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, u8 *real_master); int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, @@ -819,6 +859,7 @@ void dlm_clean_master_list(struct dlm_ct u8 dead_node); int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); +int __dlm_lockres_unused(struct dlm_lock_resource *res); static inline const char * dlm_lock_mode_name(int mode) { diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 8285228..d555dec 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -214,6 +214,9 @@ grant: if (lock->ml.node == dlm->node_num) mlog(0, "doing in-place convert for nonlocal lock\n"); lock->ml.type = type; + if (lock->lksb->flags & DLM_LKSB_PUT_LVB) + memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); + status = DLM_NORMAL; *call_ast = 1; goto unlock_exit; @@ -464,6 +467,12 @@ int dlm_convert_lock_handler(struct o2ne } spin_lock(&res->spinlock); + status = __dlm_lockres_state_to_status(res); + if (status != DLM_NORMAL) { + spin_unlock(&res->spinlock); + dlm_error(status); + goto leave; + } list_for_each(iter, &res->granted) { lock = list_entry(iter, struct dlm_lock, list); if (lock->ml.cookie == cnv->cookie && @@ -473,6 +482,21 @@ int dlm_convert_lock_handler(struct o2ne } lock = NULL; } + if (!lock) { + __dlm_print_one_lock_resource(res); + list_for_each(iter, &res->granted) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock->ml.node == cnv->node_idx) { + mlog(ML_ERROR, "There is something here " + "for node %u, lock->ml.cookie=%llu, " + "cnv->cookie=%llu\n", cnv->node_idx, + (unsigned long long)lock->ml.cookie, + (unsigned long long)cnv->cookie); + break; + } + } + lock = NULL; + } spin_unlock(&res->spinlock); if (!lock) { status = DLM_IVLOCKID; diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index c7eae5d..3f6c8d8 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -37,10 +37,8 @@ #include "cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" -#include "dlmdebug.h" #include "dlmdomain.h" -#include "dlmdebug.h" #define MLOG_MASK_PREFIX ML_DLM #include "cluster/masklog.h" @@ -120,6 +118,7 @@ void dlm_print_one_lock(struct dlm_lock } EXPORT_SYMBOL_GPL(dlm_print_one_lock); +#if 0 void dlm_dump_lock_resources(struct dlm_ctxt *dlm) { struct dlm_lock_resource *res; @@ -136,12 +135,13 @@ void dlm_dump_lock_resources(struct dlm_ spin_lock(&dlm->spinlock); for (i=0; ilockres_hash[i]); + bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, iter, bucket, hash_node) dlm_print_one_lock_resource(res); } spin_unlock(&dlm->spinlock); } +#endif /* 0 */ static const char *dlm_errnames[] = { [DLM_NORMAL] = "DLM_NORMAL", diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h deleted file mode 100644 index 6858510..0000000 --- a/fs/ocfs2/dlm/dlmdebug.h +++ /dev/null @@ -1,30 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmdebug.h - * - * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - */ - -#ifndef DLMDEBUG_H -#define DLMDEBUG_H - -void dlm_dump_lock_resources(struct dlm_ctxt *dlm); - -#endif diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 8f3a9e3..3cf5ad8 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -41,7 +41,6 @@ #include "cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" -#include "dlmdebug.h" #include "dlmdomain.h" #include "dlmver.h" @@ -49,6 +48,33 @@ #include "dlmver.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) #include "cluster/masklog.h" +static void dlm_free_pagevec(void **vec, int pages) +{ + while (pages--) + free_page((unsigned long)vec[pages]); + kfree(vec); +} + +static void **dlm_alloc_pagevec(int pages) +{ + void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL); + int i; + + if (!vec) + return NULL; + + for (i = 0; i < pages; i++) + if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL))) + goto out_free; + + mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n", + pages, DLM_HASH_PAGES, (unsigned long)DLM_BUCKETS_PER_PAGE); + return vec; +out_free: + dlm_free_pagevec(vec, i); + return NULL; +} + /* * * spinlock lock ordering: if multiple locks are needed, obey this ordering: @@ -90,8 +116,7 @@ void __dlm_insert_lockres(struct dlm_ctx assert_spin_locked(&dlm->spinlock); q = &res->lockname; - q->hash = full_name_hash(q->name, q->len); - bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]); + bucket = dlm_lockres_hash(dlm, q->hash); /* get a reference for our hashtable */ dlm_lockres_get(res); @@ -100,34 +125,32 @@ void __dlm_insert_lockres(struct dlm_ctx } struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, - const char *name, - unsigned int len) + const char *name, + unsigned int len, + unsigned int hash) { - unsigned int hash; - struct hlist_node *iter; - struct dlm_lock_resource *tmpres=NULL; struct hlist_head *bucket; + struct hlist_node *list; mlog_entry("%.*s\n", len, name); assert_spin_locked(&dlm->spinlock); - hash = full_name_hash(name, len); - - bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]); - - /* check for pre-existing lock */ - hlist_for_each(iter, bucket) { - tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node); - if (tmpres->lockname.len == len && - memcmp(tmpres->lockname.name, name, len) == 0) { - dlm_lockres_get(tmpres); - break; - } + bucket = dlm_lockres_hash(dlm, hash); - tmpres = NULL; + hlist_for_each(list, bucket) { + struct dlm_lock_resource *res = hlist_entry(list, + struct dlm_lock_resource, hash_node); + if (res->lockname.name[0] != name[0]) + continue; + if (unlikely(res->lockname.len != len)) + continue; + if (memcmp(res->lockname.name + 1, name + 1, len - 1)) + continue; + dlm_lockres_get(res); + return res; } - return tmpres; + return NULL; } struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, @@ -135,9 +158,10 @@ struct dlm_lock_resource * dlm_lookup_lo unsigned int len) { struct dlm_lock_resource *res; + unsigned int hash = dlm_lockid_hash(name, len); spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, name, len); + res = __dlm_lookup_lockres(dlm, name, len, hash); spin_unlock(&dlm->spinlock); return res; } @@ -194,7 +218,7 @@ static int dlm_wait_on_domain_helper(con static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) { if (dlm->lockres_hash) - free_page((unsigned long) dlm->lockres_hash); + dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); if (dlm->name) kfree(dlm->name); @@ -278,11 +302,21 @@ int dlm_domain_fully_joined(struct dlm_c return ret; } +static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) +{ + if (dlm->dlm_worker) { + flush_workqueue(dlm->dlm_worker); + destroy_workqueue(dlm->dlm_worker); + dlm->dlm_worker = NULL; + } +} + static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) { dlm_unregister_domain_handlers(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); + dlm_destroy_dlm_worker(dlm); /* We've left the domain. Now we can take ourselves out of the * list and allow the kref stuff to help us free the @@ -304,8 +338,8 @@ static void dlm_migrate_all_locks(struct restart: spin_lock(&dlm->spinlock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { - while (!hlist_empty(&dlm->lockres_hash[i])) { - res = hlist_entry(dlm->lockres_hash[i].first, + while (!hlist_empty(dlm_lockres_hash(dlm, i))) { + res = hlist_entry(dlm_lockres_hash(dlm, i)->first, struct dlm_lock_resource, hash_node); /* need reference when manually grabbing lockres */ dlm_lockres_get(res); @@ -374,12 +408,13 @@ static void __dlm_print_nodes(struct dlm assert_spin_locked(&dlm->spinlock); - mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name); + printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name); while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, node + 1)) < O2NM_MAX_NODES) { - mlog(ML_NOTICE, " node %d\n", node); + printk("%d ", node); } + printk("\n"); } static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data) @@ -395,7 +430,7 @@ static int dlm_exit_domain_handler(struc node = exit_msg->node_idx; - mlog(0, "Node %u leaves domain %s\n", node, dlm->name); + printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name); spin_lock(&dlm->spinlock); clear_bit(node, dlm->domain_map); @@ -644,6 +679,8 @@ static int dlm_assert_joined_handler(str set_bit(assert->node_idx, dlm->domain_map); __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); + printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n", + assert->node_idx, dlm->name); __dlm_print_nodes(dlm); /* notify anything attached to the heartbeat events */ @@ -1126,6 +1163,13 @@ static int dlm_join_domain(struct dlm_ct goto bail; } + dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); + if (!dlm->dlm_worker) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + do { unsigned int backoff; status = dlm_try_to_join_domain(dlm); @@ -1166,6 +1210,7 @@ bail: dlm_unregister_domain_handlers(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); + dlm_destroy_dlm_worker(dlm); } return status; @@ -1191,7 +1236,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(c goto leave; } - dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL); + dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); if (!dlm->lockres_hash) { mlog_errno(-ENOMEM); kfree(dlm->name); @@ -1200,8 +1245,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(c goto leave; } - for (i=0; ilockres_hash[i]); + for (i = 0; i < DLM_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); strcpy(dlm->name, domain); dlm->key = key; @@ -1231,6 +1276,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(c dlm->dlm_thread_task = NULL; dlm->dlm_reco_thread_task = NULL; + dlm->dlm_worker = NULL; init_waitqueue_head(&dlm->dlm_thread_wq); init_waitqueue_head(&dlm->dlm_reco_thread_wq); init_waitqueue_head(&dlm->reco.event); diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 7e88e24..bb2c202 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -116,7 +116,7 @@ static int dlmfs_file_open(struct inode * doesn't make sense for LVB writes. */ file->f_flags &= ~O_APPEND; - fp = kmalloc(sizeof(*fp), GFP_KERNEL); + fp = kmalloc(sizeof(*fp), GFP_NOFS); if (!fp) { status = -ENOMEM; goto bail; @@ -196,7 +196,7 @@ static ssize_t dlmfs_file_read(struct fi else readlen = count - *ppos; - lvb_buf = kmalloc(readlen, GFP_KERNEL); + lvb_buf = kmalloc(readlen, GFP_NOFS); if (!lvb_buf) return -ENOMEM; @@ -240,7 +240,7 @@ static ssize_t dlmfs_file_write(struct f else writelen = count - *ppos; - lvb_buf = kmalloc(writelen, GFP_KERNEL); + lvb_buf = kmalloc(writelen, GFP_NOFS); if (!lvb_buf) return -ENOMEM; diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 6fea283..a9fc9a6 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -201,6 +201,7 @@ static enum dlm_status dlmlock_remote(st struct dlm_lock *lock, int flags) { enum dlm_status status = DLM_DENIED; + int lockres_changed = 1; mlog_entry("type=%d\n", lock->ml.type); mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len, @@ -226,8 +227,25 @@ static enum dlm_status dlmlock_remote(st res->state &= ~DLM_LOCK_RES_IN_PROGRESS; lock->lock_pending = 0; if (status != DLM_NORMAL) { - if (status != DLM_NOTQUEUED) + if (status == DLM_RECOVERING && + dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + /* recovery lock was mastered by dead node. + * we need to have calc_usage shoot down this + * lockres and completely remaster it. */ + mlog(0, "%s: recovery lock was owned by " + "dead node %u, remaster it now.\n", + dlm->name, res->owner); + } else if (status != DLM_NOTQUEUED) { + /* + * DO NOT call calc_usage, as this would unhash + * the remote lockres before we ever get to use + * it. treat as if we never made any change to + * the lockres. + */ + lockres_changed = 0; dlm_error(status); + } dlm_revert_pending_lock(res, lock); dlm_lock_put(lock); } else if (dlm_is_recovery_lock(res->lockname.name, @@ -244,7 +262,8 @@ static enum dlm_status dlmlock_remote(st } spin_unlock(&res->spinlock); - dlm_lockres_calc_usage(dlm, res); + if (lockres_changed) + dlm_lockres_calc_usage(dlm, res); wake_up(&res->wq); return status; @@ -281,6 +300,14 @@ static enum dlm_status dlm_send_remote_l if (tmpret >= 0) { // successfully sent and received ret = status; // this is already a dlm_status + if (ret == DLM_REJECTED) { + mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " + "no longer owned by %u. that node is coming back " + "up currently.\n", dlm->name, create.namelen, + create.name, res->owner); + dlm_print_one_lock_resource(res); + BUG(); + } } else { mlog_errno(tmpret); if (dlm_is_host_down(tmpret)) { @@ -382,13 +409,13 @@ struct dlm_lock * dlm_new_lock(int type, struct dlm_lock *lock; int kernel_allocated = 0; - lock = kcalloc(1, sizeof(*lock), GFP_KERNEL); + lock = kcalloc(1, sizeof(*lock), GFP_NOFS); if (!lock) return NULL; if (!lksb) { /* zero memory only if kernel-allocated */ - lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL); + lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS); if (!lksb) { kfree(lock); return NULL; @@ -429,11 +456,16 @@ int dlm_create_lock_handler(struct o2net if (!dlm_grab(dlm)) return DLM_REJECTED; - mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), - "Domain %s not fully joined!\n", dlm->name); - name = create->name; namelen = create->namelen; + status = DLM_REJECTED; + if (!dlm_domain_fully_joined(dlm)) { + mlog(ML_ERROR, "Domain %s not fully joined, but node %u is " + "sending a create_lock message for lock %.*s!\n", + dlm->name, create->node_idx, namelen, name); + dlm_error(status); + goto leave; + } status = DLM_IVBUFLEN; if (namelen > DLM_LOCKID_NAME_MAX) { @@ -669,18 +701,22 @@ retry_lock: msleep(100); /* no waiting for dlm_reco_thread */ if (recovery) { - if (status == DLM_RECOVERING) { - mlog(0, "%s: got RECOVERING " - "for $REOCVERY lock, master " - "was %u\n", dlm->name, - res->owner); - dlm_wait_for_node_death(dlm, res->owner, - DLM_NODE_DEATH_WAIT_MAX); - } + if (status != DLM_RECOVERING) + goto retry_lock; + + mlog(0, "%s: got RECOVERING " + "for $RECOVERY lock, master " + "was %u\n", dlm->name, + res->owner); + /* wait to see the node go down, then + * drop down and allow the lockres to + * get cleaned up. need to remaster. */ + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); } else { dlm_wait_for_recovery(dlm); + goto retry_lock; } - goto retry_lock; } if (status != DLM_NORMAL) { diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 940be4c..1b8346d 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -47,7 +47,6 @@ #include "cluster/tcp.h" #include "dlmapi.h" #include "dlmcommon.h" -#include "dlmdebug.h" #include "dlmdomain.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) @@ -74,6 +73,7 @@ struct dlm_master_list_entry wait_queue_head_t wq; atomic_t woken; struct kref mle_refs; + int inuse; unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -127,18 +127,30 @@ static inline int dlm_mle_equal(struct d return 1; } -#if 0 -/* Code here is included but defined out as it aids debugging */ +#define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m) +static void _dlm_print_nodemap(unsigned long *map, const char *mapname) +{ + int i; + printk("%s=[ ", mapname); + for (i=0; imaybe_map, + *vote = mle->vote_map, + *resp = mle->response_map, + *node = mle->node_map; k = &mle->mle_refs; if (mle->type == DLM_MLE_BLOCK) @@ -159,18 +171,29 @@ void dlm_print_one_mle(struct dlm_master name = mle->u.res->lockname.name; } - mlog(ML_NOTICE, " #%3d: %3s %3d %3u %3u %c (%d)%.*s\n", - i, type, refs, master, mle->new_master, attached, - namelen, namelen, name); + mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ", + namelen, name, type, refs, master, mle->new_master, attached, + mle->inuse); + dlm_print_nodemap(maybe); + printk(", "); + dlm_print_nodemap(vote); + printk(", "); + dlm_print_nodemap(resp); + printk(", "); + dlm_print_nodemap(node); + printk(", "); + printk("\n"); } +#if 0 +/* Code here is included but defined out as it aids debugging */ + static void dlm_dump_mles(struct dlm_ctxt *dlm) { struct dlm_master_list_entry *mle; struct list_head *iter; mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); - mlog(ML_NOTICE, " ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n"); spin_lock(&dlm->master_lock); list_for_each(iter, &dlm->master_list) { mle = list_entry(iter, struct dlm_master_list_entry, list); @@ -314,6 +337,31 @@ static inline void dlm_mle_detach_hb_eve spin_unlock(&dlm->spinlock); } +static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + mle->inuse++; + kref_get(&mle->mle_refs); +} + +static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + mle->inuse--; + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + +} + /* remove from list and free */ static void __dlm_put_mle(struct dlm_master_list_entry *mle) { @@ -322,9 +370,14 @@ static void __dlm_put_mle(struct dlm_mas assert_spin_locked(&dlm->spinlock); assert_spin_locked(&dlm->master_lock); - BUG_ON(!atomic_read(&mle->mle_refs.refcount)); - - kref_put(&mle->mle_refs, dlm_mle_release); + if (!atomic_read(&mle->mle_refs.refcount)) { + /* this may or may not crash, but who cares. + * it's a BUG. */ + mlog(ML_ERROR, "bad mle: %p\n", mle); + dlm_print_one_mle(mle); + BUG(); + } else + kref_put(&mle->mle_refs, dlm_mle_release); } @@ -367,6 +420,7 @@ static void dlm_init_mle(struct dlm_mast memset(mle->response_map, 0, sizeof(mle->response_map)); mle->master = O2NM_MAX_NODES; mle->new_master = O2NM_MAX_NODES; + mle->inuse = 0; if (mle->type == DLM_MLE_MASTER) { BUG_ON(!res); @@ -564,6 +618,28 @@ static void dlm_lockres_release(struct k mlog(0, "destroying lockres %.*s\n", res->lockname.len, res->lockname.name); + if (!hlist_unhashed(&res->hash_node) || + !list_empty(&res->granted) || + !list_empty(&res->converting) || + !list_empty(&res->blocked) || + !list_empty(&res->dirty) || + !list_empty(&res->recovering) || + !list_empty(&res->purge)) { + mlog(ML_ERROR, + "Going to BUG for resource %.*s." + " We're on a list! [%c%c%c%c%c%c%c]\n", + res->lockname.len, res->lockname.name, + !hlist_unhashed(&res->hash_node) ? 'H' : ' ', + !list_empty(&res->granted) ? 'G' : ' ', + !list_empty(&res->converting) ? 'C' : ' ', + !list_empty(&res->blocked) ? 'B' : ' ', + !list_empty(&res->dirty) ? 'D' : ' ', + !list_empty(&res->recovering) ? 'R' : ' ', + !list_empty(&res->purge) ? 'P' : ' '); + + dlm_print_one_lock_resource(res); + } + /* By the time we're ready to blow this guy away, we shouldn't * be on any lists. */ BUG_ON(!hlist_unhashed(&res->hash_node)); @@ -579,11 +655,6 @@ static void dlm_lockres_release(struct k kfree(res); } -void dlm_lockres_get(struct dlm_lock_resource *res) -{ - kref_get(&res->refs); -} - void dlm_lockres_put(struct dlm_lock_resource *res) { kref_put(&res->refs, dlm_lockres_release); @@ -603,7 +674,7 @@ static void dlm_init_lockres(struct dlm_ memcpy(qname, name, namelen); res->lockname.len = namelen; - res->lockname.hash = full_name_hash(name, namelen); + res->lockname.hash = dlm_lockid_hash(name, namelen); init_waitqueue_head(&res->wq); spin_lock_init(&res->spinlock); @@ -637,11 +708,11 @@ struct dlm_lock_resource *dlm_new_lockre { struct dlm_lock_resource *res; - res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); + res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS); if (!res) return NULL; - res->lockname.name = kmalloc(namelen, GFP_KERNEL); + res->lockname.name = kmalloc(namelen, GFP_NOFS); if (!res->lockname.name) { kfree(res); return NULL; @@ -677,19 +748,20 @@ struct dlm_lock_resource * dlm_get_lock_ int blocked = 0; int ret, nodenum; struct dlm_node_iter iter; - unsigned int namelen; + unsigned int namelen, hash; int tries = 0; int bit, wait_on_recovery = 0; BUG_ON(!lockid); namelen = strlen(lockid); + hash = dlm_lockid_hash(lockid, namelen); mlog(0, "get lockres %s (len %d)\n", lockid, namelen); lookup: spin_lock(&dlm->spinlock); - tmpres = __dlm_lookup_lockres(dlm, lockid, namelen); + tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash); if (tmpres) { spin_unlock(&dlm->spinlock); mlog(0, "found in hash!\n"); @@ -704,7 +776,7 @@ lookup: mlog(0, "allocating a new resource\n"); /* nothing found and we need to allocate one. */ alloc_mle = (struct dlm_master_list_entry *) - kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); + kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!alloc_mle) goto leave; res = dlm_new_lockres(dlm, lockid, namelen); @@ -790,10 +862,11 @@ lookup: * if so, the creator of the BLOCK may try to put the last * ref at this time in the assert master handler, so we * need an extra one to keep from a bad ptr deref. */ - dlm_get_mle(mle); + dlm_get_mle_inuse(mle); spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); +redo_request: while (wait_on_recovery) { /* any cluster changes that occurred after dropping the * dlm spinlock would be detectable be a change on the mle, @@ -812,7 +885,7 @@ lookup: } dlm_kick_recovery_thread(dlm); - msleep(100); + msleep(1000); dlm_wait_for_recovery(dlm); spin_lock(&dlm->spinlock); @@ -825,13 +898,15 @@ lookup: } else wait_on_recovery = 0; spin_unlock(&dlm->spinlock); + + if (wait_on_recovery) + dlm_wait_for_node_recovery(dlm, bit, 10000); } /* must wait for lock to be mastered elsewhere */ if (blocked) goto wait; -redo_request: ret = -EINVAL; dlm_node_iter_init(mle->vote_map, &iter); while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { @@ -856,6 +931,7 @@ wait: /* keep going until the response map includes all nodes */ ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); if (ret < 0) { + wait_on_recovery = 1; mlog(0, "%s:%.*s: node map changed, redo the " "master request now, blocked=%d\n", dlm->name, res->lockname.len, @@ -866,7 +942,7 @@ wait: dlm->name, res->lockname.len, res->lockname.name, blocked); dlm_print_one_lock_resource(res); - /* dlm_print_one_mle(mle); */ + dlm_print_one_mle(mle); tries = 0; } goto redo_request; @@ -880,7 +956,7 @@ wait: dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); /* put the extra ref */ - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); wake_waiters: spin_lock(&res->spinlock); @@ -921,12 +997,14 @@ recheck: spin_unlock(&res->spinlock); /* this will cause the master to re-assert across * the whole cluster, freeing up mles */ - ret = dlm_do_master_request(mle, res->owner); - if (ret < 0) { - /* give recovery a chance to run */ - mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); - msleep(500); - goto recheck; + if (res->owner != dlm->node_num) { + ret = dlm_do_master_request(mle, res->owner); + if (ret < 0) { + /* give recovery a chance to run */ + mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); + msleep(500); + goto recheck; + } } ret = 0; goto leave; @@ -962,6 +1040,12 @@ recheck: "rechecking now\n", dlm->name, res->lockname.len, res->lockname.name); goto recheck; + } else { + if (!voting_done) { + mlog(0, "map not changed and voting not done " + "for %s:%.*s\n", dlm->name, res->lockname.len, + res->lockname.name); + } } if (m != O2NM_MAX_NODES) { @@ -1129,18 +1213,6 @@ static int dlm_restart_lock_mastery(stru set_bit(node, mle->vote_map); } else { mlog(ML_ERROR, "node down! %d\n", node); - - /* if the node wasn't involved in mastery skip it, - * but clear it out from the maps so that it will - * not affect mastery of this lockres */ - clear_bit(node, mle->response_map); - clear_bit(node, mle->vote_map); - if (!test_bit(node, mle->maybe_map)) - goto next; - - /* if we're already blocked on lock mastery, and the - * dead node wasn't the expected master, or there is - * another node in the maybe_map, keep waiting */ if (blocked) { int lowest = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); @@ -1148,54 +1220,53 @@ static int dlm_restart_lock_mastery(stru /* act like it was never there */ clear_bit(node, mle->maybe_map); - if (node != lowest) - goto next; - - mlog(ML_ERROR, "expected master %u died while " - "this node was blocked waiting on it!\n", - node); - lowest = find_next_bit(mle->maybe_map, - O2NM_MAX_NODES, - lowest+1); - if (lowest < O2NM_MAX_NODES) { - mlog(0, "still blocked. waiting " - "on %u now\n", lowest); - goto next; + if (node == lowest) { + mlog(0, "expected master %u died" + " while this node was blocked " + "waiting on it!\n", node); + lowest = find_next_bit(mle->maybe_map, + O2NM_MAX_NODES, + lowest+1); + if (lowest < O2NM_MAX_NODES) { + mlog(0, "%s:%.*s:still " + "blocked. waiting on %u " + "now\n", dlm->name, + res->lockname.len, + res->lockname.name, + lowest); + } else { + /* mle is an MLE_BLOCK, but + * there is now nothing left to + * block on. we need to return + * all the way back out and try + * again with an MLE_MASTER. + * dlm_do_local_recovery_cleanup + * has already run, so the mle + * refcount is ok */ + mlog(0, "%s:%.*s: no " + "longer blocking. try to " + "master this here\n", + dlm->name, + res->lockname.len, + res->lockname.name); + mle->type = DLM_MLE_MASTER; + mle->u.res = res; + } } - - /* mle is an MLE_BLOCK, but there is now - * nothing left to block on. we need to return - * all the way back out and try again with - * an MLE_MASTER. dlm_do_local_recovery_cleanup - * has already run, so the mle refcount is ok */ - mlog(0, "no longer blocking. we can " - "try to master this here\n"); - mle->type = DLM_MLE_MASTER; - memset(mle->maybe_map, 0, - sizeof(mle->maybe_map)); - memset(mle->response_map, 0, - sizeof(mle->maybe_map)); - memcpy(mle->vote_map, mle->node_map, - sizeof(mle->node_map)); - mle->u.res = res; - set_bit(dlm->node_num, mle->maybe_map); - - ret = -EAGAIN; - goto next; } - clear_bit(node, mle->maybe_map); - if (node > dlm->node_num) - goto next; - - mlog(0, "dead node in map!\n"); - /* yuck. go back and re-contact all nodes - * in the vote_map, removing this node. */ - memset(mle->response_map, 0, - sizeof(mle->response_map)); + /* now blank out everything, as if we had never + * contacted anyone */ + memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + memset(mle->response_map, 0, sizeof(mle->response_map)); + /* reset the vote_map to the current node_map */ + memcpy(mle->vote_map, mle->node_map, + sizeof(mle->node_map)); + /* put myself into the maybe map */ + if (mle->type != DLM_MLE_BLOCK) + set_bit(dlm->node_num, mle->maybe_map); } ret = -EAGAIN; -next: node = dlm_bitmap_diff_iter_next(&bdi, &sc); } return ret; @@ -1316,7 +1387,7 @@ int dlm_master_request_handler(struct o2 struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; char *name; - unsigned int namelen; + unsigned int namelen, hash; int found, ret; int set_maybe; int dispatch_assert = 0; @@ -1331,6 +1402,7 @@ int dlm_master_request_handler(struct o2 name = request->name; namelen = request->namelen; + hash = dlm_lockid_hash(name, namelen); if (namelen > DLM_LOCKID_NAME_MAX) { response = DLM_IVBUFLEN; @@ -1339,7 +1411,7 @@ int dlm_master_request_handler(struct o2 way_up_top: spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, name, namelen); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); if (res) { spin_unlock(&dlm->spinlock); @@ -1459,21 +1531,18 @@ way_up_top: spin_unlock(&dlm->spinlock); mle = (struct dlm_master_list_entry *) - kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); + kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!mle) { response = DLM_MASTER_RESP_ERROR; mlog_errno(-ENOMEM); goto send_response; } - spin_lock(&dlm->spinlock); - dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, - name, namelen); - spin_unlock(&dlm->spinlock); goto way_up_top; } // mlog(0, "this is second time thru, already allocated, " // "add the block.\n"); + dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); set_bit(request->node_idx, mle->maybe_map); list_add(&mle->list, &dlm->master_list); response = DLM_MASTER_RESP_NO; @@ -1556,6 +1625,8 @@ again: dlm_node_iter_init(nodemap, &iter); while ((to = dlm_node_iter_next(&iter)) >= 0) { int r = 0; + struct dlm_master_list_entry *mle = NULL; + mlog(0, "sending assert master to %d (%.*s)\n", to, namelen, lockname); memset(&assert, 0, sizeof(assert)); @@ -1567,20 +1638,28 @@ again: tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, &assert, sizeof(assert), to, &r); if (tmpret < 0) { - mlog(ML_ERROR, "assert_master returned %d!\n", tmpret); + mlog(0, "assert_master returned %d!\n", tmpret); if (!dlm_is_host_down(tmpret)) { - mlog(ML_ERROR, "unhandled error!\n"); + mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); BUG(); } /* a node died. finish out the rest of the nodes. */ - mlog(ML_ERROR, "link to %d went down!\n", to); + mlog(0, "link to %d went down!\n", to); /* any nonzero status return will do */ ret = tmpret; } else if (r < 0) { /* ok, something horribly messed. kill thyself. */ mlog(ML_ERROR,"during assert master of %.*s to %u, " "got %d.\n", namelen, lockname, to, r); - dlm_dump_lock_resources(dlm); + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + if (dlm_find_mle(dlm, &mle, (char *)lockname, + namelen)) { + dlm_print_one_mle(mle); + __dlm_put_mle(mle); + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); BUG(); } else if (r == EAGAIN) { mlog(0, "%.*s: node %u create mles on other " @@ -1612,7 +1691,7 @@ int dlm_assert_master_handler(struct o2n struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; struct dlm_lock_resource *res = NULL; char *name; - unsigned int namelen; + unsigned int namelen, hash; u32 flags; int master_request = 0; int ret = 0; @@ -1622,6 +1701,7 @@ int dlm_assert_master_handler(struct o2n name = assert->name; namelen = assert->namelen; + hash = dlm_lockid_hash(name, namelen); flags = be32_to_cpu(assert->flags); if (namelen > DLM_LOCKID_NAME_MAX) { @@ -1646,7 +1726,7 @@ int dlm_assert_master_handler(struct o2n if (bit >= O2NM_MAX_NODES) { /* not necessarily an error, though less likely. * could be master just re-asserting. */ - mlog(ML_ERROR, "no bits set in the maybe_map, but %u " + mlog(0, "no bits set in the maybe_map, but %u " "is asserting! (%.*s)\n", assert->node_idx, namelen, name); } else if (bit != assert->node_idx) { @@ -1658,19 +1738,36 @@ int dlm_assert_master_handler(struct o2n * number winning the mastery will respond * YES to mastery requests, but this node * had no way of knowing. let it pass. */ - mlog(ML_ERROR, "%u is the lowest node, " + mlog(0, "%u is the lowest node, " "%u is asserting. (%.*s) %u must " "have begun after %u won.\n", bit, assert->node_idx, namelen, name, bit, assert->node_idx); } } + if (mle->type == DLM_MLE_MIGRATION) { + if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { + mlog(0, "%s:%.*s: got cleanup assert" + " from %u for migration\n", + dlm->name, namelen, name, + assert->node_idx); + } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) { + mlog(0, "%s:%.*s: got unrelated assert" + " from %u for migration, ignoring\n", + dlm->name, namelen, name, + assert->node_idx); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + goto done; + } + } } spin_unlock(&dlm->master_lock); /* ok everything checks out with the MLE * now check to see if there is a lockres */ - res = __dlm_lookup_lockres(dlm, name, namelen); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); if (res) { spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { @@ -1679,7 +1776,8 @@ int dlm_assert_master_handler(struct o2n goto kill; } if (!mle) { - if (res->owner != assert->node_idx) { + if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && + res->owner != assert->node_idx) { mlog(ML_ERROR, "assert_master from " "%u, but current owner is " "%u! (%.*s)\n", @@ -1732,6 +1830,7 @@ ok: if (mle) { int extra_ref = 0; int nn = -1; + int rr, err = 0; spin_lock(&mle->spinlock); if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) @@ -1751,27 +1850,64 @@ ok: wake_up(&mle->wq); spin_unlock(&mle->spinlock); - if (mle->type == DLM_MLE_MIGRATION && res) { - mlog(0, "finishing off migration of lockres %.*s, " - "from %u to %u\n", - res->lockname.len, res->lockname.name, - dlm->node_num, mle->new_master); + if (res) { spin_lock(&res->spinlock); - res->state &= ~DLM_LOCK_RES_MIGRATING; - dlm_change_lockres_owner(dlm, res, mle->new_master); - BUG_ON(res->state & DLM_LOCK_RES_DIRTY); + if (mle->type == DLM_MLE_MIGRATION) { + mlog(0, "finishing off migration of lockres %.*s, " + "from %u to %u\n", + res->lockname.len, res->lockname.name, + dlm->node_num, mle->new_master); + res->state &= ~DLM_LOCK_RES_MIGRATING; + dlm_change_lockres_owner(dlm, res, mle->new_master); + BUG_ON(res->state & DLM_LOCK_RES_DIRTY); + } else { + dlm_change_lockres_owner(dlm, res, mle->master); + } spin_unlock(&res->spinlock); } - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, mle); - dlm_put_mle(mle); - + + /* master is known, detach if not already detached. + * ensures that only one assert_master call will happen + * on this mle. */ + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + + rr = atomic_read(&mle->mle_refs.refcount); + if (mle->inuse > 0) { + if (extra_ref && rr < 3) + err = 1; + else if (!extra_ref && rr < 2) + err = 1; + } else { + if (extra_ref && rr < 2) + err = 1; + else if (!extra_ref && rr < 1) + err = 1; + } + if (err) { + mlog(ML_ERROR, "%s:%.*s: got assert master from %u " + "that will mess up this node, refs=%d, extra=%d, " + "inuse=%d\n", dlm->name, namelen, name, + assert->node_idx, rr, extra_ref, mle->inuse); + dlm_print_one_mle(mle); + } + list_del_init(&mle->list); + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); if (extra_ref) { /* the assert master message now balances the extra * ref given by the master / migration request message. * if this is the last put, it will be removed * from the list. */ - dlm_put_mle(mle); + __dlm_put_mle(mle); + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + } else if (res) { + if (res->owner != assert->node_idx) { + mlog(0, "assert_master from %u, but current " + "owner is %u (%.*s), no mle\n", assert->node_idx, + res->owner, namelen, name); } } @@ -1788,12 +1924,12 @@ done: kill: /* kill the caller! */ + mlog(ML_ERROR, "Bad message received from another node. Dumping state " + "and killing the other node now! This node is OK and can continue.\n"); + __dlm_print_one_lock_resource(res); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); dlm_lockres_put(res); - mlog(ML_ERROR, "Bad message received from another node. Dumping state " - "and killing the other node now! This node is OK and can continue.\n"); - dlm_dump_lock_resources(dlm); dlm_put(dlm); return -EINVAL; } @@ -1803,7 +1939,7 @@ int dlm_dispatch_assert_master(struct dl int ignore_higher, u8 request_from, u32 flags) { struct dlm_work_item *item; - item = kcalloc(1, sizeof(*item), GFP_KERNEL); + item = kcalloc(1, sizeof(*item), GFP_NOFS); if (!item) return -ENOMEM; @@ -1825,7 +1961,7 @@ int dlm_dispatch_assert_master(struct dl list_add_tail(&item->list, &dlm->work_list); spin_unlock(&dlm->work_lock); - schedule_work(&dlm->dispatched_work); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); return 0; } @@ -1866,6 +2002,23 @@ static void dlm_assert_master_worker(str } } + /* + * If we're migrating this lock to someone else, we are no + * longer allowed to assert out own mastery. OTOH, we need to + * prevent migration from starting while we're still asserting + * our dominance. The reserved ast delays migration. + */ + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_MIGRATING) { + mlog(0, "Someone asked us to assert mastery, but we're " + "in the middle of migration. Skipping assert, " + "the new master will handle that.\n"); + spin_unlock(&res->spinlock); + goto put; + } else + __dlm_lockres_reserve_ast(res); + spin_unlock(&res->spinlock); + /* this call now finishes out the nodemap * even if one or more nodes die */ mlog(0, "worker about to master %.*s here, this=%u\n", @@ -1875,9 +2028,14 @@ static void dlm_assert_master_worker(str nodemap, flags); if (ret < 0) { /* no need to restart, we are done */ - mlog_errno(ret); + if (!dlm_is_host_down(ret)) + mlog_errno(ret); } + /* Ok, we've asserted ourselves. Let's let migration start. */ + dlm_lockres_release_ast(dlm, res); + +put: dlm_lockres_put(res); mlog(0, "finished with dlm_assert_master_worker\n"); @@ -1916,6 +2074,7 @@ static int dlm_pre_master_reco_lockres(s BUG(); /* host is down, so answer for that node would be * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ + ret = 0; } if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { @@ -2016,14 +2175,14 @@ int dlm_migrate_lockres(struct dlm_ctxt */ ret = -ENOMEM; - mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL); + mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); if (!mres) { mlog_errno(ret); goto leave; } mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, - GFP_KERNEL); + GFP_NOFS); if (!mle) { mlog_errno(ret); goto leave; @@ -2117,7 +2276,7 @@ fail: * take both dlm->spinlock and dlm->master_lock */ spin_lock(&dlm->spinlock); spin_lock(&dlm->master_lock); - dlm_get_mle(mle); + dlm_get_mle_inuse(mle); spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -2134,7 +2293,10 @@ fail: /* migration failed, detach and clean up mle */ dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); goto leave; } @@ -2164,8 +2326,8 @@ fail: /* avoid hang during shutdown when migrating lockres * to a node which also goes down */ if (dlm_is_node_dead(dlm, target)) { - mlog(0, "%s:%.*s: expected migration target %u " - "is no longer up. restarting.\n", + mlog(0, "%s:%.*s: expected migration " + "target %u is no longer up, restarting\n", dlm->name, res->lockname.len, res->lockname.name, target); ret = -ERESTARTSYS; @@ -2175,7 +2337,10 @@ fail: /* migration failed, detach and clean up mle */ dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); goto leave; } /* TODO: if node died: stop, clean up, return error */ @@ -2191,7 +2356,7 @@ fail: /* master is known, detach if not already detached */ dlm_mle_detach_hb_events(dlm, mle); - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); ret = 0; dlm_lockres_calc_usage(dlm, res); @@ -2462,7 +2627,7 @@ int dlm_migrate_request_handler(struct o struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; const char *name; - unsigned int namelen; + unsigned int namelen, hash; int ret = 0; if (!dlm_grab(dlm)) @@ -2470,10 +2635,11 @@ int dlm_migrate_request_handler(struct o name = migrate->name; namelen = migrate->namelen; + hash = dlm_lockid_hash(name, namelen); /* preallocate.. if this fails, abort */ mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, - GFP_KERNEL); + GFP_NOFS); if (!mle) { ret = -ENOMEM; @@ -2482,7 +2648,7 @@ int dlm_migrate_request_handler(struct o /* check for pre-existing lock */ spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, name, namelen); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); spin_lock(&dlm->master_lock); if (res) { @@ -2580,6 +2746,7 @@ static int dlm_add_migration_mle(struct /* remove it from the list so that only one * mle will be found */ list_del_init(&tmp->list); + __dlm_mle_detach_hb_events(dlm, mle); } spin_unlock(&tmp->spinlock); } @@ -2601,6 +2768,7 @@ void dlm_clean_master_list(struct dlm_ct struct list_head *iter, *iter2; struct dlm_master_list_entry *mle; struct dlm_lock_resource *res; + unsigned int hash; mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); top: @@ -2640,7 +2808,7 @@ top: * may result in the mle being unlinked and * freed, but there may still be a process * waiting in the dlmlock path which is fine. */ - mlog(ML_ERROR, "node %u was expected master\n", + mlog(0, "node %u was expected master\n", dead_node); atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); @@ -2673,19 +2841,21 @@ top: /* remove from the list early. NOTE: unlinking * list_head while in list_for_each_safe */ + __dlm_mle_detach_hb_events(dlm, mle); spin_lock(&mle->spinlock); list_del_init(&mle->list); atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); wake_up(&mle->wq); - mlog(0, "node %u died during migration from " - "%u to %u!\n", dead_node, + mlog(0, "%s: node %u died during migration from " + "%u to %u!\n", dlm->name, dead_node, mle->master, mle->new_master); /* if there is a lockres associated with this * mle, find it and set its owner to UNKNOWN */ + hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len); res = __dlm_lookup_lockres(dlm, mle->u.name.name, - mle->u.name.len); + mle->u.name.len, hash); if (res) { /* unfortunately if we hit this rare case, our * lock ordering is messed. we need to drop diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 805cbab..044a964 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -95,6 +95,9 @@ static void dlm_reco_unlock_ast(void *as static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data); static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data); +static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 *real_master); static u64 dlm_get_next_mig_cookie(void); @@ -115,12 +118,37 @@ static u64 dlm_get_next_mig_cookie(void) return c; } +static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm, + u8 dead_node) +{ + assert_spin_locked(&dlm->spinlock); + if (dlm->reco.dead_node != dead_node) + mlog(0, "%s: changing dead_node from %u to %u\n", + dlm->name, dlm->reco.dead_node, dead_node); + dlm->reco.dead_node = dead_node; +} + +static inline void dlm_set_reco_master(struct dlm_ctxt *dlm, + u8 master) +{ + assert_spin_locked(&dlm->spinlock); + mlog(0, "%s: changing new_master from %u to %u\n", + dlm->name, dlm->reco.new_master, master); + dlm->reco.new_master = master; +} + +static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm) +{ + assert_spin_locked(&dlm->spinlock); + clear_bit(dlm->reco.dead_node, dlm->recovery_map); + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); + dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); +} + static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) { spin_lock(&dlm->spinlock); - clear_bit(dlm->reco.dead_node, dlm->recovery_map); - dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; - dlm->reco.new_master = O2NM_INVALID_NODE_NUM; + __dlm_reset_recovery(dlm); spin_unlock(&dlm->spinlock); } @@ -132,12 +160,21 @@ void dlm_dispatch_work(void *data) struct list_head *iter, *iter2; struct dlm_work_item *item; dlm_workfunc_t *workfunc; + int tot=0; + + if (!dlm_joined(dlm)) + return; spin_lock(&dlm->work_lock); list_splice_init(&dlm->work_list, &tmp_list); spin_unlock(&dlm->work_lock); list_for_each_safe(iter, iter2, &tmp_list) { + tot++; + } + mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); + + list_for_each_safe(iter, iter2, &tmp_list) { item = list_entry(iter, struct dlm_work_item, list); workfunc = item->func; list_del_init(&item->list); @@ -220,6 +257,52 @@ void dlm_complete_recovery_thread(struct * */ +static void dlm_print_reco_node_status(struct dlm_ctxt *dlm) +{ + struct dlm_reco_node_data *ndata; + struct dlm_lock_resource *res; + + mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n", + dlm->name, dlm->dlm_reco_thread_task->pid, + dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive", + dlm->reco.dead_node, dlm->reco.new_master); + + list_for_each_entry(ndata, &dlm->reco.node_data, list) { + char *st = "unknown"; + switch (ndata->state) { + case DLM_RECO_NODE_DATA_INIT: + st = "init"; + break; + case DLM_RECO_NODE_DATA_REQUESTING: + st = "requesting"; + break; + case DLM_RECO_NODE_DATA_DEAD: + st = "dead"; + break; + case DLM_RECO_NODE_DATA_RECEIVING: + st = "receiving"; + break; + case DLM_RECO_NODE_DATA_REQUESTED: + st = "requested"; + break; + case DLM_RECO_NODE_DATA_DONE: + st = "done"; + break; + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + st = "finalize-sent"; + break; + default: + st = "bad"; + break; + } + mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n", + dlm->name, ndata->node_num, st); + } + list_for_each_entry(res, &dlm->reco.resources, recovering) { + mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n", + dlm->name, res->lockname.len, res->lockname.name); + } +} #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) @@ -267,11 +350,23 @@ int dlm_is_node_dead(struct dlm_ctxt *dl { int dead; spin_lock(&dlm->spinlock); - dead = test_bit(node, dlm->domain_map); + dead = !test_bit(node, dlm->domain_map); spin_unlock(&dlm->spinlock); return dead; } +/* returns true if node is no longer in the domain + * could be dead or just not joined */ +static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node) +{ + int recovered; + spin_lock(&dlm->spinlock); + recovered = !test_bit(node, dlm->recovery_map); + spin_unlock(&dlm->spinlock); + return recovered; +} + + int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) { if (timeout) { @@ -290,6 +385,24 @@ int dlm_wait_for_node_death(struct dlm_c return 0; } +int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) +{ + if (timeout) { + mlog(0, "%s: waiting %dms for notification of " + "recovery of node %u\n", dlm->name, timeout, node); + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_recovered(dlm, node), + msecs_to_jiffies(timeout)); + } else { + mlog(0, "%s: waiting indefinitely for notification " + "of recovery of node %u\n", dlm->name, node); + wait_event(dlm->dlm_reco_thread_wq, + dlm_is_node_recovered(dlm, node)); + } + /* for now, return 0 */ + return 0; +} + /* callers of the top-level api calls (dlmlock/dlmunlock) should * block on the dlm->reco.event when recovery is in progress. * the dlm recovery thread will set this state when it begins @@ -308,6 +421,13 @@ static int dlm_in_recovery(struct dlm_ct void dlm_wait_for_recovery(struct dlm_ctxt *dlm) { + if (dlm_in_recovery(dlm)) { + mlog(0, "%s: reco thread %d in recovery: " + "state=%d, master=%u, dead=%u\n", + dlm->name, dlm->dlm_reco_thread_task->pid, + dlm->reco.state, dlm->reco.new_master, + dlm->reco.dead_node); + } wait_event(dlm->reco.event, !dlm_in_recovery(dlm)); } @@ -341,7 +461,7 @@ static int dlm_do_recovery(struct dlm_ct mlog(0, "new master %u died while recovering %u!\n", dlm->reco.new_master, dlm->reco.dead_node); /* unset the new_master, leave dead_node */ - dlm->reco.new_master = O2NM_INVALID_NODE_NUM; + dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); } /* select a target to recover */ @@ -350,14 +470,14 @@ static int dlm_do_recovery(struct dlm_ct bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); if (bit >= O2NM_MAX_NODES || bit < 0) - dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); else - dlm->reco.dead_node = bit; + dlm_set_reco_dead_node(dlm, bit); } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) { /* BUG? */ mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", dlm->reco.dead_node); - dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); } if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { @@ -366,7 +486,8 @@ static int dlm_do_recovery(struct dlm_ct /* return to main thread loop and sleep. */ return 0; } - mlog(0, "recovery thread found node %u in the recovery map!\n", + mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", + dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.dead_node); spin_unlock(&dlm->spinlock); @@ -389,8 +510,8 @@ static int dlm_do_recovery(struct dlm_ct } mlog(0, "another node will master this recovery session.\n"); } - mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n", - dlm->name, dlm->reco.new_master, + mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", + dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master, dlm->node_num, dlm->reco.dead_node); /* it is safe to start everything back up here @@ -402,11 +523,13 @@ static int dlm_do_recovery(struct dlm_ct return 0; master_here: - mlog(0, "mastering recovery of %s:%u here(this=%u)!\n", + mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n", + dlm->dlm_reco_thread_task->pid, dlm->name, dlm->reco.dead_node, dlm->node_num); status = dlm_remaster_locks(dlm, dlm->reco.dead_node); if (status < 0) { + /* we should never hit this anymore */ mlog(ML_ERROR, "error %d remastering locks for node %u, " "retrying.\n", status, dlm->reco.dead_node); /* yield a bit to allow any final network messages @@ -433,9 +556,16 @@ static int dlm_remaster_locks(struct dlm int destroy = 0; int pass = 0; - status = dlm_init_recovery_area(dlm, dead_node); - if (status < 0) - goto leave; + do { + /* we have become recovery master. there is no escaping + * this, so just keep trying until we get it. */ + status = dlm_init_recovery_area(dlm, dead_node); + if (status < 0) { + mlog(ML_ERROR, "%s: failed to alloc recovery area, " + "retrying\n", dlm->name); + msleep(1000); + } + } while (status != 0); /* safe to access the node data list without a lock, since this * process is the only one to change the list */ @@ -452,16 +582,36 @@ static int dlm_remaster_locks(struct dlm continue; } - status = dlm_request_all_locks(dlm, ndata->node_num, dead_node); - if (status < 0) { - mlog_errno(status); - if (dlm_is_host_down(status)) - ndata->state = DLM_RECO_NODE_DATA_DEAD; - else { - destroy = 1; - goto leave; + do { + status = dlm_request_all_locks(dlm, ndata->node_num, + dead_node); + if (status < 0) { + mlog_errno(status); + if (dlm_is_host_down(status)) { + /* node died, ignore it for recovery */ + status = 0; + ndata->state = DLM_RECO_NODE_DATA_DEAD; + /* wait for the domain map to catch up + * with the network state. */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, + ndata->node_num), + msecs_to_jiffies(1000)); + mlog(0, "waited 1 sec for %u, " + "dead? %s\n", ndata->node_num, + dlm_is_node_dead(dlm, ndata->node_num) ? + "yes" : "no"); + } else { + /* -ENOMEM on the other node */ + mlog(0, "%s: node %u returned " + "%d during recovery, retrying " + "after a short wait\n", + dlm->name, ndata->node_num, + status); + msleep(100); + } } - } + } while (status != 0); switch (ndata->state) { case DLM_RECO_NODE_DATA_INIT: @@ -473,10 +623,9 @@ static int dlm_remaster_locks(struct dlm mlog(0, "node %u died after requesting " "recovery info for node %u\n", ndata->node_num, dead_node); - // start all over - destroy = 1; - status = -EAGAIN; - goto leave; + /* fine. don't need this node's info. + * continue without it. */ + break; case DLM_RECO_NODE_DATA_REQUESTING: ndata->state = DLM_RECO_NODE_DATA_REQUESTED; mlog(0, "now receiving recovery data from " @@ -520,35 +669,26 @@ static int dlm_remaster_locks(struct dlm BUG(); break; case DLM_RECO_NODE_DATA_DEAD: - mlog(ML_NOTICE, "node %u died after " + mlog(0, "node %u died after " "requesting recovery info for " "node %u\n", ndata->node_num, dead_node); - spin_unlock(&dlm_reco_state_lock); - // start all over - destroy = 1; - status = -EAGAIN; - /* instead of spinning like crazy here, - * wait for the domain map to catch up - * with the network state. otherwise this - * can be hit hundreds of times before - * the node is really seen as dead. */ - wait_event_timeout(dlm->dlm_reco_thread_wq, - dlm_is_node_dead(dlm, - ndata->node_num), - msecs_to_jiffies(1000)); - mlog(0, "waited 1 sec for %u, " - "dead? %s\n", ndata->node_num, - dlm_is_node_dead(dlm, ndata->node_num) ? - "yes" : "no"); - goto leave; + break; case DLM_RECO_NODE_DATA_RECEIVING: case DLM_RECO_NODE_DATA_REQUESTED: + mlog(0, "%s: node %u still in state %s\n", + dlm->name, ndata->node_num, + ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? + "receiving" : "requested"); all_nodes_done = 0; break; case DLM_RECO_NODE_DATA_DONE: + mlog(0, "%s: node %u state is done\n", + dlm->name, ndata->node_num); break; case DLM_RECO_NODE_DATA_FINALIZE_SENT: + mlog(0, "%s: node %u state is finalize\n", + dlm->name, ndata->node_num); break; } } @@ -578,7 +718,7 @@ static int dlm_remaster_locks(struct dlm jiffies, dlm->reco.dead_node, dlm->node_num, dlm->reco.new_master); destroy = 1; - status = ret; + status = 0; /* rescan everything marked dirty along the way */ dlm_kick_thread(dlm, NULL); break; @@ -591,7 +731,6 @@ static int dlm_remaster_locks(struct dlm } -leave: if (destroy) dlm_destroy_recovery_area(dlm, dead_node); @@ -617,7 +756,7 @@ static int dlm_init_recovery_area(struct } BUG_ON(num == dead_node); - ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL); + ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS); if (!ndata) { dlm_destroy_recovery_area(dlm, dead_node); return -ENOMEM; @@ -691,16 +830,25 @@ int dlm_request_all_locks_handler(struct if (!dlm_grab(dlm)) return -EINVAL; + if (lr->dead_node != dlm->reco.dead_node) { + mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local " + "dead_node is %u\n", dlm->name, lr->node_idx, + lr->dead_node, dlm->reco.dead_node); + dlm_print_reco_node_status(dlm); + /* this is a hack */ + dlm_put(dlm); + return -ENOMEM; + } BUG_ON(lr->dead_node != dlm->reco.dead_node); - item = kcalloc(1, sizeof(*item), GFP_KERNEL); + item = kcalloc(1, sizeof(*item), GFP_NOFS); if (!item) { dlm_put(dlm); return -ENOMEM; } /* this will get freed by dlm_request_all_locks_worker */ - buf = (char *) __get_free_page(GFP_KERNEL); + buf = (char *) __get_free_page(GFP_NOFS); if (!buf) { kfree(item); dlm_put(dlm); @@ -715,7 +863,7 @@ int dlm_request_all_locks_handler(struct spin_lock(&dlm->work_lock); list_add_tail(&item->list, &dlm->work_list); spin_unlock(&dlm->work_lock); - schedule_work(&dlm->dispatched_work); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); dlm_put(dlm); return 0; @@ -730,32 +878,34 @@ static void dlm_request_all_locks_worker struct list_head *iter; int ret; u8 dead_node, reco_master; + int skip_all_done = 0; dlm = item->dlm; dead_node = item->u.ral.dead_node; reco_master = item->u.ral.reco_master; mres = (struct dlm_migratable_lockres *)data; + mlog(0, "%s: recovery worker started, dead=%u, master=%u\n", + dlm->name, dead_node, reco_master); + if (dead_node != dlm->reco.dead_node || reco_master != dlm->reco.new_master) { - /* show extra debug info if the recovery state is messed */ - mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), " - "request(dead=%u, master=%u)\n", - dlm->name, dlm->reco.dead_node, dlm->reco.new_master, - dead_node, reco_master); - mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u " - "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n", - dlm->name, mres->lockname_len, mres->lockname, mres->master, - mres->num_locks, mres->total_locks, mres->flags, - dlm_get_lock_cookie_node(mres->ml[0].cookie), - dlm_get_lock_cookie_seq(mres->ml[0].cookie), - mres->ml[0].list, mres->ml[0].flags, - mres->ml[0].type, mres->ml[0].convert_type, - mres->ml[0].highest_blocked, mres->ml[0].node); - BUG(); + /* worker could have been created before the recovery master + * died. if so, do not continue, but do not error. */ + if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { + mlog(ML_NOTICE, "%s: will not send recovery state, " + "recovery master %u died, thread=(dead=%u,mas=%u)" + " current=(dead=%u,mas=%u)\n", dlm->name, + reco_master, dead_node, reco_master, + dlm->reco.dead_node, dlm->reco.new_master); + } else { + mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, " + "master=%u), request(dead=%u, master=%u)\n", + dlm->name, dlm->reco.dead_node, + dlm->reco.new_master, dead_node, reco_master); + } + goto leave; } - BUG_ON(dead_node != dlm->reco.dead_node); - BUG_ON(reco_master != dlm->reco.new_master); /* lock resources should have already been moved to the * dlm->reco.resources list. now move items from that list @@ -766,12 +916,20 @@ static void dlm_request_all_locks_worker dlm_move_reco_locks_to_list(dlm, &resources, dead_node); /* now we can begin blasting lockreses without the dlm lock */ + + /* any errors returned will be due to the new_master dying, + * the dlm_reco_thread should detect this */ list_for_each(iter, &resources) { res = list_entry (iter, struct dlm_lock_resource, recovering); ret = dlm_send_one_lockres(dlm, res, mres, reco_master, DLM_MRES_RECOVERY); - if (ret < 0) - mlog_errno(ret); + if (ret < 0) { + mlog(ML_ERROR, "%s: node %u went down while sending " + "recovery state for dead node %u, ret=%d\n", dlm->name, + reco_master, dead_node, ret); + skip_all_done = 1; + break; + } } /* move the resources back to the list */ @@ -779,10 +937,15 @@ static void dlm_request_all_locks_worker list_splice_init(&resources, &dlm->reco.resources); spin_unlock(&dlm->spinlock); - ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); - if (ret < 0) - mlog_errno(ret); - + if (!skip_all_done) { + ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); + if (ret < 0) { + mlog(ML_ERROR, "%s: node %u went down while sending " + "recovery all-done for dead node %u, ret=%d\n", + dlm->name, reco_master, dead_node, ret); + } + } +leave: free_page((unsigned long)data); } @@ -801,8 +964,14 @@ static int dlm_send_all_done_msg(struct ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, sizeof(done_msg), send_to, &tmpret); - /* negative status is ignored by the caller */ - if (ret >= 0) + if (ret < 0) { + if (!dlm_is_host_down(ret)) { + mlog_errno(ret); + mlog(ML_ERROR, "%s: unknown error sending data-done " + "to %u\n", dlm->name, send_to); + BUG(); + } + } else ret = tmpret; return ret; } @@ -822,7 +991,11 @@ int dlm_reco_data_done_handler(struct o2 mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " "node_idx=%u, this node=%u\n", done->dead_node, dlm->reco.dead_node, done->node_idx, dlm->node_num); - BUG_ON(done->dead_node != dlm->reco.dead_node); + + mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node), + "Got DATA DONE: dead_node=%u, reco.dead_node=%u, " + "node_idx=%u, this node=%u\n", done->dead_node, + dlm->reco.dead_node, done->node_idx, dlm->node_num); spin_lock(&dlm_reco_state_lock); list_for_each(iter, &dlm->reco.node_data) { @@ -1023,8 +1196,9 @@ static int dlm_add_lock_to_array(struct ml->type == LKM_PRMODE) { /* if it is already set, this had better be a PR * and it has to match */ - if (mres->lvb[0] && (ml->type == LKM_EXMODE || - memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { + if (!dlm_lvb_is_empty(mres->lvb) && + (ml->type == LKM_EXMODE || + memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { mlog(ML_ERROR, "mismatched lvbs!\n"); __dlm_print_one_lock_resource(lock->lockres); BUG(); @@ -1083,22 +1257,25 @@ int dlm_send_one_lockres(struct dlm_ctxt * we must send it immediately. */ ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); - if (ret < 0) { - // TODO - mlog(ML_ERROR, "dlm_send_mig_lockres_msg " - "returned %d, TODO\n", ret); - BUG(); - } + if (ret < 0) + goto error; } } /* flush any remaining locks */ ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); - if (ret < 0) { - // TODO - mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, " - "TODO\n", ret); + if (ret < 0) + goto error; + return ret; + +error: + mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n", + dlm->name, ret); + if (!dlm_is_host_down(ret)) BUG(); - } + mlog(0, "%s: node %u went down while sending %s " + "lockres %.*s\n", dlm->name, send_to, + flags & DLM_MRES_RECOVERY ? "recovery" : "migration", + res->lockname.len, res->lockname.name); return ret; } @@ -1146,8 +1323,8 @@ int dlm_mig_lockres_handler(struct o2net mlog(0, "all done flag. all lockres data received!\n"); ret = -ENOMEM; - buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL); - item = kcalloc(1, sizeof(*item), GFP_KERNEL); + buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS); + item = kcalloc(1, sizeof(*item), GFP_NOFS); if (!buf || !item) goto leave; @@ -1238,7 +1415,7 @@ int dlm_mig_lockres_handler(struct o2net spin_lock(&dlm->work_lock); list_add_tail(&item->list, &dlm->work_list); spin_unlock(&dlm->work_lock); - schedule_work(&dlm->dispatched_work); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); leave: dlm_put(dlm); @@ -1312,8 +1489,9 @@ leave: -int dlm_lockres_master_requery(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res, u8 *real_master) +static int dlm_lockres_master_requery(struct dlm_ctxt *dlm, + struct dlm_lock_resource *res, + u8 *real_master) { struct dlm_node_iter iter; int nodenum; @@ -1406,6 +1584,7 @@ int dlm_master_requery_handler(struct o2 struct dlm_ctxt *dlm = data; struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; struct dlm_lock_resource *res = NULL; + unsigned int hash; int master = DLM_LOCK_RES_OWNER_UNKNOWN; u32 flags = DLM_ASSERT_MASTER_REQUERY; @@ -1415,8 +1594,10 @@ int dlm_master_requery_handler(struct o2 return master; } + hash = dlm_lockid_hash(req->name, req->namelen); + spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, req->name, req->namelen); + res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash); if (res) { spin_lock(&res->spinlock); master = res->owner; @@ -1483,7 +1664,7 @@ static int dlm_process_recovery_data(str struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; int ret = 0; - int i; + int i, bad; struct list_head *iter; struct dlm_lock *lock = NULL; @@ -1553,28 +1734,48 @@ static int dlm_process_recovery_data(str } lksb->flags |= (ml->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); - - if (mres->lvb[0]) { + + if (ml->type == LKM_NLMODE) + goto skip_lvb; + + if (!dlm_lvb_is_empty(mres->lvb)) { if (lksb->flags & DLM_LKSB_PUT_LVB) { /* other node was trying to update * lvb when node died. recreate the * lksb with the updated lvb. */ memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN); + /* the lock resource lvb update must happen + * NOW, before the spinlock is dropped. + * we no longer wait for the AST to update + * the lvb. */ + memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); } else { /* otherwise, the node is sending its * most recent valid lvb info */ BUG_ON(ml->type != LKM_EXMODE && ml->type != LKM_PRMODE); - if (res->lvb[0] && (ml->type == LKM_EXMODE || - memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { - mlog(ML_ERROR, "received bad lvb!\n"); - __dlm_print_one_lock_resource(res); - BUG(); + if (!dlm_lvb_is_empty(res->lvb) && + (ml->type == LKM_EXMODE || + memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { + int i; + mlog(ML_ERROR, "%s:%.*s: received bad " + "lvb! type=%d\n", dlm->name, + res->lockname.len, + res->lockname.name, ml->type); + printk("lockres lvb=["); + for (i=0; ilvb[i]); + printk("]\nmigrated lvb=["); + for (i=0; ilvb[i]); + printk("]\n"); + dlm_print_one_lock_resource(res); + BUG(); } memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); } } - +skip_lvb: /* NOTE: * wrt lock queue ordering and recovery: @@ -1592,9 +1793,33 @@ static int dlm_process_recovery_data(str * relative to each other, but clearly *not* * preserved relative to locks from other nodes. */ + bad = 0; spin_lock(&res->spinlock); - dlm_lock_get(newlock); - list_add_tail(&newlock->list, queue); + list_for_each_entry(lock, queue, list) { + if (lock->ml.cookie == ml->cookie) { + u64 c = lock->ml.cookie; + mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " + "exists on this lockres!\n", dlm->name, + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(c), + dlm_get_lock_cookie_seq(c)); + + mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, " + "node=%u, cookie=%u:%llu, queue=%d\n", + ml->type, ml->convert_type, ml->node, + dlm_get_lock_cookie_node(ml->cookie), + dlm_get_lock_cookie_seq(ml->cookie), + ml->list); + + __dlm_print_one_lock_resource(res); + bad = 1; + break; + } + } + if (!bad) { + dlm_lock_get(newlock); + list_add_tail(&newlock->list, queue); + } spin_unlock(&res->spinlock); } mlog(0, "done running all the locks\n"); @@ -1618,8 +1843,14 @@ void dlm_move_lockres_to_recovery_list(s struct dlm_lock *lock; res->state |= DLM_LOCK_RES_RECOVERING; - if (!list_empty(&res->recovering)) + if (!list_empty(&res->recovering)) { + mlog(0, + "Recovering res %s:%.*s, is already on recovery list!\n", + dlm->name, res->lockname.len, res->lockname.name); list_del_init(&res->recovering); + } + /* We need to hold a reference while on the recovery list */ + dlm_lockres_get(res); list_add_tail(&res->recovering, &dlm->reco.resources); /* find any pending locks and put them back on proper list */ @@ -1708,9 +1939,11 @@ static void dlm_finish_local_lockres_rec spin_lock(&res->spinlock); dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - __dlm_dirty_lockres(dlm, res); + if (!__dlm_lockres_unused(res)) + __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); + dlm_lockres_put(res); } } @@ -1719,7 +1952,7 @@ static void dlm_finish_local_lockres_rec * the RECOVERING state and set the owner * if necessary */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { - bucket = &(dlm->lockres_hash[i]); + bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, hash_iter, bucket, hash_node) { if (res->state & DLM_LOCK_RES_RECOVERING) { if (res->owner == dead_node) { @@ -1743,11 +1976,13 @@ static void dlm_finish_local_lockres_rec dlm->name, res->lockname.len, res->lockname.name, res->owner); list_del_init(&res->recovering); + dlm_lockres_put(res); } spin_lock(&res->spinlock); dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - __dlm_dirty_lockres(dlm, res); + if (!__dlm_lockres_unused(res)) + __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); } @@ -1884,7 +2119,7 @@ static void dlm_do_local_recovery_cleanu * need to be fired as a result. */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { - bucket = &(dlm->lockres_hash[i]); + bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, iter, bucket, hash_node) { /* always prune any $RECOVERY entries for dead nodes, * otherwise hangs can occur during later recovery */ @@ -1924,6 +2159,20 @@ static void __dlm_hb_node_down(struct dl { assert_spin_locked(&dlm->spinlock); + if (dlm->reco.new_master == idx) { + mlog(0, "%s: recovery master %d just died\n", + dlm->name, idx); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + /* finalize1 was reached, so it is safe to clear + * the new_master and dead_node. that recovery + * is complete. */ + mlog(0, "%s: dead master %d had reached " + "finalize1 state, clearing\n", dlm->name, idx); + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + __dlm_reset_recovery(dlm); + } + } + /* check to see if the node is already considered dead */ if (!test_bit(idx, dlm->live_nodes_map)) { mlog(0, "for domain %s, node %d is already dead. " @@ -2087,7 +2336,7 @@ again: /* set the new_master to this node */ spin_lock(&dlm->spinlock); - dlm->reco.new_master = dlm->node_num; + dlm_set_reco_master(dlm, dlm->node_num); spin_unlock(&dlm->spinlock); } @@ -2125,6 +2374,10 @@ again: mlog(0, "%s: reco master %u is ready to recover %u\n", dlm->name, dlm->reco.new_master, dlm->reco.dead_node); status = -EEXIST; + } else if (ret == DLM_RECOVERING) { + mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n", + dlm->name, dlm->node_num); + goto again; } else { struct dlm_lock_resource *res; @@ -2156,7 +2409,7 @@ static int dlm_send_begin_reco_message(s mlog_entry("%u\n", dead_node); - mlog(0, "dead node is %u\n", dead_node); + mlog(0, "%s: dead node is %u\n", dlm->name, dead_node); spin_lock(&dlm->spinlock); dlm_node_iter_init(dlm->domain_map, &iter); @@ -2214,6 +2467,14 @@ retry: * another ENOMEM */ msleep(100); goto retry; + } else if (ret == EAGAIN) { + mlog(0, "%s: trying to start recovery of node " + "%u, but node %u is waiting for last recovery " + "to complete, backoff for a bit\n", dlm->name, + dead_node, nodenum); + /* TODO Look into replacing msleep with cond_resched() */ + msleep(100); + goto retry; } } @@ -2229,8 +2490,20 @@ int dlm_begin_reco_handler(struct o2net_ if (!dlm_grab(dlm)) return 0; - mlog(0, "node %u wants to recover node %u\n", - br->node_idx, br->dead_node); + spin_lock(&dlm->spinlock); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + mlog(0, "%s: node %u wants to recover node %u (%u:%u) " + "but this node is in finalize state, waiting on finalize2\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + spin_unlock(&dlm->spinlock); + return EAGAIN; + } + spin_unlock(&dlm->spinlock); + + mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); @@ -2252,8 +2525,8 @@ int dlm_begin_reco_handler(struct o2net_ "node %u changing it to %u\n", dlm->name, dlm->reco.dead_node, br->node_idx, br->dead_node); } - dlm->reco.new_master = br->node_idx; - dlm->reco.dead_node = br->dead_node; + dlm_set_reco_master(dlm, br->node_idx); + dlm_set_reco_dead_node(dlm, br->dead_node); if (!test_bit(br->dead_node, dlm->recovery_map)) { mlog(0, "recovery master %u sees %u as dead, but this " "node has not yet. marking %u as dead\n", @@ -2272,10 +2545,16 @@ int dlm_begin_reco_handler(struct o2net_ spin_unlock(&dlm->spinlock); dlm_kick_recovery_thread(dlm); + + mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + dlm_put(dlm); return 0; } +#define DLM_FINALIZE_STAGE2 0x01 static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) { int ret = 0; @@ -2283,25 +2562,31 @@ static int dlm_send_finalize_reco_messag struct dlm_node_iter iter; int nodenum; int status; + int stage = 1; - mlog(0, "finishing recovery for node %s:%u\n", - dlm->name, dlm->reco.dead_node); + mlog(0, "finishing recovery for node %s:%u, " + "stage %d\n", dlm->name, dlm->reco.dead_node, stage); spin_lock(&dlm->spinlock); dlm_node_iter_init(dlm->domain_map, &iter); spin_unlock(&dlm->spinlock); +stage2: memset(&fr, 0, sizeof(fr)); fr.node_idx = dlm->node_num; fr.dead_node = dlm->reco.dead_node; + if (stage == 2) + fr.flags |= DLM_FINALIZE_STAGE2; while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { if (nodenum == dlm->node_num) continue; ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key, &fr, sizeof(fr), nodenum, &status); - if (ret >= 0) { + if (ret >= 0) ret = status; + if (ret < 0) { + mlog_errno(ret); if (dlm_is_host_down(ret)) { /* this has no effect on this recovery * session, so set the status to zero to @@ -2309,13 +2594,17 @@ static int dlm_send_finalize_reco_messag mlog(ML_ERROR, "node %u went down after this " "node finished recovery.\n", nodenum); ret = 0; + continue; } - } - if (ret < 0) { - mlog_errno(ret); break; } } + if (stage == 1) { + /* reset the node_iter back to the top and send finalize2 */ + iter.curnode = -1; + stage = 2; + goto stage2; + } return ret; } @@ -2324,14 +2613,19 @@ int dlm_finalize_reco_handler(struct o2n { struct dlm_ctxt *dlm = data; struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; + int stage = 1; /* ok to return 0, domain has gone away */ if (!dlm_grab(dlm)) return 0; - mlog(0, "node %u finalizing recovery of node %u\n", - fr->node_idx, fr->dead_node); + if (fr->flags & DLM_FINALIZE_STAGE2) + stage = 2; + mlog(0, "%s: node %u finalizing recovery stage%d of " + "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage, + fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master); + spin_lock(&dlm->spinlock); if (dlm->reco.new_master != fr->node_idx) { @@ -2347,13 +2641,41 @@ int dlm_finalize_reco_handler(struct o2n BUG(); } - dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); - - spin_unlock(&dlm->spinlock); + switch (stage) { + case 1: + dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + mlog(ML_ERROR, "%s: received finalize1 from " + "new master %u for dead node %u, but " + "this node has already received it!\n", + dlm->name, fr->node_idx, fr->dead_node); + dlm_print_reco_node_status(dlm); + BUG(); + } + dlm->reco.state |= DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + break; + case 2: + if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) { + mlog(ML_ERROR, "%s: received finalize2 from " + "new master %u for dead node %u, but " + "this node did not have finalize1!\n", + dlm->name, fr->node_idx, fr->dead_node); + dlm_print_reco_node_status(dlm); + BUG(); + } + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + dlm_reset_recovery(dlm); + dlm_kick_recovery_thread(dlm); + break; + default: + BUG(); + } - dlm_reset_recovery(dlm); + mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", + dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master); - dlm_kick_recovery_thread(dlm); dlm_put(dlm); return 0; } diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 5be9d14..4641d06 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -39,6 +39,7 @@ #include #include #include #include +#include #include "cluster/heartbeat.h" @@ -53,6 +54,8 @@ #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_ #include "cluster/masklog.h" static int dlm_thread(void *data); +static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, + struct dlm_lock_resource *lockres); static void dlm_flush_asts(struct dlm_ctxt *dlm); @@ -80,7 +83,7 @@ repeat: } -static int __dlm_lockres_unused(struct dlm_lock_resource *res) +int __dlm_lockres_unused(struct dlm_lock_resource *res) { if (list_empty(&res->granted) && list_empty(&res->converting) && @@ -103,6 +106,20 @@ void __dlm_lockres_calc_usage(struct dlm assert_spin_locked(&res->spinlock); if (__dlm_lockres_unused(res)){ + /* For now, just keep any resource we master */ + if (res->owner == dlm->node_num) + { + if (!list_empty(&res->purge)) { + mlog(0, "we master %s:%.*s, but it is on " + "the purge list. Removing\n", + dlm->name, res->lockname.len, + res->lockname.name); + list_del_init(&res->purge); + dlm->purge_count--; + } + return; + } + if (list_empty(&res->purge)) { mlog(0, "putting lockres %.*s from purge list\n", res->lockname.len, res->lockname.name); @@ -110,10 +127,23 @@ void __dlm_lockres_calc_usage(struct dlm res->last_used = jiffies; list_add_tail(&res->purge, &dlm->purge_list); dlm->purge_count++; + + /* if this node is not the owner, there is + * no way to keep track of who the owner could be. + * unhash it to avoid serious problems. */ + if (res->owner != dlm->node_num) { + mlog(0, "%s:%.*s: doing immediate " + "purge of lockres owned by %u\n", + dlm->name, res->lockname.len, + res->lockname.name, res->owner); + + dlm_purge_lockres_now(dlm, res); + } } } else if (!list_empty(&res->purge)) { - mlog(0, "removing lockres %.*s from purge list\n", - res->lockname.len, res->lockname.name); + mlog(0, "removing lockres %.*s from purge list, " + "owner=%u\n", res->lockname.len, res->lockname.name, + res->owner); list_del_init(&res->purge); dlm->purge_count--; @@ -165,6 +195,7 @@ again: } else if (ret < 0) { mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", lockres->lockname.len, lockres->lockname.name); + msleep(100); goto again; } @@ -178,6 +209,24 @@ finish: __dlm_unhash_lockres(lockres); } +/* make an unused lockres go away immediately. + * as soon as the dlm spinlock is dropped, this lockres + * will not be found. kfree still happens on last put. */ +static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, + struct dlm_lock_resource *lockres) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&lockres->spinlock); + + BUG_ON(!__dlm_lockres_unused(lockres)); + + if (!list_empty(&lockres->purge)) { + list_del_init(&lockres->purge); + dlm->purge_count--; + } + __dlm_unhash_lockres(lockres); +} + static void dlm_run_purge_list(struct dlm_ctxt *dlm, int purge_now) { @@ -422,6 +471,8 @@ void __dlm_dirty_lockres(struct dlm_ctxt /* don't shuffle secondary queues */ if ((res->owner == dlm->node_num) && !(res->state & DLM_LOCK_RES_DIRTY)) { + /* ref for dirty_list */ + dlm_lockres_get(res); list_add_tail(&res->dirty, &dlm->dirty_list); res->state |= DLM_LOCK_RES_DIRTY; } @@ -606,6 +657,8 @@ static int dlm_thread(void *data) list_del_init(&res->dirty); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); + /* Drop dirty_list ref */ + dlm_lockres_put(res); /* lockres can be re-dirtied/re-added to the * dirty_list in this gap, but that is ok */ @@ -642,8 +695,9 @@ static int dlm_thread(void *data) * spinlock and do NOT have the dlm lock. * safe to reserve/queue asts and run the lists. */ - mlog(0, "calling dlm_shuffle_lists with dlm=%p, " - "res=%p\n", dlm, res); + mlog(0, "calling dlm_shuffle_lists with dlm=%s, " + "res=%.*s\n", dlm->name, + res->lockname.len, res->lockname.name); /* called while holding lockres lock */ dlm_shuffle_lists(dlm, res); @@ -657,6 +711,8 @@ in_progress: /* if the lock was in-progress, stick * it on the back of the list */ if (delay) { + /* ref for dirty_list */ + dlm_lockres_get(res); spin_lock(&res->spinlock); list_add_tail(&res->dirty, &dlm->dirty_list); res->state |= DLM_LOCK_RES_DIRTY; @@ -677,7 +733,7 @@ in_progress: /* yield and continue right away if there is more work to do */ if (!n) { - yield(); + cond_resched(); continue; } diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 7b1a275..1fe1112 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -319,6 +319,16 @@ static enum dlm_status dlm_send_remote_u mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); + if (owner == dlm->node_num) { + /* ended up trying to contact ourself. this means + * that the lockres had been remote but became local + * via a migration. just retry it, now as local */ + mlog(0, "%s:%.*s: this node became the master due to a " + "migration, re-evaluate now\n", dlm->name, + res->lockname.len, res->lockname.name); + return DLM_FORWARD; + } + memset(&unlock, 0, sizeof(unlock)); unlock.node_idx = dlm->node_num; unlock.flags = cpu_to_be32(flags); diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index 74ca4e5..e641b08 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -672,7 +672,7 @@ struct dlm_ctxt *user_dlm_register_conte u32 dlm_key; char *domain; - domain = kmalloc(name->len + 1, GFP_KERNEL); + domain = kmalloc(name->len + 1, GFP_NOFS); if (!domain) { mlog_errno(-ENOMEM); return ERR_PTR(-ENOMEM); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 64cd528..6b8e9c5 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2071,8 +2071,7 @@ int ocfs2_dlm_init(struct ocfs2_super *o } /* launch vote thread */ - osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d", - osb->osb_id); + osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote"); if (IS_ERR(osb->vote_task)) { status = PTR_ERR(osb->vote_task); osb->vote_task = NULL; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 1a5c690..fcd4475 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -298,7 +298,7 @@ static int ocfs2_extent_map_find_leaf(st ret = ocfs2_extent_map_insert(inode, rec, le16_to_cpu(el->l_tree_depth)); - if (ret) { + if (ret && (ret != -EEXIST)) { mlog_errno(ret); goto out_free; } @@ -427,6 +427,11 @@ static int ocfs2_extent_map_insert_entry /* * Simple rule: on any return code other than -EAGAIN, anything left * in the insert_context will be freed. + * + * Simple rule #2: A return code of -EEXIST from this function or + * its calls to ocfs2_extent_map_insert_entry() signifies that another + * thread beat us to the insert. It is not an actual error, but it + * tells the caller we have no more work to do. */ static int ocfs2_extent_map_try_insert(struct inode *inode, struct ocfs2_extent_rec *rec, @@ -448,22 +453,32 @@ static int ocfs2_extent_map_try_insert(s goto out_unlock; } + /* Since insert_entry failed, the map MUST have old_ent */ old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), - le32_to_cpu(rec->e_clusters), NULL, - NULL); + le32_to_cpu(rec->e_clusters), + NULL, NULL); BUG_ON(!old_ent); - ret = -EEXIST; - if (old_ent->e_tree_depth < tree_depth) + if (old_ent->e_tree_depth < tree_depth) { + /* Another thread beat us to the lower tree_depth */ + ret = -EEXIST; goto out_unlock; + } if (old_ent->e_tree_depth == tree_depth) { + /* + * Another thread beat us to this tree_depth. + * Let's make sure we agree with that thread (the + * extent_rec should be identical). + */ if (!memcmp(rec, &old_ent->e_rec, sizeof(struct ocfs2_extent_rec))) ret = 0; + else + /* FIXME: Should this be ESRCH/EBADR??? */ + ret = -EEXIST; - /* FIXME: Should this be ESRCH/EBADR??? */ goto out_unlock; } @@ -599,7 +614,7 @@ static int ocfs2_extent_map_insert(struc tree_depth, &ctxt); } while (ret == -EAGAIN); - if (ret < 0) + if ((ret < 0) && (ret != -EEXIST)) mlog_errno(ret); if (ctxt.left_ent) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index eebc3cf..23d2617 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -785,8 +785,7 @@ int ocfs2_journal_load(struct ocfs2_jour } /* Launch the commit thread */ - osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d", - osb->osb_id); + osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt"); if (IS_ERR(osb->commit_task)) { status = PTR_ERR(osb->commit_task); osb->commit_task = NULL; @@ -1119,7 +1118,7 @@ void ocfs2_recovery_thread(struct ocfs2_ goto out; osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb, - "ocfs2rec-%d", osb->osb_id); + "ocfs2rec"); if (IS_ERR(osb->recovery_thread_task)) { mlog_errno((int)PTR_ERR(osb->recovery_thread_task)); osb->recovery_thread_task = NULL; diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 843cf9d..83934e3 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -46,12 +46,12 @@ static struct page *ocfs2_nopage(struct unsigned long address, int *type) { - struct inode *inode = area->vm_file->f_dentry->d_inode; struct page *page = NOPAGE_SIGBUS; sigset_t blocked, oldset; int ret; - mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address); + mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address, + type); /* The best way to deal with signals in this path is * to block them upfront, rather than allowing the diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index da10930..cd4a6f2 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -184,7 +184,6 @@ struct ocfs2_journal; struct ocfs2_journal_handle; struct ocfs2_super { - u32 osb_id; /* id used by the proc interface */ struct task_struct *commit_task; struct super_block *sb; struct inode *root_inode; @@ -222,13 +221,11 @@ struct ocfs2_super unsigned long s_mount_opt; u16 max_slots; - u16 num_nodes; s16 node_num; s16 slot_num; int s_sectsize_bits; int s_clustersize; int s_clustersize_bits; - struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/ */ atomic_t vol_state; struct mutex recovery_lock; @@ -294,7 +291,6 @@ struct ocfs2_super }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) -#define OCFS2_MAX_OSB_ID 65536 static inline int ocfs2_should_order_data(struct inode *inode) { diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 8716279..aa6f5aa 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -264,7 +264,7 @@ int ocfs2_find_slot(struct ocfs2_super * osb->slot_num = slot; spin_unlock(&si->si_lock); - mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num); + mlog(0, "taking node slot %d\n", osb->slot_num); status = ocfs2_update_disk_slots(osb, si); if (status < 0) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 949b3da..73c0d2b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -68,13 +68,6 @@ #include "vote.h" #include "buffer_head_io.h" -/* - * Globals - */ -static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED; - -static u32 osb_id; /* Keeps track of next available OSB Id */ - static kmem_cache_t *ocfs2_inode_cachep = NULL; kmem_cache_t *ocfs2_lock_cache = NULL; @@ -642,10 +635,9 @@ static int ocfs2_fill_super(struct super ocfs2_complete_mount_recovery(osb); - printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s " - "data mode.\n", - MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num, - osb->slot_num, + printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %d, slot %d) " + "with %s data mode.\n", + osb->dev_str, osb->node_num, osb->slot_num, osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" : "ordered"); @@ -798,10 +790,6 @@ static int __init ocfs2_init(void) goto leave; } - spin_lock(&ocfs2_globals_lock); - osb_id = 0; - spin_unlock(&ocfs2_globals_lock); - ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL); if (!ocfs2_debugfs_root) { status = -EFAULT; @@ -1018,7 +1006,7 @@ static int ocfs2_fill_local_node_info(st goto bail; } - mlog(ML_NOTICE, "I am node %d\n", osb->node_num); + mlog(0, "I am node %d\n", osb->node_num); status = 0; bail: @@ -1189,8 +1177,8 @@ static void ocfs2_dismount_volume(struct atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); - printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n", - MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num); + printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %d)\n", + osb->dev_str, osb->node_num); ocfs2_delete_osb(osb); kfree(osb); @@ -1210,8 +1198,6 @@ static int ocfs2_setup_osb_uuid(struct o if (osb->uuid_str == NULL) return -ENOMEM; - memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN); - for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) { /* print with null */ ret = snprintf(ptr, 3, "%02X", uuid[i]); @@ -1309,13 +1295,6 @@ static int ocfs2_initialize_super(struct goto bail; } - osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL); - if (!osb->uuid) { - mlog(ML_ERROR, "unable to alloc uuid\n"); - status = -ENOMEM; - goto bail; - } - di = (struct ocfs2_dinode *)bh->b_data; osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots); @@ -1325,7 +1304,7 @@ static int ocfs2_initialize_super(struct status = -EINVAL; goto bail; } - mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots); + mlog(0, "max_slots for this device: %u\n", osb->max_slots); init_waitqueue_head(&osb->osb_wipe_event); osb->osb_orphan_wipes = kcalloc(osb->max_slots, @@ -1416,7 +1395,7 @@ static int ocfs2_initialize_super(struct goto bail; } - memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key)); + memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key)); osb->net_key = le32_to_cpu(uuid_net_key); strncpy(osb->vol_label, di->id2.i_super.s_label, 63); @@ -1482,18 +1461,6 @@ static int ocfs2_initialize_super(struct goto bail; } - /* Link this osb onto the global linked list of all osb structures. */ - /* The Global Link List is mainted for the whole driver . */ - spin_lock(&ocfs2_globals_lock); - osb->osb_id = osb_id; - if (osb_id < OCFS2_MAX_OSB_ID) - osb_id++; - else { - mlog(ML_ERROR, "Too many volumes mounted\n"); - status = -ENOMEM; - } - spin_unlock(&ocfs2_globals_lock); - bail: mlog_exit(status); return status; diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index f6986bd..18dbd26 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -155,7 +155,7 @@ static void *ocfs2_follow_link(struct de } status = vfs_follow_link(nd, link); - if (status) + if (status && status != -ENOENT) mlog_errno(status); bail: if (page) {