GIT f8b4c6027275d9b2d5004726a6d1bb818a13ddef git://oss.oracle.com/home/sourcebo/git/ocfs2.git/#ALL

commit a37793a7e9b6db51d8a2b8a992b528c84c3c4aa8
Author: Adrian Bunk <bunk@stusta.de>
Date:   Tue May 16 17:26:41 2006 +0200

    [PATCH] fs/ocfs2/dlm/: cleanups
    
    This patch #if 0's the no longer used dlm_dump_lock_resources().
    
    Since this makes dlmdebug.h empty, this patch also removes this header.
    
    Additionally, the needlessly global dlm_is_node_recovered() is made
    static.
    
    Signed-off-by: Adrian Bunk <bunk@stusta.de>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit fbecc5ac445e07ad8a73964cd09067c186482d60
Author: Joel Becker <joel.becker@oracle.com>
Date:   Wed May 10 18:28:59 2006 -0700

    ocfs2: Compile-time disabling of ocfs2 debugging output.
    
    Give gcc the chance to compile out the debug logging code in ocfs2.
    This saves some size at the expense of being able to debug the code.
    
    Signed-off-by: Joel Becker <joel.becker@oracle.com>

commit 0187ddbe19a309c3d0c06beb9f82a6f57681ccbe
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Tue May 9 15:09:35 2006 -0700

    ocfs2: warn the user on a dead timeout mistmatch
    
    Print a warning to the user when a node with a different dead count joins
    the region.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit f5ab4d01e84da8895a5a6eca94a7b954ded2db4f
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Thu May 4 12:03:26 2006 -0700

    ocfs2: clean up some osb fields
    
    Get rid of osb->uuid, osb->proc_sub_dir, and osb->osb_id. Those fields were
    unused, or could easily be removed. As a result, we also no longer need
    MAX_OSB_ID or ocfs2_globals_lock.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit f336f8a8de268dd94324a9b26f9526012179b961
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Thu May 4 11:49:22 2006 -0700

    ocfs2: fix init of uuid_net_key
    
    ocfs2_initialize_super() should be copying from the beginning of the uuid.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 798913f7d5838cc4c217062d307f82008e9d24aa
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Mon May 1 14:56:57 2006 -0700

    ocfs2: fix compiler warnings in dlm_convert_lock_handler()
    
    We need to cast to unsigned long long.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 025159a68823d66951337aa7aad962ab9fa5a414
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Mon May 1 14:55:10 2006 -0700

    ocfs2: dlm_print_one_mle() needs to be defined
    
    Fixes compile breakage.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit ca3c734cf6452c58a65c6ec1bf39f7f0c2e5f2c9
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 14:39:57 2006 -0700

    ocfs2: remove whitespace in dlmunlock.c
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit da18811a3009d5070c7cd06f57a16222d87fde66
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 14:39:29 2006 -0700

    ocfs2: move dlm work to a private work queue
    
    The work that is done can block for long periods of time and so is not
    appropriate for keventd.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 594d7a4ada4594d645d8d6327a9e05cf7613f88f
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 14:34:08 2006 -0700

    ocfs2: fix incorrect error returns
    
    Use DLM_REJECTED instead of DLM_RECOVERING.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit f1f1bea93ed5eebcefb1ae03178fc4aff23dfe17
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 14:31:37 2006 -0700

    ocfs2: tune down some noisy messages during dlm recovery
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 79b4a98f94bca9b6725eb2b3f08d23dbb06a5cdc
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 14:30:39 2006 -0700

    ocfs2: display message before waiting for recovery to complete
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 85a964a59e3259c2ec7d612d1401811b60b4bc5b
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 14:29:59 2006 -0700

    ocfs2: mlog in dlm_convert_lock_handler() should be ML_ERROR
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 8c5ce0ac514ec2f42cb86f9476b4cf3b1f240c83
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 14:29:28 2006 -0700

    ocfs2: retry operations when a lock is marked in recovery
    
    Before checking for a nonexistent lock, make sure the lockres is not marked
    RECOVERING. The caller will just retry and the state should be fixed up when
    recovery completes.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit b64c8e3709f0c6e4f8bfaf4837b8270706e58ffe
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 14:27:41 2006 -0700

    ocfs2: use cond_resched() in dlm_thread()
    
    yield() does not yield.  cond_resched() does.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit e05483fef620ef7e4f361c5098783b79be6ae8f2
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 14:25:21 2006 -0700

    ocfs2: use GFP_NOFS in some dlm operations
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 11d8bc6d89eb863d1631b7798980b9a3b9c1c817
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 13:54:07 2006 -0700

    ocfs2: wait for recovery when starting lock mastery
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 718fa6c87597cb9fd62821f2caa7d7553c902591
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 13:51:49 2006 -0700

    ocfs2: continue recovery when a dead node is encountered
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit ed3ddd0005dcd22abd3410d7ca8ef46f11eedc22
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 13:50:12 2006 -0700

    ocfs2: remove unneccesary spin_unlock() in dlm_remaster_locks()
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 84b16b6dd232372448d7031df006ca1b141844a4
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 13:49:20 2006 -0700

    ocfs2: dlm_remaster_locks() should never exit without completing
    
    We cannot restart recovery. Once we begin to recover a node, keep the state
    of the recovery intact and follow through, regardless of any other node
    deaths that may occur.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 2424164bc2b30aaf4355058ff50fceb642977cd0
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 13:47:50 2006 -0700

    ocfs2: special case recovery lock in dlmlock_remote()
    
    If the previous master of the recovery lock dies, let calc_usage take it
    down completely and let the caller completely redo the dlmlock() call.
    Otherwise, there will never be an opportunity to re-master the lockres and
    recovery wont be able to progress.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit ab7812fd79614ce583ef9128ede0c0ef216f85b7
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 13:32:27 2006 -0700

    ocfs2: pending mastery asserts and migrations should block each other
    
    Use the existing structure for blocking migrations when ASTs are pending to
    achieve the same result. If we can catch the assert before it goes on the
    wire, just cancel it and let the migration continue.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 085dc43d403177bbfc3770f8430b76a74770febe
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 13:30:49 2006 -0700

    ocfs2: temporarily disable automatic lock migration
    
    Now we never change the owner of a lock resource until unmount or node
    death. This will be re-enabled once some issues in the algorithm used have
    been resolved.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 10900d704658e7a425d5d146268d646d647c78a5
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 13:27:10 2006 -0700

    ocfs2: do not unconditionally purge the lockres in dlmlock_remote()
    
    In dlmlock_remote(), do not call purge_lockres until the lock resource
    actually changes. otherwise, the mastery info on the lockres will go away
    underneath the caller.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit ae78f65ac10496bca64e74356a252e37debd3521
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 12:02:07 2006 -0700

    ocfs2: increase backoff before waiting for recovery
    
    When mastering non-recovery lock resources, additional time was frequently
    needed to allow the disk heartbeat to catch up with the network timeout. the
    recovery lock resource is time critical and avoids this path.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit bccdf7724ad3779cc80f4f24e2d12a45802d0635
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 11:53:33 2006 -0700

    ocfs2: have dlm_pre_master_reco_lockres() ignore dead nodes
    
    Recovery will spin in dlm_pre_master_reco_lockres if we do not ignore
    timed-out network responses from dead nodes.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 053c7137bd408916b876dc40262aa9873380c972
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 11:51:45 2006 -0700

    ocfs2: give the dlm dirty list a reference on the lockres
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 2b4af78629ec59791a70a1e17b8d52f1c6912359
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 11:49:52 2006 -0700

    ocfs2: teach dlm_restart_lock_mastery() to wait on recovery
    
    Change behavior of dlm_restart_lock_mastery() when a node goes down.  Dump
    all responses that have been collected and start over.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit a983e81a1a58177b965787da7b0548a9b529a354
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 11:46:59 2006 -0700

    ocfs2: gracefully handle stale create_lock messages.
    
    This is an error on the sending side, so gracefully error out on the
    receiving end.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 12734481b548b670cec2352a79d27304b3dc2714
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 11:32:14 2006 -0700

    ocfs2: update lvb immediately during recovery
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit e684426d7de65442a80f0f1b4b86237d90a15bea
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 11:22:06 2006 -0700

    ocfs2: do not send master requests to localhost
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit fc57a48b645374d1b34986f665a7a0e1047a0ba0
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 11:16:45 2006 -0700

    ocfs2: purge lockres' sooner
    
    Immediately purge a lockress that the local node is not the master of.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 6fa15be8b2724953217dafa74e7b0af289288abc
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 11:15:04 2006 -0700

    ocfs2: dump mismatching migrated lvbs before BUG()
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 7f65283718d0efe398b6b56c149873099b8d589d
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 11:11:13 2006 -0700

    ocfs2: make dlm recovery finalization 2 stage
    
    Makes it easier for the recovery process to deal with node death.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 32849e41d59ab3426d11d036472676e9676f58a9
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Mon May 1 10:57:51 2006 -0700

    ocfs2: dlm recovery / lockres reference count fix
    
    Take a reference on lockres structures while they are on the recovery list.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit d3421c18b9bfe64b323b91ff7faf440dcce5170b
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 19:26:15 2006 -0700

    ocfs2: better error handling during assert master message
    
    handle errors during lock assert master by either killing self or other node
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 5161fc73aec43043a60909f5c146e09653b6b019
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 19:24:21 2006 -0700

    ocfs2: dump lockres info before we BUG() on a bad reference
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit ce7e851f46b61f6ff7780e47aa70ec94d04518dd
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Thu Apr 27 19:07:45 2006 -0700

    ocfs2: do LVB puts in place
    
    Don't wait until the AST will be fired to do the LVB copy into the lock
    resource.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 12c11f5ba8fc4ddab35fc433cb970e99d5ab13dc
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 19:04:49 2006 -0700

    ocfs2: mle ref count debugging
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 268cff398ed4f78bdd8a1942eec689243cbcc347
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 19:03:18 2006 -0700

    ocfs2: allow for an assert message during lock mastery
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 5c82a1ec37c87a96b755992af8a456100a687079
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 19:01:35 2006 -0700

    ocfs2: take mle reference during migration
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 9c6cfc037f723df1640a83028edc741945059257
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 19:00:26 2006 -0700

    ocfs2: properly initialize the mle structure
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit da9f7da4a7cc0a3566af2b06e36b9a173a182c33
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 18:53:04 2006 -0700

    ocfs2: detach mle from heartbeat events
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit ca117fc061b47a6eef8b5d24fdbf56f274fd28cd
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 18:51:26 2006 -0700

    ocfs2: mle ref counting fixes
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit e57ae48b0f9e2caa0767f5ae02903ef4ff756aa7
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 18:47:41 2006 -0700

    ocfs2: better mle debugging
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 783a7bcc01729fec54ca0d2c3739e0d7e66024c6
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 18:08:51 2006 -0700

    ocfs2: clean up recovery related messages
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 290973d09aaa2df9d75d98f9907dafba37a4d485
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 18:06:58 2006 -0700

    ocfs2: handle network errors during recovery
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 9ff759e38bb8cf86d1dbcb88251b6c10a01ec46d
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 18:05:41 2006 -0700

    ocfs2: only recover one dead node at a time
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit b3957291d6731fd21e4d19ee18e659a84fcc1f25
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 18:03:49 2006 -0700

    ocfs2: Better tracking for recovery state changes
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 409ffa328ba577dd2e5c25809e1c1c4a36172b5d
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 18:02:10 2006 -0700

    ocfs2: Fix empty lvb check
    
    The check for an empty lvb should check the entire buffer not just the first
    byte.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit c2fe8996fec1c6d127f687b44ffe63cfbe68ee4e
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 18:00:21 2006 -0700

    ocfs2: fix inverted logic in dlm_is_node_dead
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 8915f9363b2b059094ea524aab437ace03bb38bc
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 17:59:46 2006 -0700

    ocfs2: recheck lockres master before sending an unlock request.
    
    Recovery may have happened and it may now be mastered locally.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 9f190101bba0242714fb1e21bc301cc19eec0a28
Author: Kurt Hackel <kurt.hackel@oracle.com>
Date:   Thu Apr 27 17:58:23 2006 -0700

    ocfs2: add a small delay after a failed migration
    
    Otherwise we risk starving other threads.
    
    Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit f9aa7e3eed61202e872ea7b95110868eb92106c2
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Thu Apr 27 17:53:22 2006 -0700

    ocfs2: silence a debug print
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit a419d77b9420632003c98d57cc903958b1e816c1
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Thu Mar 23 11:23:29 2006 -0800

    ocfs2: silence a compile warning in dlm_alloc_pagevec()
    
    Reported by Andrew Morton.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit f879afc0496443c9feb791164a7e89358916b34c
Author: Joel Becker <Joel.Becker@oracle.com>
Date:   Thu Mar 16 17:40:37 2006 -0800

    [PATCH] ocfs2: Alloc at least a page for the DLM hash
    
    The OCFS2 DLM allocates a number of pages for a hash to lookup locks.
    There was a bug where a PAGE_SIZE bigger than the hash size (eg, 64K
    pages) would result in zero pages allocated.
    
    Signed-off-by: Joel Becker <joel.becker@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 1af8e8ce644db841c587e61c80c928b20bac0946
Author: Daniel Phillips <phillips@google.com>
Date:   Fri Mar 10 18:08:16 2006 -0800

    ocfs2: allocate lockres hash pages in an array
    
    This allows us to have a hash table greater than a single page which greatly
    improves dlm performance on some tests.
    
    Signed-off-by: Daniel Phillips <phillips@google.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 22c3f6e4f595cee6146b33b3d244f3e13af14937
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Fri Mar 10 13:44:00 2006 -0800

    ocfs2: inline dlm_lockres_get()
    
    It's called on every lookup so this might help performance a bit.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit cb6adc1431f7e883f4191bed0a6c5861ada41147
Author: Daniel Phillips <phillips@google.com>
Date:   Fri Mar 10 13:31:47 2006 -0800

    [PATCH] Clean up ocfs2 hash probe and make it faster
    
    Signed-Off-By: Daniel Phillips <phillips@google.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit be82847bb31cf9f82d82dc909358d6fa67004b07
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Thu Mar 9 17:55:56 2006 -0800

    ocfs2: calculate lockid hash values outside of the spinlock
    
    Fixes a performance bug - pointed out by Andrew.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 7b8cb1bbd31d413f977b6c14e55c197d2893712b
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Mon Mar 6 15:36:17 2006 -0800

    ocfs2: move lockres qstr next to hlist_node structure
    
    Gains us a bit of performance on loads which heavily hit the lockres hash.
    Patch suggested by Daniel Phillips <phillips@google.com>.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit f64493eb2ecfc65853552b07d268aed33d95d7b8
Author: Zach Brown <zach.brown@oracle.com>
Date:   Thu Apr 27 17:25:32 2006 -0700

    ocfs2: replace idr with u32 in tcp
    
    Signed-off-by: Zach Brown <zach.brown@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 10cf7554f4e90aead497e70f460de8ce9024f9c1
Author: Sunil Mushran <sunil.mushran@oracle.com>
Date:   Thu Apr 27 16:44:13 2006 -0700

    ocfs2: silence ENOENT during lookup of broken links
    
    Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 23413eef39731e619a9184e731d5d66b5d3199a8
Author: Sunil Mushran <sunil.mushran@oracle.com>
Date:   Thu Apr 27 16:41:31 2006 -0700

    ocfs2: Cleanup message prints
    
    Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit e8613372ab22cb8f1fbcbfe2d61e4bb8bbac28a5
Author: Joel Becker <joel.becker@oracle.com>
Date:   Thu Apr 27 16:36:14 2006 -0700

    ocfs2: silence -EEXIST from ocfs2_extent_map_insert/lookup
    
    Signed-off-by: Joel Becker <joel.becker@oracle.com>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 0e6dd31c00fe0ab96feeb9c0cbae95660d545dc3
Author: Joel Becker <joel.becker@oracle.com>
Date:   Wed Apr 12 18:34:43 2006 -0700

    configfs: Release memory in configfs_example.
    
    The configfs_example module was missing a ->release().
    
    Signed-off-by: Joel Becker <joel.becker@oracle.com>

commit 5898445c0ecf79b25de2da4e41cd1ebaa7b1bcab
Author: Joel Becker <joel.becker@oracle.com>
Date:   Fri Mar 10 11:42:30 2006 -0800

    configfs: Clear up a few extra spaces where there should be TABs.
    
    Signed-off-by: Joel Becker <joel.becker@oracle.com>

commit 10e5be93b9410d044de64850085f2d72e4d2ee85
Author: Adrian Bunk <bunk@stusta.de>
Date:   Fri Mar 31 16:53:55 2006 +0200

    [PATCH] fs/ocfs2/dlm/dlmrecovery.c: make dlm_lockres_master_requery() static
    
    dlm_lockres_master_requery() became global without any external usage.
    
    Signed-off-by: Adrian Bunk <bunk@stusta.de>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 765f81792db9f8a9542478337e54abfc9df1e685
Author: Adrian Bunk <bunk@stusta.de>
Date:   Sun Mar 26 14:25:52 2006 +0200

    ocfs2: OCFS2_FS must depend on SYSFS
    
    Signed-off-by: Adrian Bunk <bunk@stusta.de>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

---
diff --git a/Documentation/filesystems/configfs/configfs_example.c b/Documentation/filesystems/configfs/configfs_example.c
index 3d4713a..2d6a14a 100644
--- a/Documentation/filesystems/configfs/configfs_example.c
+++ b/Documentation/filesystems/configfs/configfs_example.c
@@ -264,6 +264,15 @@ static struct config_item_type simple_ch
 };
 
 
+struct simple_children {
+	struct config_group group;
+};
+
+static inline struct simple_children *to_simple_children(struct config_item *item)
+{
+	return item ? container_of(to_config_group(item), struct simple_children, group) : NULL;
+}
+
 static struct config_item *simple_children_make_item(struct config_group *group, const char *name)
 {
 	struct simple_child *simple_child;
@@ -304,7 +313,13 @@ static ssize_t simple_children_attr_show
 "items have only one attribute that is readable and writeable.\n");
 }
 
+static void simple_children_release(struct config_item *item)
+{
+	kfree(to_simple_children(item));
+}
+
 static struct configfs_item_operations simple_children_item_ops = {
+	.release 	= simple_children_release,
 	.show_attribute	= simple_children_attr_show,
 };
 
@@ -345,10 +360,6 @@ static struct configfs_subsystem simple_
  * children of its own.
  */
 
-struct simple_children {
-	struct config_group group;
-};
-
 static struct config_group *group_children_make_group(struct config_group *group, const char *name)
 {
 	struct simple_children *simple_children;
diff --git a/fs/Kconfig b/fs/Kconfig
index f9b5842..9b777b8 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -326,7 +326,7 @@ source "fs/xfs/Kconfig"
 
 config OCFS2_FS
 	tristate "OCFS2 file system support (EXPERIMENTAL)"
-	depends on NET && EXPERIMENTAL
+	depends on NET && SYSFS && EXPERIMENTAL
 	select CONFIGFS_FS
 	select JBD
 	select CRC32
@@ -356,6 +356,16 @@ config OCFS2_FS
 	          - POSIX ACLs
 	          - readpages / writepages (not user visible)
 
+config OCFS2_DEBUG_MASKLOG
+	bool "OCFS2 logging support"
+	depends on OCFS2_FS
+	default y
+	help
+	  The ocfs2 filesystem has an extensive logging system.  The system
+	  allows selection of events to log via files in /sys/o2cb/logmask/.
+	  This option will enlarge your kernel, but it allows debugging of
+	  ocfs2 filesystem issues.
+
 config MINIX_FS
 	tristate "Minix fs support"
 	help
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5f95218..880c9ca 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -211,7 +211,7 @@ static void remove_dir(struct dentry * d
 	struct configfs_dirent * sd;
 
 	sd = d->d_fsdata;
- 	list_del_init(&sd->s_sibling);
+	list_del_init(&sd->s_sibling);
 	configfs_put(sd);
 	if (d->d_inode)
 		simple_rmdir(parent->d_inode,d);
@@ -330,7 +330,7 @@ static int configfs_detach_prep(struct d
 
 			ret = configfs_detach_prep(sd->s_dentry);
 			if (!ret)
-			       	continue;
+				continue;
 		} else
 			ret = -ENOTEMPTY;
 
@@ -931,7 +931,7 @@ int configfs_rename_dir(struct config_it
 
 	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
 	if (!IS_ERR(new_dentry)) {
-  		if (!new_dentry->d_inode) {
+		if (!new_dentry->d_inode) {
 			error = config_item_set_name(item, "%s", new_name);
 			if (!error) {
 				d_add(new_dentry, NULL);
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index e5512e2..fb65e08 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -66,7 +66,7 @@ static void fill_item_path(struct config
 }
 
 static int create_link(struct config_item *parent_item,
- 		       struct config_item *item,
+		       struct config_item *item,
 		       struct dentry *dentry)
 {
 	struct configfs_dirent *target_sd = item->ci_dentry->d_fsdata;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 21f38ac..d508a60 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -517,6 +517,7 @@ static inline void o2hb_prepare_block(st
 	hb_block->hb_seq = cpu_to_le64(cputime);
 	hb_block->hb_node = node_num;
 	hb_block->hb_generation = cpu_to_le64(generation);
+	hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS);
 
 	/* This step must always happen last! */
 	hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
@@ -645,6 +646,8 @@ static int o2hb_check_slot(struct o2hb_r
 	struct o2nm_node *node;
 	struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
 	u64 cputime;
+	unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
+	unsigned int slot_dead_ms;
 
 	memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
 
@@ -733,6 +736,23 @@ fire_callbacks:
 			      &o2hb_live_slots[slot->ds_node_num]);
 
 		slot->ds_equal_samples = 0;
+
+		/* We want to be sure that all nodes agree on the
+		 * number of milliseconds before a node will be
+		 * considered dead. The self-fencing timeout is
+		 * computed from this value, and a discrepancy might
+		 * result in heartbeat calling a node dead when it
+		 * hasn't self-fenced yet. */
+		slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms);
+		if (slot_dead_ms && slot_dead_ms != dead_ms) {
+			/* TODO: Perhaps we can fail the region here. */
+			mlog(ML_ERROR, "Node %d on device %s has a dead count "
+			     "of %u ms, but our count is %u ms.\n"
+			     "Please double check your configuration values "
+			     "for 'O2CB_HEARTBEAT_THRESHOLD'\n",
+			     slot->ds_node_num, reg->hr_dev_name, slot_dead_ms,
+			     dead_ms);
+		}
 		goto out;
 	}
 
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 73edad7..a42628b 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -123,6 +123,17 @@ #ifndef MLOG_MASK_PREFIX
 #define MLOG_MASK_PREFIX 0
 #endif
 
+/*
+ * When logging is disabled, force the bit test to 0 for anything other
+ * than errors and notices, allowing gcc to remove the code completely.
+ * When enabled, allow all masks.
+ */
+#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
+#define ML_ALLOWED_BITS ~0
+#else
+#define ML_ALLOWED_BITS (ML_ERROR|ML_NOTICE)
+#endif
+
 #define MLOG_MAX_BITS 64
 
 struct mlog_bits {
@@ -187,7 +198,8 @@ #define __mlog_printk(level, fmt, args..
 
 #define mlog(mask, fmt, args...) do {					\
 	u64 __m = MLOG_MASK_PREFIX | (mask);				\
-	if (__mlog_test_u64(__m, mlog_and_bits) &&			\
+	if ((__m & ML_ALLOWED_BITS) &&					\
+	    __mlog_test_u64(__m, mlog_and_bits) &&			\
 	    !__mlog_test_u64(__m, mlog_not_bits)) {			\
 		if (__m & ML_ERROR)					\
 			__mlog_printk(KERN_ERR, "ERROR: "fmt , ##args);	\
@@ -204,6 +216,7 @@ #define mlog_errno(st) do {						\
 		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
 } while (0)
 
+#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
 #define mlog_entry(fmt, args...) do {					\
 	mlog(ML_ENTRY, "ENTRY:" fmt , ##args);				\
 } while (0)
@@ -247,6 +260,13 @@ #define mlog_exit_ptr(ptr) do {						\
 #define mlog_exit_void() do {						\
 	mlog(ML_EXIT, "EXIT\n");					\
 } while (0)
+#else
+#define mlog_entry(...)  do { } while (0)
+#define mlog_entry_void(...)  do { } while (0)
+#define mlog_exit(...)  do { } while (0)
+#define mlog_exit_ptr(...)  do { } while (0)
+#define mlog_exit_void(...)  do { } while (0)
+#endif  /* defined(CONFIG_OCFS2_DEBUG_MASKLOG) */
 
 #define mlog_bug_on_msg(cond, fmt, args...) do {			\
 	if (cond) {							\
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h
index 9409606..3f4151d 100644
--- a/fs/ocfs2/cluster/ocfs2_heartbeat.h
+++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h
@@ -32,6 +32,7 @@ struct o2hb_disk_heartbeat_block {
 	__u8  hb_pad1[3];
 	__le32 hb_cksum;
 	__le64 hb_generation;
+	__le32 hb_dead_ms;
 };
 
 #endif /* _OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 0f60cc0..c6185be 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -56,7 +56,6 @@
 #include <linux/kernel.h>
 #include <linux/jiffies.h>
 #include <linux/slab.h>
-#include <linux/idr.h>
 #include <linux/kref.h>
 #include <net/tcp.h>
 
@@ -172,31 +171,18 @@ static u8 o2net_num_from_nn(struct o2net
 }
 
 /* ------------------------------------------------------------ */
-
-static int o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
+static void o2net_prep_nsw(struct o2net_node *nn, struct o2net_status_wait *nsw)
 {
-	int ret = 0;
+	spin_lock(&nn->nn_lock);
 
-	do {
-		if (!idr_pre_get(&nn->nn_status_idr, GFP_ATOMIC)) {
-			ret = -EAGAIN;
-			break;
-		}
-		spin_lock(&nn->nn_lock);
-		ret = idr_get_new(&nn->nn_status_idr, nsw, &nsw->ns_id);
-		if (ret == 0)
-			list_add_tail(&nsw->ns_node_item,
-				      &nn->nn_status_list);
-		spin_unlock(&nn->nn_lock);
-	} while (ret == -EAGAIN);
+	nsw->ns_id = nn->nn_status_next_id++;
+	list_add_tail(&nsw->ns_node_item, &nn->nn_status_list);
 
-	if (ret == 0)  {
-		init_waitqueue_head(&nsw->ns_wq);
-		nsw->ns_sys_status = O2NET_ERR_NONE;
-		nsw->ns_status = 0;
-	}
+	spin_unlock(&nn->nn_lock);
 
-	return ret;
+	init_waitqueue_head(&nsw->ns_wq);
+	nsw->ns_sys_status = O2NET_ERR_NONE;
+	nsw->ns_status = 0;
 }
 
 static void o2net_complete_nsw_locked(struct o2net_node *nn,
@@ -210,31 +196,43 @@ static void o2net_complete_nsw_locked(st
 		list_del_init(&nsw->ns_node_item);
 		nsw->ns_sys_status = sys_status;
 		nsw->ns_status = status;
-		idr_remove(&nn->nn_status_idr, nsw->ns_id);
 		wake_up(&nsw->ns_wq);
 	}
 }
 
-static void o2net_complete_nsw(struct o2net_node *nn,
-			       struct o2net_status_wait *nsw,
-			       u64 id, enum o2net_system_error sys_status,
-			       s32 status)
+static void o2net_complete_nsw_id(struct o2net_node *nn, u32 id,
+				  enum o2net_system_error sys_status,
+				  s32 status)
 {
+	struct list_head *iter, *tmp;
+	struct o2net_status_wait *nsw;
+	int killed = 0;
+
 	spin_lock(&nn->nn_lock);
-	if (nsw == NULL) {
-		if (id > INT_MAX)
-			goto out;
 
-		nsw = idr_find(&nn->nn_status_idr, id);
-		if (nsw == NULL)
-			goto out;
+	list_for_each_safe(iter, tmp, &nn->nn_status_list) {
+		nsw = list_entry(iter, struct o2net_status_wait, ns_node_item);
+		if (id == nsw->ns_id) {
+			o2net_complete_nsw_locked(nn, nsw, sys_status, status);
+			killed = 1;
+			break;
+		}
 	}
 
-	o2net_complete_nsw_locked(nn, nsw, sys_status, status);
+	spin_unlock(&nn->nn_lock);
 
-out:
+	if (!killed)
+		mlog(ML_ERROR, "didn't find nsw for id %u\n", id);
+}
+
+static void o2net_complete_nsw(struct o2net_node *nn,
+			       struct o2net_status_wait *nsw,
+			       enum o2net_system_error sys_status,
+			       s32 status)
+{
+	spin_lock(&nn->nn_lock);
+	o2net_complete_nsw_locked(nn, nsw, sys_status, status);
 	spin_unlock(&nn->nn_lock);
-	return;
 }
 
 static void o2net_complete_nodes_nsw(struct o2net_node *nn)
@@ -396,8 +394,8 @@ static void o2net_set_nn_state(struct o2
 	}
 
 	if (was_valid && !valid) {
-		mlog(ML_NOTICE, "no longer connected to " SC_NODEF_FMT "\n",
-		     SC_NODEF_ARGS(old_sc));
+		printk(KERN_INFO "o2net: no longer connected to "
+		       SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
 		o2net_complete_nodes_nsw(nn);
 	}
 
@@ -409,10 +407,10 @@ static void o2net_set_nn_state(struct o2
 		 * the only way to start connecting again is to down
 		 * heartbeat and bring it back up. */
 		cancel_delayed_work(&nn->nn_connect_expired);
-		mlog(ML_NOTICE, "%s " SC_NODEF_FMT "\n", 
-		     o2nm_this_node() > sc->sc_node->nd_num ?
-		     	"connected to" : "accepted connection from",
-		     SC_NODEF_ARGS(sc));
+		printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
+		       o2nm_this_node() > sc->sc_node->nd_num ?
+		       		"connected to" : "accepted connection from",
+		       SC_NODEF_ARGS(sc));
 	}
 
 	/* trigger the connecting worker func as long as we're not valid,
@@ -928,9 +926,7 @@ int o2net_send_message_vec(u32 msg_type,
 	vec[0].iov_base = msg;
 	memcpy(&vec[1], caller_vec, caller_veclen * sizeof(struct kvec));
 
-	ret = o2net_prep_nsw(nn, &nsw);
-	if (ret)
-		goto out;
+	o2net_prep_nsw(nn, &nsw);
 
 	msg->msg_num = cpu_to_be32(nsw.ns_id);
 
@@ -963,7 +959,7 @@ out:
 		kfree(vec);
 	if (msg)
 		kfree(msg);
-	o2net_complete_nsw(nn, &nsw, 0, 0, 0);
+	o2net_complete_nsw(nn, &nsw, 0, 0);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(o2net_send_message_vec);
@@ -1019,10 +1015,9 @@ static int o2net_process_message(struct 
 	switch(be16_to_cpu(hdr->magic)) {
 		case O2NET_MSG_STATUS_MAGIC:
 			/* special type for returning message status */
-			o2net_complete_nsw(nn, NULL,
-					   be32_to_cpu(hdr->msg_num),
-					   be32_to_cpu(hdr->sys_status),
-					   be32_to_cpu(hdr->status));
+			o2net_complete_nsw_id(nn, be32_to_cpu(hdr->msg_num),
+					      be32_to_cpu(hdr->sys_status),
+					      be32_to_cpu(hdr->status));
 			goto out;
 		case O2NET_MSG_KEEP_REQ_MAGIC:
 			o2net_sendpage(sc, o2net_keep_resp,
@@ -1280,7 +1275,7 @@ static void o2net_idle_timer(unsigned lo
 
 	do_gettimeofday(&now);
 
-	mlog(ML_NOTICE, "connection to " SC_NODEF_FMT " has been idle for 10 "
+	printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for 10 "
 	     "seconds, shutting it down.\n", SC_NODEF_ARGS(sc));
 	mlog(ML_NOTICE, "here are some times that might help debug the "
 	     "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
@@ -1825,7 +1820,6 @@ int o2net_init(void)
 		/* until we see hb from a node we'll return einval */
 		nn->nn_persistent_error = -ENOTCONN;
 		init_waitqueue_head(&nn->nn_sc_wq);
-		idr_init(&nn->nn_status_idr);
 		INIT_LIST_HEAD(&nn->nn_status_list);
 	}
 
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index ff9e2e2..fa2c705 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -70,7 +70,7 @@ struct o2net_node {
 	 * or fails or when an accepted socket is attached. */
 	wait_queue_head_t		nn_sc_wq;
 
-	struct idr			nn_status_idr;
+	u32				nn_status_next_id;
 	struct list_head		nn_status_list;
 
 	/* connects are attempted from when heartbeat comes up until either hb
@@ -166,7 +166,7 @@ enum o2net_system_error {
 struct o2net_status_wait {
 	enum o2net_system_error	ns_sys_status;
 	s32			ns_status;
-	int			ns_id;
+	u32			ns_id;
 	wait_queue_head_t	ns_wq;
 	struct list_head	ns_node_item;
 };
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ae47f45..3d494d1 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -213,11 +213,9 @@ int ocfs2_find_files_on_disk(const char 
 			     struct ocfs2_dir_entry **dirent)
 {
 	int status = -ENOENT;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry("(osb=%p, parent=%llu, name='%.*s', blkno=%p, inode=%p)\n",
-		   osb, (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		   namelen, name, blkno, inode);
+	mlog_entry("(name=%.*s, blkno=%p, inode=%p, dirent_bh=%p, dirent=%p)\n",
+		   namelen, name, blkno, inode, dirent_bh, dirent);
 
 	*dirent_bh = ocfs2_find_entry(name, namelen, inode, dirent);
 	if (!*dirent_bh || !*dirent) {
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 355593d..90ce915 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -197,12 +197,14 @@ static void dlm_update_lvb(struct dlm_ct
 				  lock->ml.node == dlm->node_num ? "master" :
 				  "remote");
 			memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
-		} else if (lksb->flags & DLM_LKSB_PUT_LVB) {
-			mlog(0, "setting lvb from lockres for %s node\n",
-				  lock->ml.node == dlm->node_num ? "master" :
-				  "remote");
-			memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
 		}
+		/* Do nothing for lvb put requests - they should be done in
+ 		 * place when the lock is downconverted - otherwise we risk
+ 		 * racing gets and puts which could result in old lvb data
+ 		 * being propagated. We leave the put flag set and clear it
+ 		 * here. In the future we might want to clear it at the time
+ 		 * the put is actually done.
+		 */
 		spin_unlock(&res->spinlock);
 	}
 
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 88cc43d..14530ee 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,17 @@ #define DLM_LOCK_RES_OWNER_UNKNOWN     O
 #define DLM_THREAD_SHUFFLE_INTERVAL    5     // flush everything every 5 passes
 #define DLM_THREAD_MS                  200   // flush at least every 200 ms
 
-#define DLM_HASH_BUCKETS     (PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_SIZE_DEFAULT	(1 << 14)
+#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
+# define DLM_HASH_PAGES		1
+#else
+# define DLM_HASH_PAGES		(DLM_HASH_SIZE_DEFAULT / PAGE_SIZE)
+#endif
+#define DLM_BUCKETS_PER_PAGE	(PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_BUCKETS	(DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
+
+/* Intended to make it easier for us to switch out hash functions */
+#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
 
 enum dlm_ast_type {
 	DLM_AST = 0,
@@ -61,7 +71,8 @@ static inline int dlm_is_recovery_lock(c
 	return 0;
 }
 
-#define DLM_RECO_STATE_ACTIVE  0x0001
+#define DLM_RECO_STATE_ACTIVE    0x0001
+#define DLM_RECO_STATE_FINALIZE  0x0002
 
 struct dlm_recovery_ctxt
 {
@@ -85,7 +96,7 @@ enum dlm_ctxt_state {
 struct dlm_ctxt
 {
 	struct list_head list;
-	struct hlist_head *lockres_hash;
+	struct hlist_head **lockres_hash;
 	struct list_head dirty_list;
 	struct list_head purge_list;
 	struct list_head pending_asts;
@@ -120,6 +131,7 @@ struct dlm_ctxt
 	struct o2hb_callback_func dlm_hb_down;
 	struct task_struct *dlm_thread_task;
 	struct task_struct *dlm_reco_thread_task;
+	struct workqueue_struct *dlm_worker;
 	wait_queue_head_t dlm_thread_wq;
 	wait_queue_head_t dlm_reco_thread_wq;
 	wait_queue_head_t ast_wq;
@@ -132,6 +144,11 @@ struct dlm_ctxt
 	struct list_head	dlm_eviction_callbacks;
 };
 
+static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i)
+{
+	return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
+}
+
 /* these keventd work queue items are for less-frequently
  * called functions that cannot be directly called from the
  * net message handlers for some reason, usually because
@@ -216,20 +233,29 @@ struct dlm_lock_resource
 	/* WARNING: Please see the comment in dlm_init_lockres before
 	 * adding fields here. */
 	struct hlist_node hash_node;
+	struct qstr lockname;
 	struct kref      refs;
 
-	/* please keep these next 3 in this order
-	 * some funcs want to iterate over all lists */
+	/*
+	 * Please keep granted, converting, and blocked in this order,
+	 * as some funcs want to iterate over all lists.
+	 *
+	 * All four lists are protected by the hash's reference.
+	 */
 	struct list_head granted;
 	struct list_head converting;
 	struct list_head blocked;
+	struct list_head purge;
 
+	/*
+	 * These two lists require you to hold an additional reference
+	 * while they are on the list.
+	 */
 	struct list_head dirty;
 	struct list_head recovering; // dlm_recovery_ctxt.resources list
 
 	/* unused lock resources have their last_used stamped and are
 	 * put on a list for the dlm thread to run. */
-	struct list_head purge;
 	unsigned long    last_used;
 
 	unsigned migration_pending:1;
@@ -238,7 +264,6 @@ struct dlm_lock_resource
 	wait_queue_head_t wq;
 	u8  owner;              //node which owns the lock resource, or unknown
 	u16 state;
-	struct qstr lockname;
 	char lvb[DLM_LVB_LEN];
 };
 
@@ -300,6 +325,15 @@ enum dlm_lockres_list {
 	DLM_BLOCKED_LIST
 };
 
+static inline int dlm_lvb_is_empty(char *lvb)
+{
+	int i;
+	for (i=0; i<DLM_LVB_LEN; i++)
+		if (lvb[i])
+			return 0;
+	return 1;
+}
+
 static inline struct list_head *
 dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
 {
@@ -609,7 +643,8 @@ struct dlm_finalize_reco
 {
 	u8 node_idx;
 	u8 dead_node;
-	__be16 pad1;
+	u8 flags;
+	u8 pad1;
 	__be32 pad2;
 };
 
@@ -676,6 +711,7 @@ void dlm_wait_for_recovery(struct dlm_ct
 void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
 int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
 
 void dlm_put(struct dlm_ctxt *dlm);
 struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -687,14 +723,20 @@ void dlm_lockres_calc_usage(struct dlm_c
 			    struct dlm_lock_resource *res);
 void dlm_purge_lockres(struct dlm_ctxt *dlm,
 		       struct dlm_lock_resource *lockres);
-void dlm_lockres_get(struct dlm_lock_resource *res);
+static inline void dlm_lockres_get(struct dlm_lock_resource *res)
+{
+	/* This is called on every lookup, so it might be worth
+	 * inlining. */
+	kref_get(&res->refs);
+}
 void dlm_lockres_put(struct dlm_lock_resource *res);
 void __dlm_unhash_lockres(struct dlm_lock_resource *res);
 void __dlm_insert_lockres(struct dlm_ctxt *dlm,
 			  struct dlm_lock_resource *res);
 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
 						const char *name,
-						unsigned int len);
+						unsigned int len,
+						unsigned int hash);
 struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
 					      const char *name,
 					      unsigned int len);
@@ -780,8 +822,6 @@ int dlm_begin_reco_handler(struct o2net_
 int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data);
 int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
 			  u8 nodenum, u8 *real_master);
-int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
-			       struct dlm_lock_resource *res, u8 *real_master);
 
 
 int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
@@ -819,6 +859,7 @@ void dlm_clean_master_list(struct dlm_ct
 			   u8 dead_node);
 int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
 
+int __dlm_lockres_unused(struct dlm_lock_resource *res);
 
 static inline const char * dlm_lock_mode_name(int mode)
 {
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 8285228..d555dec 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -214,6 +214,9 @@ grant:
 	if (lock->ml.node == dlm->node_num)
 		mlog(0, "doing in-place convert for nonlocal lock\n");
 	lock->ml.type = type;
+	if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
+		memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+
 	status = DLM_NORMAL;
 	*call_ast = 1;
 	goto unlock_exit;
@@ -464,6 +467,12 @@ int dlm_convert_lock_handler(struct o2ne
 	}
 
 	spin_lock(&res->spinlock);
+	status = __dlm_lockres_state_to_status(res);
+	if (status != DLM_NORMAL) {
+		spin_unlock(&res->spinlock);
+		dlm_error(status);
+		goto leave;
+	}
 	list_for_each(iter, &res->granted) {
 		lock = list_entry(iter, struct dlm_lock, list);
 		if (lock->ml.cookie == cnv->cookie &&
@@ -473,6 +482,21 @@ int dlm_convert_lock_handler(struct o2ne
 		}
 		lock = NULL;
 	}
+	if (!lock) {
+		__dlm_print_one_lock_resource(res);
+		list_for_each(iter, &res->granted) {
+			lock = list_entry(iter, struct dlm_lock, list);
+			if (lock->ml.node == cnv->node_idx) {
+				mlog(ML_ERROR, "There is something here "
+				     "for node %u, lock->ml.cookie=%llu, "
+				     "cnv->cookie=%llu\n", cnv->node_idx,
+				     (unsigned long long)lock->ml.cookie,
+				     (unsigned long long)cnv->cookie);
+				break;
+			}
+		}
+		lock = NULL;
+	}
 	spin_unlock(&res->spinlock);
 	if (!lock) {
 		status = DLM_IVLOCKID;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index c7eae5d..3f6c8d8 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -37,10 +37,8 @@ #include "cluster/tcp.h"
 
 #include "dlmapi.h"
 #include "dlmcommon.h"
-#include "dlmdebug.h"
 
 #include "dlmdomain.h"
-#include "dlmdebug.h"
 
 #define MLOG_MASK_PREFIX ML_DLM
 #include "cluster/masklog.h"
@@ -120,6 +118,7 @@ void dlm_print_one_lock(struct dlm_lock 
 }
 EXPORT_SYMBOL_GPL(dlm_print_one_lock);
 
+#if 0
 void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
 {
 	struct dlm_lock_resource *res;
@@ -136,12 +135,13 @@ void dlm_dump_lock_resources(struct dlm_
 
 	spin_lock(&dlm->spinlock);
 	for (i=0; i<DLM_HASH_BUCKETS; i++) {
-		bucket = &(dlm->lockres_hash[i]);
+		bucket = dlm_lockres_hash(dlm, i);
 		hlist_for_each_entry(res, iter, bucket, hash_node)
 			dlm_print_one_lock_resource(res);
 	}
 	spin_unlock(&dlm->spinlock);
 }
+#endif  /*  0  */
 
 static const char *dlm_errnames[] = {
 	[DLM_NORMAL] =			"DLM_NORMAL",
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
deleted file mode 100644
index 6858510..0000000
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmdebug.h
- *
- * Copyright (C) 2004 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- */
-
-#ifndef DLMDEBUG_H
-#define DLMDEBUG_H
-
-void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
-
-#endif
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8f3a9e3..3cf5ad8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -41,7 +41,6 @@ #include "cluster/tcp.h"
 #include "dlmapi.h"
 #include "dlmcommon.h"
 
-#include "dlmdebug.h"
 #include "dlmdomain.h"
 
 #include "dlmver.h"
@@ -49,6 +48,33 @@ #include "dlmver.h"
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
 #include "cluster/masklog.h"
 
+static void dlm_free_pagevec(void **vec, int pages)
+{
+	while (pages--)
+		free_page((unsigned long)vec[pages]);
+	kfree(vec);
+}
+
+static void **dlm_alloc_pagevec(int pages)
+{
+	void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL);
+	int i;
+
+	if (!vec)
+		return NULL;
+
+	for (i = 0; i < pages; i++)
+		if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
+			goto out_free;
+
+	mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n",
+	     pages, DLM_HASH_PAGES, (unsigned long)DLM_BUCKETS_PER_PAGE);
+	return vec;
+out_free:
+	dlm_free_pagevec(vec, i);
+	return NULL;
+}
+
 /*
  *
  * spinlock lock ordering: if multiple locks are needed, obey this ordering:
@@ -90,8 +116,7 @@ void __dlm_insert_lockres(struct dlm_ctx
 	assert_spin_locked(&dlm->spinlock);
 
 	q = &res->lockname;
-	q->hash = full_name_hash(q->name, q->len);
-	bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]);
+	bucket = dlm_lockres_hash(dlm, q->hash);
 
 	/* get a reference for our hashtable */
 	dlm_lockres_get(res);
@@ -100,34 +125,32 @@ void __dlm_insert_lockres(struct dlm_ctx
 }
 
 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
-					 const char *name,
-					 unsigned int len)
+						const char *name,
+						unsigned int len,
+						unsigned int hash)
 {
-	unsigned int hash;
-	struct hlist_node *iter;
-	struct dlm_lock_resource *tmpres=NULL;
 	struct hlist_head *bucket;
+	struct hlist_node *list;
 
 	mlog_entry("%.*s\n", len, name);
 
 	assert_spin_locked(&dlm->spinlock);
 
-	hash = full_name_hash(name, len);
-
-	bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]);
-
-	/* check for pre-existing lock */
-	hlist_for_each(iter, bucket) {
-		tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node);
-		if (tmpres->lockname.len == len &&
-		    memcmp(tmpres->lockname.name, name, len) == 0) {
-			dlm_lockres_get(tmpres);
-			break;
-		}
+	bucket = dlm_lockres_hash(dlm, hash);
 
-		tmpres = NULL;
+	hlist_for_each(list, bucket) {
+		struct dlm_lock_resource *res = hlist_entry(list,
+			struct dlm_lock_resource, hash_node);
+		if (res->lockname.name[0] != name[0])
+			continue;
+		if (unlikely(res->lockname.len != len))
+			continue;
+		if (memcmp(res->lockname.name + 1, name + 1, len - 1))
+			continue;
+		dlm_lockres_get(res);
+		return res;
 	}
-	return tmpres;
+	return NULL;
 }
 
 struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
@@ -135,9 +158,10 @@ struct dlm_lock_resource * dlm_lookup_lo
 				    unsigned int len)
 {
 	struct dlm_lock_resource *res;
+	unsigned int hash = dlm_lockid_hash(name, len);
 
 	spin_lock(&dlm->spinlock);
-	res = __dlm_lookup_lockres(dlm, name, len);
+	res = __dlm_lookup_lockres(dlm, name, len, hash);
 	spin_unlock(&dlm->spinlock);
 	return res;
 }
@@ -194,7 +218,7 @@ static int dlm_wait_on_domain_helper(con
 static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
 {
 	if (dlm->lockres_hash)
-		free_page((unsigned long) dlm->lockres_hash);
+		dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
 
 	if (dlm->name)
 		kfree(dlm->name);
@@ -278,11 +302,21 @@ int dlm_domain_fully_joined(struct dlm_c
 	return ret;
 }
 
+static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
+{
+	if (dlm->dlm_worker) {
+		flush_workqueue(dlm->dlm_worker);
+		destroy_workqueue(dlm->dlm_worker);
+		dlm->dlm_worker = NULL;
+	}
+}
+
 static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
 {
 	dlm_unregister_domain_handlers(dlm);
 	dlm_complete_thread(dlm);
 	dlm_complete_recovery_thread(dlm);
+	dlm_destroy_dlm_worker(dlm);
 
 	/* We've left the domain. Now we can take ourselves out of the
 	 * list and allow the kref stuff to help us free the
@@ -304,8 +338,8 @@ static void dlm_migrate_all_locks(struct
 restart:
 	spin_lock(&dlm->spinlock);
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-		while (!hlist_empty(&dlm->lockres_hash[i])) {
-			res = hlist_entry(dlm->lockres_hash[i].first,
+		while (!hlist_empty(dlm_lockres_hash(dlm, i))) {
+			res = hlist_entry(dlm_lockres_hash(dlm, i)->first,
 					  struct dlm_lock_resource, hash_node);
 			/* need reference when manually grabbing lockres */
 			dlm_lockres_get(res);
@@ -374,12 +408,13 @@ static void __dlm_print_nodes(struct dlm
 
 	assert_spin_locked(&dlm->spinlock);
 
-	mlog(ML_NOTICE, "Nodes in my domain (\"%s\"):\n", dlm->name);
+	printk(KERN_INFO "ocfs2_dlm: Nodes in domain (\"%s\"): ", dlm->name);
 
 	while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
 				     node + 1)) < O2NM_MAX_NODES) {
-		mlog(ML_NOTICE, " node %d\n", node);
+		printk("%d ", node);
 	}
+	printk("\n");
 }
 
 static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data)
@@ -395,7 +430,7 @@ static int dlm_exit_domain_handler(struc
 
 	node = exit_msg->node_idx;
 
-	mlog(0, "Node %u leaves domain %s\n", node, dlm->name);
+	printk(KERN_INFO "ocfs2_dlm: Node %u leaves domain %s\n", node, dlm->name);
 
 	spin_lock(&dlm->spinlock);
 	clear_bit(node, dlm->domain_map);
@@ -644,6 +679,8 @@ static int dlm_assert_joined_handler(str
 		set_bit(assert->node_idx, dlm->domain_map);
 		__dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
 
+		printk(KERN_INFO "ocfs2_dlm: Node %u joins domain %s\n",
+		       assert->node_idx, dlm->name);
 		__dlm_print_nodes(dlm);
 
 		/* notify anything attached to the heartbeat events */
@@ -1126,6 +1163,13 @@ static int dlm_join_domain(struct dlm_ct
 		goto bail;
 	}
 
+	dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+	if (!dlm->dlm_worker) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
+
 	do {
 		unsigned int backoff;
 		status = dlm_try_to_join_domain(dlm);
@@ -1166,6 +1210,7 @@ bail:
 		dlm_unregister_domain_handlers(dlm);
 		dlm_complete_thread(dlm);
 		dlm_complete_recovery_thread(dlm);
+		dlm_destroy_dlm_worker(dlm);
 	}
 
 	return status;
@@ -1191,7 +1236,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(c
 		goto leave;
 	}
 
-	dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL);
+	dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
 	if (!dlm->lockres_hash) {
 		mlog_errno(-ENOMEM);
 		kfree(dlm->name);
@@ -1200,8 +1245,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(c
 		goto leave;
 	}
 
-	for (i=0; i<DLM_HASH_BUCKETS; i++)
-		INIT_HLIST_HEAD(&dlm->lockres_hash[i]);
+	for (i = 0; i < DLM_HASH_BUCKETS; i++)
+		INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
 
 	strcpy(dlm->name, domain);
 	dlm->key = key;
@@ -1231,6 +1276,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(c
 
 	dlm->dlm_thread_task = NULL;
 	dlm->dlm_reco_thread_task = NULL;
+	dlm->dlm_worker = NULL;
 	init_waitqueue_head(&dlm->dlm_thread_wq);
 	init_waitqueue_head(&dlm->dlm_reco_thread_wq);
 	init_waitqueue_head(&dlm->reco.event);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 7e88e24..bb2c202 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -116,7 +116,7 @@ static int dlmfs_file_open(struct inode 
 	 * doesn't make sense for LVB writes. */
 	file->f_flags &= ~O_APPEND;
 
-	fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+	fp = kmalloc(sizeof(*fp), GFP_NOFS);
 	if (!fp) {
 		status = -ENOMEM;
 		goto bail;
@@ -196,7 +196,7 @@ static ssize_t dlmfs_file_read(struct fi
 	else
 		readlen = count - *ppos;
 
-	lvb_buf = kmalloc(readlen, GFP_KERNEL);
+	lvb_buf = kmalloc(readlen, GFP_NOFS);
 	if (!lvb_buf)
 		return -ENOMEM;
 
@@ -240,7 +240,7 @@ static ssize_t dlmfs_file_write(struct f
 	else
 		writelen = count - *ppos;
 
-	lvb_buf = kmalloc(writelen, GFP_KERNEL);
+	lvb_buf = kmalloc(writelen, GFP_NOFS);
 	if (!lvb_buf)
 		return -ENOMEM;
 
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 6fea283..a9fc9a6 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -201,6 +201,7 @@ static enum dlm_status dlmlock_remote(st
 				      struct dlm_lock *lock, int flags)
 {
 	enum dlm_status status = DLM_DENIED;
+	int lockres_changed = 1;
 
 	mlog_entry("type=%d\n", lock->ml.type);
 	mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
@@ -226,8 +227,25 @@ static enum dlm_status dlmlock_remote(st
 	res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
 	lock->lock_pending = 0;
 	if (status != DLM_NORMAL) {
-		if (status != DLM_NOTQUEUED)
+		if (status == DLM_RECOVERING &&
+		    dlm_is_recovery_lock(res->lockname.name,
+					 res->lockname.len)) {
+			/* recovery lock was mastered by dead node.
+			 * we need to have calc_usage shoot down this
+			 * lockres and completely remaster it. */
+			mlog(0, "%s: recovery lock was owned by "
+			     "dead node %u, remaster it now.\n",
+			     dlm->name, res->owner);
+		} else if (status != DLM_NOTQUEUED) {
+			/*
+			 * DO NOT call calc_usage, as this would unhash
+			 * the remote lockres before we ever get to use
+			 * it.  treat as if we never made any change to
+			 * the lockres.
+			 */
+			lockres_changed = 0;
 			dlm_error(status);
+		}
 		dlm_revert_pending_lock(res, lock);
 		dlm_lock_put(lock);
 	} else if (dlm_is_recovery_lock(res->lockname.name, 
@@ -244,7 +262,8 @@ static enum dlm_status dlmlock_remote(st
 	}
 	spin_unlock(&res->spinlock);
 
-	dlm_lockres_calc_usage(dlm, res);
+	if (lockres_changed)
+		dlm_lockres_calc_usage(dlm, res);
 
 	wake_up(&res->wq);
 	return status;
@@ -281,6 +300,14 @@ static enum dlm_status dlm_send_remote_l
 	if (tmpret >= 0) {
 		// successfully sent and received
 		ret = status;  // this is already a dlm_status
+		if (ret == DLM_REJECTED) {
+			mlog(ML_ERROR, "%s:%.*s: BUG.  this is a stale lockres "
+			     "no longer owned by %u.  that node is coming back "
+			     "up currently.\n", dlm->name, create.namelen,
+			     create.name, res->owner);
+			dlm_print_one_lock_resource(res);
+			BUG();
+		}
 	} else {
 		mlog_errno(tmpret);
 		if (dlm_is_host_down(tmpret)) {
@@ -382,13 +409,13 @@ struct dlm_lock * dlm_new_lock(int type,
 	struct dlm_lock *lock;
 	int kernel_allocated = 0;
 
-	lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
+	lock = kcalloc(1, sizeof(*lock), GFP_NOFS);
 	if (!lock)
 		return NULL;
 
 	if (!lksb) {
 		/* zero memory only if kernel-allocated */
-		lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
+		lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS);
 		if (!lksb) {
 			kfree(lock);
 			return NULL;
@@ -429,11 +456,16 @@ int dlm_create_lock_handler(struct o2net
 	if (!dlm_grab(dlm))
 		return DLM_REJECTED;
 
-	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
-			"Domain %s not fully joined!\n", dlm->name);
-
 	name = create->name;
 	namelen = create->namelen;
+	status = DLM_REJECTED;
+	if (!dlm_domain_fully_joined(dlm)) {
+		mlog(ML_ERROR, "Domain %s not fully joined, but node %u is "
+		     "sending a create_lock message for lock %.*s!\n",
+		     dlm->name, create->node_idx, namelen, name);
+		dlm_error(status);
+		goto leave;
+	}
 
 	status = DLM_IVBUFLEN;
 	if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -669,18 +701,22 @@ retry_lock:
 			msleep(100);
 			/* no waiting for dlm_reco_thread */
 			if (recovery) {
-				if (status == DLM_RECOVERING) {
-					mlog(0, "%s: got RECOVERING "
-					     "for $REOCVERY lock, master "
-					     "was %u\n", dlm->name, 
-					     res->owner);
-					dlm_wait_for_node_death(dlm, res->owner, 
-							DLM_NODE_DEATH_WAIT_MAX);
-				}
+				if (status != DLM_RECOVERING)
+					goto retry_lock;
+
+				mlog(0, "%s: got RECOVERING "
+				     "for $RECOVERY lock, master "
+				     "was %u\n", dlm->name,
+				     res->owner);
+				/* wait to see the node go down, then
+				 * drop down and allow the lockres to
+				 * get cleaned up.  need to remaster. */
+				dlm_wait_for_node_death(dlm, res->owner,
+						DLM_NODE_DEATH_WAIT_MAX);
 			} else {
 				dlm_wait_for_recovery(dlm);
+				goto retry_lock;
 			}
-			goto retry_lock;
 		}
 
 		if (status != DLM_NORMAL) {
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 940be4c..1b8346d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -47,7 +47,6 @@ #include "cluster/tcp.h"
 
 #include "dlmapi.h"
 #include "dlmcommon.h"
-#include "dlmdebug.h"
 #include "dlmdomain.h"
 
 #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
@@ -74,6 +73,7 @@ struct dlm_master_list_entry
 	wait_queue_head_t wq;
 	atomic_t woken;
 	struct kref mle_refs;
+	int inuse;
 	unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -127,18 +127,30 @@ static inline int dlm_mle_equal(struct d
 	return 1;
 }
 
-#if 0
-/* Code here is included but defined out as it aids debugging */
+#define dlm_print_nodemap(m)  _dlm_print_nodemap(m,#m)
+static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
+{
+	int i;
+	printk("%s=[ ", mapname);
+	for (i=0; i<O2NM_MAX_NODES; i++)
+		if (test_bit(i, map))
+			printk("%d ", i);
+	printk("]");
+}
 
-void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
 {
-	int i = 0, refs;
+	int refs;
 	char *type;
 	char attached;
 	u8 master;
 	unsigned int namelen;
 	const char *name;
 	struct kref *k;
+	unsigned long *maybe = mle->maybe_map,
+		      *vote = mle->vote_map,
+		      *resp = mle->response_map,
+		      *node = mle->node_map;
 
 	k = &mle->mle_refs;
 	if (mle->type == DLM_MLE_BLOCK)
@@ -159,18 +171,29 @@ void dlm_print_one_mle(struct dlm_master
 		name = mle->u.res->lockname.name;
 	}
 
-	mlog(ML_NOTICE, "  #%3d: %3s  %3d  %3u   %3u %c    (%d)%.*s\n",
-		  i, type, refs, master, mle->new_master, attached,
-		  namelen, namelen, name);
+	mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
+		  namelen, name, type, refs, master, mle->new_master, attached,
+		  mle->inuse);
+	dlm_print_nodemap(maybe);
+	printk(", ");
+	dlm_print_nodemap(vote);
+	printk(", ");
+	dlm_print_nodemap(resp);
+	printk(", ");
+	dlm_print_nodemap(node);
+	printk(", ");
+	printk("\n");
 }
 
+#if 0
+/* Code here is included but defined out as it aids debugging */
+
 static void dlm_dump_mles(struct dlm_ctxt *dlm)
 {
 	struct dlm_master_list_entry *mle;
 	struct list_head *iter;
 	
 	mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
-	mlog(ML_NOTICE, "  ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
 	spin_lock(&dlm->master_lock);
 	list_for_each(iter, &dlm->master_list) {
 		mle = list_entry(iter, struct dlm_master_list_entry, list);
@@ -314,6 +337,31 @@ static inline void dlm_mle_detach_hb_eve
 	spin_unlock(&dlm->spinlock);
 }
 
+static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&dlm->master_lock);
+	mle->inuse++;
+	kref_get(&mle->mle_refs);
+}
+
+static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
+{
+	struct dlm_ctxt *dlm;
+	dlm = mle->dlm;
+
+	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->master_lock);
+	mle->inuse--;
+	__dlm_put_mle(mle);
+	spin_unlock(&dlm->master_lock);
+	spin_unlock(&dlm->spinlock);
+
+}
+
 /* remove from list and free */
 static void __dlm_put_mle(struct dlm_master_list_entry *mle)
 {
@@ -322,9 +370,14 @@ static void __dlm_put_mle(struct dlm_mas
 
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&dlm->master_lock);
-	BUG_ON(!atomic_read(&mle->mle_refs.refcount));
-
-	kref_put(&mle->mle_refs, dlm_mle_release);
+	if (!atomic_read(&mle->mle_refs.refcount)) {
+		/* this may or may not crash, but who cares.
+		 * it's a BUG. */
+		mlog(ML_ERROR, "bad mle: %p\n", mle);
+		dlm_print_one_mle(mle);
+		BUG();
+	} else
+		kref_put(&mle->mle_refs, dlm_mle_release);
 }
 
 
@@ -367,6 +420,7 @@ static void dlm_init_mle(struct dlm_mast
 	memset(mle->response_map, 0, sizeof(mle->response_map));
 	mle->master = O2NM_MAX_NODES;
 	mle->new_master = O2NM_MAX_NODES;
+	mle->inuse = 0;
 
 	if (mle->type == DLM_MLE_MASTER) {
 		BUG_ON(!res);
@@ -564,6 +618,28 @@ static void dlm_lockres_release(struct k
 	mlog(0, "destroying lockres %.*s\n", res->lockname.len,
 	     res->lockname.name);
 
+	if (!hlist_unhashed(&res->hash_node) ||
+	    !list_empty(&res->granted) ||
+	    !list_empty(&res->converting) ||
+	    !list_empty(&res->blocked) ||
+	    !list_empty(&res->dirty) ||
+	    !list_empty(&res->recovering) ||
+	    !list_empty(&res->purge)) {
+		mlog(ML_ERROR,
+		     "Going to BUG for resource %.*s."
+		     "  We're on a list! [%c%c%c%c%c%c%c]\n",
+		     res->lockname.len, res->lockname.name,
+		     !hlist_unhashed(&res->hash_node) ? 'H' : ' ',
+		     !list_empty(&res->granted) ? 'G' : ' ',
+		     !list_empty(&res->converting) ? 'C' : ' ',
+		     !list_empty(&res->blocked) ? 'B' : ' ',
+		     !list_empty(&res->dirty) ? 'D' : ' ',
+		     !list_empty(&res->recovering) ? 'R' : ' ',
+		     !list_empty(&res->purge) ? 'P' : ' ');
+
+		dlm_print_one_lock_resource(res);
+	}
+
 	/* By the time we're ready to blow this guy away, we shouldn't
 	 * be on any lists. */
 	BUG_ON(!hlist_unhashed(&res->hash_node));
@@ -579,11 +655,6 @@ static void dlm_lockres_release(struct k
 	kfree(res);
 }
 
-void dlm_lockres_get(struct dlm_lock_resource *res)
-{
-	kref_get(&res->refs);
-}
-
 void dlm_lockres_put(struct dlm_lock_resource *res)
 {
 	kref_put(&res->refs, dlm_lockres_release);
@@ -603,7 +674,7 @@ static void dlm_init_lockres(struct dlm_
 	memcpy(qname, name, namelen);
 
 	res->lockname.len = namelen;
-	res->lockname.hash = full_name_hash(name, namelen);
+	res->lockname.hash = dlm_lockid_hash(name, namelen);
 
 	init_waitqueue_head(&res->wq);
 	spin_lock_init(&res->spinlock);
@@ -637,11 +708,11 @@ struct dlm_lock_resource *dlm_new_lockre
 {
 	struct dlm_lock_resource *res;
 
-	res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
+	res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
 	if (!res)
 		return NULL;
 
-	res->lockname.name = kmalloc(namelen, GFP_KERNEL);
+	res->lockname.name = kmalloc(namelen, GFP_NOFS);
 	if (!res->lockname.name) {
 		kfree(res);
 		return NULL;
@@ -677,19 +748,20 @@ struct dlm_lock_resource * dlm_get_lock_
 	int blocked = 0;
 	int ret, nodenum;
 	struct dlm_node_iter iter;
-	unsigned int namelen;
+	unsigned int namelen, hash;
 	int tries = 0;
 	int bit, wait_on_recovery = 0;
 
 	BUG_ON(!lockid);
 
 	namelen = strlen(lockid);
+	hash = dlm_lockid_hash(lockid, namelen);
 
 	mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
 
 lookup:
 	spin_lock(&dlm->spinlock);
-	tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
+	tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash);
 	if (tmpres) {
 		spin_unlock(&dlm->spinlock);
 		mlog(0, "found in hash!\n");
@@ -704,7 +776,7 @@ lookup:
 		mlog(0, "allocating a new resource\n");
 		/* nothing found and we need to allocate one. */
 		alloc_mle = (struct dlm_master_list_entry *)
-			kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+			kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
 		if (!alloc_mle)
 			goto leave;
 		res = dlm_new_lockres(dlm, lockid, namelen);
@@ -790,10 +862,11 @@ lookup:
 	 * if so, the creator of the BLOCK may try to put the last
 	 * ref at this time in the assert master handler, so we
 	 * need an extra one to keep from a bad ptr deref. */
-	dlm_get_mle(mle);
+	dlm_get_mle_inuse(mle);
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 
+redo_request:
 	while (wait_on_recovery) {
 		/* any cluster changes that occurred after dropping the
 		 * dlm spinlock would be detectable be a change on the mle,
@@ -812,7 +885,7 @@ lookup:
 		} 
 
 		dlm_kick_recovery_thread(dlm);
-		msleep(100);
+		msleep(1000);
 		dlm_wait_for_recovery(dlm);
 
 		spin_lock(&dlm->spinlock);
@@ -825,13 +898,15 @@ lookup:
 		} else
 			wait_on_recovery = 0;
 		spin_unlock(&dlm->spinlock);
+
+		if (wait_on_recovery)
+			dlm_wait_for_node_recovery(dlm, bit, 10000);
 	}
 
 	/* must wait for lock to be mastered elsewhere */
 	if (blocked)
 		goto wait;
 
-redo_request:
 	ret = -EINVAL;
 	dlm_node_iter_init(mle->vote_map, &iter);
 	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
@@ -856,6 +931,7 @@ wait:
 	/* keep going until the response map includes all nodes */
 	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
 	if (ret < 0) {
+		wait_on_recovery = 1;
 		mlog(0, "%s:%.*s: node map changed, redo the "
 		     "master request now, blocked=%d\n",
 		     dlm->name, res->lockname.len,
@@ -866,7 +942,7 @@ wait:
 			     dlm->name, res->lockname.len, 
 			     res->lockname.name, blocked);
 			dlm_print_one_lock_resource(res);
-			/* dlm_print_one_mle(mle); */
+			dlm_print_one_mle(mle);
 			tries = 0;
 		}
 		goto redo_request;
@@ -880,7 +956,7 @@ wait:
 	dlm_mle_detach_hb_events(dlm, mle);
 	dlm_put_mle(mle);
 	/* put the extra ref */
-	dlm_put_mle(mle);
+	dlm_put_mle_inuse(mle);
 
 wake_waiters:
 	spin_lock(&res->spinlock);
@@ -921,12 +997,14 @@ recheck:
 		spin_unlock(&res->spinlock);
 		/* this will cause the master to re-assert across
 		 * the whole cluster, freeing up mles */
-		ret = dlm_do_master_request(mle, res->owner);
-		if (ret < 0) {
-			/* give recovery a chance to run */
-			mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
-			msleep(500);
-			goto recheck;
+		if (res->owner != dlm->node_num) {
+			ret = dlm_do_master_request(mle, res->owner);
+			if (ret < 0) {
+				/* give recovery a chance to run */
+				mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
+				msleep(500);
+				goto recheck;
+			}
 		}
 		ret = 0;
 		goto leave;
@@ -962,6 +1040,12 @@ recheck:
 		     "rechecking now\n", dlm->name, res->lockname.len,
 		     res->lockname.name);
 		goto recheck;
+	} else {
+		if (!voting_done) {
+			mlog(0, "map not changed and voting not done "
+			     "for %s:%.*s\n", dlm->name, res->lockname.len,
+			     res->lockname.name);
+		}
 	}
 
 	if (m != O2NM_MAX_NODES) {
@@ -1129,18 +1213,6 @@ static int dlm_restart_lock_mastery(stru
 			set_bit(node, mle->vote_map);
 		} else {
 			mlog(ML_ERROR, "node down! %d\n", node);
-
-			/* if the node wasn't involved in mastery skip it,
-			 * but clear it out from the maps so that it will
-			 * not affect mastery of this lockres */
-			clear_bit(node, mle->response_map);
-			clear_bit(node, mle->vote_map);
-			if (!test_bit(node, mle->maybe_map))
-				goto next;
-
-			/* if we're already blocked on lock mastery, and the
-			 * dead node wasn't the expected master, or there is
-			 * another node in the maybe_map, keep waiting */
 			if (blocked) {
 				int lowest = find_next_bit(mle->maybe_map,
 						       O2NM_MAX_NODES, 0);
@@ -1148,54 +1220,53 @@ static int dlm_restart_lock_mastery(stru
 				/* act like it was never there */
 				clear_bit(node, mle->maybe_map);
 
-			       	if (node != lowest)
-					goto next;
-
-				mlog(ML_ERROR, "expected master %u died while "
-				     "this node was blocked waiting on it!\n",
-				     node);
-				lowest = find_next_bit(mle->maybe_map,
-						       O2NM_MAX_NODES,
-						       lowest+1);
-				if (lowest < O2NM_MAX_NODES) {
-					mlog(0, "still blocked. waiting "
-					     "on %u now\n", lowest);
-					goto next;
+			       	if (node == lowest) {
+					mlog(0, "expected master %u died"
+					    " while this node was blocked "
+					    "waiting on it!\n", node);
+					lowest = find_next_bit(mle->maybe_map,
+						       	O2NM_MAX_NODES,
+						       	lowest+1);
+					if (lowest < O2NM_MAX_NODES) {
+						mlog(0, "%s:%.*s:still "
+						     "blocked. waiting on %u "
+						     "now\n", dlm->name,
+						     res->lockname.len,
+						     res->lockname.name,
+						     lowest);
+					} else {
+						/* mle is an MLE_BLOCK, but
+						 * there is now nothing left to
+						 * block on.  we need to return
+						 * all the way back out and try
+						 * again with an MLE_MASTER.
+						 * dlm_do_local_recovery_cleanup
+						 * has already run, so the mle
+						 * refcount is ok */
+						mlog(0, "%s:%.*s: no "
+						     "longer blocking. try to "
+						     "master this here\n",
+						     dlm->name,
+						     res->lockname.len,
+						     res->lockname.name);
+						mle->type = DLM_MLE_MASTER;
+						mle->u.res = res;
+					}
 				}
-
-				/* mle is an MLE_BLOCK, but there is now
-				 * nothing left to block on.  we need to return
-				 * all the way back out and try again with
-				 * an MLE_MASTER. dlm_do_local_recovery_cleanup
-				 * has already run, so the mle refcount is ok */
-				mlog(0, "no longer blocking. we can "
-				     "try to master this here\n");
-				mle->type = DLM_MLE_MASTER;
-				memset(mle->maybe_map, 0,
-				       sizeof(mle->maybe_map));
-				memset(mle->response_map, 0,
-				       sizeof(mle->maybe_map));
-				memcpy(mle->vote_map, mle->node_map,
-				       sizeof(mle->node_map));
-				mle->u.res = res;
-				set_bit(dlm->node_num, mle->maybe_map);
-
-				ret = -EAGAIN;
-				goto next;
 			}
 
-			clear_bit(node, mle->maybe_map);
-			if (node > dlm->node_num)
-				goto next;
-
-			mlog(0, "dead node in map!\n");
-			/* yuck. go back and re-contact all nodes
-			 * in the vote_map, removing this node. */
-			memset(mle->response_map, 0,
-			       sizeof(mle->response_map));
+			/* now blank out everything, as if we had never
+			 * contacted anyone */
+			memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+			memset(mle->response_map, 0, sizeof(mle->response_map));
+			/* reset the vote_map to the current node_map */
+			memcpy(mle->vote_map, mle->node_map,
+			       sizeof(mle->node_map));
+			/* put myself into the maybe map */
+			if (mle->type != DLM_MLE_BLOCK)
+				set_bit(dlm->node_num, mle->maybe_map);
 		}
 		ret = -EAGAIN;
-next:
 		node = dlm_bitmap_diff_iter_next(&bdi, &sc);
 	}
 	return ret;
@@ -1316,7 +1387,7 @@ int dlm_master_request_handler(struct o2
 	struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
 	struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
 	char *name;
-	unsigned int namelen;
+	unsigned int namelen, hash;
 	int found, ret;
 	int set_maybe;
 	int dispatch_assert = 0;
@@ -1331,6 +1402,7 @@ int dlm_master_request_handler(struct o2
 
 	name = request->name;
 	namelen = request->namelen;
+	hash = dlm_lockid_hash(name, namelen);
 
 	if (namelen > DLM_LOCKID_NAME_MAX) {
 		response = DLM_IVBUFLEN;
@@ -1339,7 +1411,7 @@ int dlm_master_request_handler(struct o2
 
 way_up_top:
 	spin_lock(&dlm->spinlock);
-	res = __dlm_lookup_lockres(dlm, name, namelen);
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
 	if (res) {
 		spin_unlock(&dlm->spinlock);
 
@@ -1459,21 +1531,18 @@ way_up_top:
 			spin_unlock(&dlm->spinlock);
 
 			mle = (struct dlm_master_list_entry *)
-				kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+				kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
 			if (!mle) {
 				response = DLM_MASTER_RESP_ERROR;
 				mlog_errno(-ENOMEM);
 				goto send_response;
 			}
-			spin_lock(&dlm->spinlock);
-			dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
-					 name, namelen);
-			spin_unlock(&dlm->spinlock);
 			goto way_up_top;
 		}
 
 		// mlog(0, "this is second time thru, already allocated, "
 		// "add the block.\n");
+		dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
 		set_bit(request->node_idx, mle->maybe_map);
 		list_add(&mle->list, &dlm->master_list);
 		response = DLM_MASTER_RESP_NO;
@@ -1556,6 +1625,8 @@ again:
 	dlm_node_iter_init(nodemap, &iter);
 	while ((to = dlm_node_iter_next(&iter)) >= 0) {
 		int r = 0;
+		struct dlm_master_list_entry *mle = NULL;
+
 		mlog(0, "sending assert master to %d (%.*s)\n", to,
 		     namelen, lockname);
 		memset(&assert, 0, sizeof(assert));
@@ -1567,20 +1638,28 @@ again:
 		tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
 					    &assert, sizeof(assert), to, &r);
 		if (tmpret < 0) {
-			mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
+			mlog(0, "assert_master returned %d!\n", tmpret);
 			if (!dlm_is_host_down(tmpret)) {
-				mlog(ML_ERROR, "unhandled error!\n");
+				mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
 				BUG();
 			}
 			/* a node died.  finish out the rest of the nodes. */
-			mlog(ML_ERROR, "link to %d went down!\n", to);
+			mlog(0, "link to %d went down!\n", to);
 			/* any nonzero status return will do */
 			ret = tmpret;
 		} else if (r < 0) {
 			/* ok, something horribly messed.  kill thyself. */
 			mlog(ML_ERROR,"during assert master of %.*s to %u, "
 			     "got %d.\n", namelen, lockname, to, r);
-			dlm_dump_lock_resources(dlm);
+			spin_lock(&dlm->spinlock);
+			spin_lock(&dlm->master_lock);
+			if (dlm_find_mle(dlm, &mle, (char *)lockname,
+					 namelen)) {
+				dlm_print_one_mle(mle);
+				__dlm_put_mle(mle);
+			}
+			spin_unlock(&dlm->master_lock);
+			spin_unlock(&dlm->spinlock);
 			BUG();
 		} else if (r == EAGAIN) {
 			mlog(0, "%.*s: node %u create mles on other "
@@ -1612,7 +1691,7 @@ int dlm_assert_master_handler(struct o2n
 	struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
 	struct dlm_lock_resource *res = NULL;
 	char *name;
-	unsigned int namelen;
+	unsigned int namelen, hash;
 	u32 flags;
 	int master_request = 0;
 	int ret = 0;
@@ -1622,6 +1701,7 @@ int dlm_assert_master_handler(struct o2n
 
 	name = assert->name;
 	namelen = assert->namelen;
+	hash = dlm_lockid_hash(name, namelen);
 	flags = be32_to_cpu(assert->flags);
 
 	if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -1646,7 +1726,7 @@ int dlm_assert_master_handler(struct o2n
 		if (bit >= O2NM_MAX_NODES) {
 			/* not necessarily an error, though less likely.
 			 * could be master just re-asserting. */
-			mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
+			mlog(0, "no bits set in the maybe_map, but %u "
 			     "is asserting! (%.*s)\n", assert->node_idx,
 			     namelen, name);
 		} else if (bit != assert->node_idx) {
@@ -1658,19 +1738,36 @@ int dlm_assert_master_handler(struct o2n
 				 * number winning the mastery will respond
 				 * YES to mastery requests, but this node
 				 * had no way of knowing.  let it pass. */
-				mlog(ML_ERROR, "%u is the lowest node, "
+				mlog(0, "%u is the lowest node, "
 				     "%u is asserting. (%.*s)  %u must "
 				     "have begun after %u won.\n", bit,
 				     assert->node_idx, namelen, name, bit,
 				     assert->node_idx);
 			}
 		}
+		if (mle->type == DLM_MLE_MIGRATION) {
+			if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
+				mlog(0, "%s:%.*s: got cleanup assert"
+				     " from %u for migration\n",
+				     dlm->name, namelen, name,
+				     assert->node_idx);
+			} else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
+				mlog(0, "%s:%.*s: got unrelated assert"
+				     " from %u for migration, ignoring\n",
+				     dlm->name, namelen, name,
+				     assert->node_idx);
+				__dlm_put_mle(mle);
+				spin_unlock(&dlm->master_lock);
+				spin_unlock(&dlm->spinlock);
+				goto done;
+			}	
+		}
 	}
 	spin_unlock(&dlm->master_lock);
 
 	/* ok everything checks out with the MLE
 	 * now check to see if there is a lockres */
-	res = __dlm_lookup_lockres(dlm, name, namelen);
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
 	if (res) {
 		spin_lock(&res->spinlock);
 		if (res->state & DLM_LOCK_RES_RECOVERING)  {
@@ -1679,7 +1776,8 @@ int dlm_assert_master_handler(struct o2n
 			goto kill;
 		}
 		if (!mle) {
-			if (res->owner != assert->node_idx) {
+			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
+			    res->owner != assert->node_idx) {
 				mlog(ML_ERROR, "assert_master from "
 					  "%u, but current owner is "
 					  "%u! (%.*s)\n",
@@ -1732,6 +1830,7 @@ ok:
 	if (mle) {
 		int extra_ref = 0;
 		int nn = -1;
+		int rr, err = 0;
 		
 		spin_lock(&mle->spinlock);
 		if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
@@ -1751,27 +1850,64 @@ ok:
 		wake_up(&mle->wq);
 		spin_unlock(&mle->spinlock);
 
-		if (mle->type == DLM_MLE_MIGRATION && res) {
-			mlog(0, "finishing off migration of lockres %.*s, "
-			     "from %u to %u\n",
-			       res->lockname.len, res->lockname.name,
-			       dlm->node_num, mle->new_master);
+		if (res) {
 			spin_lock(&res->spinlock);
-			res->state &= ~DLM_LOCK_RES_MIGRATING;
-			dlm_change_lockres_owner(dlm, res, mle->new_master);
-			BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+			if (mle->type == DLM_MLE_MIGRATION) {
+				mlog(0, "finishing off migration of lockres %.*s, "
+			     		"from %u to %u\n",
+			       		res->lockname.len, res->lockname.name,
+			       		dlm->node_num, mle->new_master);
+				res->state &= ~DLM_LOCK_RES_MIGRATING;
+				dlm_change_lockres_owner(dlm, res, mle->new_master);
+				BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+			} else {
+				dlm_change_lockres_owner(dlm, res, mle->master);
+			}
 			spin_unlock(&res->spinlock);
 		}
-		/* master is known, detach if not already detached */
-		dlm_mle_detach_hb_events(dlm, mle);
-		dlm_put_mle(mle);
-		
+
+		/* master is known, detach if not already detached.
+		 * ensures that only one assert_master call will happen
+		 * on this mle. */
+		spin_lock(&dlm->spinlock);
+		spin_lock(&dlm->master_lock);
+
+		rr = atomic_read(&mle->mle_refs.refcount);
+		if (mle->inuse > 0) {
+			if (extra_ref && rr < 3)
+				err = 1;
+			else if (!extra_ref && rr < 2)
+				err = 1;
+		} else {
+			if (extra_ref && rr < 2)
+				err = 1;
+			else if (!extra_ref && rr < 1)
+				err = 1;
+		}
+		if (err) {
+			mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
+			     "that will mess up this node, refs=%d, extra=%d, "
+			     "inuse=%d\n", dlm->name, namelen, name,
+			     assert->node_idx, rr, extra_ref, mle->inuse);
+			dlm_print_one_mle(mle);
+		}
+		list_del_init(&mle->list);
+		__dlm_mle_detach_hb_events(dlm, mle);
+		__dlm_put_mle(mle);
 		if (extra_ref) {
 			/* the assert master message now balances the extra
 		 	 * ref given by the master / migration request message.
 		 	 * if this is the last put, it will be removed
 		 	 * from the list. */
-			dlm_put_mle(mle);
+			__dlm_put_mle(mle);
+		}
+		spin_unlock(&dlm->master_lock);
+		spin_unlock(&dlm->spinlock);
+	} else if (res) {
+		if (res->owner != assert->node_idx) {
+			mlog(0, "assert_master from %u, but current "
+			     "owner is %u (%.*s), no mle\n", assert->node_idx,
+			     res->owner, namelen, name);
 		}
 	}
 
@@ -1788,12 +1924,12 @@ done:
 
 kill:
 	/* kill the caller! */
+	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
+	     "and killing the other node now!  This node is OK and can continue.\n");
+	__dlm_print_one_lock_resource(res);
 	spin_unlock(&res->spinlock);
 	spin_unlock(&dlm->spinlock);
 	dlm_lockres_put(res);
-	mlog(ML_ERROR, "Bad message received from another node.  Dumping state "
-	     "and killing the other node now!  This node is OK and can continue.\n");
-	dlm_dump_lock_resources(dlm);
 	dlm_put(dlm);
 	return -EINVAL;
 }
@@ -1803,7 +1939,7 @@ int dlm_dispatch_assert_master(struct dl
 			       int ignore_higher, u8 request_from, u32 flags)
 {
 	struct dlm_work_item *item;
-	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	item = kcalloc(1, sizeof(*item), GFP_NOFS);
 	if (!item)
 		return -ENOMEM;
 
@@ -1825,7 +1961,7 @@ int dlm_dispatch_assert_master(struct dl
 	list_add_tail(&item->list, &dlm->work_list);
 	spin_unlock(&dlm->work_lock);
 
-	schedule_work(&dlm->dispatched_work);
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 	return 0;
 }
 
@@ -1866,6 +2002,23 @@ static void dlm_assert_master_worker(str
 		}
 	}
 
+	/*
+	 * If we're migrating this lock to someone else, we are no
+	 * longer allowed to assert out own mastery.  OTOH, we need to
+	 * prevent migration from starting while we're still asserting
+	 * our dominance.  The reserved ast delays migration.
+	 */
+	spin_lock(&res->spinlock);
+	if (res->state & DLM_LOCK_RES_MIGRATING) {
+		mlog(0, "Someone asked us to assert mastery, but we're "
+		     "in the middle of migration.  Skipping assert, "
+		     "the new master will handle that.\n");
+		spin_unlock(&res->spinlock);
+		goto put;
+	} else
+		__dlm_lockres_reserve_ast(res);
+	spin_unlock(&res->spinlock);
+
 	/* this call now finishes out the nodemap
 	 * even if one or more nodes die */
 	mlog(0, "worker about to master %.*s here, this=%u\n",
@@ -1875,9 +2028,14 @@ static void dlm_assert_master_worker(str
 				   nodemap, flags);
 	if (ret < 0) {
 		/* no need to restart, we are done */
-		mlog_errno(ret);
+		if (!dlm_is_host_down(ret))
+			mlog_errno(ret);
 	}
 
+	/* Ok, we've asserted ourselves.  Let's let migration start. */
+	dlm_lockres_release_ast(dlm, res);
+
+put:
 	dlm_lockres_put(res);
 
 	mlog(0, "finished with dlm_assert_master_worker\n");
@@ -1916,6 +2074,7 @@ static int dlm_pre_master_reco_lockres(s
 				BUG();
 			/* host is down, so answer for that node would be
 			 * DLM_LOCK_RES_OWNER_UNKNOWN.  continue. */
+			ret = 0;
 		}
 
 		if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
@@ -2016,14 +2175,14 @@ int dlm_migrate_lockres(struct dlm_ctxt 
 	 */
 
 	ret = -ENOMEM;
-	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
+	mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
 	if (!mres) {
 		mlog_errno(ret);
 		goto leave;
 	}
 
 	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-								GFP_KERNEL);
+								GFP_NOFS);
 	if (!mle) {
 		mlog_errno(ret);
 		goto leave;
@@ -2117,7 +2276,7 @@ fail:
 	 * take both dlm->spinlock and dlm->master_lock */
 	spin_lock(&dlm->spinlock);
 	spin_lock(&dlm->master_lock);
-	dlm_get_mle(mle);
+	dlm_get_mle_inuse(mle);
 	spin_unlock(&dlm->master_lock);
 	spin_unlock(&dlm->spinlock);
 
@@ -2134,7 +2293,10 @@ fail:
 		/* migration failed, detach and clean up mle */
 		dlm_mle_detach_hb_events(dlm, mle);
 		dlm_put_mle(mle);
-		dlm_put_mle(mle);
+		dlm_put_mle_inuse(mle);
+		spin_lock(&res->spinlock);
+		res->state &= ~DLM_LOCK_RES_MIGRATING;
+		spin_unlock(&res->spinlock);
 		goto leave;
 	}
 
@@ -2164,8 +2326,8 @@ fail:
 			/* avoid hang during shutdown when migrating lockres 
 			 * to a node which also goes down */
 			if (dlm_is_node_dead(dlm, target)) {
-				mlog(0, "%s:%.*s: expected migration target %u "
-				     "is no longer up.  restarting.\n",
+				mlog(0, "%s:%.*s: expected migration "
+				     "target %u is no longer up, restarting\n",
 				     dlm->name, res->lockname.len,
 				     res->lockname.name, target);
 				ret = -ERESTARTSYS;
@@ -2175,7 +2337,10 @@ fail:
 			/* migration failed, detach and clean up mle */
 			dlm_mle_detach_hb_events(dlm, mle);
 			dlm_put_mle(mle);
-			dlm_put_mle(mle);
+			dlm_put_mle_inuse(mle);
+			spin_lock(&res->spinlock);
+			res->state &= ~DLM_LOCK_RES_MIGRATING;
+			spin_unlock(&res->spinlock);
 			goto leave;
 		}
 		/* TODO: if node died: stop, clean up, return error */
@@ -2191,7 +2356,7 @@ fail:
 
 	/* master is known, detach if not already detached */
 	dlm_mle_detach_hb_events(dlm, mle);
-	dlm_put_mle(mle);
+	dlm_put_mle_inuse(mle);
 	ret = 0;
 
 	dlm_lockres_calc_usage(dlm, res);
@@ -2462,7 +2627,7 @@ int dlm_migrate_request_handler(struct o
 	struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
 	struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
 	const char *name;
-	unsigned int namelen;
+	unsigned int namelen, hash;
 	int ret = 0;
 
 	if (!dlm_grab(dlm))
@@ -2470,10 +2635,11 @@ int dlm_migrate_request_handler(struct o
 
 	name = migrate->name;
 	namelen = migrate->namelen;
+	hash = dlm_lockid_hash(name, namelen);
 
 	/* preallocate.. if this fails, abort */
 	mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
-							 GFP_KERNEL);
+							 GFP_NOFS);
 
 	if (!mle) {
 		ret = -ENOMEM;
@@ -2482,7 +2648,7 @@ int dlm_migrate_request_handler(struct o
 
 	/* check for pre-existing lock */
 	spin_lock(&dlm->spinlock);
-	res = __dlm_lookup_lockres(dlm, name, namelen);
+	res = __dlm_lookup_lockres(dlm, name, namelen, hash);
 	spin_lock(&dlm->master_lock);
 
 	if (res) {
@@ -2580,6 +2746,7 @@ static int dlm_add_migration_mle(struct 
 			/* remove it from the list so that only one
 			 * mle will be found */
 			list_del_init(&tmp->list);
+			__dlm_mle_detach_hb_events(dlm, mle);
 		}
 		spin_unlock(&tmp->spinlock);
 	}
@@ -2601,6 +2768,7 @@ void dlm_clean_master_list(struct dlm_ct
 	struct list_head *iter, *iter2;
 	struct dlm_master_list_entry *mle;
 	struct dlm_lock_resource *res;
+	unsigned int hash;
 
 	mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
 top:
@@ -2640,7 +2808,7 @@ top:
 				 * may result in the mle being unlinked and
 				 * freed, but there may still be a process
 				 * waiting in the dlmlock path which is fine. */
-				mlog(ML_ERROR, "node %u was expected master\n",
+				mlog(0, "node %u was expected master\n",
 				     dead_node);
 				atomic_set(&mle->woken, 1);
 				spin_unlock(&mle->spinlock);
@@ -2673,19 +2841,21 @@ top:
 
 		/* remove from the list early.  NOTE: unlinking
 		 * list_head while in list_for_each_safe */
+		__dlm_mle_detach_hb_events(dlm, mle);
 		spin_lock(&mle->spinlock);
 		list_del_init(&mle->list);
 		atomic_set(&mle->woken, 1);
 		spin_unlock(&mle->spinlock);
 		wake_up(&mle->wq);
 
-		mlog(0, "node %u died during migration from "
-		     "%u to %u!\n", dead_node,
+		mlog(0, "%s: node %u died during migration from "
+		     "%u to %u!\n", dlm->name, dead_node,
 		     mle->master, mle->new_master);
 		/* if there is a lockres associated with this
 	 	 * mle, find it and set its owner to UNKNOWN */
+		hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
 		res = __dlm_lookup_lockres(dlm, mle->u.name.name,
-					mle->u.name.len);
+					   mle->u.name.len, hash);
 		if (res) {
 			/* unfortunately if we hit this rare case, our
 		 	 * lock ordering is messed.  we need to drop
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 805cbab..044a964 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -95,6 +95,9 @@ static void dlm_reco_unlock_ast(void *as
 static void dlm_request_all_locks_worker(struct dlm_work_item *item,
 					 void *data);
 static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data);
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master);
 
 static u64 dlm_get_next_mig_cookie(void);
 
@@ -115,12 +118,37 @@ static u64 dlm_get_next_mig_cookie(void)
 	return c;
 }
 
+static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm,
+					  u8 dead_node)
+{
+	assert_spin_locked(&dlm->spinlock);
+	if (dlm->reco.dead_node != dead_node)
+		mlog(0, "%s: changing dead_node from %u to %u\n",
+		     dlm->name, dlm->reco.dead_node, dead_node);
+	dlm->reco.dead_node = dead_node;
+}
+
+static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
+				       u8 master)
+{
+	assert_spin_locked(&dlm->spinlock);
+	mlog(0, "%s: changing new_master from %u to %u\n",
+	     dlm->name, dlm->reco.new_master, master);
+	dlm->reco.new_master = master;
+}
+
+static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+	assert_spin_locked(&dlm->spinlock);
+	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+	dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
+	dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
+}
+
 static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
 {
 	spin_lock(&dlm->spinlock);
-	clear_bit(dlm->reco.dead_node, dlm->recovery_map);
-	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
-	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+	__dlm_reset_recovery(dlm);
 	spin_unlock(&dlm->spinlock);
 }
 
@@ -132,12 +160,21 @@ void dlm_dispatch_work(void *data)
 	struct list_head *iter, *iter2;
 	struct dlm_work_item *item;
 	dlm_workfunc_t *workfunc;
+	int tot=0;
+
+	if (!dlm_joined(dlm))
+		return;
 
 	spin_lock(&dlm->work_lock);
 	list_splice_init(&dlm->work_list, &tmp_list);
 	spin_unlock(&dlm->work_lock);
 
 	list_for_each_safe(iter, iter2, &tmp_list) {
+		tot++;
+	}
+	mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
+
+	list_for_each_safe(iter, iter2, &tmp_list) {
 		item = list_entry(iter, struct dlm_work_item, list);
 		workfunc = item->func;
 		list_del_init(&item->list);
@@ -220,6 +257,52 @@ void dlm_complete_recovery_thread(struct
  *
  */
 
+static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
+{
+	struct dlm_reco_node_data *ndata;
+	struct dlm_lock_resource *res;
+
+	mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
+	     dlm->name, dlm->dlm_reco_thread_task->pid,
+	     dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
+	     dlm->reco.dead_node, dlm->reco.new_master);
+
+	list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+		char *st = "unknown";
+		switch (ndata->state) {
+			case DLM_RECO_NODE_DATA_INIT:
+				st = "init";
+				break;
+			case DLM_RECO_NODE_DATA_REQUESTING:
+				st = "requesting";
+				break;
+			case DLM_RECO_NODE_DATA_DEAD:
+				st = "dead";
+				break;
+			case DLM_RECO_NODE_DATA_RECEIVING:
+				st = "receiving";
+				break;
+			case DLM_RECO_NODE_DATA_REQUESTED:
+				st = "requested";
+				break;
+			case DLM_RECO_NODE_DATA_DONE:
+				st = "done";
+				break;
+			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+				st = "finalize-sent";
+				break;
+			default:
+				st = "bad";
+				break;
+		}
+		mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n",
+		     dlm->name, ndata->node_num, st);
+	}
+	list_for_each_entry(res, &dlm->reco.resources, recovering) {
+		mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+	}
+}
 
 #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
 
@@ -267,11 +350,23 @@ int dlm_is_node_dead(struct dlm_ctxt *dl
 {
 	int dead;
 	spin_lock(&dlm->spinlock);
-	dead = test_bit(node, dlm->domain_map);
+	dead = !test_bit(node, dlm->domain_map);
 	spin_unlock(&dlm->spinlock);
 	return dead;
 }
 
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
+{
+	int recovered;
+	spin_lock(&dlm->spinlock);
+	recovered = !test_bit(node, dlm->recovery_map);
+	spin_unlock(&dlm->spinlock);
+	return recovered;
+}
+
+
 int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
 {
 	if (timeout) {
@@ -290,6 +385,24 @@ int dlm_wait_for_node_death(struct dlm_c
 	return 0;
 }
 
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+	if (timeout) {
+		mlog(0, "%s: waiting %dms for notification of "
+		     "recovery of node %u\n", dlm->name, timeout, node);
+		wait_event_timeout(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_recovered(dlm, node),
+			   msecs_to_jiffies(timeout));
+	} else {
+		mlog(0, "%s: waiting indefinitely for notification "
+		     "of recovery of node %u\n", dlm->name, node);
+		wait_event(dlm->dlm_reco_thread_wq,
+			   dlm_is_node_recovered(dlm, node));
+	}
+	/* for now, return 0 */
+	return 0;
+}
+
 /* callers of the top-level api calls (dlmlock/dlmunlock) should
  * block on the dlm->reco.event when recovery is in progress.
  * the dlm recovery thread will set this state when it begins
@@ -308,6 +421,13 @@ static int dlm_in_recovery(struct dlm_ct
 
 void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
 {
+	if (dlm_in_recovery(dlm)) {
+		mlog(0, "%s: reco thread %d in recovery: "
+		     "state=%d, master=%u, dead=%u\n",
+		     dlm->name, dlm->dlm_reco_thread_task->pid,
+		     dlm->reco.state, dlm->reco.new_master,
+		     dlm->reco.dead_node);
+	}
 	wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
 }
 
@@ -341,7 +461,7 @@ static int dlm_do_recovery(struct dlm_ct
 		mlog(0, "new master %u died while recovering %u!\n",
 		     dlm->reco.new_master, dlm->reco.dead_node);
 		/* unset the new_master, leave dead_node */
-		dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+		dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
 	}
 
 	/* select a target to recover */
@@ -350,14 +470,14 @@ static int dlm_do_recovery(struct dlm_ct
 
 		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
 		if (bit >= O2NM_MAX_NODES || bit < 0)
-			dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+			dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
 		else
-			dlm->reco.dead_node = bit;
+			dlm_set_reco_dead_node(dlm, bit);
 	} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
 		/* BUG? */
 		mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
 		     dlm->reco.dead_node);
-		dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+		dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
 	}
 
 	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
@@ -366,7 +486,8 @@ static int dlm_do_recovery(struct dlm_ct
 		/* return to main thread loop and sleep. */
 		return 0;
 	}
-	mlog(0, "recovery thread found node %u in the recovery map!\n",
+	mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
+	     dlm->name, dlm->dlm_reco_thread_task->pid,
 	     dlm->reco.dead_node);
 	spin_unlock(&dlm->spinlock);
 
@@ -389,8 +510,8 @@ static int dlm_do_recovery(struct dlm_ct
 		}
 		mlog(0, "another node will master this recovery session.\n");
 	}
-	mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
-	     dlm->name, dlm->reco.new_master,
+	mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
+	     dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master,
 	     dlm->node_num, dlm->reco.dead_node);
 
 	/* it is safe to start everything back up here
@@ -402,11 +523,13 @@ static int dlm_do_recovery(struct dlm_ct
 	return 0;
 
 master_here:
-	mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
+	mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n",
+	     dlm->dlm_reco_thread_task->pid,
 	     dlm->name, dlm->reco.dead_node, dlm->node_num);
 
 	status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
 	if (status < 0) {
+		/* we should never hit this anymore */
 		mlog(ML_ERROR, "error %d remastering locks for node %u, "
 		     "retrying.\n", status, dlm->reco.dead_node);
 		/* yield a bit to allow any final network messages
@@ -433,9 +556,16 @@ static int dlm_remaster_locks(struct dlm
 	int destroy = 0;
 	int pass = 0;
 
-	status = dlm_init_recovery_area(dlm, dead_node);
-	if (status < 0)
-		goto leave;
+	do {
+		/* we have become recovery master.  there is no escaping
+		 * this, so just keep trying until we get it. */
+		status = dlm_init_recovery_area(dlm, dead_node);
+		if (status < 0) {
+			mlog(ML_ERROR, "%s: failed to alloc recovery area, "
+			     "retrying\n", dlm->name);
+			msleep(1000);
+		}
+	} while (status != 0);
 
 	/* safe to access the node data list without a lock, since this
 	 * process is the only one to change the list */
@@ -452,16 +582,36 @@ static int dlm_remaster_locks(struct dlm
 			continue;
 		}
 
-		status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
-		if (status < 0) {
-			mlog_errno(status);
-			if (dlm_is_host_down(status))
-				ndata->state = DLM_RECO_NODE_DATA_DEAD;
-			else {
-				destroy = 1;
-				goto leave;
+		do {
+			status = dlm_request_all_locks(dlm, ndata->node_num,
+						       dead_node);
+			if (status < 0) {
+				mlog_errno(status);
+				if (dlm_is_host_down(status)) {
+					/* node died, ignore it for recovery */
+					status = 0;
+					ndata->state = DLM_RECO_NODE_DATA_DEAD;
+					/* wait for the domain map to catch up
+					 * with the network state. */
+					wait_event_timeout(dlm->dlm_reco_thread_wq,
+							   dlm_is_node_dead(dlm,
+								ndata->node_num),
+							   msecs_to_jiffies(1000));
+					mlog(0, "waited 1 sec for %u, "
+					     "dead? %s\n", ndata->node_num,
+					     dlm_is_node_dead(dlm, ndata->node_num) ?
+					     "yes" : "no");
+				} else {
+					/* -ENOMEM on the other node */
+					mlog(0, "%s: node %u returned "
+					     "%d during recovery, retrying "
+					     "after a short wait\n",
+					     dlm->name, ndata->node_num,
+					     status);
+					msleep(100);
+				}
 			}
-		}
+		} while (status != 0);
 
 		switch (ndata->state) {
 			case DLM_RECO_NODE_DATA_INIT:
@@ -473,10 +623,9 @@ static int dlm_remaster_locks(struct dlm
 				mlog(0, "node %u died after requesting "
 				     "recovery info for node %u\n",
 				     ndata->node_num, dead_node);
-				// start all over
-				destroy = 1;
-				status = -EAGAIN;
-				goto leave;
+				/* fine.  don't need this node's info.
+				 * continue without it. */
+				break;
 			case DLM_RECO_NODE_DATA_REQUESTING:
 				ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
 				mlog(0, "now receiving recovery data from "
@@ -520,35 +669,26 @@ static int dlm_remaster_locks(struct dlm
 					BUG();
 					break;
 				case DLM_RECO_NODE_DATA_DEAD:
-					mlog(ML_NOTICE, "node %u died after "
+					mlog(0, "node %u died after "
 					     "requesting recovery info for "
 					     "node %u\n", ndata->node_num,
 					     dead_node);
-					spin_unlock(&dlm_reco_state_lock);
-					// start all over
-					destroy = 1;
-					status = -EAGAIN;
-					/* instead of spinning like crazy here,
-					 * wait for the domain map to catch up
-					 * with the network state.  otherwise this
-					 * can be hit hundreds of times before
-					 * the node is really seen as dead. */
-					wait_event_timeout(dlm->dlm_reco_thread_wq,
-							   dlm_is_node_dead(dlm,
-								ndata->node_num),
-							   msecs_to_jiffies(1000));
-					mlog(0, "waited 1 sec for %u, "
-					     "dead? %s\n", ndata->node_num,
-					     dlm_is_node_dead(dlm, ndata->node_num) ?
-					     "yes" : "no");
-					goto leave;
+					break;
 				case DLM_RECO_NODE_DATA_RECEIVING:
 				case DLM_RECO_NODE_DATA_REQUESTED:
+					mlog(0, "%s: node %u still in state %s\n",
+					     dlm->name, ndata->node_num,
+					     ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
+					     "receiving" : "requested");
 					all_nodes_done = 0;
 					break;
 				case DLM_RECO_NODE_DATA_DONE:
+					mlog(0, "%s: node %u state is done\n",
+					     dlm->name, ndata->node_num);
 					break;
 				case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+					mlog(0, "%s: node %u state is finalize\n",
+					     dlm->name, ndata->node_num);
 					break;
 			}
 		}
@@ -578,7 +718,7 @@ static int dlm_remaster_locks(struct dlm
 			     jiffies, dlm->reco.dead_node,
 			     dlm->node_num, dlm->reco.new_master);
 			destroy = 1;
-			status = ret;
+			status = 0;
 			/* rescan everything marked dirty along the way */
 			dlm_kick_thread(dlm, NULL);
 			break;
@@ -591,7 +731,6 @@ static int dlm_remaster_locks(struct dlm
 
 	}
 
-leave:
 	if (destroy)
 		dlm_destroy_recovery_area(dlm, dead_node);
 
@@ -617,7 +756,7 @@ static int dlm_init_recovery_area(struct
 		}
 		BUG_ON(num == dead_node);
 
-		ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
+		ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS);
 		if (!ndata) {
 			dlm_destroy_recovery_area(dlm, dead_node);
 			return -ENOMEM;
@@ -691,16 +830,25 @@ int dlm_request_all_locks_handler(struct
 	if (!dlm_grab(dlm))
 		return -EINVAL;
 
+	if (lr->dead_node != dlm->reco.dead_node) {
+		mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
+		     "dead_node is %u\n", dlm->name, lr->node_idx,
+		     lr->dead_node, dlm->reco.dead_node);
+		dlm_print_reco_node_status(dlm);
+		/* this is a hack */
+		dlm_put(dlm);
+		return -ENOMEM;
+	}
 	BUG_ON(lr->dead_node != dlm->reco.dead_node);
 
-	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	item = kcalloc(1, sizeof(*item), GFP_NOFS);
 	if (!item) {
 		dlm_put(dlm);
 		return -ENOMEM;
 	}
 
 	/* this will get freed by dlm_request_all_locks_worker */
-	buf = (char *) __get_free_page(GFP_KERNEL);
+	buf = (char *) __get_free_page(GFP_NOFS);
 	if (!buf) {
 		kfree(item);
 		dlm_put(dlm);
@@ -715,7 +863,7 @@ int dlm_request_all_locks_handler(struct
 	spin_lock(&dlm->work_lock);
 	list_add_tail(&item->list, &dlm->work_list);
 	spin_unlock(&dlm->work_lock);
-	schedule_work(&dlm->dispatched_work);
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 
 	dlm_put(dlm);
 	return 0;
@@ -730,32 +878,34 @@ static void dlm_request_all_locks_worker
 	struct list_head *iter;
 	int ret;
 	u8 dead_node, reco_master;
+	int skip_all_done = 0;
 
 	dlm = item->dlm;
 	dead_node = item->u.ral.dead_node;
 	reco_master = item->u.ral.reco_master;
 	mres = (struct dlm_migratable_lockres *)data;
 
+	mlog(0, "%s: recovery worker started, dead=%u, master=%u\n",
+	     dlm->name, dead_node, reco_master);
+
 	if (dead_node != dlm->reco.dead_node ||
 	    reco_master != dlm->reco.new_master) {
-		/* show extra debug info if the recovery state is messed */
-		mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
-		     "request(dead=%u, master=%u)\n",
-		     dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
-		     dead_node, reco_master);
-		mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
-		     "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
-		     dlm->name, mres->lockname_len, mres->lockname, mres->master,
-		     mres->num_locks, mres->total_locks, mres->flags,
-		     dlm_get_lock_cookie_node(mres->ml[0].cookie),
-		     dlm_get_lock_cookie_seq(mres->ml[0].cookie),
-		     mres->ml[0].list, mres->ml[0].flags,
-		     mres->ml[0].type, mres->ml[0].convert_type,
-		     mres->ml[0].highest_blocked, mres->ml[0].node);
-		BUG();
+		/* worker could have been created before the recovery master
+		 * died.  if so, do not continue, but do not error. */
+		if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+			mlog(ML_NOTICE, "%s: will not send recovery state, "
+			     "recovery master %u died, thread=(dead=%u,mas=%u)"
+			     " current=(dead=%u,mas=%u)\n", dlm->name,
+			     reco_master, dead_node, reco_master,
+			     dlm->reco.dead_node, dlm->reco.new_master);
+		} else {
+			mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
+			     "master=%u), request(dead=%u, master=%u)\n",
+			     dlm->name, dlm->reco.dead_node,
+			     dlm->reco.new_master, dead_node, reco_master);
+		}
+		goto leave;
 	}
-	BUG_ON(dead_node != dlm->reco.dead_node);
-	BUG_ON(reco_master != dlm->reco.new_master);
 
 	/* lock resources should have already been moved to the
  	 * dlm->reco.resources list.  now move items from that list
@@ -766,12 +916,20 @@ static void dlm_request_all_locks_worker
 	dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
 
 	/* now we can begin blasting lockreses without the dlm lock */
+
+	/* any errors returned will be due to the new_master dying,
+	 * the dlm_reco_thread should detect this */
 	list_for_each(iter, &resources) {
 		res = list_entry (iter, struct dlm_lock_resource, recovering);
 		ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
 				   	DLM_MRES_RECOVERY);
-		if (ret < 0)
-			mlog_errno(ret);
+		if (ret < 0) {
+			mlog(ML_ERROR, "%s: node %u went down while sending "
+			     "recovery state for dead node %u, ret=%d\n", dlm->name,
+			     reco_master, dead_node, ret);
+			skip_all_done = 1;
+			break;
+		}
 	}
 
 	/* move the resources back to the list */
@@ -779,10 +937,15 @@ static void dlm_request_all_locks_worker
 	list_splice_init(&resources, &dlm->reco.resources);
 	spin_unlock(&dlm->spinlock);
 
-	ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
-	if (ret < 0)
-		mlog_errno(ret);
-
+	if (!skip_all_done) {
+		ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
+		if (ret < 0) {
+			mlog(ML_ERROR, "%s: node %u went down while sending "
+			     "recovery all-done for dead node %u, ret=%d\n",
+			     dlm->name, reco_master, dead_node, ret);
+		}
+	}
+leave:
 	free_page((unsigned long)data);
 }
 
@@ -801,8 +964,14 @@ static int dlm_send_all_done_msg(struct 
 
 	ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
 				 sizeof(done_msg), send_to, &tmpret);
-	/* negative status is ignored by the caller */
-	if (ret >= 0)
+	if (ret < 0) {
+		if (!dlm_is_host_down(ret)) {
+			mlog_errno(ret);
+			mlog(ML_ERROR, "%s: unknown error sending data-done "
+			     "to %u\n", dlm->name, send_to);
+			BUG();
+		}
+	} else
 		ret = tmpret;
 	return ret;
 }
@@ -822,7 +991,11 @@ int dlm_reco_data_done_handler(struct o2
 	mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
 	     "node_idx=%u, this node=%u\n", done->dead_node,
 	     dlm->reco.dead_node, done->node_idx, dlm->node_num);
-	BUG_ON(done->dead_node != dlm->reco.dead_node);
+
+	mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node),
+			"Got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+			"node_idx=%u, this node=%u\n", done->dead_node,
+			dlm->reco.dead_node, done->node_idx, dlm->node_num);
 
 	spin_lock(&dlm_reco_state_lock);
 	list_for_each(iter, &dlm->reco.node_data) {
@@ -1023,8 +1196,9 @@ static int dlm_add_lock_to_array(struct 
 		    ml->type == LKM_PRMODE) {
 			/* if it is already set, this had better be a PR
 			 * and it has to match */
-			if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
-			    memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
+			if (!dlm_lvb_is_empty(mres->lvb) &&
+			    (ml->type == LKM_EXMODE ||
+			     memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
 				mlog(ML_ERROR, "mismatched lvbs!\n");
 				__dlm_print_one_lock_resource(lock->lockres);
 				BUG();
@@ -1083,22 +1257,25 @@ int dlm_send_one_lockres(struct dlm_ctxt
 			 * we must send it immediately. */
 			ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
 						       res, total_locks);
-			if (ret < 0) {
-				// TODO
-				mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
-				     "returned %d, TODO\n", ret);
-				BUG();
-			}
+			if (ret < 0)
+				goto error;
 		}
 	}
 	/* flush any remaining locks */
 	ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
-	if (ret < 0) {
-		// TODO
-		mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
-		     "TODO\n", ret);
+	if (ret < 0)
+		goto error;
+	return ret;
+
+error:
+	mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n",
+	     dlm->name, ret);
+	if (!dlm_is_host_down(ret))
 		BUG();
-	}
+	mlog(0, "%s: node %u went down while sending %s "
+	     "lockres %.*s\n", dlm->name, send_to,
+	     flags & DLM_MRES_RECOVERY ?  "recovery" : "migration",
+	     res->lockname.len, res->lockname.name);
 	return ret;
 }
 
@@ -1146,8 +1323,8 @@ int dlm_mig_lockres_handler(struct o2net
 		mlog(0, "all done flag.  all lockres data received!\n");
 
 	ret = -ENOMEM;
-	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
-	item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+	buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
+	item = kcalloc(1, sizeof(*item), GFP_NOFS);
 	if (!buf || !item)
 		goto leave;
 
@@ -1238,7 +1415,7 @@ int dlm_mig_lockres_handler(struct o2net
 	spin_lock(&dlm->work_lock);
 	list_add_tail(&item->list, &dlm->work_list);
 	spin_unlock(&dlm->work_lock);
-	schedule_work(&dlm->dispatched_work);
+	queue_work(dlm->dlm_worker, &dlm->dispatched_work);
 
 leave:
 	dlm_put(dlm);
@@ -1312,8 +1489,9 @@ leave:
 
 
 
-int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
-			       struct dlm_lock_resource *res, u8 *real_master)
+static int dlm_lockres_master_requery(struct dlm_ctxt *dlm,
+				      struct dlm_lock_resource *res,
+				      u8 *real_master)
 {
 	struct dlm_node_iter iter;
 	int nodenum;
@@ -1406,6 +1584,7 @@ int dlm_master_requery_handler(struct o2
 	struct dlm_ctxt *dlm = data;
 	struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
 	struct dlm_lock_resource *res = NULL;
+	unsigned int hash;
 	int master = DLM_LOCK_RES_OWNER_UNKNOWN;
 	u32 flags = DLM_ASSERT_MASTER_REQUERY;
 
@@ -1415,8 +1594,10 @@ int dlm_master_requery_handler(struct o2
 		return master;
 	}
 
+	hash = dlm_lockid_hash(req->name, req->namelen);
+
 	spin_lock(&dlm->spinlock);
-	res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
+	res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash);
 	if (res) {
 		spin_lock(&res->spinlock);
 		master = res->owner;
@@ -1483,7 +1664,7 @@ static int dlm_process_recovery_data(str
 	struct dlm_lock *newlock = NULL;
 	struct dlm_lockstatus *lksb = NULL;
 	int ret = 0;
-	int i;
+	int i, bad;
 	struct list_head *iter;
 	struct dlm_lock *lock = NULL;
 
@@ -1553,28 +1734,48 @@ static int dlm_process_recovery_data(str
 		}
 		lksb->flags |= (ml->flags &
 				(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
-			
-		if (mres->lvb[0]) {
+
+		if (ml->type == LKM_NLMODE)
+			goto skip_lvb;
+
+		if (!dlm_lvb_is_empty(mres->lvb)) {
 			if (lksb->flags & DLM_LKSB_PUT_LVB) {
 				/* other node was trying to update
 				 * lvb when node died.  recreate the
 				 * lksb with the updated lvb. */
 				memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
+				/* the lock resource lvb update must happen
+				 * NOW, before the spinlock is dropped.
+				 * we no longer wait for the AST to update
+				 * the lvb. */
+				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
 			} else {
 				/* otherwise, the node is sending its 
 				 * most recent valid lvb info */
 				BUG_ON(ml->type != LKM_EXMODE &&
 				       ml->type != LKM_PRMODE);
-				if (res->lvb[0] && (ml->type == LKM_EXMODE ||
-				    memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
-					mlog(ML_ERROR, "received bad lvb!\n");
-					__dlm_print_one_lock_resource(res);
-					BUG();
+				if (!dlm_lvb_is_empty(res->lvb) &&
+ 				    (ml->type == LKM_EXMODE ||
+ 				     memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+ 					int i;
+ 					mlog(ML_ERROR, "%s:%.*s: received bad "
+ 					     "lvb! type=%d\n", dlm->name,
+ 					     res->lockname.len,
+ 					     res->lockname.name, ml->type);
+ 					printk("lockres lvb=[");
+ 					for (i=0; i<DLM_LVB_LEN; i++)
+ 						printk("%02x", res->lvb[i]);
+ 					printk("]\nmigrated lvb=[");
+ 					for (i=0; i<DLM_LVB_LEN; i++)
+ 						printk("%02x", mres->lvb[i]);
+ 					printk("]\n");
+ 					dlm_print_one_lock_resource(res);
+ 					BUG();
 				}
 				memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
 			}
 		}
-
+skip_lvb:
 
 		/* NOTE:
 		 * wrt lock queue ordering and recovery:
@@ -1592,9 +1793,33 @@ static int dlm_process_recovery_data(str
 		 * relative to each other, but clearly *not*
 		 * preserved relative to locks from other nodes.
 		 */
+		bad = 0;
 		spin_lock(&res->spinlock);
-		dlm_lock_get(newlock);
-		list_add_tail(&newlock->list, queue);
+		list_for_each_entry(lock, queue, list) {
+			if (lock->ml.cookie == ml->cookie) {
+				u64 c = lock->ml.cookie;
+				mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
+				     "exists on this lockres!\n", dlm->name,
+				     res->lockname.len, res->lockname.name,
+				     dlm_get_lock_cookie_node(c),
+				     dlm_get_lock_cookie_seq(c));
+
+				mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
+				     "node=%u, cookie=%u:%llu, queue=%d\n",
+	      			     ml->type, ml->convert_type, ml->node,
+				     dlm_get_lock_cookie_node(ml->cookie),
+				     dlm_get_lock_cookie_seq(ml->cookie),
+				     ml->list);
+
+				__dlm_print_one_lock_resource(res);
+				bad = 1;
+				break;
+			}
+		}
+		if (!bad) {
+			dlm_lock_get(newlock);
+			list_add_tail(&newlock->list, queue);
+		}
 		spin_unlock(&res->spinlock);
 	}
 	mlog(0, "done running all the locks\n");
@@ -1618,8 +1843,14 @@ void dlm_move_lockres_to_recovery_list(s
 	struct dlm_lock *lock;
 
 	res->state |= DLM_LOCK_RES_RECOVERING;
-	if (!list_empty(&res->recovering))
+	if (!list_empty(&res->recovering)) {
+		mlog(0,
+		     "Recovering res %s:%.*s, is already on recovery list!\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
 		list_del_init(&res->recovering);
+	}
+	/* We need to hold a reference while on the recovery list */
+	dlm_lockres_get(res);
 	list_add_tail(&res->recovering, &dlm->reco.resources);
 
 	/* find any pending locks and put them back on proper list */
@@ -1708,9 +1939,11 @@ static void dlm_finish_local_lockres_rec
 			spin_lock(&res->spinlock);
 			dlm_change_lockres_owner(dlm, res, new_master);
 			res->state &= ~DLM_LOCK_RES_RECOVERING;
-			__dlm_dirty_lockres(dlm, res);
+			if (!__dlm_lockres_unused(res))
+				__dlm_dirty_lockres(dlm, res);
 			spin_unlock(&res->spinlock);
 			wake_up(&res->wq);
+			dlm_lockres_put(res);
 		}
 	}
 
@@ -1719,7 +1952,7 @@ static void dlm_finish_local_lockres_rec
 	 * the RECOVERING state and set the owner
 	 * if necessary */
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-		bucket = &(dlm->lockres_hash[i]);
+		bucket = dlm_lockres_hash(dlm, i);
 		hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
 			if (res->state & DLM_LOCK_RES_RECOVERING) {
 				if (res->owner == dead_node) {
@@ -1743,11 +1976,13 @@ static void dlm_finish_local_lockres_rec
 					     dlm->name, res->lockname.len,
 					     res->lockname.name, res->owner);
 					list_del_init(&res->recovering);
+					dlm_lockres_put(res);
 				}
 				spin_lock(&res->spinlock);
 				dlm_change_lockres_owner(dlm, res, new_master);
 				res->state &= ~DLM_LOCK_RES_RECOVERING;
-				__dlm_dirty_lockres(dlm, res);
+				if (!__dlm_lockres_unused(res))
+					__dlm_dirty_lockres(dlm, res);
 				spin_unlock(&res->spinlock);
 				wake_up(&res->wq);
 			}
@@ -1884,7 +2119,7 @@ static void dlm_do_local_recovery_cleanu
 	 *    need to be fired as a result.
 	 */
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
-		bucket = &(dlm->lockres_hash[i]);
+		bucket = dlm_lockres_hash(dlm, i);
 		hlist_for_each_entry(res, iter, bucket, hash_node) {
  			/* always prune any $RECOVERY entries for dead nodes,
  			 * otherwise hangs can occur during later recovery */
@@ -1924,6 +2159,20 @@ static void __dlm_hb_node_down(struct dl
 {
 	assert_spin_locked(&dlm->spinlock);
 
+	if (dlm->reco.new_master == idx) {
+		mlog(0, "%s: recovery master %d just died\n",
+		     dlm->name, idx);
+		if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+			/* finalize1 was reached, so it is safe to clear
+			 * the new_master and dead_node.  that recovery
+			 * is complete. */
+			mlog(0, "%s: dead master %d had reached "
+			     "finalize1 state, clearing\n", dlm->name, idx);
+			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+			__dlm_reset_recovery(dlm);
+		}
+	}
+
 	/* check to see if the node is already considered dead */
 	if (!test_bit(idx, dlm->live_nodes_map)) {
 		mlog(0, "for domain %s, node %d is already dead. "
@@ -2087,7 +2336,7 @@ again:	
 
 			/* set the new_master to this node */
 			spin_lock(&dlm->spinlock);
-			dlm->reco.new_master = dlm->node_num;
+			dlm_set_reco_master(dlm, dlm->node_num);
 			spin_unlock(&dlm->spinlock);
 		}
 
@@ -2125,6 +2374,10 @@ again:	
 		mlog(0, "%s: reco master %u is ready to recover %u\n",
 		     dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
 		status = -EEXIST;
+	} else if (ret == DLM_RECOVERING) {
+		mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
+		     dlm->name, dlm->node_num);
+		goto again;
 	} else {
 		struct dlm_lock_resource *res;
 
@@ -2156,7 +2409,7 @@ static int dlm_send_begin_reco_message(s
 
 	mlog_entry("%u\n", dead_node);
 
-	mlog(0, "dead node is %u\n", dead_node);
+	mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
 
 	spin_lock(&dlm->spinlock);
 	dlm_node_iter_init(dlm->domain_map, &iter);
@@ -2214,6 +2467,14 @@ retry:
 			 * another ENOMEM */
 			msleep(100);
 			goto retry;
+		} else if (ret == EAGAIN) {
+			mlog(0, "%s: trying to start recovery of node "
+			     "%u, but node %u is waiting for last recovery "
+			     "to complete, backoff for a bit\n", dlm->name,
+			     dead_node, nodenum);
+			/* TODO Look into replacing msleep with cond_resched() */
+			msleep(100);
+			goto retry;
 		}
 	}
 
@@ -2229,8 +2490,20 @@ int dlm_begin_reco_handler(struct o2net_
 	if (!dlm_grab(dlm))
 		return 0;
 
-	mlog(0, "node %u wants to recover node %u\n",
-		  br->node_idx, br->dead_node);
+	spin_lock(&dlm->spinlock);
+	if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+		mlog(0, "%s: node %u wants to recover node %u (%u:%u) "
+		     "but this node is in finalize state, waiting on finalize2\n",
+		     dlm->name, br->node_idx, br->dead_node,
+		     dlm->reco.dead_node, dlm->reco.new_master);
+		spin_unlock(&dlm->spinlock);
+		return EAGAIN;
+	}
+	spin_unlock(&dlm->spinlock);
+
+	mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
+	     dlm->name, br->node_idx, br->dead_node,
+	     dlm->reco.dead_node, dlm->reco.new_master);
 
 	dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
 
@@ -2252,8 +2525,8 @@ int dlm_begin_reco_handler(struct o2net_
 		     "node %u changing it to %u\n", dlm->name, 
 		     dlm->reco.dead_node, br->node_idx, br->dead_node);
 	}
-	dlm->reco.new_master = br->node_idx;
-	dlm->reco.dead_node = br->dead_node;
+	dlm_set_reco_master(dlm, br->node_idx);
+	dlm_set_reco_dead_node(dlm, br->dead_node);
 	if (!test_bit(br->dead_node, dlm->recovery_map)) {
 		mlog(0, "recovery master %u sees %u as dead, but this "
 		     "node has not yet.  marking %u as dead\n",
@@ -2272,10 +2545,16 @@ int dlm_begin_reco_handler(struct o2net_
 	spin_unlock(&dlm->spinlock);
 
 	dlm_kick_recovery_thread(dlm);
+
+	mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n",
+	     dlm->name, br->node_idx, br->dead_node,
+	     dlm->reco.dead_node, dlm->reco.new_master);
+
 	dlm_put(dlm);
 	return 0;
 }
 
+#define DLM_FINALIZE_STAGE2  0x01
 static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
 {
 	int ret = 0;
@@ -2283,25 +2562,31 @@ static int dlm_send_finalize_reco_messag
 	struct dlm_node_iter iter;
 	int nodenum;
 	int status;
+	int stage = 1;
 
-	mlog(0, "finishing recovery for node %s:%u\n",
-	     dlm->name, dlm->reco.dead_node);
+	mlog(0, "finishing recovery for node %s:%u, "
+	     "stage %d\n", dlm->name, dlm->reco.dead_node, stage);
 
 	spin_lock(&dlm->spinlock);
 	dlm_node_iter_init(dlm->domain_map, &iter);
 	spin_unlock(&dlm->spinlock);
 
+stage2:
 	memset(&fr, 0, sizeof(fr));
 	fr.node_idx = dlm->node_num;
 	fr.dead_node = dlm->reco.dead_node;
+	if (stage == 2)
+		fr.flags |= DLM_FINALIZE_STAGE2;
 
 	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
 		if (nodenum == dlm->node_num)
 			continue;
 		ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
 					 &fr, sizeof(fr), nodenum, &status);
-		if (ret >= 0) {
+		if (ret >= 0)
 			ret = status;
+		if (ret < 0) {
+			mlog_errno(ret);
 			if (dlm_is_host_down(ret)) {
 				/* this has no effect on this recovery 
 				 * session, so set the status to zero to 
@@ -2309,13 +2594,17 @@ static int dlm_send_finalize_reco_messag
 				mlog(ML_ERROR, "node %u went down after this "
 				     "node finished recovery.\n", nodenum);
 				ret = 0;
+				continue;
 			}
-		}
-		if (ret < 0) {
-			mlog_errno(ret);
 			break;
 		}
 	}
+	if (stage == 1) {
+		/* reset the node_iter back to the top and send finalize2 */
+		iter.curnode = -1;
+		stage = 2;
+		goto stage2;
+	}
 
 	return ret;
 }
@@ -2324,14 +2613,19 @@ int dlm_finalize_reco_handler(struct o2n
 {
 	struct dlm_ctxt *dlm = data;
 	struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
+	int stage = 1;
 
 	/* ok to return 0, domain has gone away */
 	if (!dlm_grab(dlm))
 		return 0;
 
-	mlog(0, "node %u finalizing recovery of node %u\n",
-	     fr->node_idx, fr->dead_node);
+	if (fr->flags & DLM_FINALIZE_STAGE2)
+		stage = 2;
 
+	mlog(0, "%s: node %u finalizing recovery stage%d of "
+	     "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
+	     fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
+ 
 	spin_lock(&dlm->spinlock);
 
 	if (dlm->reco.new_master != fr->node_idx) {
@@ -2347,13 +2641,41 @@ int dlm_finalize_reco_handler(struct o2n
 		BUG();
 	}
 
-	dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
-
-	spin_unlock(&dlm->spinlock);
+	switch (stage) {
+		case 1:
+			dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+			if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+				mlog(ML_ERROR, "%s: received finalize1 from "
+				     "new master %u for dead node %u, but "
+				     "this node has already received it!\n",
+				     dlm->name, fr->node_idx, fr->dead_node);
+				dlm_print_reco_node_status(dlm);
+				BUG();
+			}
+			dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+			spin_unlock(&dlm->spinlock);
+			break;
+		case 2:
+			if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) {
+				mlog(ML_ERROR, "%s: received finalize2 from "
+				     "new master %u for dead node %u, but "
+				     "this node did not have finalize1!\n",
+				     dlm->name, fr->node_idx, fr->dead_node);
+				dlm_print_reco_node_status(dlm);
+				BUG();
+			}
+			dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+			spin_unlock(&dlm->spinlock);
+			dlm_reset_recovery(dlm);
+			dlm_kick_recovery_thread(dlm);
+			break;
+		default:
+			BUG();
+	}
 
-	dlm_reset_recovery(dlm);
+	mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
+	     dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
 
-	dlm_kick_recovery_thread(dlm);
 	dlm_put(dlm);
 	return 0;
 }
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 5be9d14..4641d06 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -39,6 +39,7 @@ #include <linux/socket.h>
 #include <linux/inet.h>
 #include <linux/timer.h>
 #include <linux/kthread.h>
+#include <linux/delay.h>
 
 
 #include "cluster/heartbeat.h"
@@ -53,6 +54,8 @@ #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_
 #include "cluster/masklog.h"
 
 static int dlm_thread(void *data);
+static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *lockres);
 
 static void dlm_flush_asts(struct dlm_ctxt *dlm);
 
@@ -80,7 +83,7 @@ repeat:
 }
 
 
-static int __dlm_lockres_unused(struct dlm_lock_resource *res)
+int __dlm_lockres_unused(struct dlm_lock_resource *res)
 {
 	if (list_empty(&res->granted) &&
 	    list_empty(&res->converting) &&
@@ -103,6 +106,20 @@ void __dlm_lockres_calc_usage(struct dlm
 	assert_spin_locked(&res->spinlock);
 
 	if (__dlm_lockres_unused(res)){
+		/* For now, just keep any resource we master */
+		if (res->owner == dlm->node_num)
+		{
+			if (!list_empty(&res->purge)) {
+				mlog(0, "we master %s:%.*s, but it is on "
+				     "the purge list.  Removing\n",
+				     dlm->name, res->lockname.len,
+				     res->lockname.name);
+				list_del_init(&res->purge);
+				dlm->purge_count--;
+			}
+			return;
+		}
+
 		if (list_empty(&res->purge)) {
 			mlog(0, "putting lockres %.*s from purge list\n",
 			     res->lockname.len, res->lockname.name);
@@ -110,10 +127,23 @@ void __dlm_lockres_calc_usage(struct dlm
 			res->last_used = jiffies;
 			list_add_tail(&res->purge, &dlm->purge_list);
 			dlm->purge_count++;
+
+			/* if this node is not the owner, there is
+			 * no way to keep track of who the owner could be.
+			 * unhash it to avoid serious problems. */
+			if (res->owner != dlm->node_num) {
+				mlog(0, "%s:%.*s: doing immediate "
+				     "purge of lockres owned by %u\n",
+				     dlm->name, res->lockname.len,
+				     res->lockname.name, res->owner);
+
+				dlm_purge_lockres_now(dlm, res);
+			}
 		}
 	} else if (!list_empty(&res->purge)) {
-		mlog(0, "removing lockres %.*s from purge list\n",
-		     res->lockname.len, res->lockname.name);
+		mlog(0, "removing lockres %.*s from purge list, "
+		     "owner=%u\n", res->lockname.len, res->lockname.name,
+		     res->owner);
 
 		list_del_init(&res->purge);
 		dlm->purge_count--;
@@ -165,6 +195,7 @@ again:
 	} else if (ret < 0) {
 		mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
 		     lockres->lockname.len, lockres->lockname.name);
+		msleep(100);
 		goto again;
 	}
 
@@ -178,6 +209,24 @@ finish:
 	__dlm_unhash_lockres(lockres);
 }
 
+/* make an unused lockres go away immediately.
+ * as soon as the dlm spinlock is dropped, this lockres
+ * will not be found. kfree still happens on last put. */
+static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
+				  struct dlm_lock_resource *lockres)
+{
+	assert_spin_locked(&dlm->spinlock);
+	assert_spin_locked(&lockres->spinlock);
+
+	BUG_ON(!__dlm_lockres_unused(lockres));
+
+	if (!list_empty(&lockres->purge)) {
+		list_del_init(&lockres->purge);
+		dlm->purge_count--;
+	}
+	__dlm_unhash_lockres(lockres);
+}
+
 static void dlm_run_purge_list(struct dlm_ctxt *dlm,
 			       int purge_now)
 {
@@ -422,6 +471,8 @@ void __dlm_dirty_lockres(struct dlm_ctxt
 	/* don't shuffle secondary queues */
 	if ((res->owner == dlm->node_num) &&
 	    !(res->state & DLM_LOCK_RES_DIRTY)) {
+		/* ref for dirty_list */
+		dlm_lockres_get(res);
 		list_add_tail(&res->dirty, &dlm->dirty_list);
 		res->state |= DLM_LOCK_RES_DIRTY;
 	}
@@ -606,6 +657,8 @@ static int dlm_thread(void *data)
 			list_del_init(&res->dirty);
 			spin_unlock(&res->spinlock);
 			spin_unlock(&dlm->spinlock);
+			/* Drop dirty_list ref */
+			dlm_lockres_put(res);
 
 		 	/* lockres can be re-dirtied/re-added to the
 			 * dirty_list in this gap, but that is ok */
@@ -642,8 +695,9 @@ static int dlm_thread(void *data)
 			 * spinlock and do NOT have the dlm lock.
 			 * safe to reserve/queue asts and run the lists. */
 
-			mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
-			     "res=%p\n", dlm, res);
+			mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
+			     "res=%.*s\n", dlm->name,
+			     res->lockname.len, res->lockname.name);
 
 			/* called while holding lockres lock */
 			dlm_shuffle_lists(dlm, res);
@@ -657,6 +711,8 @@ in_progress:
 			/* if the lock was in-progress, stick
 			 * it on the back of the list */
 			if (delay) {
+				/* ref for dirty_list */
+				dlm_lockres_get(res);
 				spin_lock(&res->spinlock);
 				list_add_tail(&res->dirty, &dlm->dirty_list);
 				res->state |= DLM_LOCK_RES_DIRTY;
@@ -677,7 +733,7 @@ in_progress:
 
 		/* yield and continue right away if there is more work to do */
 		if (!n) {
-			yield();
+			cond_resched();
 			continue;
 		}
 
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 7b1a275..1fe1112 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -319,6 +319,16 @@ static enum dlm_status dlm_send_remote_u
 
 	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
 
+	if (owner == dlm->node_num) {
+		/* ended up trying to contact ourself.  this means
+		 * that the lockres had been remote but became local
+		 * via a migration.  just retry it, now as local */
+		mlog(0, "%s:%.*s: this node became the master due to a "
+		     "migration, re-evaluate now\n", dlm->name,
+		     res->lockname.len, res->lockname.name);
+		return DLM_FORWARD;
+	}
+
 	memset(&unlock, 0, sizeof(unlock));
 	unlock.node_idx = dlm->node_num;
 	unlock.flags = cpu_to_be32(flags);
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index 74ca4e5..e641b08 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -672,7 +672,7 @@ struct dlm_ctxt *user_dlm_register_conte
 	u32 dlm_key;
 	char *domain;
 
-	domain = kmalloc(name->len + 1, GFP_KERNEL);
+	domain = kmalloc(name->len + 1, GFP_NOFS);
 	if (!domain) {
 		mlog_errno(-ENOMEM);
 		return ERR_PTR(-ENOMEM);
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 64cd528..6b8e9c5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2071,8 +2071,7 @@ int ocfs2_dlm_init(struct ocfs2_super *o
 	}
 
 	/* launch vote thread */
-	osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote-%d",
-				     osb->osb_id);
+	osb->vote_task = kthread_run(ocfs2_vote_thread, osb, "ocfs2vote");
 	if (IS_ERR(osb->vote_task)) {
 		status = PTR_ERR(osb->vote_task);
 		osb->vote_task = NULL;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 1a5c690..fcd4475 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -298,7 +298,7 @@ static int ocfs2_extent_map_find_leaf(st
 
 		ret = ocfs2_extent_map_insert(inode, rec,
 					      le16_to_cpu(el->l_tree_depth));
-		if (ret) {
+		if (ret && (ret != -EEXIST)) {
 			mlog_errno(ret);
 			goto out_free;
 		}
@@ -427,6 +427,11 @@ static int ocfs2_extent_map_insert_entry
 /*
  * Simple rule: on any return code other than -EAGAIN, anything left
  * in the insert_context will be freed.
+ *
+ * Simple rule #2: A return code of -EEXIST from this function or
+ * its calls to ocfs2_extent_map_insert_entry() signifies that another
+ * thread beat us to the insert.  It is not an actual error, but it
+ * tells the caller we have no more work to do.
  */
 static int ocfs2_extent_map_try_insert(struct inode *inode,
 				       struct ocfs2_extent_rec *rec,
@@ -448,22 +453,32 @@ static int ocfs2_extent_map_try_insert(s
 		goto out_unlock;
 	}
 
+	/* Since insert_entry failed, the map MUST have old_ent */
 	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
-					  le32_to_cpu(rec->e_clusters), NULL,
-					  NULL);
+					  le32_to_cpu(rec->e_clusters),
+					  NULL, NULL);
 
 	BUG_ON(!old_ent);
 
-	ret = -EEXIST;
-	if (old_ent->e_tree_depth < tree_depth)
+	if (old_ent->e_tree_depth < tree_depth) {
+		/* Another thread beat us to the lower tree_depth */
+		ret = -EEXIST;
 		goto out_unlock;
+	}
 
 	if (old_ent->e_tree_depth == tree_depth) {
+		/*
+		 * Another thread beat us to this tree_depth.
+		 * Let's make sure we agree with that thread (the
+		 * extent_rec should be identical).
+		 */
 		if (!memcmp(rec, &old_ent->e_rec,
 			    sizeof(struct ocfs2_extent_rec)))
 			ret = 0;
+		else
+			/* FIXME: Should this be ESRCH/EBADR??? */
+			ret = -EEXIST;
 
-		/* FIXME: Should this be ESRCH/EBADR??? */
 		goto out_unlock;
 	}
 
@@ -599,7 +614,7 @@ static int ocfs2_extent_map_insert(struc
 						  tree_depth, &ctxt);
 	} while (ret == -EAGAIN);
 
-	if (ret < 0)
+	if ((ret < 0) && (ret != -EEXIST))
 		mlog_errno(ret);
 
 	if (ctxt.left_ent)
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index eebc3cf..23d2617 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -785,8 +785,7 @@ int ocfs2_journal_load(struct ocfs2_jour
 	}
 
 	/* Launch the commit thread */
-	osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt-%d",
-				       osb->osb_id);
+	osb->commit_task = kthread_run(ocfs2_commit_thread, osb, "ocfs2cmt");
 	if (IS_ERR(osb->commit_task)) {
 		status = PTR_ERR(osb->commit_task);
 		osb->commit_task = NULL;
@@ -1119,7 +1118,7 @@ void ocfs2_recovery_thread(struct ocfs2_
 		goto out;
 
 	osb->recovery_thread_task =  kthread_run(__ocfs2_recovery_thread, osb,
-						 "ocfs2rec-%d", osb->osb_id);
+						 "ocfs2rec");
 	if (IS_ERR(osb->recovery_thread_task)) {
 		mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
 		osb->recovery_thread_task = NULL;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 843cf9d..83934e3 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -46,12 +46,12 @@ static struct page *ocfs2_nopage(struct 
 				 unsigned long address,
 				 int *type)
 {
-	struct inode *inode = area->vm_file->f_dentry->d_inode;
 	struct page *page = NOPAGE_SIGBUS;
 	sigset_t blocked, oldset;
 	int ret;
 
-	mlog_entry("(inode %lu, address %lu)\n", inode->i_ino, address);
+	mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
+		   type);
 
 	/* The best way to deal with signals in this path is
 	 * to block them upfront, rather than allowing the
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index da10930..cd4a6f2 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -184,7 +184,6 @@ struct ocfs2_journal;
 struct ocfs2_journal_handle;
 struct ocfs2_super
 {
-	u32 osb_id;		/* id used by the proc interface */
 	struct task_struct *commit_task;
 	struct super_block *sb;
 	struct inode *root_inode;
@@ -222,13 +221,11 @@ struct ocfs2_super
 	unsigned long s_mount_opt;
 
 	u16 max_slots;
-	u16 num_nodes;
 	s16 node_num;
 	s16 slot_num;
 	int s_sectsize_bits;
 	int s_clustersize;
 	int s_clustersize_bits;
-	struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
 
 	atomic_t vol_state;
 	struct mutex recovery_lock;
@@ -294,7 +291,6 @@ struct ocfs2_super
 };
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
-#define OCFS2_MAX_OSB_ID             65536
 
 static inline int ocfs2_should_order_data(struct inode *inode)
 {
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 8716279..aa6f5aa 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -264,7 +264,7 @@ int ocfs2_find_slot(struct ocfs2_super *
 	osb->slot_num = slot;
 	spin_unlock(&si->si_lock);
 
-	mlog(ML_NOTICE, "taking node slot %d\n", osb->slot_num);
+	mlog(0, "taking node slot %d\n", osb->slot_num);
 
 	status = ocfs2_update_disk_slots(osb, si);
 	if (status < 0)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 949b3da..73c0d2b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -68,13 +68,6 @@ #include "vote.h"
 
 #include "buffer_head_io.h"
 
-/*
- * Globals
- */
-static spinlock_t ocfs2_globals_lock = SPIN_LOCK_UNLOCKED;
-
-static u32 osb_id;             /* Keeps track of next available OSB Id */
-
 static kmem_cache_t *ocfs2_inode_cachep = NULL;
 
 kmem_cache_t *ocfs2_lock_cache = NULL;
@@ -642,10 +635,9 @@ static int ocfs2_fill_super(struct super
 
 	ocfs2_complete_mount_recovery(osb);
 
-	printk("ocfs2: Mounting device (%u,%u) on (node %d, slot %d) with %s "
-	       "data mode.\n",
-	       MAJOR(sb->s_dev), MINOR(sb->s_dev), osb->node_num,
-	       osb->slot_num,
+	printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %d, slot %d) "
+	       "with %s data mode.\n",
+	       osb->dev_str, osb->node_num, osb->slot_num,
 	       osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
 	       "ordered");
 
@@ -798,10 +790,6 @@ static int __init ocfs2_init(void)
 		goto leave;
 	}
 
-	spin_lock(&ocfs2_globals_lock);
-	osb_id = 0;
-	spin_unlock(&ocfs2_globals_lock);
-
 	ocfs2_debugfs_root = debugfs_create_dir("ocfs2", NULL);
 	if (!ocfs2_debugfs_root) {
 		status = -EFAULT;
@@ -1018,7 +1006,7 @@ static int ocfs2_fill_local_node_info(st
 		goto bail;
 	}
 
-	mlog(ML_NOTICE, "I am node %d\n", osb->node_num);
+	mlog(0, "I am node %d\n", osb->node_num);
 
 	status = 0;
 bail:
@@ -1189,8 +1177,8 @@ static void ocfs2_dismount_volume(struct
 
 	atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
 
-	printk("ocfs2: Unmounting device (%u,%u) on (node %d)\n",
-	       MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev), osb->node_num);
+	printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %d)\n",
+	       osb->dev_str, osb->node_num);
 
 	ocfs2_delete_osb(osb);
 	kfree(osb);
@@ -1210,8 +1198,6 @@ static int ocfs2_setup_osb_uuid(struct o
 	if (osb->uuid_str == NULL)
 		return -ENOMEM;
 
-	memcpy(osb->uuid, uuid, OCFS2_VOL_UUID_LEN);
-
 	for (i = 0, ptr = osb->uuid_str; i < OCFS2_VOL_UUID_LEN; i++) {
 		/* print with null */
 		ret = snprintf(ptr, 3, "%02X", uuid[i]);
@@ -1309,13 +1295,6 @@ static int ocfs2_initialize_super(struct
 		goto bail;
 	}
 
-	osb->uuid = kmalloc(OCFS2_VOL_UUID_LEN, GFP_KERNEL);
-	if (!osb->uuid) {
-		mlog(ML_ERROR, "unable to alloc uuid\n");
-		status = -ENOMEM;
-		goto bail;
-	}
-
 	di = (struct ocfs2_dinode *)bh->b_data;
 
 	osb->max_slots = le16_to_cpu(di->id2.i_super.s_max_slots);
@@ -1325,7 +1304,7 @@ static int ocfs2_initialize_super(struct
 		status = -EINVAL;
 		goto bail;
 	}
-	mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);
+	mlog(0, "max_slots for this device: %u\n", osb->max_slots);
 
 	init_waitqueue_head(&osb->osb_wipe_event);
 	osb->osb_orphan_wipes = kcalloc(osb->max_slots,
@@ -1416,7 +1395,7 @@ static int ocfs2_initialize_super(struct
 		goto bail;
 	}
 
-	memcpy(&uuid_net_key, &osb->uuid[i], sizeof(osb->net_key));
+	memcpy(&uuid_net_key, di->id2.i_super.s_uuid, sizeof(uuid_net_key));
 	osb->net_key = le32_to_cpu(uuid_net_key);
 
 	strncpy(osb->vol_label, di->id2.i_super.s_label, 63);
@@ -1482,18 +1461,6 @@ static int ocfs2_initialize_super(struct
 		goto bail;
 	}
 
-	/*  Link this osb onto the global linked list of all osb structures. */
-	/*  The Global Link List is mainted for the whole driver . */
-	spin_lock(&ocfs2_globals_lock);
-	osb->osb_id = osb_id;
-	if (osb_id < OCFS2_MAX_OSB_ID)
-		osb_id++;
-	else {
-		mlog(ML_ERROR, "Too many volumes mounted\n");
-		status = -ENOMEM;
-	}
-	spin_unlock(&ocfs2_globals_lock);
-
 bail:
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index f6986bd..18dbd26 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -155,7 +155,7 @@ static void *ocfs2_follow_link(struct de
 	}
 
 	status = vfs_follow_link(nd, link);
-	if (status)
+	if (status && status != -ENOENT)
 		mlog_errno(status);
 bail:
 	if (page) {