GIT 3f31214702b048cb55f6838c8332358643d94474 git+ssh://master.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6-nmw.git

commit 
Author: David Teigland <teigland@redhat.com>
Date:   Tue Jan 9 09:46:02 2007 -0600

    [DLM] expose dlm_config_info fields in configfs
    
    Make the dlm_config_info values readable and writeable via configfs
    entries.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 15e056fb23f500e2e4de7a23397113bd4f9d2521
Author: David Teigland <teigland@redhat.com>
Date:   Tue Jan 9 09:44:01 2007 -0600

    [DLM] add config entry to enable log_debug
    
    Add a new dlm_config_info field to enable log_debug output and change
    log_debug() to use it.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit b4135d4f278f0328ad67cf9e40e3cffd28004ba5
Author: David Teigland <teigland@redhat.com>
Date:   Tue Jan 9 09:41:48 2007 -0600

    [DLM] rename dlm_config_info fields
    
    Add a "ci_" prefix to the fields in the dlm_config_info struct so that we
    can use macros to add configfs functions to access them (in a later
    patch).  No functional changes in this patch, just naming changes.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit fb76e09a36619e46b9298e93be564328ea3eeb0e
Author: David Teigland <teigland@redhat.com>
Date:   Tue Jan 9 09:38:39 2007 -0600

    [DLM] change some log_error to log_debug
    
    Some common, non-error messages should use log_debug instead of log_error
    so they can be turned off.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit f8fb3e9d4c0d4e7709803eab2e1b23d76e42009b
Author: Steven Whitehouse <swhiteho@redhat.com>
Date:   Tue Jan 9 12:00:31 2007 -0500

    [GFS2] Compile file for previous patch
    
    This is a quick fix to the previous patch to fix a typo which
    prevents it compiling.
    
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
    Cc: Wendy Cheng <wcheng@redhat.com>

commit b988f30c4fddfd56a87652366e4125853e0c6028
Author: Wendy Cheng <wcheng@redhat.com>
Date:   Mon Jan 8 14:19:34 2007 -0500

    [GFS2] Fix gfs2_rename deadlock
    
    Second round of gfs2_rename lock re-ordering to allow Anaconda adding
    root partition on top of gfs2. Previous to this patch the recursive
    lock detector in glock.c can be triggered due to attempting to lock
    the rgrp twice. This fixes it by checking to see whether the rgrp
    is already locked.
    
    This fixes Red Hat bugzilla #221237
    
    Signed-off-by: S. Wendy Cheng <wcheng@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 6eeee1d10101833334f3dab453e2366ebb765885
Author: Russell Cattelan <cattelan@redhat.com>
Date:   Mon Jan 8 17:47:51 2007 -0600

    [GFS2] BZ 217008 fsfuzzer fix.
    
    Update the quilt header comments to match the
    code changes.
    
    Change gfs2_lookup_simple to return an error in the case
    of a NULL inode.
    The callers of gfs2_lookup_simple do not check for NULL
    in the no entry case and such would end up dereferencing a NULL ptr.
    
    This fixes:
    http://projects.info-pull.com/mokb/MOKB-15-11-2006.html
    
    Signed-off-by: Russell Cattelan <cattelan@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 0a2547441d9f21188ba6dd4bb1603255ef9c2b7d
Author: Steven Whitehouse <swhiteho@redhat.com>
Date:   Mon Jan 8 14:31:40 2007 +0000

    [GFS2] Fix ordering of page disposal vs. glock_dq
    
    In case of unlinked files with dirty pages GFS2 wasn't clearing
    the pages in quite the right order. This patch clears the pages
    earlier (before the qlock_dq) to avoid the situation that the
    release of the glock results in attempting to write back data that
    has already been deallocated.
    
    This fixes Red Hat bugzilla: #220117
    
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 3caba8b6cae34dfaa818ee3ed10e934b99d3e22a
Author: Patrick Caulfield <pcaulfie@redhat.com>
Date:   Tue Jan 2 17:08:54 2007 +0000

    [DLM] Fix spin lock already unlocked bug
    
    I just noticed this message when testing some other changes I'd made to
    lowcomms (to use workqueues) but the problem seems to be in the current
    git trees too. I'm amazed no-one has seen it.
    
        BUG: spinlock already unlocked on CPU#1, dlm_recoverd/16868
    
    Signed-Off-By: Patrick Caulfield <pcaulfie@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit ca22804caf6c4eb6bc67bf1347f5532d3682ef35
Author: Patrick Caulfield <pcaulfie@redhat.com>
Date:   Tue Jan 2 17:01:05 2007 +0000

    [DLM] Fix schedule() calls
    
    I was a little over-enthusiastic turning schedule() calls int cond_sched() when fixing the DLM for Andrew Morton.
    
    These four should really be calls to schedule() or the dlm can busy-wait.
    
    Signed-Off-By: Patrick Caulfield <pcaulfie@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 009c000a7ff3f66b293c16fd821ec0bdeb4a1a13
Author: Wendy Cheng <wcheng@redhat.com>
Date:   Fri Dec 15 14:44:36 2006 -0500

    [GFS2] Fix change nlink deadlock
    
    Bugzilla 215088
    
    Fix deadlock in gfs2_change_nlink() while installing RHEL5 into GFS2
    partition. The gfs2_rename() apparently needs block allocation for the
    new name (into the directory) where it requires rg locks. At the same
    time, while updating the nlink count for the replaced file,
    gfs2_change_nlink() tries to return the inode meta-data back to resource
    group where it needs rg locks too. Our logic doesn't allow process to
    acquire these locks recursively by the same process  (RHEL installer)
    that results a BUG call. This only happens within rename code path and
    only if the destination file exists before the rename operation.
    
    Signed-off-by: S. Wendy Cheng <wcheng@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 1c077ed91b977f132da6ac2e92a08cf2bf8d80f6
Author: Steven Whitehouse <swhiteho@redhat.com>
Date:   Fri Dec 15 16:49:51 2006 -0500

    [GFS2] Fail over to readpage for stuffed files
    
    This is partially derrived from a patch written by Russell Cattelan.
    It fixes a bug where there is a race between readpages and truncate
    by ignoring readpages for stuffed files. This is ok because a stuffed
    file will never be more than one block (minus sizeof(struct gfs2_dinode))
    in size and block size is always less than page size, so we do not lose
    anything efficiency-wise by not doing readahead for stuffed files. They
    will have already been "read ahead" by the action of reading the inode
    in, in the first place.
    
    This is the remaining part of the fix for Red Hat bugzilla #218966
    which had not yet made it upstream.
    
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
    Cc: Russell Cattelan <cattelan@redhat.com>

commit 9e804c01e0e4f66e2c11893f0b3920c84f15ea2d
Author: Steven Whitehouse <swhiteho@redhat.com>
Date:   Thu Dec 14 18:24:26 2006 +0000

    [GFS2] Fix DIO deadlock
    
    This patch fixes Red Hat bugzilla #212627 in which a deadlock occurs
    due to trying to take the i_mutex while holding a glock. The correct
    locking order is defined as i_mutex -> glock in all cases.
    
    I've left dealing with allocating writes. I know that we need to do
    that, but for now this should do the trick. We don't need to take the
    i_mutex on write, because the VFS has already taken it for us. On read
    we don't need it since the glock is enough protection. The reason that
    I've made some of the checks into a separate function is that we'll need
    to do the checks again in the allocating write case eventually, so this
    is partly in preparation for this. Likewise the return value test of !=
    1 might look a bit odd and thats because we'll need a third return value
    in case of requiring an allocation.
    
    I've made the change to deferred mode on the glock to ensure flushing
    read caches on other nodes. I notice that (using blktrace to look at
    whats going on) we appear to do a better job of large I/Os than ext3
    after this patch (in terms of not splitting up the I/Os).
    
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
    Cc: Wendy Cheng <wcheng@redhat.com>

commit 4ce2a88c37f8530438b4b8eb5c8065228612020f
Author: Adrian Bunk <bunk@stusta.de>
Date:   Tue Dec 19 13:04:03 2006 -0800

    [DLM] fs/dlm/lowcomms-tcp.c: remove 2 functions
    
    Remove the following unused functions:
    
    - lowcomms_send_message()
    - lowcomms_max_buffer_size()
    
    Signed-off-by: Adrian Bunk <bunk@stusta.de>
    Signed-off-by: Andrew Morton <akpm@osdl.org>
    Signed-off-by: Patrick Caulfield <pcaulfie@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 0bb7d9af1ff55e08f55148855075043d32e449bd
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 13 10:40:26 2006 -0600

    [DLM] fix lost flags in stub replies
    
    When the dlm fakes an unlock/cancel reply from a failed node using a stub
    message struct, it wasn't setting the flags in the stub message.  So, in
    the process of receiving the fake message the lkb flags would be updated
    and cleared from the zero flags in the message.  The problem observed in
    tests was the loss of the USER flag which caused the dlm to think a user
    lock was a kernel lock and subsequently fail an assertion checking the
    validity of the ast/callback field.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit a8f511fb7d90f3363ed68cde217c206c6b85bf56
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 13 10:39:20 2006 -0600

    [DLM] fix receive_request() lvb copying
    
    LVB's are not sent as part of new requests, but the code receiving the
    request was copying data into the lvb anyway.  The space in the message
    where it mistakenly thought the lvb lived actually contained the resource
    name, so it wound up incorrectly copying this name data into the lvb.  Fix
    is to just create the lvb, not copy junk into it.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit aa2d61254a4d0c3c73e75521836f17c93fc30822
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 13 10:38:45 2006 -0600

    [DLM] fix send_args() lvb copying
    
    The send_args() function is used to copy parameters into a message for a
    number different message types.  Only some of those types are set up
    beforehand (in create_message) to include space for sending lvb data.
    send_args was wrongly copying the lvb for all message types as long as the
    lock had an lvb.  This means that the lvb data was being written past the
    end of the message into unknown space.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 2cfbd7072eaa0c430382ce39d4185f30d036cc52
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 13 10:37:55 2006 -0600

    [DLM] add version check
    
    Check if we receive a message from another lockspace member running a
    version of the dlm with an incompatible inter-node message protocol.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit eb15e3ab09e7dbdae88b618ff685e284c010d392
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 13 10:37:16 2006 -0600

    [DLM] fix old rcom messages
    
    A reply to a recovery message will often be received after the relevant
    recovery sequence has aborted and the next recovery sequence has begun.
    We need to ignore replies to these old messages from the previous
    recovery.  There's already a way to do this for synchronous recovery
    requests using the rc_id number, but not for async.
    
    Each recovery sequence already has a locally unique sequence number
    associated with it.  This patch adds a field to the rcom (recovery
    message) structure where this recovery sequence number can be placed,
    rc_seq.  When a node sends a reply to a recovery request, it copies the
    rc_seq number it received into rc_seq_reply.  When the first node receives
    the reply to its recovery message, it will check whether rc_seq_reply
    matches the current recovery sequence number, ls_recover_seq, and if not
    then it ignores the old reply.
    
    An old, inadequate approach to filtering out old replies (checking if the
    current stage of recovery has moved back to the start) has been removed
    from two spots.
    
    The protocol version number is changed to reflect the different rcom
    structures.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit e5be984a8c58b56788023f4995cd497d96b8f812
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 13 10:36:37 2006 -0600

    [DLM] fix resend rcom lock
    
    There's a chance the new master of resource hasn't learned it's the new
    master before another node sends it a lock during recovery.  The node
    sending the lock needs to resend if this happens.
    
    - A sends a master lookup for resource R to C
    - B sends a master lookup for resource R to C
    - C receives A's lookup, assigns A to be master of R and
      sends a reply back to A
    - C receives B's lookup and sends a reply back to B saying
      that A is the master
    - B receives lookup reply from C and sends its lock for R to A
    - A receives lock from B, doesn't think it's the master of R
      and sends an error back to B
    - A receives lookup reply from C and becomes master of R
    - B gets error back from A and resends its lock back to A
      (this resending is what this patch does)
    - A receives lock from B, it now sees it's the master of R
      and takes the lock
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 4b5ae83680d0f280f2cd4541af618e15c3ff0323
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 6 11:46:33 2006 -0600

    [GFS2] don't try to lockfs after shutdown
    
    If an fs has already been shut down, a lockfs callback should do nothing.
    An fs that's been shut down can't acquire locks or do anything with
    respect to the cluster.
    
    Also, remove FIXME comment in withdraw function.  The missing bits of the
    withdraw procedure are now all done by user space.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
 fs/dlm/config.c        |  154 ++++++++++++++++++++++++++++++++++++++++++++----
 fs/dlm/config.h        |   17 +++--
 fs/dlm/dlm_internal.h  |   19 +++---
 fs/dlm/lock.c          |   43 +++++++++++--
 fs/dlm/lockspace.c     |   10 ++-
 fs/dlm/lowcomms-sctp.c |   10 ++-
 fs/dlm/lowcomms-tcp.c  |   39 +++---------
 fs/dlm/midcomms.c      |    4 +
 fs/dlm/rcom.c          |   85 +++++++++++++++++---------
 fs/dlm/recover.c       |    4 +
 fs/dlm/recoverd.c      |   22 +++----
 fs/dlm/util.c          |    4 +
 fs/gfs2/inode.c        |   56 ++++++++++++++---
 fs/gfs2/inode.h        |    1 
 fs/gfs2/lm.c           |    8 --
 fs/gfs2/ops_address.c  |  102 +++++++++++++++-----------------
 fs/gfs2/ops_inode.c    |    4 +
 fs/gfs2/ops_super.c    |    9 +++
 18 files changed, 395 insertions(+), 196 deletions(-)

diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index 8855305..8665c88 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -54,6 +54,11 @@ static struct config_item *make_node(str
 static void drop_node(struct config_group *, struct config_item *);
 static void release_node(struct config_item *);
 
+static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
+			    char *buf);
+static ssize_t store_cluster(struct config_item *i,
+			     struct configfs_attribute *a,
+			     const char *buf, size_t len);
 static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
 			 char *buf);
 static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a,
@@ -73,6 +78,101 @@ static ssize_t node_nodeid_write(struct 
 static ssize_t node_weight_read(struct node *nd, char *buf);
 static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len);
 
+struct cluster {
+	struct config_group group;
+	unsigned int cl_tcp_port;
+	unsigned int cl_buffer_size;
+	unsigned int cl_rsbtbl_size;
+	unsigned int cl_lkbtbl_size;
+	unsigned int cl_dirtbl_size;
+	unsigned int cl_recover_timer;
+	unsigned int cl_toss_secs;
+	unsigned int cl_scan_secs;
+	unsigned int cl_log_debug;
+};
+
+enum {
+	CLUSTER_ATTR_TCP_PORT = 0,
+	CLUSTER_ATTR_BUFFER_SIZE,
+	CLUSTER_ATTR_RSBTBL_SIZE,
+	CLUSTER_ATTR_LKBTBL_SIZE,
+	CLUSTER_ATTR_DIRTBL_SIZE,
+	CLUSTER_ATTR_RECOVER_TIMER,
+	CLUSTER_ATTR_TOSS_SECS,
+	CLUSTER_ATTR_SCAN_SECS,
+	CLUSTER_ATTR_LOG_DEBUG,
+};
+
+struct cluster_attribute {
+	struct configfs_attribute attr;
+	ssize_t (*show)(struct cluster *, char *);
+	ssize_t (*store)(struct cluster *, const char *, size_t);
+};
+
+static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
+			   unsigned int *info_field, int check_zero,
+			   const char *buf, size_t len)
+{
+	unsigned int x;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	x = simple_strtoul(buf, NULL, 0);
+
+	if (check_zero && !x)
+		return -EINVAL;
+
+	*cl_field = x;
+	*info_field = x;
+
+	return len;
+}
+
+#define __CONFIGFS_ATTR(_name,_mode,_read,_write) {                           \
+	.attr   = { .ca_name = __stringify(_name),                            \
+		    .ca_mode = _mode,                                         \
+		    .ca_owner = THIS_MODULE },                                \
+	.show   = _read,                                                      \
+	.store  = _write,                                                     \
+}
+
+#define CLUSTER_ATTR(name, check_zero)                                        \
+static ssize_t name##_write(struct cluster *cl, const char *buf, size_t len)  \
+{                                                                             \
+	return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name,         \
+			   check_zero, buf, len);                             \
+}                                                                             \
+static ssize_t name##_read(struct cluster *cl, char *buf)                     \
+{                                                                             \
+	return snprintf(buf, PAGE_SIZE, "%u\n", cl->cl_##name);               \
+}                                                                             \
+static struct cluster_attribute cluster_attr_##name =                         \
+__CONFIGFS_ATTR(name, 0644, name##_read, name##_write)
+
+CLUSTER_ATTR(tcp_port, 1);
+CLUSTER_ATTR(buffer_size, 1);
+CLUSTER_ATTR(rsbtbl_size, 1);
+CLUSTER_ATTR(lkbtbl_size, 1);
+CLUSTER_ATTR(dirtbl_size, 1);
+CLUSTER_ATTR(recover_timer, 1);
+CLUSTER_ATTR(toss_secs, 1);
+CLUSTER_ATTR(scan_secs, 1);
+CLUSTER_ATTR(log_debug, 0);
+
+static struct configfs_attribute *cluster_attrs[] = {
+	[CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port.attr,
+	[CLUSTER_ATTR_BUFFER_SIZE] = &cluster_attr_buffer_size.attr,
+	[CLUSTER_ATTR_RSBTBL_SIZE] = &cluster_attr_rsbtbl_size.attr,
+	[CLUSTER_ATTR_LKBTBL_SIZE] = &cluster_attr_lkbtbl_size.attr,
+	[CLUSTER_ATTR_DIRTBL_SIZE] = &cluster_attr_dirtbl_size.attr,
+	[CLUSTER_ATTR_RECOVER_TIMER] = &cluster_attr_recover_timer.attr,
+	[CLUSTER_ATTR_TOSS_SECS] = &cluster_attr_toss_secs.attr,
+	[CLUSTER_ATTR_SCAN_SECS] = &cluster_attr_scan_secs.attr,
+	[CLUSTER_ATTR_LOG_DEBUG] = &cluster_attr_log_debug.attr,
+	NULL,
+};
+
 enum {
 	COMM_ATTR_NODEID = 0,
 	COMM_ATTR_LOCAL,
@@ -152,10 +252,6 @@ struct clusters {
 	struct configfs_subsystem subsys;
 };
 
-struct cluster {
-	struct config_group group;
-};
-
 struct spaces {
 	struct config_group ss_group;
 };
@@ -197,6 +293,8 @@ static struct configfs_group_operations 
 
 static struct configfs_item_operations cluster_ops = {
 	.release = release_cluster,
+	.show_attribute = show_cluster,
+	.store_attribute = store_cluster,
 };
 
 static struct configfs_group_operations spaces_ops = {
@@ -237,6 +335,7 @@ static struct config_item_type clusters_
 
 static struct config_item_type cluster_type = {
 	.ct_item_ops = &cluster_ops,
+	.ct_attrs = cluster_attrs,
 	.ct_owner = THIS_MODULE,
 };
 
@@ -317,6 +416,16 @@ static struct config_group *make_cluster
 	cl->group.default_groups[1] = &cms->cs_group;
 	cl->group.default_groups[2] = NULL;
 
+	cl->cl_tcp_port = dlm_config.ci_tcp_port;
+	cl->cl_buffer_size = dlm_config.ci_buffer_size;
+	cl->cl_rsbtbl_size = dlm_config.ci_rsbtbl_size;
+	cl->cl_lkbtbl_size = dlm_config.ci_lkbtbl_size;
+	cl->cl_dirtbl_size = dlm_config.ci_dirtbl_size;
+	cl->cl_recover_timer = dlm_config.ci_recover_timer;
+	cl->cl_toss_secs = dlm_config.ci_toss_secs;
+	cl->cl_scan_secs = dlm_config.ci_scan_secs;
+	cl->cl_log_debug = dlm_config.ci_log_debug;
+
 	space_list = &sps->ss_group;
 	comm_list = &cms->cs_group;
 	return &cl->group;
@@ -509,6 +618,25 @@ void dlm_config_exit(void)
  * Functions for user space to read/write attributes
  */
 
+static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a,
+			    char *buf)
+{
+	struct cluster *cl = to_cluster(i);
+	struct cluster_attribute *cla =
+			container_of(a, struct cluster_attribute, attr);
+	return cla->show ? cla->show(cl, buf) : 0;
+}
+
+static ssize_t store_cluster(struct config_item *i,
+			     struct configfs_attribute *a,
+			     const char *buf, size_t len)
+{
+	struct cluster *cl = to_cluster(i);
+	struct cluster_attribute *cla =
+		container_of(a, struct cluster_attribute, attr);
+	return cla->store ? cla->store(cl, buf, len) : -EINVAL;
+}
+
 static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a,
 			 char *buf)
 {
@@ -775,15 +903,17 @@ #define DEFAULT_DIRTBL_SIZE      512
 #define DEFAULT_RECOVER_TIMER      5
 #define DEFAULT_TOSS_SECS         10
 #define DEFAULT_SCAN_SECS          5
+#define DEFAULT_LOG_DEBUG          0
 
 struct dlm_config_info dlm_config = {
-	.tcp_port = DEFAULT_TCP_PORT,
-	.buffer_size = DEFAULT_BUFFER_SIZE,
-	.rsbtbl_size = DEFAULT_RSBTBL_SIZE,
-	.lkbtbl_size = DEFAULT_LKBTBL_SIZE,
-	.dirtbl_size = DEFAULT_DIRTBL_SIZE,
-	.recover_timer = DEFAULT_RECOVER_TIMER,
-	.toss_secs = DEFAULT_TOSS_SECS,
-	.scan_secs = DEFAULT_SCAN_SECS
+	.ci_tcp_port = DEFAULT_TCP_PORT,
+	.ci_buffer_size = DEFAULT_BUFFER_SIZE,
+	.ci_rsbtbl_size = DEFAULT_RSBTBL_SIZE,
+	.ci_lkbtbl_size = DEFAULT_LKBTBL_SIZE,
+	.ci_dirtbl_size = DEFAULT_DIRTBL_SIZE,
+	.ci_recover_timer = DEFAULT_RECOVER_TIMER,
+	.ci_toss_secs = DEFAULT_TOSS_SECS,
+	.ci_scan_secs = DEFAULT_SCAN_SECS,
+	.ci_log_debug = DEFAULT_LOG_DEBUG
 };
 
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index 9da7839..1e97861 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -17,14 +17,15 @@ #define __CONFIG_DOT_H__
 #define DLM_MAX_ADDR_COUNT 3
 
 struct dlm_config_info {
-	int tcp_port;
-	int buffer_size;
-	int rsbtbl_size;
-	int lkbtbl_size;
-	int dirtbl_size;
-	int recover_timer;
-	int toss_secs;
-	int scan_secs;
+	int ci_tcp_port;
+	int ci_buffer_size;
+	int ci_rsbtbl_size;
+	int ci_lkbtbl_size;
+	int ci_dirtbl_size;
+	int ci_recover_timer;
+	int ci_toss_secs;
+	int ci_scan_secs;
+	int ci_log_debug;
 };
 
 extern struct dlm_config_info dlm_config;
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 1ee8195..ee993c5 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -41,6 +41,7 @@ #include <asm/semaphore.h>
 #include <asm/uaccess.h>
 
 #include <linux/dlm.h>
+#include "config.h"
 
 #define DLM_LOCKSPACE_LEN	64
 
@@ -69,12 +70,12 @@ #define log_print(fmt, args...) \
 #define log_error(ls, fmt, args...) \
 	printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args)
 
-#define DLM_LOG_DEBUG
-#ifdef DLM_LOG_DEBUG
-#define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args)
-#else
-#define log_debug(ls, fmt, args...)
-#endif
+#define log_debug(ls, fmt, args...) \
+do { \
+	if (dlm_config.ci_log_debug) \
+		printk(KERN_DEBUG "dlm: %s: " fmt "\n", \
+		       (ls)->ls_name , ##args); \
+} while (0)
 
 #define DLM_ASSERT(x, do) \
 { \
@@ -309,8 +310,8 @@ static inline int rsb_flag(struct dlm_rs
 
 /* dlm_header is first element of all structs sent between nodes */
 
-#define DLM_HEADER_MAJOR	0x00020000
-#define DLM_HEADER_MINOR	0x00000001
+#define DLM_HEADER_MAJOR	0x00030000
+#define DLM_HEADER_MINOR	0x00000000
 
 #define DLM_MSG			1
 #define DLM_RCOM		2
@@ -386,6 +387,8 @@ struct dlm_rcom {
 	uint32_t		rc_type;	/* DLM_RCOM_ */
 	int			rc_result;	/* multi-purpose */
 	uint64_t		rc_id;		/* match reply with request */
+	uint64_t		rc_seq;		/* sender's ls_recover_seq */
+	uint64_t		rc_seq_reply;	/* remote ls_recover_seq */
 	char			rc_buf[0];
 };
 
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 30878de..5bac982 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -810,7 +810,7 @@ static int shrink_bucket(struct dlm_ls *
 		list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss,
 					    res_hashchain) {
 			if (!time_after_eq(jiffies, r->res_toss_time +
-					   dlm_config.toss_secs * HZ))
+					   dlm_config.ci_toss_secs * HZ))
 				continue;
 			found = 1;
 			break;
@@ -2144,12 +2144,24 @@ static void send_args(struct dlm_rsb *r,
 	if (lkb->lkb_astaddr)
 		ms->m_asts |= AST_COMP;
 
-	if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
-		memcpy(ms->m_extra, r->res_name, r->res_length);
+	/* compare with switch in create_message; send_remove() doesn't
+	   use send_args() */
 
-	else if (lkb->lkb_lvbptr)
+	switch (ms->m_type) {
+	case DLM_MSG_REQUEST:
+	case DLM_MSG_LOOKUP:
+		memcpy(ms->m_extra, r->res_name, r->res_length);
+		break;
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_REQUEST_REPLY:
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_GRANT:
+		if (!lkb->lkb_lvbptr)
+			break;
 		memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
-
+		break;
+	}
 }
 
 static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
@@ -2418,8 +2430,12 @@ static int receive_request_args(struct d
 
 	DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
 
-	if (receive_lvb(ls, lkb, ms))
-		return -ENOMEM;
+	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+		/* lkb was just created so there won't be an lvb yet */
+		lkb->lkb_lvbptr = allocate_lvb(ls);
+		if (!lkb->lkb_lvbptr)
+			return -ENOMEM;
+	}
 
 	return 0;
 }
@@ -3132,6 +3148,7 @@ static void recover_convert_waiter(struc
 	if (middle_conversion(lkb)) {
 		hold_lkb(lkb);
 		ls->ls_stub_ms.m_result = -EINPROGRESS;
+		ls->ls_stub_ms.m_flags = lkb->lkb_flags;
 		_remove_from_waiters(lkb);
 		_receive_convert_reply(lkb, &ls->ls_stub_ms);
 
@@ -3205,6 +3222,7 @@ void dlm_recover_waiters_pre(struct dlm_
 		case DLM_MSG_UNLOCK:
 			hold_lkb(lkb);
 			ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
 			_remove_from_waiters(lkb);
 			_receive_unlock_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
@@ -3213,6 +3231,7 @@ void dlm_recover_waiters_pre(struct dlm_
 		case DLM_MSG_CANCEL:
 			hold_lkb(lkb);
 			ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
 			_remove_from_waiters(lkb);
 			_receive_cancel_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
@@ -3571,6 +3590,14 @@ int dlm_recover_process_copy(struct dlm_
 	lock_rsb(r);
 
 	switch (error) {
+	case -EBADR:
+		/* There's a chance the new master received our lock before
+		   dlm_recover_master_reply(), this wouldn't happen if we did
+		   a barrier between recover_masters and recover_locks. */
+		log_debug(ls, "master copy not ready %x r %lx %s", lkb->lkb_id,
+			  (unsigned long)r, r->res_name);
+		dlm_send_rcom_lock(r, lkb);
+		goto out;
 	case -EEXIST:
 		log_debug(ls, "master copy exists %x", lkb->lkb_id);
 		/* fall through */
@@ -3585,7 +3612,7 @@ int dlm_recover_process_copy(struct dlm_
 	/* an ack for dlm_recover_locks() which waits for replies from
 	   all the locks it sends to new masters */
 	dlm_recovered_lock(r);
-
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
 	dlm_put_lkb(lkb);
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 59012b0..f40817b 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -236,7 +236,7 @@ static int dlm_scand(void *data)
 	while (!kthread_should_stop()) {
 		list_for_each_entry(ls, &lslist, ls_list)
 			dlm_scan_rsbs(ls);
-		schedule_timeout_interruptible(dlm_config.scan_secs * HZ);
+		schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ);
 	}
 	return 0;
 }
@@ -422,7 +422,7 @@ static int new_lockspace(char *name, int
 	ls->ls_count = 0;
 	ls->ls_flags = 0;
 
-	size = dlm_config.rsbtbl_size;
+	size = dlm_config.ci_rsbtbl_size;
 	ls->ls_rsbtbl_size = size;
 
 	ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL);
@@ -434,7 +434,7 @@ static int new_lockspace(char *name, int
 		rwlock_init(&ls->ls_rsbtbl[i].lock);
 	}
 
-	size = dlm_config.lkbtbl_size;
+	size = dlm_config.ci_lkbtbl_size;
 	ls->ls_lkbtbl_size = size;
 
 	ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL);
@@ -446,7 +446,7 @@ static int new_lockspace(char *name, int
 		ls->ls_lkbtbl[i].counter = 1;
 	}
 
-	size = dlm_config.dirtbl_size;
+	size = dlm_config.ci_dirtbl_size;
 	ls->ls_dirtbl_size = size;
 
 	ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL);
@@ -489,7 +489,7 @@ static int new_lockspace(char *name, int
 	mutex_init(&ls->ls_requestqueue_mutex);
 	mutex_init(&ls->ls_clear_proc_locks);
 
-	ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+	ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
 	if (!ls->ls_recover_buf)
 		goto out_dirfree;
 
diff --git a/fs/dlm/lowcomms-sctp.c b/fs/dlm/lowcomms-sctp.c
index fe158d7..5aeadad 100644
--- a/fs/dlm/lowcomms-sctp.c
+++ b/fs/dlm/lowcomms-sctp.c
@@ -635,7 +635,7 @@ static int add_bind_addr(struct sockaddr
 
 	if (result < 0)
 		log_print("Can't bind to port %d addr number %d",
-			  dlm_config.tcp_port, num);
+			  dlm_config.ci_tcp_port, num);
 
 	return result;
 }
@@ -711,7 +711,7 @@ static int init_sock(void)
 	/* Bind to all interfaces. */
 	for (i = 0; i < dlm_local_count; i++) {
 		memcpy(&localaddr, dlm_local_addr[i], sizeof(localaddr));
-		make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len);
+		make_sockaddr(&localaddr, dlm_config.ci_tcp_port, &addr_len);
 
 		result = add_bind_addr(&localaddr, addr_len, num);
 		if (result)
@@ -863,7 +863,7 @@ static void initiate_association(int nod
 		return;
 	}
 
-	make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen);
+	make_sockaddr(&rem_addr, dlm_config.ci_tcp_port, &addrlen);
 
 	outmessage.msg_name = &rem_addr;
 	outmessage.msg_namelen = addrlen;
@@ -1109,7 +1109,7 @@ static int dlm_recvd(void *data)
 		set_current_state(TASK_INTERRUPTIBLE);
 		add_wait_queue(&lowcomms_recv_wait, &wait);
 		if (!test_bit(CF_READ_PENDING, &sctp_con.flags))
-			cond_resched();
+			schedule();
 		remove_wait_queue(&lowcomms_recv_wait, &wait);
 		set_current_state(TASK_RUNNING);
 
@@ -1141,7 +1141,7 @@ static int dlm_sendd(void *data)
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (write_list_empty())
-			cond_resched();
+			schedule();
 		set_current_state(TASK_RUNNING);
 
 		if (sctp_con.eagain_flag) {
diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms-tcp.c
index 9be3a44..b4fb578 100644
--- a/fs/dlm/lowcomms-tcp.c
+++ b/fs/dlm/lowcomms-tcp.c
@@ -548,7 +548,7 @@ static void connect_to_sock(struct conne
 	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
 
-	make_sockaddr(&saddr, dlm_config.tcp_port, &addr_len);
+	make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
 
 	add_sock(sock, con);
 
@@ -616,10 +616,10 @@ static struct socket *create_listen_sock
 	con->sock = sock;
 
 	/* Bind to our port */
-	make_sockaddr(saddr, dlm_config.tcp_port, &addr_len);
+	make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
 	result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
 	if (result < 0) {
-		printk("dlm: Can't bind to port %d\n", dlm_config.tcp_port);
+		printk("dlm: Can't bind to port %d\n", dlm_config.ci_tcp_port);
 		sock_release(sock);
 		sock = NULL;
 		con->sock = NULL;
@@ -638,7 +638,8 @@ static struct socket *create_listen_sock
 
 	result = sock->ops->listen(sock, 5);
 	if (result < 0) {
-		printk("dlm: Can't listen on port %d\n", dlm_config.tcp_port);
+		printk("dlm: Can't listen on port %d\n",
+		       dlm_config.ci_tcp_port);
 		sock_release(sock);
 		sock = NULL;
 		goto create_out;
@@ -709,6 +710,7 @@ void *dlm_lowcomms_get_buffer(int nodeid
 	if (!con)
 		return NULL;
 
+	spin_lock(&con->writequeue_lock);
 	e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
 	if ((&e->list == &con->writequeue) ||
 	    (PAGE_CACHE_SIZE - e->end < len)) {
@@ -747,6 +749,7 @@ void dlm_lowcomms_commit_buffer(void *mh
 	struct connection *con = e->con;
 	int users;
 
+	spin_lock(&con->writequeue_lock);
 	users = --e->users;
 	if (users)
 		goto out;
@@ -880,22 +883,6 @@ out:
 	return -1;
 }
 
-/* API send message call, may queue the request */
-/* N.B. This is the old interface - use the new one for new calls */
-int lowcomms_send_message(int nodeid, char *buf, int len, gfp_t allocation)
-{
-	struct writequeue_entry *e;
-	char *b;
-
-	e = dlm_lowcomms_get_buffer(nodeid, len, allocation, &b);
-	if (e) {
-		memcpy(b, buf, len);
-		dlm_lowcomms_commit_buffer(e);
-		return 0;
-	}
-	return -ENOBUFS;
-}
-
 /* Look for activity on active sockets */
 static void process_sockets(void)
 {
@@ -1012,7 +999,7 @@ static int dlm_recvd(void *data)
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (read_list_empty())
-			cond_resched();
+			schedule();
 		set_current_state(TASK_RUNNING);
 
 		process_sockets();
@@ -1046,7 +1033,7 @@ static int dlm_sendd(void *data)
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (write_and_state_lists_empty())
-			cond_resched();
+			schedule();
 		set_current_state(TASK_RUNNING);
 
 		process_state_queue();
@@ -1087,14 +1074,6 @@ static int daemons_start(void)
 	return 0;
 }
 
-/*
- * Return the largest buffer size we can cope with.
- */
-int lowcomms_max_buffer_size(void)
-{
-	return PAGE_CACHE_SIZE;
-}
-
 void dlm_lowcomms_stop(void)
 {
 	int i;
diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c
index c9b1c3d..a5126e0 100644
--- a/fs/dlm/midcomms.c
+++ b/fs/dlm/midcomms.c
@@ -82,7 +82,7 @@ int dlm_process_incoming_buffer(int node
 		if (msglen < sizeof(struct dlm_header))
 			break;
 		err = -E2BIG;
-		if (msglen > dlm_config.buffer_size) {
+		if (msglen > dlm_config.ci_buffer_size) {
 			log_print("message size %d from %d too big, buf len %d",
 				  msglen, nodeid, len);
 			break;
@@ -103,7 +103,7 @@ int dlm_process_incoming_buffer(int node
 
 		if (msglen > sizeof(__tmp) &&
 		    msg == (struct dlm_header *) __tmp) {
-			msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL);
+			msg = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL);
 			if (msg == NULL)
 				return ret;
 		}
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 4cc31be..6bfbd61 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -56,6 +56,10 @@ static int create_rcom(struct dlm_ls *ls
 
 	rc->rc_type = type;
 
+	spin_lock(&ls->ls_recover_lock);
+	rc->rc_seq = ls->ls_recover_seq;
+	spin_unlock(&ls->ls_recover_lock);
+
 	*mh_ret = mh;
 	*rc_ret = rc;
 	return 0;
@@ -78,8 +82,17 @@ static void make_config(struct dlm_ls *l
 	rf->rf_lsflags = ls->ls_exflags;
 }
 
-static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
+static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
 {
+	struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
+
+	if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
+		log_error(ls, "version mismatch: %x nodeid %d: %x",
+			  DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid,
+			  rc->rc_header.h_version);
+		return -EINVAL;
+	}
+
 	if (rf->rf_lvblen != ls->ls_lvblen ||
 	    rf->rf_lsflags != ls->ls_exflags) {
 		log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
@@ -125,7 +138,7 @@ int dlm_rcom_status(struct dlm_ls *ls, i
 		goto out;
 
 	allow_sync_reply(ls, &rc->rc_id);
-	memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+	memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
 
 	send_rcom(ls, mh, rc);
 
@@ -141,8 +154,7 @@ int dlm_rcom_status(struct dlm_ls *ls, i
 		log_debug(ls, "remote node %d not ready", nodeid);
 		rc->rc_result = 0;
 	} else
-		error = check_config(ls, (struct rcom_config *) rc->rc_buf,
-				     nodeid);
+		error = check_config(ls, rc, nodeid);
 	/* the caller looks at rc_result for the remote recovery status */
  out:
 	return error;
@@ -159,6 +171,7 @@ static void receive_rcom_status(struct d
 	if (error)
 		return;
 	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
 	rc->rc_result = dlm_recover_status(ls);
 	make_config(ls, (struct rcom_config *) rc->rc_buf);
 
@@ -200,7 +213,7 @@ int dlm_rcom_names(struct dlm_ls *ls, in
 	if (nodeid == dlm_our_nodeid()) {
 		dlm_copy_master_names(ls, last_name, last_len,
 		                      ls->ls_recover_buf + len,
-		                      dlm_config.buffer_size - len, nodeid);
+		                      dlm_config.ci_buffer_size - len, nodeid);
 		goto out;
 	}
 
@@ -210,7 +223,7 @@ int dlm_rcom_names(struct dlm_ls *ls, in
 	memcpy(rc->rc_buf, last_name, last_len);
 
 	allow_sync_reply(ls, &rc->rc_id);
-	memset(ls->ls_recover_buf, 0, dlm_config.buffer_size);
+	memset(ls->ls_recover_buf, 0, dlm_config.ci_buffer_size);
 
 	send_rcom(ls, mh, rc);
 
@@ -224,30 +237,17 @@ static void receive_rcom_names(struct dl
 {
 	struct dlm_rcom *rc;
 	struct dlm_mhandle *mh;
-	int error, inlen, outlen;
-	int nodeid = rc_in->rc_header.h_nodeid;
-	uint32_t status = dlm_recover_status(ls);
-
-	/*
-	 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
-	 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
-	 * It could only happen in rare cases where we get a late NAMES
-	 * message from a previous instance of recovery.
-	 */
-
-	if (!(status & DLM_RS_NODES)) {
-		log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
-		return;
-	}
+	int error, inlen, outlen, nodeid;
 
 	nodeid = rc_in->rc_header.h_nodeid;
 	inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
-	outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom);
+	outlen = dlm_config.ci_buffer_size - sizeof(struct dlm_rcom);
 
 	error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh);
 	if (error)
 		return;
 	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
 
 	dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
 			      nodeid);
@@ -294,6 +294,7 @@ static void receive_rcom_lookup(struct d
 		ret_nodeid = error;
 	rc->rc_result = ret_nodeid;
 	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
 
 	send_rcom(ls, mh, rc);
 }
@@ -375,20 +376,13 @@ static void receive_rcom_lock(struct dlm
 
 	memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
 	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
 
 	send_rcom(ls, mh, rc);
 }
 
 static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 {
-	uint32_t status = dlm_recover_status(ls);
-
-	if (!(status & DLM_RS_DIR)) {
-		log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
-			  rc_in->rc_header.h_nodeid);
-		return;
-	}
-
 	dlm_recover_process_copy(ls, rc_in);
 }
 
@@ -415,6 +409,7 @@ static int send_ls_not_ready(int nodeid,
 
 	rc->rc_type = DLM_RCOM_STATUS_REPLY;
 	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
 	rc->rc_result = -ESRCH;
 
 	rf = (struct rcom_config *) rc->rc_buf;
@@ -426,6 +421,31 @@ static int send_ls_not_ready(int nodeid,
 	return 0;
 }
 
+static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	uint64_t seq;
+	int rv = 0;
+
+	switch (rc->rc_type) {
+	case DLM_RCOM_STATUS_REPLY:
+	case DLM_RCOM_NAMES_REPLY:
+	case DLM_RCOM_LOOKUP_REPLY:
+	case DLM_RCOM_LOCK_REPLY:
+		spin_lock(&ls->ls_recover_lock);
+		seq = ls->ls_recover_seq;
+		spin_unlock(&ls->ls_recover_lock);
+		if (rc->rc_seq_reply != seq) {
+			log_debug(ls, "ignoring old reply %x from %d "
+				      "seq_reply %llx expect %llx",
+				      rc->rc_type, rc->rc_header.h_nodeid,
+				      (unsigned long long)rc->rc_seq_reply,
+				      (unsigned long long)seq);
+			rv = 1;
+		}
+	}
+	return rv;
+}
+
 /* Called by dlm_recvd; corresponds to dlm_receive_message() but special
    recovery-only comms are sent through here. */
 
@@ -449,11 +469,14 @@ void dlm_receive_rcom(struct dlm_header 
 	}
 
 	if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) {
-		log_error(ls, "ignoring recovery message %x from %d",
+		log_debug(ls, "ignoring recovery message %x from %d",
 			  rc->rc_type, nodeid);
 		goto out;
 	}
 
+	if (is_old_reply(ls, rc))
+		goto out;
+
 	if (nodeid != rc->rc_header.h_nodeid) {
 		log_error(ls, "bad rcom nodeid %d from %d",
 			  rc->rc_header.h_nodeid, nodeid);
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index cf9f683..a7fa4cb 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -44,7 +44,7 @@ #include "recover.h"
 static void dlm_wait_timer_fn(unsigned long data)
 {
 	struct dlm_ls *ls = (struct dlm_ls *) data;
-	mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ));
+	mod_timer(&ls->ls_timer, jiffies + (dlm_config.ci_recover_timer * HZ));
 	wake_up(&ls->ls_wait_general);
 }
 
@@ -55,7 +55,7 @@ int dlm_wait_function(struct dlm_ls *ls,
 	init_timer(&ls->ls_timer);
 	ls->ls_timer.function = dlm_wait_timer_fn;
 	ls->ls_timer.data = (long) ls;
-	ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ);
+	ls->ls_timer.expires = jiffies + (dlm_config.ci_recover_timer * HZ);
 	add_timer(&ls->ls_timer);
 
 	wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls));
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 650536a..3cb636d 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -77,7 +77,7 @@ static int ls_recover(struct dlm_ls *ls,
 
 	error = dlm_recover_members(ls, rv, &neg);
 	if (error) {
-		log_error(ls, "recover_members failed %d", error);
+		log_debug(ls, "recover_members failed %d", error);
 		goto fail;
 	}
 	start = jiffies;
@@ -89,7 +89,7 @@ static int ls_recover(struct dlm_ls *ls,
 
 	error = dlm_recover_directory(ls);
 	if (error) {
-		log_error(ls, "recover_directory failed %d", error);
+		log_debug(ls, "recover_directory failed %d", error);
 		goto fail;
 	}
 
@@ -99,7 +99,7 @@ static int ls_recover(struct dlm_ls *ls,
 
 	error = dlm_recover_directory_wait(ls);
 	if (error) {
-		log_error(ls, "recover_directory_wait failed %d", error);
+		log_debug(ls, "recover_directory_wait failed %d", error);
 		goto fail;
 	}
 
@@ -129,7 +129,7 @@ static int ls_recover(struct dlm_ls *ls,
 
 		error = dlm_recover_masters(ls);
 		if (error) {
-			log_error(ls, "recover_masters failed %d", error);
+			log_debug(ls, "recover_masters failed %d", error);
 			goto fail;
 		}
 
@@ -139,13 +139,13 @@ static int ls_recover(struct dlm_ls *ls,
 
 		error = dlm_recover_locks(ls);
 		if (error) {
-			log_error(ls, "recover_locks failed %d", error);
+			log_debug(ls, "recover_locks failed %d", error);
 			goto fail;
 		}
 
 		error = dlm_recover_locks_wait(ls);
 		if (error) {
-			log_error(ls, "recover_locks_wait failed %d", error);
+			log_debug(ls, "recover_locks_wait failed %d", error);
 			goto fail;
 		}
 
@@ -166,7 +166,7 @@ static int ls_recover(struct dlm_ls *ls,
 
 		error = dlm_recover_locks_wait(ls);
 		if (error) {
-			log_error(ls, "recover_locks_wait failed %d", error);
+			log_debug(ls, "recover_locks_wait failed %d", error);
 			goto fail;
 		}
 	}
@@ -184,7 +184,7 @@ static int ls_recover(struct dlm_ls *ls,
 	dlm_set_recover_status(ls, DLM_RS_DONE);
 	error = dlm_recover_done_wait(ls);
 	if (error) {
-		log_error(ls, "recover_done_wait failed %d", error);
+		log_debug(ls, "recover_done_wait failed %d", error);
 		goto fail;
 	}
 
@@ -192,19 +192,19 @@ static int ls_recover(struct dlm_ls *ls,
 
 	error = enable_locking(ls, rv->seq);
 	if (error) {
-		log_error(ls, "enable_locking failed %d", error);
+		log_debug(ls, "enable_locking failed %d", error);
 		goto fail;
 	}
 
 	error = dlm_process_requestqueue(ls);
 	if (error) {
-		log_error(ls, "process_requestqueue failed %d", error);
+		log_debug(ls, "process_requestqueue failed %d", error);
 		goto fail;
 	}
 
 	error = dlm_recover_waiters_post(ls);
 	if (error) {
-		log_error(ls, "recover_waiters_post failed %d", error);
+		log_debug(ls, "recover_waiters_post failed %d", error);
 		goto fail;
 	}
 
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 767197d..963889c 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -134,6 +134,8 @@ void dlm_rcom_out(struct dlm_rcom *rc)
 	rc->rc_type		= cpu_to_le32(rc->rc_type);
 	rc->rc_result		= cpu_to_le32(rc->rc_result);
 	rc->rc_id		= cpu_to_le64(rc->rc_id);
+	rc->rc_seq		= cpu_to_le64(rc->rc_seq);
+	rc->rc_seq_reply	= cpu_to_le64(rc->rc_seq_reply);
 
 	if (type == DLM_RCOM_LOCK)
 		rcom_lock_out((struct rcom_lock *) rc->rc_buf);
@@ -151,6 +153,8 @@ void dlm_rcom_in(struct dlm_rcom *rc)
 	rc->rc_type		= le32_to_cpu(rc->rc_type);
 	rc->rc_result		= le32_to_cpu(rc->rc_result);
 	rc->rc_id		= le64_to_cpu(rc->rc_id);
+	rc->rc_seq		= le64_to_cpu(rc->rc_seq);
+	rc->rc_seq_reply	= le64_to_cpu(rc->rc_seq_reply);
 
 	if (rc->rc_type == DLM_RCOM_LOCK)
 		rcom_lock_in((struct rcom_lock *) rc->rc_buf);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d122074..58c2ce7 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -287,10 +287,8 @@ out:
  *
  * Returns: errno
  */
-
 int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
 {
-	struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
 	struct buffer_head *dibh;
 	u32 nlink;
 	int error;
@@ -322,26 +320,52 @@ int gfs2_change_nlink(struct gfs2_inode 
 	brelse(dibh);
 	mark_inode_dirty(&ip->i_inode);
 
-	if (ip->i_inode.i_nlink == 0) {
-		struct gfs2_rgrpd *rgd;
-		struct gfs2_holder ri_gh, rg_gh;
+	if (ip->i_inode.i_nlink == 0)
+		error = gfs2_change_nlink_i(ip);
+
+	return error;
+}
+
+int gfs2_change_nlink_i(struct gfs2_inode *ip)
+{
+	struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
+	struct gfs2_inode *rindex = GFS2_I(sdp->sd_rindex);
+	struct gfs2_glock *ri_gl = rindex->i_gl;
+	struct gfs2_rgrpd *rgd;
+	struct gfs2_holder ri_gh, rg_gh;
+	int existing, error;
 
+	/* if we come from rename path, we could have the lock already */
+	existing = gfs2_glock_is_locked_by_me(ri_gl);
+	if (!existing) {
 		error = gfs2_rindex_hold(sdp, &ri_gh);
 		if (error)
 			goto out;
-		error = -EIO;
-		rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
-		if (!rgd)
-			goto out_norgrp;
+	}
+
+	/* find the matching rgd */
+	error = -EIO;
+	rgd = gfs2_blk2rgrpd(sdp, ip->i_num.no_addr);
+	if (!rgd)
+		goto out_norgrp;
+
+	/*
+	 * Eventually we may want to move rgd(s) to a linked list
+	 * and piggyback the free logic into one of gfs2 daemons
+	 * to gain some performance.
+	 */
+	if (!rgd->rd_gl || !gfs2_glock_is_locked_by_me(rgd->rd_gl)) {
 		error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh);
 		if (error)
 			goto out_norgrp;
 
 		gfs2_unlink_di(&ip->i_inode); /* mark inode unlinked */
 		gfs2_glock_dq_uninit(&rg_gh);
+	}
+
 out_norgrp:
+	if (!existing)
 		gfs2_glock_dq_uninit(&ri_gh);
-	}
 out:
 	return error;
 }
@@ -349,8 +373,18 @@ out:
 struct inode *gfs2_lookup_simple(struct inode *dip, const char *name)
 {
 	struct qstr qstr;
+	struct inode *inode;
 	gfs2_str2qstr(&qstr, name);
-	return gfs2_lookupi(dip, &qstr, 1, NULL);
+	inode = gfs2_lookupi(dip, &qstr, 1, NULL);
+	/* gfs2_lookupi has inconsistent callers: vfs
+	 * related routines expect NULL for no entry found,
+	 * gfs2_lookup_simple callers expect ENOENT
+	 * and do not check for NULL.
+	 */
+	if (inode == NULL)
+		return ERR_PTR(-ENOENT);
+	else
+		return inode;
 }
 
 
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index b57f448..cee281b 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -40,6 +40,7 @@ int gfs2_inode_refresh(struct gfs2_inode
 
 int gfs2_dinode_dealloc(struct gfs2_inode *inode);
 int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
+int gfs2_change_nlink_i(struct gfs2_inode *ip);
 struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 			   int is_root, struct nameidata *nd);
 struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
index effe4a3..e30673d 100644
--- a/fs/gfs2/lm.c
+++ b/fs/gfs2/lm.c
@@ -104,15 +104,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sd
 	vprintk(fmt, args);
 	va_end(args);
 
-	fs_err(sdp, "about to withdraw from the cluster\n");
+	fs_err(sdp, "about to withdraw this file system\n");
 	BUG_ON(sdp->sd_args.ar_debug);
 
-
-	fs_err(sdp, "waiting for outstanding I/O\n");
-
-	/* FIXME: suspend dm device so oustanding bio's complete
-	   and all further io requests fail */
-
 	fs_err(sdp, "telling LM to withdraw\n");
 	gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
 	fs_err(sdp, "withdrawn\n");
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index d8d69a7..37bfeb9 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -256,7 +256,7 @@ out_unlock:
  *    the page lock and the glock) and return having done no I/O. Its
  *    obviously not something we'd want to do on too regular a basis.
  *    Any I/O we ignore at this time will be done via readpage later.
- * 2. We have to handle stuffed files here too.
+ * 2. We don't handle stuffed files here we let readpage do the honours.
  * 3. mpage_readpages() does most of the heavy lifting in the common case.
  * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
  * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
@@ -269,8 +269,7 @@ static int gfs2_readpages(struct file *f
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_holder gh;
-	unsigned page_idx;
-	int ret;
+	int ret = 0;
 	int do_unlock = 0;
 
 	if (likely(file != &gfs2_internal_file_sentinel)) {
@@ -289,29 +288,8 @@ static int gfs2_readpages(struct file *f
 			goto out_unlock;
 	}
 skip_lock:
-	if (gfs2_is_stuffed(ip)) {
-		struct pagevec lru_pvec;
-		pagevec_init(&lru_pvec, 0);
-		for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-			struct page *page = list_entry(pages->prev, struct page, lru);
-			prefetchw(&page->flags);
-			list_del(&page->lru);
-			if (!add_to_page_cache(page, mapping,
-					       page->index, GFP_KERNEL)) {
-				ret = stuffed_readpage(ip, page);
-				unlock_page(page);
-				if (!pagevec_add(&lru_pvec, page))
-					 __pagevec_lru_add(&lru_pvec);
-			} else {
-				page_cache_release(page);
-			}
-		}
-		pagevec_lru_add(&lru_pvec);
-		ret = 0;
-	} else {
-		/* What we really want to do .... */
+	if (!gfs2_is_stuffed(ip))
 		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
-	}
 
 	if (do_unlock) {
 		gfs2_glock_dq_m(1, &gh);
@@ -594,6 +572,36 @@ static void gfs2_invalidatepage(struct p
 	return;
 }
 
+/**
+ * gfs2_ok_for_dio - check that dio is valid on this file
+ * @ip: The inode
+ * @rw: READ or WRITE
+ * @offset: The offset at which we are reading or writing
+ *
+ * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
+ *          1 (to accept the i/o request)
+ */
+static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
+{
+	/*
+	 * Should we return an error here? I can't see that O_DIRECT for
+	 * a journaled file makes any sense. For now we'll silently fall
+	 * back to buffered I/O, likewise we do the same for stuffed
+	 * files since they are (a) small and (b) unaligned.
+	 */
+	if (gfs2_is_jdata(ip))
+		return 0;
+
+	if (gfs2_is_stuffed(ip))
+		return 0;
+
+	if (offset > i_size_read(&ip->i_inode))
+		return 0;
+	return 1;
+}
+
+
+
 static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 			      const struct iovec *iov, loff_t offset,
 			      unsigned long nr_segs)
@@ -604,42 +612,28 @@ static ssize_t gfs2_direct_IO(int rw, st
 	struct gfs2_holder gh;
 	int rv;
 
-	if (rw == READ)
-		mutex_lock(&inode->i_mutex);
 	/*
-	 * Shared lock, even if its a write, since we do no allocation
-	 * on this path. All we need change is atime.
+	 * Deferred lock, even if its a write, since we do no allocation
+	 * on this path. All we need change is atime, and this lock mode
+	 * ensures that other nodes have flushed their buffered read caches
+	 * (i.e. their page cache entries for this inode). We do not,
+	 * unfortunately have the option of only flushing a range like
+	 * the VFS does.
 	 */
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh);
 	rv = gfs2_glock_nq_atime(&gh);
 	if (rv)
-		goto out;
-
-	if (offset > i_size_read(inode))
-		goto out;
-
-	/*
-	 * Should we return an error here? I can't see that O_DIRECT for
-	 * a journaled file makes any sense. For now we'll silently fall
-	 * back to buffered I/O, likewise we do the same for stuffed
-	 * files since they are (a) small and (b) unaligned.
-	 */
-	if (gfs2_is_jdata(ip))
-		goto out;
-
-	if (gfs2_is_stuffed(ip))
-		goto out;
-
-	rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
-					    inode->i_sb->s_bdev,
-					    iov, offset, nr_segs,
-					    gfs2_get_block_direct, NULL);
+		return rv;
+	rv = gfs2_ok_for_dio(ip, rw, offset);
+	if (rv != 1)
+		goto out; /* dio not valid, fall back to buffered i/o */
+
+	rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
+					   iov, offset, nr_segs,
+					   gfs2_get_block_direct, NULL);
 out:
 	gfs2_glock_dq_m(1, &gh);
 	gfs2_holder_uninit(&gh);
-	if (rw == READ)
-		mutex_unlock(&inode->i_mutex);
-
 	return rv;
 }
 
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 636dda4..b2a12f4 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -684,12 +684,12 @@ static int gfs2_rename(struct inode *odi
 		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
 					 al->al_rgd->rd_ri.ri_length +
 					 4 * RES_DINODE + 4 * RES_LEAF +
-					 RES_STATFS + RES_QUOTA, 0);
+					 RES_STATFS + RES_QUOTA + 4, 0);
 		if (error)
 			goto out_ipreserv;
 	} else {
 		error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
-					 5 * RES_LEAF, 0);
+					 5 * RES_LEAF + 4, 0);
 		if (error)
 			goto out_gunlock;
 	}
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 7685b46..c22738c 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -173,6 +173,9 @@ static void gfs2_write_super_lockfs(stru
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	int error;
 
+	if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return;
+
 	for (;;) {
 		error = gfs2_freeze_fs(sdp);
 		if (!error)
@@ -426,6 +429,12 @@ static void gfs2_delete_inode(struct ino
 	}
 
 	error = gfs2_dinode_dealloc(ip);
+	/*
+	 * Must do this before unlock to avoid trying to write back
+	 * potentially dirty data now that inode no longer exists
+	 * on disk.
+	 */
+	truncate_inode_pages(&inode->i_data, 0);
 
 out_unlock:
 	gfs2_glock_dq(&ip->i_iopen_gh);