GIT 40ad0d6b8aa0e4b2851c481f317791f07258e31d git+ssh://master.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-2.6-nmw.git

commit 
Author: Adrian Bunk <bunk@stusta.de>
Date:   Tue Dec 19 13:04:03 2006 -0800

    [DLM] fs/dlm/lowcomms-tcp.c: remove 2 functions
    
    Remove the following unused functions:
    
    - lowcomms_send_message()
    - lowcomms_max_buffer_size()
    
    Signed-off-by: Adrian Bunk <bunk@stusta.de>
    Signed-off-by: Andrew Morton <akpm@osdl.org>
    Signed-off-by: Patrick Caulfield <pcaulfie@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 536966e9085f6f653110b232cd481c02d821a842
Author: Wendy Cheng <wcheng@redhat.com>
Date:   Fri Dec 15 14:44:36 2006 -0500

    [GFS2] Fix change nlink deadlock
    
    Bugzilla 215088
    
    Fix deadlock in gfs2_change_nlink() while installing RHEL5 into GFS2
    partition. The gfs2_rename() apparently needs block allocation for the
    new name (into the directory) where it requires rg locks. At the same
    time, while updating the nlink count for the replaced file,
    gfs2_change_nlink() tries to return the inode meta-data back to resource
    group where it needs rg locks too. Our logic doesn't allow process to
    acquire these locks recursively by the same process  (RHEL installer)
    that results a BUG call. This only happens within rename code path and
    only if the destination file exists before the rename operation.
    
    Signed-off-by: S. Wendy Cheng <wcheng@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 067bd6ac53a63ccd612cf2e756ec55dbd75d3f7b
Author: Steven Whitehouse <swhiteho@redhat.com>
Date:   Fri Dec 15 16:49:51 2006 -0500

    [GFS2] Fail over to readpage for stuffed files
    
    This is partially derrived from a patch written by Russell Cattelan.
    It fixes a bug where there is a race between readpages and truncate
    by ignoring readpages for stuffed files. This is ok because a stuffed
    file will never be more than one block (minus sizeof(struct gfs2_dinode))
    in size and block size is always less than page size, so we do not lose
    anything efficiency-wise by not doing readahead for stuffed files. They
    will have already been "read ahead" by the action of reading the inode
    in, in the first place.
    
    This is the remaining part of the fix for Red Hat bugzilla #218966
    which had not yet made it upstream.
    
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
    Cc: Russell Cattelan <cattelan@redhat.com>

commit 3898bf7700cda0c11a5dc461c4499ba589cc55e6
Author: Steven Whitehouse <swhiteho@redhat.com>
Date:   Thu Dec 14 18:24:26 2006 +0000

    [GFS2] Fix DIO deadlock
    
    This patch fixes Red Hat bugzilla #212627 in which a deadlock occurs
    due to trying to take the i_mutex while holding a glock. The correct
    locking order is defined as i_mutex -> glock in all cases.
    
    I've left dealing with allocating writes. I know that we need to do
    that, but for now this should do the trick. We don't need to take the
    i_mutex on write, because the VFS has already taken it for us. On read
    we don't need it since the glock is enough protection. The reason that
    I've made some of the checks into a separate function is that we'll need
    to do the checks again in the allocating write case eventually, so this
    is partly in preparation for this. Likewise the return value test of !=
    1 might look a bit odd and thats because we'll need a third return value
    in case of requiring an allocation.
    
    I've made the change to deferred mode on the glock to ensure flushing
    read caches on other nodes. I notice that (using blktrace to look at
    whats going on) we appear to do a better job of large I/Os than ext3
    after this patch (in terms of not splitting up the I/Os).
    
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
    Cc: Wendy Cheng <wcheng@redhat.com>

commit 1271e5ed072e5485738437e76b62165c9ad83437
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 13 10:40:26 2006 -0600

    [DLM] fix lost flags in stub replies
    
    When the dlm fakes an unlock/cancel reply from a failed node using a stub
    message struct, it wasn't setting the flags in the stub message.  So, in
    the process of receiving the fake message the lkb flags would be updated
    and cleared from the zero flags in the message.  The problem observed in
    tests was the loss of the USER flag which caused the dlm to think a user
    lock was a kernel lock and subsequently fail an assertion checking the
    validity of the ast/callback field.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 31f281641788d3bc4ec5af375e6d455be7ea2d82
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 13 10:39:20 2006 -0600

    [DLM] fix receive_request() lvb copying
    
    LVB's are not sent as part of new requests, but the code receiving the
    request was copying data into the lvb anyway.  The space in the message
    where it mistakenly thought the lvb lived actually contained the resource
    name, so it wound up incorrectly copying this name data into the lvb.  Fix
    is to just create the lvb, not copy junk into it.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit c2ab6a61dde59068bc9e87cb5ece4cc0d3f05574
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 13 10:38:45 2006 -0600

    [DLM] fix send_args() lvb copying
    
    The send_args() function is used to copy parameters into a message for a
    number different message types.  Only some of those types are set up
    beforehand (in create_message) to include space for sending lvb data.
    send_args was wrongly copying the lvb for all message types as long as the
    lock had an lvb.  This means that the lvb data was being written past the
    end of the message into unknown space.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 6fc73f7e0d3289203e535e51a73f6072c4d6e719
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 13 10:37:55 2006 -0600

    [DLM] add version check
    
    Check if we receive a message from another lockspace member running a
    version of the dlm with an incompatible inter-node message protocol.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 1a5afd59eed3a69e2cdc572a0b9a52763c9197ac
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 13 10:37:16 2006 -0600

    [DLM] fix old rcom messages
    
    A reply to a recovery message will often be received after the relevant
    recovery sequence has aborted and the next recovery sequence has begun.
    We need to ignore replies to these old messages from the previous
    recovery.  There's already a way to do this for synchronous recovery
    requests using the rc_id number, but not for async.
    
    Each recovery sequence already has a locally unique sequence number
    associated with it.  This patch adds a field to the rcom (recovery
    message) structure where this recovery sequence number can be placed,
    rc_seq.  When a node sends a reply to a recovery request, it copies the
    rc_seq number it received into rc_seq_reply.  When the first node receives
    the reply to its recovery message, it will check whether rc_seq_reply
    matches the current recovery sequence number, ls_recover_seq, and if not
    then it ignores the old reply.
    
    An old, inadequate approach to filtering out old replies (checking if the
    current stage of recovery has moved back to the start) has been removed
    from two spots.
    
    The protocol version number is changed to reflect the different rcom
    structures.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 2d7531e26c5c77f5ec5c3fd4335e18836dcfa8b1
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 13 10:36:37 2006 -0600

    [DLM] fix resend rcom lock
    
    There's a chance the new master of resource hasn't learned it's the new
    master before another node sends it a lock during recovery.  The node
    sending the lock needs to resend if this happens.
    
    - A sends a master lookup for resource R to C
    - B sends a master lookup for resource R to C
    - C receives A's lookup, assigns A to be master of R and
      sends a reply back to A
    - C receives B's lookup and sends a reply back to B saying
      that A is the master
    - B receives lookup reply from C and sends its lock for R to A
    - A receives lock from B, doesn't think it's the master of R
      and sends an error back to B
    - A receives lookup reply from C and becomes master of R
    - B gets error back from A and resends its lock back to A
      (this resending is what this patch does)
    - A receives lock from B, it now sees it's the master of R
      and takes the lock
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit 1d75e2cce5de74efeb3e5bb9f556ca872428af0d
Author: Steven Whitehouse <swhiteho@redhat.com>
Date:   Tue Dec 12 10:16:25 2006 +0000

    [GFS2] Fix Kconfig
    
    Here is a patch to fix up the Kconfig so that we don't land up with
    problems when people disable the NET subsystem.  Thanks for all the hints and
    suggestions that people have sent me regarding this.
    
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
    Cc: Aleksandr Koltsoff <czr@iki.fi>
    Cc: Toralf Förster <toralf.foerster@gmx.de>
    Cc: Randy Dunlap <randy.dunlap@oracle.com>
    Cc: Adrian Bunk <bunk@stusta.de>
    Cc: Chris Zubrzycki <chris@middle--earth.org>
    Cc: Patrick Caulfield <pcaulfie@redhat.com>

commit c7e5cf01a8dd7530ed062bbb6a8410d69fb6a771
Author: Patrick Caulfield <pcaulfie@redhat.com>
Date:   Fri Dec 8 14:31:12 2006 -0500

    [DLM] fix compile warning
    
    This patch fixes a compile warning in lowcomms-tcp.c indicating that
    kmem_cache_t is deprecated.
    
    Signed-Off-By: Patrick Caulfield <pcaulfie@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>

commit f252a27aff7b6d152bef2338e957cf80fa5a605c
Author: David Teigland <teigland@redhat.com>
Date:   Wed Dec 6 11:46:33 2006 -0600

    [GFS2] don't try to lockfs after shutdown
    
    If an fs has already been shut down, a lockfs callback should do nothing.
    An fs that's been shut down can't acquire locks or do anything with
    respect to the cluster.
    
    Also, remove FIXME comment in withdraw function.  The missing bits of the
    withdraw procedure are now all done by user space.
    
    Signed-off-by: David Teigland <teigland@redhat.com>
    Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
 fs/dlm/dlm_internal.h |    6 ++-
 fs/dlm/lock.c         |   41 ++++++++++++++++----
 fs/dlm/lowcomms-tcp.c |   24 ------------
 fs/dlm/rcom.c         |   75 ++++++++++++++++++++++++------------
 fs/dlm/util.c         |    4 ++
 fs/gfs2/inode.c       |   20 ++++++++--
 fs/gfs2/inode.h       |    1 
 fs/gfs2/lm.c          |    8 ----
 fs/gfs2/ops_address.c |  102 +++++++++++++++++++++++--------------------------
 fs/gfs2/ops_inode.c   |   25 +++++++++++-
 fs/gfs2/ops_super.c   |    3 +
 11 files changed, 182 insertions(+), 127 deletions(-)

diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 1ee8195..7185a13 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -309,8 +309,8 @@ static inline int rsb_flag(struct dlm_rs
 
 /* dlm_header is first element of all structs sent between nodes */
 
-#define DLM_HEADER_MAJOR	0x00020000
-#define DLM_HEADER_MINOR	0x00000001
+#define DLM_HEADER_MAJOR	0x00030000
+#define DLM_HEADER_MINOR	0x00000000
 
 #define DLM_MSG			1
 #define DLM_RCOM		2
@@ -386,6 +386,8 @@ struct dlm_rcom {
 	uint32_t		rc_type;	/* DLM_RCOM_ */
 	int			rc_result;	/* multi-purpose */
 	uint64_t		rc_id;		/* match reply with request */
+	uint64_t		rc_seq;		/* sender's ls_recover_seq */
+	uint64_t		rc_seq_reply;	/* remote ls_recover_seq */
 	char			rc_buf[0];
 };
 
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 30878de..ed52485 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2144,12 +2144,24 @@ static void send_args(struct dlm_rsb *r,
 	if (lkb->lkb_astaddr)
 		ms->m_asts |= AST_COMP;
 
-	if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP)
-		memcpy(ms->m_extra, r->res_name, r->res_length);
+	/* compare with switch in create_message; send_remove() doesn't
+	   use send_args() */
 
-	else if (lkb->lkb_lvbptr)
+	switch (ms->m_type) {
+	case DLM_MSG_REQUEST:
+	case DLM_MSG_LOOKUP:
+		memcpy(ms->m_extra, r->res_name, r->res_length);
+		break;
+	case DLM_MSG_CONVERT:
+	case DLM_MSG_UNLOCK:
+	case DLM_MSG_REQUEST_REPLY:
+	case DLM_MSG_CONVERT_REPLY:
+	case DLM_MSG_GRANT:
+		if (!lkb->lkb_lvbptr)
+			break;
 		memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
-
+		break;
+	}
 }
 
 static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
@@ -2418,8 +2430,12 @@ static int receive_request_args(struct d
 
 	DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb););
 
-	if (receive_lvb(ls, lkb, ms))
-		return -ENOMEM;
+	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
+		/* lkb was just created so there won't be an lvb yet */
+		lkb->lkb_lvbptr = allocate_lvb(ls);
+		if (!lkb->lkb_lvbptr)
+			return -ENOMEM;
+	}
 
 	return 0;
 }
@@ -3132,6 +3148,7 @@ static void recover_convert_waiter(struc
 	if (middle_conversion(lkb)) {
 		hold_lkb(lkb);
 		ls->ls_stub_ms.m_result = -EINPROGRESS;
+		ls->ls_stub_ms.m_flags = lkb->lkb_flags;
 		_remove_from_waiters(lkb);
 		_receive_convert_reply(lkb, &ls->ls_stub_ms);
 
@@ -3205,6 +3222,7 @@ void dlm_recover_waiters_pre(struct dlm_
 		case DLM_MSG_UNLOCK:
 			hold_lkb(lkb);
 			ls->ls_stub_ms.m_result = -DLM_EUNLOCK;
+			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
 			_remove_from_waiters(lkb);
 			_receive_unlock_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
@@ -3213,6 +3231,7 @@ void dlm_recover_waiters_pre(struct dlm_
 		case DLM_MSG_CANCEL:
 			hold_lkb(lkb);
 			ls->ls_stub_ms.m_result = -DLM_ECANCEL;
+			ls->ls_stub_ms.m_flags = lkb->lkb_flags;
 			_remove_from_waiters(lkb);
 			_receive_cancel_reply(lkb, &ls->ls_stub_ms);
 			dlm_put_lkb(lkb);
@@ -3571,6 +3590,14 @@ int dlm_recover_process_copy(struct dlm_
 	lock_rsb(r);
 
 	switch (error) {
+	case -EBADR:
+		/* There's a chance the new master received our lock before
+		   dlm_recover_master_reply(), this wouldn't happen if we did
+		   a barrier between recover_masters and recover_locks. */
+		log_debug(ls, "master copy not ready %x r %lx %s", lkb->lkb_id,
+			  (unsigned long)r, r->res_name);
+		dlm_send_rcom_lock(r, lkb);
+		goto out;
 	case -EEXIST:
 		log_debug(ls, "master copy exists %x", lkb->lkb_id);
 		/* fall through */
@@ -3585,7 +3612,7 @@ int dlm_recover_process_copy(struct dlm_
 	/* an ack for dlm_recover_locks() which waits for replies from
 	   all the locks it sends to new masters */
 	dlm_recovered_lock(r);
-
+ out:
 	unlock_rsb(r);
 	put_rsb(r);
 	dlm_put_lkb(lkb);
diff --git a/fs/dlm/lowcomms-tcp.c b/fs/dlm/lowcomms-tcp.c
index 9be3a44..3b22473 100644
--- a/fs/dlm/lowcomms-tcp.c
+++ b/fs/dlm/lowcomms-tcp.c
@@ -880,22 +880,6 @@ out:
 	return -1;
 }
 
-/* API send message call, may queue the request */
-/* N.B. This is the old interface - use the new one for new calls */
-int lowcomms_send_message(int nodeid, char *buf, int len, gfp_t allocation)
-{
-	struct writequeue_entry *e;
-	char *b;
-
-	e = dlm_lowcomms_get_buffer(nodeid, len, allocation, &b);
-	if (e) {
-		memcpy(b, buf, len);
-		dlm_lowcomms_commit_buffer(e);
-		return 0;
-	}
-	return -ENOBUFS;
-}
-
 /* Look for activity on active sockets */
 static void process_sockets(void)
 {
@@ -1087,14 +1071,6 @@ static int daemons_start(void)
 	return 0;
 }
 
-/*
- * Return the largest buffer size we can cope with.
- */
-int lowcomms_max_buffer_size(void)
-{
-	return PAGE_CACHE_SIZE;
-}
-
 void dlm_lowcomms_stop(void)
 {
 	int i;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 4cc31be..54fba9b 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -56,6 +56,10 @@ static int create_rcom(struct dlm_ls *ls
 
 	rc->rc_type = type;
 
+	spin_lock(&ls->ls_recover_lock);
+	rc->rc_seq = ls->ls_recover_seq;
+	spin_unlock(&ls->ls_recover_lock);
+
 	*mh_ret = mh;
 	*rc_ret = rc;
 	return 0;
@@ -78,8 +82,17 @@ static void make_config(struct dlm_ls *l
 	rf->rf_lsflags = ls->ls_exflags;
 }
 
-static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid)
+static int check_config(struct dlm_ls *ls, struct dlm_rcom *rc, int nodeid)
 {
+	struct rcom_config *rf = (struct rcom_config *) rc->rc_buf;
+
+	if ((rc->rc_header.h_version & 0xFFFF0000) != DLM_HEADER_MAJOR) {
+		log_error(ls, "version mismatch: %x nodeid %d: %x",
+			  DLM_HEADER_MAJOR | DLM_HEADER_MINOR, nodeid,
+			  rc->rc_header.h_version);
+		return -EINVAL;
+	}
+
 	if (rf->rf_lvblen != ls->ls_lvblen ||
 	    rf->rf_lsflags != ls->ls_exflags) {
 		log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x",
@@ -141,8 +154,7 @@ int dlm_rcom_status(struct dlm_ls *ls, i
 		log_debug(ls, "remote node %d not ready", nodeid);
 		rc->rc_result = 0;
 	} else
-		error = check_config(ls, (struct rcom_config *) rc->rc_buf,
-				     nodeid);
+		error = check_config(ls, rc, nodeid);
 	/* the caller looks at rc_result for the remote recovery status */
  out:
 	return error;
@@ -159,6 +171,7 @@ static void receive_rcom_status(struct d
 	if (error)
 		return;
 	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
 	rc->rc_result = dlm_recover_status(ls);
 	make_config(ls, (struct rcom_config *) rc->rc_buf);
 
@@ -224,21 +237,7 @@ static void receive_rcom_names(struct dl
 {
 	struct dlm_rcom *rc;
 	struct dlm_mhandle *mh;
-	int error, inlen, outlen;
-	int nodeid = rc_in->rc_header.h_nodeid;
-	uint32_t status = dlm_recover_status(ls);
-
-	/*
-	 * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while
-	 * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes).
-	 * It could only happen in rare cases where we get a late NAMES
-	 * message from a previous instance of recovery.
-	 */
-
-	if (!(status & DLM_RS_NODES)) {
-		log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid);
-		return;
-	}
+	int error, inlen, outlen, nodeid;
 
 	nodeid = rc_in->rc_header.h_nodeid;
 	inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom);
@@ -248,6 +247,7 @@ static void receive_rcom_names(struct dl
 	if (error)
 		return;
 	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
 
 	dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen,
 			      nodeid);
@@ -294,6 +294,7 @@ static void receive_rcom_lookup(struct d
 		ret_nodeid = error;
 	rc->rc_result = ret_nodeid;
 	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
 
 	send_rcom(ls, mh, rc);
 }
@@ -375,20 +376,13 @@ static void receive_rcom_lock(struct dlm
 
 	memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock));
 	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
 
 	send_rcom(ls, mh, rc);
 }
 
 static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in)
 {
-	uint32_t status = dlm_recover_status(ls);
-
-	if (!(status & DLM_RS_DIR)) {
-		log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u",
-			  rc_in->rc_header.h_nodeid);
-		return;
-	}
-
 	dlm_recover_process_copy(ls, rc_in);
 }
 
@@ -415,6 +409,7 @@ static int send_ls_not_ready(int nodeid,
 
 	rc->rc_type = DLM_RCOM_STATUS_REPLY;
 	rc->rc_id = rc_in->rc_id;
+	rc->rc_seq_reply = rc_in->rc_seq;
 	rc->rc_result = -ESRCH;
 
 	rf = (struct rcom_config *) rc->rc_buf;
@@ -426,6 +421,31 @@ static int send_ls_not_ready(int nodeid,
 	return 0;
 }
 
+static int is_old_reply(struct dlm_ls *ls, struct dlm_rcom *rc)
+{
+	uint64_t seq;
+	int rv = 0;
+
+	switch (rc->rc_type) {
+	case DLM_RCOM_STATUS_REPLY:
+	case DLM_RCOM_NAMES_REPLY:
+	case DLM_RCOM_LOOKUP_REPLY:
+	case DLM_RCOM_LOCK_REPLY:
+		spin_lock(&ls->ls_recover_lock);
+		seq = ls->ls_recover_seq;
+		spin_unlock(&ls->ls_recover_lock);
+		if (rc->rc_seq_reply != seq) {
+			log_error(ls, "ignoring old reply %x from %d "
+				      "seq_reply %llx expect %llx",
+				      rc->rc_type, rc->rc_header.h_nodeid,
+				      (unsigned long long)rc->rc_seq_reply,
+				      (unsigned long long)seq);
+			rv = 1;
+		}
+	}
+	return rv;
+}
+
 /* Called by dlm_recvd; corresponds to dlm_receive_message() but special
    recovery-only comms are sent through here. */
 
@@ -454,6 +474,9 @@ void dlm_receive_rcom(struct dlm_header 
 		goto out;
 	}
 
+	if (is_old_reply(ls, rc))
+		goto out;
+
 	if (nodeid != rc->rc_header.h_nodeid) {
 		log_error(ls, "bad rcom nodeid %d from %d",
 			  rc->rc_header.h_nodeid, nodeid);
diff --git a/fs/dlm/util.c b/fs/dlm/util.c
index 767197d..963889c 100644
--- a/fs/dlm/util.c
+++ b/fs/dlm/util.c
@@ -134,6 +134,8 @@ void dlm_rcom_out(struct dlm_rcom *rc)
 	rc->rc_type		= cpu_to_le32(rc->rc_type);
 	rc->rc_result		= cpu_to_le32(rc->rc_result);
 	rc->rc_id		= cpu_to_le64(rc->rc_id);
+	rc->rc_seq		= cpu_to_le64(rc->rc_seq);
+	rc->rc_seq_reply	= cpu_to_le64(rc->rc_seq_reply);
 
 	if (type == DLM_RCOM_LOCK)
 		rcom_lock_out((struct rcom_lock *) rc->rc_buf);
@@ -151,6 +153,8 @@ void dlm_rcom_in(struct dlm_rcom *rc)
 	rc->rc_type		= le32_to_cpu(rc->rc_type);
 	rc->rc_result		= le32_to_cpu(rc->rc_result);
 	rc->rc_id		= le64_to_cpu(rc->rc_id);
+	rc->rc_seq		= le64_to_cpu(rc->rc_seq);
+	rc->rc_seq_reply	= le64_to_cpu(rc->rc_seq_reply);
 
 	if (rc->rc_type == DLM_RCOM_LOCK)
 		rcom_lock_in((struct rcom_lock *) rc->rc_buf);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index d122074..6bc4436 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -281,16 +281,14 @@ out:
 }
 
 /**
- * gfs2_change_nlink - Change nlink count on inode
+ * gfs2_change_nlink_i - Change nlink count on inode
  * @ip: The GFS2 inode
  * @diff: The change in the nlink count required
  *
  * Returns: errno
  */
-
-int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
+int gfs2_change_nlink_i(struct gfs2_inode *ip, int diff)
 {
-	struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
 	struct buffer_head *dibh;
 	u32 nlink;
 	int error;
@@ -322,6 +320,20 @@ int gfs2_change_nlink(struct gfs2_inode 
 	brelse(dibh);
 	mark_inode_dirty(&ip->i_inode);
 
+	return error;
+}
+
+int gfs2_change_nlink(struct gfs2_inode *ip, int diff)
+{
+	struct gfs2_sbd *sdp = ip->i_inode.i_sb->s_fs_info;
+	int error;
+
+	/* update the nlink */
+	error = gfs2_change_nlink_i(ip, diff);
+	if (error)
+		return error;
+
+	/* return meta data block back to rg */
 	if (ip->i_inode.i_nlink == 0) {
 		struct gfs2_rgrpd *rgd;
 		struct gfs2_holder ri_gh, rg_gh;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index b57f448..85c67cb 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -40,6 +40,7 @@ int gfs2_inode_refresh(struct gfs2_inode
 
 int gfs2_dinode_dealloc(struct gfs2_inode *inode);
 int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
+int gfs2_change_nlink_i(struct gfs2_inode *ip, int diff);
 struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 			   int is_root, struct nameidata *nd);
 struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
index effe4a3..e30673d 100644
--- a/fs/gfs2/lm.c
+++ b/fs/gfs2/lm.c
@@ -104,15 +104,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sd
 	vprintk(fmt, args);
 	va_end(args);
 
-	fs_err(sdp, "about to withdraw from the cluster\n");
+	fs_err(sdp, "about to withdraw this file system\n");
 	BUG_ON(sdp->sd_args.ar_debug);
 
-
-	fs_err(sdp, "waiting for outstanding I/O\n");
-
-	/* FIXME: suspend dm device so oustanding bio's complete
-	   and all further io requests fail */
-
 	fs_err(sdp, "telling LM to withdraw\n");
 	gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
 	fs_err(sdp, "withdrawn\n");
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index d8d69a7..37bfeb9 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -256,7 +256,7 @@ out_unlock:
  *    the page lock and the glock) and return having done no I/O. Its
  *    obviously not something we'd want to do on too regular a basis.
  *    Any I/O we ignore at this time will be done via readpage later.
- * 2. We have to handle stuffed files here too.
+ * 2. We don't handle stuffed files here we let readpage do the honours.
  * 3. mpage_readpages() does most of the heavy lifting in the common case.
  * 4. gfs2_get_block() is relied upon to set BH_Boundary in the right places.
  * 5. We use LM_FLAG_TRY_1CB here, effectively we then have lock-ahead as
@@ -269,8 +269,7 @@ static int gfs2_readpages(struct file *f
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct gfs2_holder gh;
-	unsigned page_idx;
-	int ret;
+	int ret = 0;
 	int do_unlock = 0;
 
 	if (likely(file != &gfs2_internal_file_sentinel)) {
@@ -289,29 +288,8 @@ static int gfs2_readpages(struct file *f
 			goto out_unlock;
 	}
 skip_lock:
-	if (gfs2_is_stuffed(ip)) {
-		struct pagevec lru_pvec;
-		pagevec_init(&lru_pvec, 0);
-		for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-			struct page *page = list_entry(pages->prev, struct page, lru);
-			prefetchw(&page->flags);
-			list_del(&page->lru);
-			if (!add_to_page_cache(page, mapping,
-					       page->index, GFP_KERNEL)) {
-				ret = stuffed_readpage(ip, page);
-				unlock_page(page);
-				if (!pagevec_add(&lru_pvec, page))
-					 __pagevec_lru_add(&lru_pvec);
-			} else {
-				page_cache_release(page);
-			}
-		}
-		pagevec_lru_add(&lru_pvec);
-		ret = 0;
-	} else {
-		/* What we really want to do .... */
+	if (!gfs2_is_stuffed(ip))
 		ret = mpage_readpages(mapping, pages, nr_pages, gfs2_get_block);
-	}
 
 	if (do_unlock) {
 		gfs2_glock_dq_m(1, &gh);
@@ -594,6 +572,36 @@ static void gfs2_invalidatepage(struct p
 	return;
 }
 
+/**
+ * gfs2_ok_for_dio - check that dio is valid on this file
+ * @ip: The inode
+ * @rw: READ or WRITE
+ * @offset: The offset at which we are reading or writing
+ *
+ * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o)
+ *          1 (to accept the i/o request)
+ */
+static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset)
+{
+	/*
+	 * Should we return an error here? I can't see that O_DIRECT for
+	 * a journaled file makes any sense. For now we'll silently fall
+	 * back to buffered I/O, likewise we do the same for stuffed
+	 * files since they are (a) small and (b) unaligned.
+	 */
+	if (gfs2_is_jdata(ip))
+		return 0;
+
+	if (gfs2_is_stuffed(ip))
+		return 0;
+
+	if (offset > i_size_read(&ip->i_inode))
+		return 0;
+	return 1;
+}
+
+
+
 static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 			      const struct iovec *iov, loff_t offset,
 			      unsigned long nr_segs)
@@ -604,42 +612,28 @@ static ssize_t gfs2_direct_IO(int rw, st
 	struct gfs2_holder gh;
 	int rv;
 
-	if (rw == READ)
-		mutex_lock(&inode->i_mutex);
 	/*
-	 * Shared lock, even if its a write, since we do no allocation
-	 * on this path. All we need change is atime.
+	 * Deferred lock, even if its a write, since we do no allocation
+	 * on this path. All we need change is atime, and this lock mode
+	 * ensures that other nodes have flushed their buffered read caches
+	 * (i.e. their page cache entries for this inode). We do not,
+	 * unfortunately have the option of only flushing a range like
+	 * the VFS does.
 	 */
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh);
+	gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh);
 	rv = gfs2_glock_nq_atime(&gh);
 	if (rv)
-		goto out;
-
-	if (offset > i_size_read(inode))
-		goto out;
-
-	/*
-	 * Should we return an error here? I can't see that O_DIRECT for
-	 * a journaled file makes any sense. For now we'll silently fall
-	 * back to buffered I/O, likewise we do the same for stuffed
-	 * files since they are (a) small and (b) unaligned.
-	 */
-	if (gfs2_is_jdata(ip))
-		goto out;
-
-	if (gfs2_is_stuffed(ip))
-		goto out;
-
-	rv = blockdev_direct_IO_own_locking(rw, iocb, inode,
-					    inode->i_sb->s_bdev,
-					    iov, offset, nr_segs,
-					    gfs2_get_block_direct, NULL);
+		return rv;
+	rv = gfs2_ok_for_dio(ip, rw, offset);
+	if (rv != 1)
+		goto out; /* dio not valid, fall back to buffered i/o */
+
+	rv = blockdev_direct_IO_no_locking(rw, iocb, inode, inode->i_sb->s_bdev,
+					   iov, offset, nr_segs,
+					   gfs2_get_block_direct, NULL);
 out:
 	gfs2_glock_dq_m(1, &gh);
 	gfs2_holder_uninit(&gh);
-	if (rw == READ)
-		mutex_unlock(&inode->i_mutex);
-
 	return rv;
 }
 
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 636dda4..919e894 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -553,6 +553,7 @@ static int gfs2_rename(struct inode *odi
 	int alloc_required;
 	unsigned int x;
 	int error;
+	struct gfs2_rgrpd *rgd;
 
 	if (ndentry->d_inode) {
 		nip = GFS2_I(ndentry->d_inode);
@@ -684,12 +685,12 @@ static int gfs2_rename(struct inode *odi
 		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
 					 al->al_rgd->rd_ri.ri_length +
 					 4 * RES_DINODE + 4 * RES_LEAF +
-					 RES_STATFS + RES_QUOTA, 0);
+					 RES_STATFS + RES_QUOTA + 1, 0);
 		if (error)
 			goto out_ipreserv;
 	} else {
 		error = gfs2_trans_begin(sdp, 4 * RES_DINODE +
-					 5 * RES_LEAF, 0);
+					 5 * RES_LEAF + 1, 0);
 		if (error)
 			goto out_gunlock;
 	}
@@ -703,7 +704,25 @@ static int gfs2_rename(struct inode *odi
 			error = gfs2_dir_del(ndip, &ndentry->d_name);
 			if (error)
 				goto out_end_trans;
-			error = gfs2_change_nlink(nip, -1);
+			error = gfs2_change_nlink_i(nip, -1);
+			if ((!error) && (nip->i_inode.i_nlink == 0)) {
+				error = -EIO;
+				rgd = gfs2_blk2rgrpd(sdp, nip->i_num.no_addr);
+				if (rgd) {
+					struct gfs2_holder nlink_rg_gh;
+					if (rgd != nip->i_alloc.al_rgd)
+						error = gfs2_glock_nq_init(
+						rgd->rd_gl, LM_ST_EXCLUSIVE,
+						0, &nlink_rg_gh);
+					else
+						error = 0;
+                			if (!error) {
+						gfs2_unlink_di(&nip->i_inode);
+						if (rgd != nip->i_alloc.al_rgd)
+							gfs2_glock_dq_uninit(&nlink_rg_gh);
+					}
+				}
+			}
 		}
 		if (error)
 			goto out_end_trans;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 7685b46..b283783 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -173,6 +173,9 @@ static void gfs2_write_super_lockfs(stru
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	int error;
 
+	if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return;
+
 	for (;;) {
 		error = gfs2_freeze_fs(sdp);
 		if (!error)