GIT 2ce3808fb67ec14d2f3ed454c5720112b606e3ab git+ssh://master.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2.git#ALL

commit b34dc19b561f43cbee1cc925eae30227f0c8e763
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Wed May 9 15:16:19 2007 -0700

    ocfs2: shared writeable mmap
    
    Implement cluster consistent shared writeable mappings using the
    ->page_mkwrite() callback.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit acae311c6106029157613f3e5da3bbd5617b5a0a
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Wed May 9 15:14:45 2007 -0700

    ocfs2: factor out write aops into nolock variants
    
    ocfs2_mkwrite() will want this so that it can add some mmap specific checks
    before asking for a write.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 8e6b3ced694acab573d98ee53ce0bdde7232547e
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Tue May 8 17:47:32 2007 -0700

    ocfs2: rework ocfs2_buffered_write_cluster()
    
    Use some ideas from the new-aops patch series and turn
    ocfs2_buffered_write_cluster() into a 2 stage operation with the caller
    copying data in between. The code now understands multiple cluster writes as
    a result of having to deal with a full page write for greater than 4k pages.
    
    This sets us up to easily call into the write path during ->page_mkwrite().
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit 0bbd349a498a140107b1e960e0881dd8486e49e4
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Wed May 9 13:40:18 2007 -0700

    ocfs2: take ip_alloc_sem during entire truncate
    
    Use of the alloc sem during truncate was too narrow - we want to protect
    the i_size change and page truncation against mmap now.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit a96894dcce4bac0efda45c4f687b826ff46f7ee2
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Mon May 14 11:39:40 2007 -0700

    ocfs2: unmap_mapping_range() in ocfs2_truncate()
    
    We weren't calling this before, but since ocfs2 handles the entire truncate
    operation, we should.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit a9b3ad77134ccbe972c7dec360facb9a8634e047
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Mon May 14 11:38:51 2007 -0700

    ocfs2: trylock in ocfs2_readpage()
    
    Similarly to the page lock / cluster lock inversion in ocfs2_readpage, we
    can deadlock on ip_alloc_sem. We can down_read_trylock() instead and just
    return AOP_TRUNCATED_PAGE if the operation fails.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit c52713f291eefa5c03ff60688641c09b2f8e24f0
Author: Mark Fasheh <mark.fasheh@oracle.com>
Date:   Wed May 9 17:34:26 2007 -0700

    ocfs2: fix inode leak
    
    We weren't cleaning up our inode reference on error in
    ocfs2_reserve_local_alloc_bits(). Add a check for error return and iput() if
    need be. Move the code to set the alloc context inode info to the end of the
    function so we don't have any possibility of passing back a bad pointer.
    
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>

commit ee7dd5048fa0b91e51f46fe715f45bc9a0448523
Author: Nate Diller <nate.diller@gmail.com>
Date:   Thu May 10 22:56:01 2007 -0700

    [PATCH] ocfs2: use zero_user_page
    
    Use zero_user_page() instead of open-coding it.
    
    Signed-off-by: Nate Diller <nate.diller@gmail.com>
    Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
    Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
 fs/ocfs2/alloc.c      |    3 
 fs/ocfs2/aops.c       |  933 ++++++++++++++++++++++++++++++-------------------
 fs/ocfs2/aops.h       |   61 +--
 fs/ocfs2/file.c       |  140 ++++---
 fs/ocfs2/localalloc.c |    7 
 fs/ocfs2/mmap.c       |  167 ++++++++-
 6 files changed, 816 insertions(+), 495 deletions(-)

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 19712a7..02b6e7a 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -3631,8 +3631,6 @@ int ocfs2_commit_truncate(struct ocfs2_s
 
 	mlog_entry_void();
 
-	down_write(&OCFS2_I(inode)->ip_alloc_sem);
-
 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
 						     i_size_read(inode));
 
@@ -3754,7 +3752,6 @@ start:
 	goto start;
 
 bail:
-	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 8e7cafb..b8869fd 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -222,7 +222,10 @@ static int ocfs2_readpage(struct file *f
 		goto out;
 	}
 
-	down_read(&OCFS2_I(inode)->ip_alloc_sem);
+	if (down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem) == 0) {
+		ret = AOP_TRUNCATED_PAGE;
+		goto out_meta_unlock;
+	}
 
 	/*
 	 * i_size might have just been updated as we grabed the meta lock.  We
@@ -235,10 +238,7 @@ static int ocfs2_readpage(struct file *f
 	 * XXX sys_readahead() seems to get that wrong?
 	 */
 	if (start >= i_size_read(inode)) {
-		char *addr = kmap(page);
-		memset(addr, 0, PAGE_SIZE);
-		flush_dcache_page(page);
-		kunmap(page);
+		zero_user_page(page, 0, PAGE_SIZE, KM_USER0);
 		SetPageUptodate(page);
 		ret = 0;
 		goto out_alloc;
@@ -258,6 +258,7 @@ static int ocfs2_readpage(struct file *f
 	ocfs2_data_unlock(inode, 0);
 out_alloc:
 	up_read(&OCFS2_I(inode)->ip_alloc_sem);
+out_meta_unlock:
 	ocfs2_meta_unlock(inode, 0);
 out:
 	if (unlock)
@@ -683,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *p
 	     bh = bh->b_this_page, block_start += bsize) {
 		block_end = block_start + bsize;
 
+		clear_buffer_new(bh);
+
 		/*
 		 * Ignore blocks outside of our i/o range -
 		 * they may belong to unallocated clusters.
@@ -697,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *p
 		 * For an allocating write with cluster size >= page
 		 * size, we always write the entire page.
 		 */
-
-		if (buffer_new(bh))
-			clear_buffer_new(bh);
+		if (new)
+			set_buffer_new(bh);
 
 		if (!buffer_mapped(bh)) {
 			map_bh(bh, inode->i_sb, *p_blkno);
@@ -760,217 +762,234 @@ next_bh:
 	return ret;
 }
 
+#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE)
+#define OCFS2_MAX_CTXT_PAGES	1
+#else
+#define OCFS2_MAX_CTXT_PAGES	(OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE)
+#endif
+
+#define OCFS2_MAX_CLUSTERS_PER_PAGE	(PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE)
+
 /*
- * This will copy user data from the buffer page in the splice
- * context.
- *
- * For now, we ignore SPLICE_F_MOVE as that would require some extra
- * communication out all the way to ocfs2_write().
+ * Describe the state of a single cluster to be written to.
  */
-int ocfs2_map_and_write_splice_data(struct inode *inode,
-				  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
-				  unsigned int *ret_from, unsigned int *ret_to)
-{
-	int ret;
-	unsigned int to, from, cluster_start, cluster_end;
-	char *src, *dst;
-	struct ocfs2_splice_write_priv *sp = wc->w_private;
-	struct pipe_buffer *buf = sp->s_buf;
-	unsigned long bytes, src_from;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+struct ocfs2_write_cluster_desc {
+	u32		c_cpos;
+	u32		c_phys;
+	/*
+	 * Give this a unique field because c_phys eventually gets
+	 * filled.
+	 */
+	unsigned	c_new;
+};
 
-	ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
-					&cluster_end);
+struct ocfs2_write_ctxt {
+	/* Logical cluster position / len of write */
+	u32				w_cpos;
+	u32				w_clen;
 
-	from = sp->s_offset;
-	src_from = sp->s_buf_offset;
-	bytes = wc->w_count;
+	struct ocfs2_write_cluster_desc	w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
 
-	if (wc->w_large_pages) {
-		/*
-		 * For cluster size < page size, we have to
-		 * calculate pos within the cluster and obey
-		 * the rightmost boundary.
-		 */
-		bytes = min(bytes, (unsigned long)(osb->s_clustersize
-				   - (wc->w_pos & (osb->s_clustersize - 1))));
-	}
-	to = from + bytes;
+	/*
+	 * This is true if page_size > cluster_size.
+	 *
+	 * It triggers a set of special cases during write which might
+	 * have to deal with allocating writes to partial pages.
+	 */
+	unsigned int			w_large_pages;
 
-	if (wc->w_this_page_new)
-		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-					    cluster_start, cluster_end, 1);
-	else
-		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-					    from, to, 0);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
+	/*
+	 * Pages involved in this write.
+	 *
+	 * w_target_page is the page being written to by the user.
+	 *
+	 * w_pages is an array of pages which always contains
+	 * w_target_page, and in the case of an allocating write with
+	 * page_size < cluster size, it will contain zero'd and mapped
+	 * pages adjacent to w_target_page which need to be written
+	 * out in so that future reads from that region will get
+	 * zero's.
+	 */
+	struct page			*w_pages[OCFS2_MAX_CTXT_PAGES];
+	unsigned int			w_num_pages;
+	struct page			*w_target_page;
+
+	/*
+	 * ocfs2_write_end() uses this to know what the real range to
+	 * write in the target should be.
+	 */
+	unsigned int			w_target_from;
+	unsigned int			w_target_to;
+
+	/*
+	 * We could use journal_current_handle() but this is cleaner,
+	 * IMHO -Mark
+	 */
+	handle_t			*w_handle;
+
+	struct buffer_head		*w_di_bh;
+};
+
+static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
+{
+	int i;
+
+	for(i = 0; i < wc->w_num_pages; i++) {
+		if (wc->w_pages[i] == NULL)
+			continue;
+
+		unlock_page(wc->w_pages[i]);
+		mark_page_accessed(wc->w_pages[i]);
+		page_cache_release(wc->w_pages[i]);
 	}
 
-	BUG_ON(from > PAGE_CACHE_SIZE);
-	BUG_ON(to > PAGE_CACHE_SIZE);
-	BUG_ON(from > osb->s_clustersize);
-	BUG_ON(to > osb->s_clustersize);
+	brelse(wc->w_di_bh);
+	kfree(wc);
+}
 
-	src = buf->ops->map(sp->s_pipe, buf, 1);
-	dst = kmap_atomic(wc->w_this_page, KM_USER1);
-	memcpy(dst + from, src + src_from, bytes);
-	kunmap_atomic(wc->w_this_page, KM_USER1);
-	buf->ops->unmap(sp->s_pipe, buf, src);
+static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
+				  struct ocfs2_super *osb, loff_t pos,
+				  unsigned len, struct buffer_head *di_bh)
+{
+	struct ocfs2_write_ctxt *wc;
 
-	wc->w_finished_copy = 1;
+	wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS);
+	if (!wc)
+		return -ENOMEM;
 
-	*ret_from = from;
-	*ret_to = to;
-out:
+	wc->w_cpos = pos >> osb->s_clustersize_bits;
+	wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len);
+	get_bh(di_bh);
+	wc->w_di_bh = di_bh;
+
+	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+		wc->w_large_pages = 1;
+	else
+		wc->w_large_pages = 0;
 
-	return bytes ? (unsigned int)bytes : ret;
+	*wcp = wc;
+
+	return 0;
 }
 
 /*
- * This will copy user data from the iovec in the buffered write
- * context.
+ * If a page has any new buffers, zero them out here, and mark them uptodate
+ * and dirty so they'll be written out (in order to prevent uninitialised
+ * block data from leaking). And clear the new bit.
  */
-int ocfs2_map_and_write_user_data(struct inode *inode,
-				  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
-				  unsigned int *ret_from, unsigned int *ret_to)
+static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to)
 {
-	int ret;
-	unsigned int to, from, cluster_start, cluster_end;
-	unsigned long bytes, src_from;
-	char *dst;
-	struct ocfs2_buffered_write_priv *bp = wc->w_private;
-	const struct iovec *cur_iov = bp->b_cur_iov;
-	char __user *buf;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	unsigned int block_start, block_end;
+	struct buffer_head *head, *bh;
 
-	ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
-					&cluster_end);
+	BUG_ON(!PageLocked(page));
+	if (!page_has_buffers(page))
+		return;
 
-	buf = cur_iov->iov_base + bp->b_cur_off;
-	src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
+	bh = head = page_buffers(page);
+	block_start = 0;
+	do {
+		block_end = block_start + bh->b_size;
+
+		if (buffer_new(bh)) {
+			if (block_end > from && block_start < to) {
+				if (!PageUptodate(page)) {
+					unsigned start, end;
+					void *kaddr;
+
+					start = max(from, block_start);
+					end = min(to, block_end);
+
+					kaddr = kmap_atomic(page, KM_USER0);
+					memset(kaddr+start, 0, end - start);
+					flush_dcache_page(page);
+					kunmap_atomic(kaddr, KM_USER0);
+					set_buffer_uptodate(bh);
+				}
+
+				clear_buffer_new(bh);
+				mark_buffer_dirty(bh);
+			}
+		}
 
-	from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+}
 
-	/*
-	 * This is a lot of comparisons, but it reads quite
-	 * easily, which is important here.
-	 */
-	/* Stay within the src page */
-	bytes = PAGE_SIZE - src_from;
-	/* Stay within the vector */
-	bytes = min(bytes,
-		    (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
-	/* Stay within count */
-	bytes = min(bytes, (unsigned long)wc->w_count);
-	/*
-	 * For clustersize > page size, just stay within
-	 * target page, otherwise we have to calculate pos
-	 * within the cluster and obey the rightmost
-	 * boundary.
-	 */
-	if (wc->w_large_pages) {
-		/*
-		 * For cluster size < page size, we have to
-		 * calculate pos within the cluster and obey
-		 * the rightmost boundary.
-		 */
-		bytes = min(bytes, (unsigned long)(osb->s_clustersize
-				   - (wc->w_pos & (osb->s_clustersize - 1))));
-	} else {
-		/*
-		 * cluster size > page size is the most common
-		 * case - we just stay within the target page
-		 * boundary.
-		 */
-		bytes = min(bytes, PAGE_CACHE_SIZE - from);
-	}
+/*
+ * Only called when we have a failure during allocating write to write
+ * zero's to the newly allocated region.
+ */
+static void ocfs2_write_failure(struct inode *inode,
+				struct ocfs2_write_ctxt *wc,
+				loff_t user_pos, unsigned user_len)
+{
+	int i;
+	unsigned from, to;
+	struct page *tmppage;
 
-	to = from + bytes;
+	ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len);
 
-	if (wc->w_this_page_new)
-		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-					    cluster_start, cluster_end, 1);
-	else
-		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
-					    from, to, 0);
-	if (ret) {
-		mlog_errno(ret);
-		goto out;
+	if (wc->w_large_pages) {
+		from = wc->w_target_from;
+		to = wc->w_target_to;
+	} else {
+		from = 0;
+		to = PAGE_CACHE_SIZE;
 	}
 
-	BUG_ON(from > PAGE_CACHE_SIZE);
-	BUG_ON(to > PAGE_CACHE_SIZE);
-	BUG_ON(from > osb->s_clustersize);
-	BUG_ON(to > osb->s_clustersize);
-
-	dst = kmap(wc->w_this_page);
-	memcpy(dst + from, bp->b_src_buf + src_from, bytes);
-	kunmap(wc->w_this_page);
+	for(i = 0; i < wc->w_num_pages; i++) {
+		tmppage = wc->w_pages[i];
 
-	/*
-	 * XXX: This is slow, but simple. The caller of
-	 * ocfs2_buffered_write_cluster() is responsible for
-	 * passing through the iovecs, so it's difficult to
-	 * predict what our next step is in here after our
-	 * initial write. A future version should be pushing
-	 * that iovec manipulation further down.
-	 *
-	 * By setting this, we indicate that a copy from user
-	 * data was done, and subsequent calls for this
-	 * cluster will skip copying more data.
-	 */
-	wc->w_finished_copy = 1;
+		if (ocfs2_should_order_data(inode))
+			walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+					  from, to, NULL,
+					  ocfs2_journal_dirty_data);
 
-	*ret_from = from;
-	*ret_to = to;
-out:
-
-	return bytes ? (unsigned int)bytes : ret;
+		block_commit_write(tmppage, from, to);
+	}
 }
 
-/*
- * Map, fill and write a page to disk.
- *
- * The work of copying data is done via callback.  Newly allocated
- * pages which don't take user data will be zero'd (set 'new' to
- * indicate an allocating write)
- *
- * Returns a negative error code or the number of bytes copied into
- * the page.
- */
-static int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
-				 u64 *p_blkno, struct page *page,
-				 struct ocfs2_write_ctxt *wc, int new)
+static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno,
+					struct ocfs2_write_ctxt *wc,
+					struct page *page, u32 cpos,
+					loff_t user_pos, unsigned user_len,
+					int new)
 {
-	int ret, copied = 0;
-	unsigned int from = 0, to = 0;
+	int ret;
+	unsigned int map_from = 0, map_to = 0;
 	unsigned int cluster_start, cluster_end;
-	unsigned int zero_from = 0, zero_to = 0;
+	unsigned int user_data_from = 0, user_data_to = 0;
 
-	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
+	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos,
 					&cluster_start, &cluster_end);
 
-	if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
-	    && !wc->w_finished_copy) {
-
-		wc->w_this_page = page;
-		wc->w_this_page_new = new;
-		ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
-		if (ret < 0) {
+	if (page == wc->w_target_page) {
+		map_from = user_pos & (PAGE_CACHE_SIZE - 1);
+		map_to = map_from + user_len;
+
+		if (new)
+			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+						    cluster_start, cluster_end,
+						    new);
+		else
+			ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+						    map_from, map_to, new);
+		if (ret) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		copied = ret;
-
-		zero_from = from;
-		zero_to = to;
+		user_data_from = map_from;
+		user_data_to = map_to;
 		if (new) {
-			from = cluster_start;
-			to = cluster_end;
+			map_from = cluster_start;
+			map_to = cluster_end;
 		}
+
+		wc->w_target_from = map_from;
+		wc->w_target_to = map_to;
 	} else {
 		/*
 		 * If we haven't allocated the new page yet, we
@@ -979,11 +998,11 @@ static int ocfs2_write_data_page(struct 
 		 */
 		BUG_ON(!new);
 
-		from = cluster_start;
-		to = cluster_end;
+		map_from = cluster_start;
+		map_to = cluster_end;
 
 		ret = ocfs2_map_page_blocks(page, p_blkno, inode,
-					    cluster_start, cluster_end, 1);
+					    cluster_start, cluster_end, new);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
@@ -1002,108 +1021,110 @@ static int ocfs2_write_data_page(struct 
 	 */
 	if (new && !PageUptodate(page))
 		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
-					 wc->w_cpos, zero_from, zero_to);
+					 cpos, user_data_from, user_data_to);
 
 	flush_dcache_page(page);
 
-	if (ocfs2_should_order_data(inode)) {
-		ret = walk_page_buffers(handle,
-					page_buffers(page),
-					from, to, NULL,
-					ocfs2_journal_dirty_data);
-		if (ret < 0)
-			mlog_errno(ret);
-	}
-
-	/*
-	 * We don't use generic_commit_write() because we need to
-	 * handle our own i_size update.
-	 */
-	ret = block_commit_write(page, from, to);
-	if (ret)
-		mlog_errno(ret);
 out:
-
-	return copied ? copied : ret;
+	return ret;
 }
 
 /*
- * Do the actual write of some data into an inode. Optionally allocate
- * in order to fulfill the write.
- *
- * cpos is the logical cluster offset within the file to write at
- *
- * 'phys' is the physical mapping of that offset. a 'phys' value of
- * zero indicates that allocation is required. In this case, data_ac
- * and meta_ac should be valid (meta_ac can be null if metadata
- * allocation isn't required).
+ * This function will only grab one clusters worth of pages.
  */
-static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
-			   struct buffer_head *di_bh,
-			   struct ocfs2_alloc_context *data_ac,
-			   struct ocfs2_alloc_context *meta_ac,
-			   struct ocfs2_write_ctxt *wc)
+static int ocfs2_grab_pages_for_write(struct address_space *mapping,
+				      struct ocfs2_write_ctxt *wc,
+				      u32 cpos, loff_t user_pos, int new,
+				      struct page *mmap_page)
 {
-	int ret, i, numpages = 1, new;
-	unsigned int copied = 0;
-	u32 tmp_pos;
-	u64 v_blkno, p_blkno;
-	struct address_space *mapping = file->f_mapping;
+	int ret = 0, i;
+	unsigned long start, target_index, index;
 	struct inode *inode = mapping->host;
-	unsigned long index, start;
-	struct page **cpages;
 
-	new = phys == 0 ? 1 : 0;
+	target_index = user_pos >> PAGE_CACHE_SHIFT;
 
 	/*
 	 * Figure out how many pages we'll be manipulating here. For
 	 * non allocating write, we just change the one
 	 * page. Otherwise, we'll need a whole clusters worth.
 	 */
-	if (new)
-		numpages = ocfs2_pages_per_cluster(inode->i_sb);
-
-	cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
-	if (!cpages) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		return ret;
-	}
-
-	/*
-	 * Fill our page array first. That way we've grabbed enough so
-	 * that we can zero and flush if we error after adding the
-	 * extent.
-	 */
 	if (new) {
-		start = ocfs2_align_clusters_to_page_index(inode->i_sb,
-							   wc->w_cpos);
-		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
+		wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
+		start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
 	} else {
-		start = wc->w_pos >> PAGE_CACHE_SHIFT;
-		v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
+		wc->w_num_pages = 1;
+		start = target_index;
 	}
 
-	for(i = 0; i < numpages; i++) {
+	for(i = 0; i < wc->w_num_pages; i++) {
 		index = start + i;
 
-		cpages[i] = find_or_create_page(mapping, index, GFP_NOFS);
-		if (!cpages[i]) {
-			ret = -ENOMEM;
-			mlog_errno(ret);
-			goto out;
+		if (index == target_index && mmap_page) {
+			/*
+			 * ocfs2_pagemkwrite() is a little different
+			 * and wants us to directly use the page
+			 * passed in.
+			 */
+			lock_page(mmap_page);
+
+			if (mmap_page->mapping != mapping) {
+				unlock_page(mmap_page);
+				/*
+				 * Sanity check - the locking in
+				 * ocfs2_pagemkwrite() should ensure
+				 * that this code doesn't trigger.
+				 */
+				ret = -EINVAL;
+				mlog_errno(ret);
+				goto out;
+			}
+
+			page_cache_get(mmap_page);
+			wc->w_pages[i] = mmap_page;
+		} else {
+			wc->w_pages[i] = find_or_create_page(mapping, index,
+							     GFP_NOFS);
+			if (!wc->w_pages[i]) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
 		}
+
+		if (index == target_index)
+			wc->w_target_page = wc->w_pages[i];
 	}
+out:
+	return ret;
+}
+
+/*
+ * Prepare a single cluster for write one cluster into the file.
+ */
+static int ocfs2_write_cluster(struct address_space *mapping,
+			       u32 phys, struct ocfs2_alloc_context *data_ac,
+			       struct ocfs2_alloc_context *meta_ac,
+			       struct ocfs2_write_ctxt *wc, u32 cpos,
+			       loff_t user_pos, unsigned user_len)
+{
+	int ret, i, new;
+	u64 v_blkno, p_blkno;
+	struct inode *inode = mapping->host;
+
+	new = phys == 0 ? 1 : 0;
 
 	if (new) {
+		u32 tmp_pos;
+
 		/*
 		 * This is safe to call with the page locks - it won't take
 		 * any additional semaphores or cluster locks.
 		 */
-		tmp_pos = wc->w_cpos;
+		tmp_pos = cpos;
 		ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
-						 &tmp_pos, 1, di_bh, handle,
-						 data_ac, meta_ac, NULL);
+						 &tmp_pos, 1, wc->w_di_bh,
+						 wc->w_handle, data_ac,
+						 meta_ac, NULL);
 		/*
 		 * This shouldn't happen because we must have already
 		 * calculated the correct meta data allocation required. The
@@ -1120,159 +1141,349 @@ static ssize_t ocfs2_write(struct file *
 			mlog_errno(ret);
 			goto out;
 		}
+
+		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos);
+	} else {
+		v_blkno = user_pos >> inode->i_sb->s_blocksize_bits;
 	}
 
+	/*
+	 * The only reason this should fail is due to an inability to
+	 * find the extent added.
+	 */
 	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
 					  NULL);
 	if (ret < 0) {
-
-		/*
-		 * XXX: Should we go readonly here?
-		 */
-
-		mlog_errno(ret);
+		ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, "
+			    "at logical block %llu",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)v_blkno);
 		goto out;
 	}
 
 	BUG_ON(p_blkno == 0);
 
-	for(i = 0; i < numpages; i++) {
-		ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
-					    wc, new);
-		if (ret < 0) {
-			mlog_errno(ret);
-			goto out;
-		}
+	for(i = 0; i < wc->w_num_pages; i++) {
+		int tmpret;
 
-		copied += ret;
+		tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc,
+						      wc->w_pages[i], cpos,
+						      user_pos, user_len, new);
+		if (tmpret) {
+			mlog_errno(tmpret);
+			if (ret == 0)
+				tmpret = ret;
+		}
 	}
 
+	/*
+	 * We only have cleanup to do in case of allocating write.
+	 */
+	if (ret && new)
+		ocfs2_write_failure(inode, wc, user_pos, user_len);
+
 out:
-	for(i = 0; i < numpages; i++) {
-		unlock_page(cpages[i]);
-		mark_page_accessed(cpages[i]);
-		page_cache_release(cpages[i]);
-	}
-	kfree(cpages);
 
-	return copied ? copied : ret;
+	return ret;
 }
 
-static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
-				  struct ocfs2_super *osb, loff_t pos,
-				  size_t count, ocfs2_page_writer *cb,
-				  void *cb_priv)
+/*
+ * ocfs2_write_end() wants to know which parts of the target page it
+ * should complete the write on. It's easiest to compute them ahead of
+ * time when a more complete view of the write is available.
+ */
+static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
+					struct ocfs2_write_ctxt *wc,
+					loff_t pos, unsigned len, int alloc)
 {
-	wc->w_count = count;
-	wc->w_pos = pos;
-	wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
-	wc->w_finished_copy = 0;
+	struct ocfs2_write_cluster_desc *desc;
 
-	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
-		wc->w_large_pages = 1;
-	else
-		wc->w_large_pages = 0;
+	wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1);
+	wc->w_target_to = wc->w_target_from + len;
 
-	wc->w_write_data_page = cb;
-	wc->w_private = cb_priv;
+	if (alloc == 0)
+		return;
+
+	/*
+	 * Allocating write - we may have different boundaries based
+	 * on page size and cluster size.
+	 *
+	 * NOTE: We can no longer compute one value from the other as
+	 * the actual write length and user provided length may be
+	 * different.
+	 */
+
+	if (wc->w_large_pages) {
+		/*
+		 * We only care about the 1st and last cluster within
+		 * our range and whether they are holes or not. Either
+		 * value may be extended out to the start/end of a
+		 * newly allocated cluster.
+		 */
+		desc = &wc->w_desc[0];
+		if (desc->c_new)
+			ocfs2_figure_cluster_boundaries(osb,
+							desc->c_cpos,
+							&wc->w_target_from,
+							NULL);
+
+		desc = &wc->w_desc[wc->w_clen - 1];
+		if (desc->c_new)
+			ocfs2_figure_cluster_boundaries(osb,
+							desc->c_cpos,
+							NULL,
+							&wc->w_target_to);
+	} else {
+		wc->w_target_from = 0;
+		wc->w_target_to = PAGE_CACHE_SIZE;
+	}
 }
 
-/*
- * Write a cluster to an inode. The cluster may not be allocated yet,
- * in which case it will be. This only exists for buffered writes -
- * O_DIRECT takes a more "traditional" path through the kernel.
- *
- * The caller is responsible for incrementing pos, written counts, etc
- *
- * For file systems that don't support sparse files, pre-allocation
- * and page zeroing up until cpos should be done prior to this
- * function call.
- *
- * Callers should be holding i_sem, and the rw cluster lock.
- *
- * Returns the number of user bytes written, or less than zero for
- * error.
- */
-ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
-				     size_t count, ocfs2_page_writer *actor,
-				     void *priv)
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata,
+			     struct buffer_head *di_bh, struct page *mmap_page)
 {
-	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
-	ssize_t written = 0;
-	u32 phys;
-	struct inode *inode = file->f_mapping->host;
+	int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS;
+	unsigned int num_clusters = 0, clusters_to_alloc = 0;
+	u32 phys = 0;
+	struct ocfs2_write_ctxt *wc;
+	struct inode *inode = mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dinode *di;
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_alloc_context *meta_ac = NULL;
 	handle_t *handle;
-	struct ocfs2_write_ctxt wc;
+	struct ocfs2_write_cluster_desc *desc;
 
-	ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
-
-	ret = ocfs2_meta_lock(inode, &di_bh, 1);
+	ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh);
 	if (ret) {
 		mlog_errno(ret);
+		return ret;
+	}
+
+	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+
+	for (i = 0; i < wc->w_clen; i++) {
+		desc = &wc->w_desc[i];
+		desc->c_cpos = wc->w_cpos + i;
+
+		if (num_clusters == 0) {
+			ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys,
+						 &num_clusters, NULL);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		} else if (phys) {
+			/*
+			 * Only increment phys if it doesn't describe
+			 * a hole.
+			 */
+			phys++;
+		}
+
+		desc->c_phys = phys;
+		if (phys == 0) {
+			desc->c_new = 1;
+			clusters_to_alloc++;
+		}
+
+		num_clusters--;
+	}
+
+	/*
+	 * We set w_target_from, w_target_to here so that
+	 * ocfs2_write_end() knows which range in the target page to
+	 * write out. An allocation requires that we write the entire
+	 * cluster range.
+	 */
+	if (clusters_to_alloc > 0) {
+		/*
+		 * XXX: We are stretching the limits of
+		 * ocfs2_lock_allocators(). It greately over-estimates
+		 * the work to be done.
+		 */
+		ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc,
+					    &data_ac, &meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		credits = ocfs2_calc_extend_credits(inode->i_sb, di,
+						    clusters_to_alloc);
+
+	}
+
+	ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc);
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
 		goto out;
 	}
-	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	wc->w_handle = handle;
 
 	/*
-	 * Take alloc sem here to prevent concurrent lookups. That way
-	 * the mapping, zeroing and tree manipulation within
-	 * ocfs2_write() will be safe against ->readpage(). This
-	 * should also serve to lock out allocation from a shared
-	 * writeable region.
+	 * We don't want this to fail in ocfs2_write_end(), so do it
+	 * here.
 	 */
-	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+	ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
 
-	ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
+	/*
+	 * Fill our page array first. That way we've grabbed enough so
+	 * that we can zero and flush if we error after adding the
+	 * extent.
+	 */
+	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+					 clusters_to_alloc, mmap_page);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_meta;
+		goto out_commit;
 	}
 
-	/* phys == 0 means that allocation is required. */
-	if (phys == 0) {
-		ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
+	for (i = 0; i < wc->w_clen; i++) {
+		desc = &wc->w_desc[i];
+
+		ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac,
+					  meta_ac, wc, desc->c_cpos, pos, len);
 		if (ret) {
 			mlog_errno(ret);
-			goto out_meta;
+			goto out_commit;
 		}
+	}
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	*pagep = wc->w_target_page;
+	*fsdata = wc;
+	return 0;
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out:
+	ocfs2_free_write_ctxt(wc);
+
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+	return ret;
+}
+
+int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+		      loff_t pos, unsigned len, unsigned flags,
+		      struct page **pagep, void **fsdata)
+{
+	int ret;
+	struct buffer_head *di_bh = NULL;
+	struct inode *inode = mapping->host;
 
-		credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
+	ret = ocfs2_meta_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
 	}
 
+	/*
+	 * Take alloc sem here to prevent concurrent lookups. That way
+	 * the mapping, zeroing and tree manipulation within
+	 * ocfs2_write() will be safe against ->readpage(). This
+	 * should also serve to lock out allocation from a shared
+	 * writeable region.
+	 */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
 	ret = ocfs2_data_lock(inode, 1);
 	if (ret) {
 		mlog_errno(ret);
-		goto out_meta;
+		goto out_fail;
 	}
 
-	handle = ocfs2_start_trans(osb, credits);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
+	ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep,
+				       fsdata, di_bh, NULL);
+	if (ret) {
 		mlog_errno(ret);
-		goto out_data;
+		goto out_fail_data;
 	}
 
-	written = ocfs2_write(file, phys, handle, di_bh, data_ac,
-			      meta_ac, &wc);
-	if (written < 0) {
-		ret = written;
-		mlog_errno(ret);
-		goto out_commit;
+	brelse(di_bh);
+
+	return 0;
+
+out_fail_data:
+	ocfs2_data_unlock(inode, 1);
+out_fail:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	brelse(di_bh);
+	ocfs2_meta_unlock(inode, 1);
+
+	return ret;
+}
+
+int ocfs2_write_end_nolock(struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata)
+{
+	int i;
+	unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1);
+	struct inode *inode = mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_write_ctxt *wc = fsdata;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
+	handle_t *handle = wc->w_handle;
+	struct page *tmppage;
+
+	if (unlikely(copied < len)) {
+		if (!PageUptodate(wc->w_target_page))
+			copied = 0;
+
+		ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
+				       start+len);
 	}
+	flush_dcache_page(wc->w_target_page);
+
+	for(i = 0; i < wc->w_num_pages; i++) {
+		tmppage = wc->w_pages[i];
+
+		if (tmppage == wc->w_target_page) {
+			from = wc->w_target_from;
+			to = wc->w_target_to;
+
+			BUG_ON(from > PAGE_CACHE_SIZE ||
+			       to > PAGE_CACHE_SIZE ||
+			       to < from);
+		} else {
+			/*
+			 * Pages adjacent to the target (if any) imply
+			 * a hole-filling write in which case we want
+			 * to flush their entire range.
+			 */
+			from = 0;
+			to = PAGE_CACHE_SIZE;
+		}
 
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
+		if (ocfs2_should_order_data(inode))
+			walk_page_buffers(wc->w_handle, page_buffers(tmppage),
+					  from, to, NULL,
+					  ocfs2_journal_dirty_data);
+
+		block_commit_write(tmppage, from, to);
 	}
 
-	pos += written;
+	pos += copied;
 	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
 		mark_inode_dirty(inode);
@@ -1283,28 +1494,28 @@ ssize_t ocfs2_buffered_write_cluster(str
 	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
 	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
 
-	ret = ocfs2_journal_dirty(handle, di_bh);
-	if (ret)
-		mlog_errno(ret);
+	ocfs2_journal_dirty(handle, wc->w_di_bh);
 
-out_commit:
 	ocfs2_commit_trans(osb, handle);
+	ocfs2_free_write_ctxt(wc);
 
-out_data:
-	ocfs2_data_unlock(inode, 1);
+	return copied;
+}
+
+int ocfs2_write_end(struct file *file, struct address_space *mapping,
+		    loff_t pos, unsigned len, unsigned copied,
+		    struct page *page, void *fsdata)
+{
+	int ret;
+	struct inode *inode = mapping->host;
 
-out_meta:
+	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+
+	ocfs2_data_unlock(inode, 1);
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	ocfs2_meta_unlock(inode, 1);
 
-out:
-	brelse(di_bh);
-	if (data_ac)
-		ocfs2_free_alloc_context(data_ac);
-	if (meta_ac)
-		ocfs2_free_alloc_context(meta_ac);
-
-	return written ? written : ret;
+	return ret;
 }
 
 const struct address_space_operations ocfs2_aops = {
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index 45821d4..389579b 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -42,57 +42,22 @@ int walk_page_buffers(	handle_t *handle,
 			int (*fn)(	handle_t *handle,
 					struct buffer_head *bh));
 
-struct ocfs2_write_ctxt;
-typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
-				u64 *, unsigned int *, unsigned int *);
+int ocfs2_write_begin(struct file *file, struct address_space *mapping,
+		      loff_t pos, unsigned len, unsigned flags,
+		      struct page **pagep, void **fsdata);
 
-ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
-				     size_t count, ocfs2_page_writer *actor,
-				     void *priv);
+int ocfs2_write_end(struct file *file, struct address_space *mapping,
+		    loff_t pos, unsigned len, unsigned copied,
+		    struct page *page, void *fsdata);
 
-struct ocfs2_write_ctxt {
-	size_t				w_count;
-	loff_t				w_pos;
-	u32				w_cpos;
-	unsigned int			w_finished_copy;
+int ocfs2_write_end_nolock(struct address_space *mapping,
+			   loff_t pos, unsigned len, unsigned copied,
+			   struct page *page, void *fsdata);
 
-	/* This is true if page_size > cluster_size */
-	unsigned int			w_large_pages;
-
-	/* Filler callback and private data */
-	ocfs2_page_writer		*w_write_data_page;
-	void				*w_private;
-
-	/* Only valid for the filler callback */
-	struct page			*w_this_page;
-	unsigned int			w_this_page_new;
-};
-
-struct ocfs2_buffered_write_priv {
-	char				*b_src_buf;
-	const struct iovec		*b_cur_iov; /* Current iovec */
-	size_t				b_cur_off; /* Offset in the
-						    * current iovec */
-};
-int ocfs2_map_and_write_user_data(struct inode *inode,
-				  struct ocfs2_write_ctxt *wc,
-				  u64 *p_blkno,
-				  unsigned int *ret_from,
-				  unsigned int *ret_to);
-
-struct ocfs2_splice_write_priv {
-	struct splice_desc		*s_sd;
-	struct pipe_buffer		*s_buf;
-	struct pipe_inode_info		*s_pipe;
-	/* Neither offset value is ever larger than one page */
-	unsigned int			s_offset;
-	unsigned int			s_buf_offset;
-};
-int ocfs2_map_and_write_splice_data(struct inode *inode,
-				    struct ocfs2_write_ctxt *wc,
-				    u64 *p_blkno,
-				    unsigned int *ret_from,
-				    unsigned int *ret_to);
+int ocfs2_write_begin_nolock(struct address_space *mapping,
+			     loff_t pos, unsigned len, unsigned flags,
+			     struct page **pagep, void **fsdata,
+			     struct buffer_head *di_bh, struct page *mmap_page);
 
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 9395b4f..e1b7332 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -326,8 +326,6 @@ static int ocfs2_truncate_file(struct in
 		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
 		   (unsigned long long)new_i_size);
 
-	truncate_inode_pages(inode->i_mapping, new_i_size);
-
 	fe = (struct ocfs2_dinode *) di_bh->b_data;
 	if (!OCFS2_IS_VALID_DINODE(fe)) {
 		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
@@ -362,16 +360,23 @@ static int ocfs2_truncate_file(struct in
 	if (new_i_size == le64_to_cpu(fe->i_size))
 		goto bail;
 
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
 	/* This forces other nodes to sync and drop their pages. Do
 	 * this even if we have a truncate without allocation change -
 	 * ocfs2 cluster sizes can be much greater than page size, so
 	 * we have to truncate them anyway.  */
 	status = ocfs2_data_lock(inode, 1);
 	if (status < 0) {
+		up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
 		mlog_errno(status);
 		goto bail;
 	}
 
+	unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
+	truncate_inode_pages(inode->i_mapping, new_i_size);
+
 	/* alright, we're going to need to do a full blown alloc size
 	 * change. Orphan the inode so that recovery can complete the
 	 * truncate if necessary. This does the task of marking
@@ -398,6 +403,8 @@ static int ocfs2_truncate_file(struct in
 bail_unlock_data:
 	ocfs2_data_unlock(inode, 1);
 
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
 bail:
 
 	mlog_exit(status);
@@ -994,6 +1001,13 @@ #define OCFS2_VALID_ATTRS (ATTR_ATIME | 
 		goto bail_unlock;
 	}
 
+	/*
+	 * This will intentionally not wind up calling vmtruncate(),
+	 * since all the work for a size change has been done above.
+	 * Otherwise, we could get into problems with truncate as
+	 * ip_alloc_sem is used there to protect against i_size
+	 * changes.
+	 */
 	status = inode_setattr(inode, attr);
 	if (status < 0) {
 		mlog_errno(status);
@@ -1328,15 +1342,16 @@ ocfs2_set_next_iovec(const struct iovec 
 	*basep = base;
 }
 
-static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
+static struct page * ocfs2_get_write_source(char **ret_src_buf,
 					    const struct iovec *cur_iov,
 					    size_t iov_offset)
 {
 	int ret;
-	char *buf;
+	char *buf = cur_iov->iov_base + iov_offset;
 	struct page *src_page = NULL;
+	unsigned long off;
 
-	buf = cur_iov->iov_base + iov_offset;
+	off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
 
 	if (!segment_eq(get_fs(), KERNEL_DS)) {
 		/*
@@ -1348,18 +1363,17 @@ static struct page * ocfs2_get_write_sou
 				     (unsigned long)buf & PAGE_CACHE_MASK, 1,
 				     0, 0, &src_page, NULL);
 		if (ret == 1)
-			bp->b_src_buf = kmap(src_page);
+			*ret_src_buf = kmap(src_page) + off;
 		else
 			src_page = ERR_PTR(-EFAULT);
 	} else {
-		bp->b_src_buf = buf;
+		*ret_src_buf = buf;
 	}
 
 	return src_page;
 }
 
-static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
-				   struct page *page)
+static void ocfs2_put_write_source(struct page *page)
 {
 	if (page) {
 		kunmap(page);
@@ -1375,10 +1389,12 @@ static ssize_t ocfs2_file_buffered_write
 {
 	int ret = 0;
 	ssize_t copied, total = 0;
-	size_t iov_offset = 0;
+	size_t iov_offset = 0, bytes;
+	loff_t pos;
 	const struct iovec *cur_iov = iov;
-	struct ocfs2_buffered_write_priv bp;
-	struct page *page;
+	struct page *user_page, *page;
+	char *buf, *dst;
+	void *fsdata;
 
 	/*
 	 * handle partial DIO write.  Adjust cur_iov if needed.
@@ -1386,21 +1402,39 @@ static ssize_t ocfs2_file_buffered_write
 	ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
 
 	do {
-		bp.b_cur_off = iov_offset;
-		bp.b_cur_iov = cur_iov;
+		pos = *ppos;
 
-		page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
-		if (IS_ERR(page)) {
-			ret = PTR_ERR(page);
+		user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
+		if (IS_ERR(user_page)) {
+			ret = PTR_ERR(user_page);
 			goto out;
 		}
 
-		copied = ocfs2_buffered_write_cluster(file, *ppos, count,
-						      ocfs2_map_and_write_user_data,
-						      &bp);
+		/* Stay within our page boundaries */
+		bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
+			    (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
+		/* Stay within the vector boundary */
+		bytes = min(bytes,
+			    (unsigned long)(cur_iov->iov_len - iov_offset));
+		/* Stay within count */
+		bytes = min(bytes, (unsigned long)count);
+
+		page = NULL;
+		ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
+					&page, &fsdata);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 
-		ocfs2_put_write_source(&bp, page);
+		dst = kmap_atomic(page, KM_USER0);
+		memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes);
+		kunmap_atomic(dst, KM_USER0);
+		flush_dcache_page(page);
+		ocfs2_put_write_source(user_page);
 
+		copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
+					 bytes, page, fsdata);
 		if (copied < 0) {
 			mlog_errno(copied);
 			ret = copied;
@@ -1408,7 +1442,7 @@ static ssize_t ocfs2_file_buffered_write
 		}
 
 		total += copied;
-		*ppos = *ppos + copied;
+		*ppos = pos + copied;
 		count -= copied;
 
 		ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
@@ -1608,52 +1642,46 @@ static int ocfs2_splice_write_actor(stru
 				    struct pipe_buffer *buf,
 				    struct splice_desc *sd)
 {
-	int ret, count, total = 0;
+	int ret, count;
 	ssize_t copied = 0;
-	struct ocfs2_splice_write_priv sp;
+	struct file *file = sd->file;
+	unsigned int offset;
+	struct page *page = NULL;
+	void *fsdata;
+	char *src, *dst;
 
 	ret = buf->ops->pin(pipe, buf);
 	if (ret)
 		goto out;
 
-	sp.s_sd = sd;
-	sp.s_buf = buf;
-	sp.s_pipe = pipe;
-	sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
-	sp.s_buf_offset = buf->offset;
-
+	offset = sd->pos & ~PAGE_CACHE_MASK;
 	count = sd->len;
-	if (count + sp.s_offset > PAGE_CACHE_SIZE)
-		count = PAGE_CACHE_SIZE - sp.s_offset;
+	if (count + offset > PAGE_CACHE_SIZE)
+		count = PAGE_CACHE_SIZE - offset;
 
-	do {
-		/*
-		 * splice wants us to copy up to one page at a
-		 * time. For pagesize > cluster size, this means we
-		 * might enter ocfs2_buffered_write_cluster() more
-		 * than once, so keep track of our progress here.
-		 */
-		copied = ocfs2_buffered_write_cluster(sd->file,
-						      (loff_t)sd->pos + total,
-						      count,
-						      ocfs2_map_and_write_splice_data,
-						      &sp);
-		if (copied < 0) {
-			mlog_errno(copied);
-			ret = copied;
-			goto out;
-		}
+	ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
+				&page, &fsdata);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		count -= copied;
-		sp.s_offset += copied;
-		sp.s_buf_offset += copied;
-		total += copied;
-	} while (count);
+	src = buf->ops->map(pipe, buf, 1);
+	dst = kmap_atomic(page, KM_USER1);
+	memcpy(dst + offset, src + buf->offset, count);
+	kunmap_atomic(page, KM_USER1);
+	buf->ops->unmap(pipe, buf, src);
 
-	ret = 0;
+	copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
+				 page, fsdata);
+	if (copied < 0) {
+		mlog_errno(copied);
+		ret = copied;
+		goto out;
+	}
 out:
 
-	return total ? total : ret;
+	return copied ? copied : ret;
 }
 
 static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 4dedd97..545f789 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -471,9 +471,6 @@ int ocfs2_reserve_local_alloc_bits(struc
 
 	mutex_lock(&local_alloc_inode->i_mutex);
 
-	ac->ac_inode = local_alloc_inode;
-	ac->ac_which = OCFS2_AC_USE_LOCAL;
-
 	if (osb->local_alloc_state != OCFS2_LA_ENABLED) {
 		status = -ENOSPC;
 		goto bail;
@@ -511,10 +508,14 @@ int ocfs2_reserve_local_alloc_bits(struc
 		}
 	}
 
+	ac->ac_inode = local_alloc_inode;
+	ac->ac_which = OCFS2_AC_USE_LOCAL;
 	get_bh(osb->local_alloc_bh);
 	ac->ac_bh = osb->local_alloc_bh;
 	status = 0;
 bail:
+	if (status < 0 && local_alloc_inode)
+		iput(local_alloc_inode);
 
 	mlog_exit(status);
 	return status;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index af01158..d79aa12 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -37,11 +37,29 @@ #include <cluster/masklog.h>
 
 #include "ocfs2.h"
 
+#include "aops.h"
 #include "dlmglue.h"
 #include "file.h"
 #include "inode.h"
 #include "mmap.h"
 
+static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset)
+{
+	/* The best way to deal with signals in the vm path is
+	 * to block them upfront, rather than allowing the
+	 * locking paths to return -ERESTARTSYS. */
+	sigfillset(blocked);
+
+	/* We should technically never get a bad return value
+	 * from sigprocmask */
+	return sigprocmask(SIG_BLOCK, blocked, oldset);
+}
+
+static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset)
+{
+	return sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
 static struct page *ocfs2_nopage(struct vm_area_struct * area,
 				 unsigned long address,
 				 int *type)
@@ -53,14 +71,7 @@ static struct page *ocfs2_nopage(struct 
 	mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address,
 		   type);
 
-	/* The best way to deal with signals in this path is
-	 * to block them upfront, rather than allowing the
-	 * locking paths to return -ERESTARTSYS. */
-	sigfillset(&blocked);
-
-	/* We should technically never get a bad ret return
-	 * from sigprocmask */
-	ret = sigprocmask(SIG_BLOCK, &blocked, &oldset);
+	ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
@@ -68,7 +79,7 @@ static struct page *ocfs2_nopage(struct 
 
 	page = filemap_nopage(area, address, type);
 
-	ret = sigprocmask(SIG_SETMASK, &oldset, NULL);
+	ret = ocfs2_vm_op_unblock_sigs(&oldset);
 	if (ret < 0)
 		mlog_errno(ret);
 out:
@@ -76,28 +87,136 @@ out:
 	return page;
 }
 
-static struct vm_operations_struct ocfs2_file_vm_ops = {
-	.nopage = ocfs2_nopage,
-};
+static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh,
+				struct page *page)
+{
+	int ret;
+	struct address_space *mapping = inode->i_mapping;
+	loff_t pos = page->index << PAGE_CACHE_SHIFT;
+	unsigned int len = PAGE_CACHE_SIZE;
+	pgoff_t last_index;
+	struct page *locked_page = NULL;
+	void *fsdata;
+	loff_t size = i_size_read(inode);
 
-int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+	/*
+	 * Another node might have truncated while we were waiting on
+	 * cluster locks.
+	 */
+	last_index = size >> PAGE_CACHE_SHIFT;
+	if (page->index > last_index) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * The i_size check above doesn't catch the case where nodes
+	 * truncated and then re-extended the file. We'll re-check the
+	 * page mapping after taking the page lock inside of
+	 * ocfs2_write_begin_nolock().
+	 */
+	if (!PageUptodate(page) || page->mapping != inode->i_mapping) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Call ocfs2_write_begin() and ocfs2_write_end() to take
+	 * advantage of the allocation code there. We pass a write
+	 * length of the whole page (chopped to i_size) to make sure
+	 * the whole thing is allocated.
+	 *
+	 * Since we know the page is up to date, we don't have to
+	 * worry about ocfs2_write_begin() skipping some buffer reads
+	 * because the "write" would invalidate their data.
+	 */
+	if (page->index == last_index)
+		len = size & ~PAGE_CACHE_MASK;
+
+	ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page,
+				       &fsdata, di_bh, page);
+	if (ret) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
+				     fsdata);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+	BUG_ON(ret != len);
+	ret = 0;
+out:
+	return ret;
+}
+
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 {
-	int ret = 0, lock_level = 0;
-	struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	struct buffer_head *di_bh = NULL;
+	sigset_t blocked, oldset;
+	int ret, ret2;
+
+	ret = ocfs2_vm_op_block_sigs(&blocked, &oldset);
+	if (ret < 0) {
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/*
+	 * The cluster locks taken will block a truncate from another
+	 * node. Taking the data lock will also ensure that we don't
+	 * attempt page truncation as part of a downconvert.
+	 */
+	ret = ocfs2_meta_lock(inode, &di_bh, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
 
 	/*
-	 * Only support shared writeable mmap for local mounts which
-	 * don't know about holes.
+	 * The alloc sem should be enough to serialize with
+	 * ocfs2_truncate_file() changing i_size as well as any thread
+	 * modifying the inode btree.
 	 */
-	if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
-	    ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
-	    ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
-		mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
-		/* This is -EINVAL because generic_file_readonly_mmap
-		 * returns it in a similar situation. */
-		return -EINVAL;
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = ocfs2_data_lock(inode, 1);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out_meta_unlock;
 	}
 
+	ret = __ocfs2_page_mkwrite(inode, di_bh, page);
+
+	ocfs2_data_unlock(inode, 1);
+
+out_meta_unlock:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	brelse(di_bh);
+	ocfs2_meta_unlock(inode, 1);
+
+out:
+	ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
+	if (ret2 < 0)
+		mlog_errno(ret2);
+
+	return ret;
+}
+
+static struct vm_operations_struct ocfs2_file_vm_ops = {
+	.nopage		= ocfs2_nopage,
+	.page_mkwrite	= ocfs2_page_mkwrite,
+};
+
+int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int ret = 0, lock_level = 0;
+
 	ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode,
 				    file->f_vfsmnt, &lock_level);
 	if (ret < 0) {