GIT 9f3a4fca2411fa8ef94b345830f72ff7b6fc1aa2 git+ssh://master.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2.git#ALL commit 454fe8ba1777cba0245f5539c3825e981d874708 Author: Mark Fasheh Date: Wed May 9 15:16:19 2007 -0700 ocfs2: shared writeable mmap Implement cluster consistent shared writeable mappings using the ->page_mkwrite() callback. Signed-off-by: Mark Fasheh commit fb50d3d4cfce5dd8a08b3182204deaeb54a18dfb Author: Mark Fasheh Date: Wed May 9 15:14:45 2007 -0700 ocfs2: factor out write aops into nolock variants ocfs2_mkwrite() will want this so that it can add some mmap specific checks before asking for a write. Signed-off-by: Mark Fasheh commit f86eb69318843cd36502ecef9b9e2d0c8c76cef0 Author: Mark Fasheh Date: Tue May 8 17:47:32 2007 -0700 ocfs2: rework ocfs2_buffered_write_cluster() Use some ideas from the new-aops patch series and turn ocfs2_buffered_write_cluster() into a 2 stage operation with the caller copying data in between. The code now understands multiple cluster writes as a result of having to deal with a full page write for greater than 4k pages. This sets us up to easily call into the write path during ->page_mkwrite(). Signed-off-by: Mark Fasheh commit ecd0341ea73aed4c1663624f1f5ce819c288a5de Author: Mark Fasheh Date: Wed May 9 13:40:18 2007 -0700 ocfs2: take ip_alloc_sem during entire truncate Use of the alloc sem during truncate was too narrow - we want to protect the i_size change and page truncation against mmap now. Signed-off-by: Mark Fasheh commit c703282adf4ad7d1ddbb1adf09ee2f59976bbc12 Author: Christoph Hellwig Date: Thu May 17 16:03:13 2007 +0200 [PATCH] ocfs2: use list_for_each_entry where benefical Signed-off-by: Christoph Hellwig Signed-off-by: Mark Fasheh Signed-off-by: Andrew Morton --- fs/ocfs2/alloc.c | 3 fs/ocfs2/aops.c | 930 +++++++++++++++++++++-------------- fs/ocfs2/aops.h | 67 -- fs/ocfs2/cluster/tcp.c | 13 fs/ocfs2/dlm/dlmmaster.c | 40 - fs/ocfs2/dlm/dlmrecovery.c | 77 -- fs/ocfs2/dlmglue.c | 6 fs/ocfs2/extent_map.c | 10 fs/ocfs2/file.c | 142 +++-- fs/ocfs2/journal.c | 6 fs/ocfs2/mmap.c | 165 +++++- 11 files changed, 859 insertions(+), 600 deletions(-) diff -puN fs/ocfs2/alloc.c~git-ocfs2 fs/ocfs2/alloc.c --- a/fs/ocfs2/alloc.c~git-ocfs2 +++ a/fs/ocfs2/alloc.c @@ -3631,8 +3631,6 @@ int ocfs2_commit_truncate(struct ocfs2_s mlog_entry_void(); - down_write(&OCFS2_I(inode)->ip_alloc_sem); - new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); @@ -3754,7 +3752,6 @@ start: goto start; bail: - up_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_schedule_truncate_log_flush(osb, 1); diff -puN fs/ocfs2/aops.c~git-ocfs2 fs/ocfs2/aops.c --- a/fs/ocfs2/aops.c~git-ocfs2 +++ a/fs/ocfs2/aops.c @@ -684,6 +684,8 @@ int ocfs2_map_page_blocks(struct page *p bh = bh->b_this_page, block_start += bsize) { block_end = block_start + bsize; + clear_buffer_new(bh); + /* * Ignore blocks outside of our i/o range - * they may belong to unallocated clusters. @@ -698,9 +700,8 @@ int ocfs2_map_page_blocks(struct page *p * For an allocating write with cluster size >= page * size, we always write the entire page. */ - - if (buffer_new(bh)) - clear_buffer_new(bh); + if (new) + set_buffer_new(bh); if (!buffer_mapped(bh)) { map_bh(bh, inode->i_sb, *p_blkno); @@ -761,217 +762,234 @@ next_bh: return ret; } +#if (PAGE_CACHE_SIZE >= OCFS2_MAX_CLUSTERSIZE) +#define OCFS2_MAX_CTXT_PAGES 1 +#else +#define OCFS2_MAX_CTXT_PAGES (OCFS2_MAX_CLUSTERSIZE / PAGE_CACHE_SIZE) +#endif + +#define OCFS2_MAX_CLUSTERS_PER_PAGE (PAGE_CACHE_SIZE / OCFS2_MIN_CLUSTERSIZE) + /* - * This will copy user data from the buffer page in the splice - * context. - * - * For now, we ignore SPLICE_F_MOVE as that would require some extra - * communication out all the way to ocfs2_write(). + * Describe the state of a single cluster to be written to. */ -int ocfs2_map_and_write_splice_data(struct inode *inode, - struct ocfs2_write_ctxt *wc, u64 *p_blkno, - unsigned int *ret_from, unsigned int *ret_to) -{ - int ret; - unsigned int to, from, cluster_start, cluster_end; - char *src, *dst; - struct ocfs2_splice_write_priv *sp = wc->w_private; - struct pipe_buffer *buf = sp->s_buf; - unsigned long bytes, src_from; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); +struct ocfs2_write_cluster_desc { + u32 c_cpos; + u32 c_phys; + /* + * Give this a unique field because c_phys eventually gets + * filled. + */ + unsigned c_new; +}; - ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, - &cluster_end); +struct ocfs2_write_ctxt { + /* Logical cluster position / len of write */ + u32 w_cpos; + u32 w_clen; - from = sp->s_offset; - src_from = sp->s_buf_offset; - bytes = wc->w_count; + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; - if (wc->w_large_pages) { - /* - * For cluster size < page size, we have to - * calculate pos within the cluster and obey - * the rightmost boundary. - */ - bytes = min(bytes, (unsigned long)(osb->s_clustersize - - (wc->w_pos & (osb->s_clustersize - 1)))); - } - to = from + bytes; + /* + * This is true if page_size > cluster_size. + * + * It triggers a set of special cases during write which might + * have to deal with allocating writes to partial pages. + */ + unsigned int w_large_pages; - if (wc->w_this_page_new) - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, - cluster_start, cluster_end, 1); - else - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, - from, to, 0); - if (ret) { - mlog_errno(ret); - goto out; + /* + * Pages involved in this write. + * + * w_target_page is the page being written to by the user. + * + * w_pages is an array of pages which always contains + * w_target_page, and in the case of an allocating write with + * page_size < cluster size, it will contain zero'd and mapped + * pages adjacent to w_target_page which need to be written + * out in so that future reads from that region will get + * zero's. + */ + struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; + unsigned int w_num_pages; + struct page *w_target_page; + + /* + * ocfs2_write_end() uses this to know what the real range to + * write in the target should be. + */ + unsigned int w_target_from; + unsigned int w_target_to; + + /* + * We could use journal_current_handle() but this is cleaner, + * IMHO -Mark + */ + handle_t *w_handle; + + struct buffer_head *w_di_bh; +}; + +static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) +{ + int i; + + for(i = 0; i < wc->w_num_pages; i++) { + if (wc->w_pages[i] == NULL) + continue; + + unlock_page(wc->w_pages[i]); + mark_page_accessed(wc->w_pages[i]); + page_cache_release(wc->w_pages[i]); } - BUG_ON(from > PAGE_CACHE_SIZE); - BUG_ON(to > PAGE_CACHE_SIZE); - BUG_ON(from > osb->s_clustersize); - BUG_ON(to > osb->s_clustersize); - - src = buf->ops->map(sp->s_pipe, buf, 1); - dst = kmap_atomic(wc->w_this_page, KM_USER1); - memcpy(dst + from, src + src_from, bytes); - kunmap_atomic(wc->w_this_page, KM_USER1); - buf->ops->unmap(sp->s_pipe, buf, src); + brelse(wc->w_di_bh); + kfree(wc); +} - wc->w_finished_copy = 1; +static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, + struct ocfs2_super *osb, loff_t pos, + unsigned len, struct buffer_head *di_bh) +{ + struct ocfs2_write_ctxt *wc; - *ret_from = from; - *ret_to = to; -out: + wc = kzalloc(sizeof(struct ocfs2_write_ctxt), GFP_NOFS); + if (!wc) + return -ENOMEM; + + wc->w_cpos = pos >> osb->s_clustersize_bits; + wc->w_clen = ocfs2_clusters_for_bytes(osb->sb, len); + get_bh(di_bh); + wc->w_di_bh = di_bh; + + if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) + wc->w_large_pages = 1; + else + wc->w_large_pages = 0; - return bytes ? (unsigned int)bytes : ret; + *wcp = wc; + + return 0; } /* - * This will copy user data from the iovec in the buffered write - * context. + * If a page has any new buffers, zero them out here, and mark them uptodate + * and dirty so they'll be written out (in order to prevent uninitialised + * block data from leaking). And clear the new bit. */ -int ocfs2_map_and_write_user_data(struct inode *inode, - struct ocfs2_write_ctxt *wc, u64 *p_blkno, - unsigned int *ret_from, unsigned int *ret_to) +static void ocfs2_zero_new_buffers(struct page *page, unsigned from, unsigned to) { - int ret; - unsigned int to, from, cluster_start, cluster_end; - unsigned long bytes, src_from; - char *dst; - struct ocfs2_buffered_write_priv *bp = wc->w_private; - const struct iovec *cur_iov = bp->b_cur_iov; - char __user *buf; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int block_start, block_end; + struct buffer_head *head, *bh; - ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, - &cluster_end); + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; - buf = cur_iov->iov_base + bp->b_cur_off; - src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; + bh = head = page_buffers(page); + block_start = 0; + do { + block_end = block_start + bh->b_size; - from = wc->w_pos & (PAGE_CACHE_SIZE - 1); + if (buffer_new(bh)) { + if (block_end > from && block_start < to) { + if (!PageUptodate(page)) { + unsigned start, end; + void *kaddr; + + start = max(from, block_start); + end = min(to, block_end); + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+start, 0, end - start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh); + } + + clear_buffer_new(bh); + mark_buffer_dirty(bh); + } + } - /* - * This is a lot of comparisons, but it reads quite - * easily, which is important here. - */ - /* Stay within the src page */ - bytes = PAGE_SIZE - src_from; - /* Stay within the vector */ - bytes = min(bytes, - (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); - /* Stay within count */ - bytes = min(bytes, (unsigned long)wc->w_count); - /* - * For clustersize > page size, just stay within - * target page, otherwise we have to calculate pos - * within the cluster and obey the rightmost - * boundary. - */ - if (wc->w_large_pages) { - /* - * For cluster size < page size, we have to - * calculate pos within the cluster and obey - * the rightmost boundary. - */ - bytes = min(bytes, (unsigned long)(osb->s_clustersize - - (wc->w_pos & (osb->s_clustersize - 1)))); - } else { - /* - * cluster size > page size is the most common - * case - we just stay within the target page - * boundary. - */ - bytes = min(bytes, PAGE_CACHE_SIZE - from); - } + block_start = block_end; + bh = bh->b_this_page; + } while (bh != head); +} - to = from + bytes; +/* + * Only called when we have a failure during allocating write to write + * zero's to the newly allocated region. + */ +static void ocfs2_write_failure(struct inode *inode, + struct ocfs2_write_ctxt *wc, + loff_t user_pos, unsigned user_len) +{ + int i; + unsigned from, to; + struct page *tmppage; - if (wc->w_this_page_new) - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, - cluster_start, cluster_end, 1); - else - ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, - from, to, 0); - if (ret) { - mlog_errno(ret); - goto out; + ocfs2_zero_new_buffers(wc->w_target_page, user_pos, user_len); + + if (wc->w_large_pages) { + from = wc->w_target_from; + to = wc->w_target_to; + } else { + from = 0; + to = PAGE_CACHE_SIZE; } - BUG_ON(from > PAGE_CACHE_SIZE); - BUG_ON(to > PAGE_CACHE_SIZE); - BUG_ON(from > osb->s_clustersize); - BUG_ON(to > osb->s_clustersize); - - dst = kmap(wc->w_this_page); - memcpy(dst + from, bp->b_src_buf + src_from, bytes); - kunmap(wc->w_this_page); - - /* - * XXX: This is slow, but simple. The caller of - * ocfs2_buffered_write_cluster() is responsible for - * passing through the iovecs, so it's difficult to - * predict what our next step is in here after our - * initial write. A future version should be pushing - * that iovec manipulation further down. - * - * By setting this, we indicate that a copy from user - * data was done, and subsequent calls for this - * cluster will skip copying more data. - */ - wc->w_finished_copy = 1; + for(i = 0; i < wc->w_num_pages; i++) { + tmppage = wc->w_pages[i]; - *ret_from = from; - *ret_to = to; -out: + if (ocfs2_should_order_data(inode)) + walk_page_buffers(wc->w_handle, page_buffers(tmppage), + from, to, NULL, + ocfs2_journal_dirty_data); - return bytes ? (unsigned int)bytes : ret; + block_commit_write(tmppage, from, to); + } } -/* - * Map, fill and write a page to disk. - * - * The work of copying data is done via callback. Newly allocated - * pages which don't take user data will be zero'd (set 'new' to - * indicate an allocating write) - * - * Returns a negative error code or the number of bytes copied into - * the page. - */ -static int ocfs2_write_data_page(struct inode *inode, handle_t *handle, - u64 *p_blkno, struct page *page, - struct ocfs2_write_ctxt *wc, int new) +static int ocfs2_prepare_page_for_write(struct inode *inode, u64 *p_blkno, + struct ocfs2_write_ctxt *wc, + struct page *page, u32 cpos, + loff_t user_pos, unsigned user_len, + int new) { - int ret, copied = 0; - unsigned int from = 0, to = 0; + int ret; + unsigned int map_from = 0, map_to = 0; unsigned int cluster_start, cluster_end; - unsigned int zero_from = 0, zero_to = 0; + unsigned int user_data_from = 0, user_data_to = 0; - ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, + ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), cpos, &cluster_start, &cluster_end); - if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index - && !wc->w_finished_copy) { - - wc->w_this_page = page; - wc->w_this_page_new = new; - ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); - if (ret < 0) { + if (page == wc->w_target_page) { + map_from = user_pos & (PAGE_CACHE_SIZE - 1); + map_to = map_from + user_len; + + if (new) + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + cluster_start, cluster_end, + new); + else + ret = ocfs2_map_page_blocks(page, p_blkno, inode, + map_from, map_to, new); + if (ret) { mlog_errno(ret); goto out; } - copied = ret; - - zero_from = from; - zero_to = to; + user_data_from = map_from; + user_data_to = map_to; if (new) { - from = cluster_start; - to = cluster_end; + map_from = cluster_start; + map_to = cluster_end; } + + wc->w_target_from = map_from; + wc->w_target_to = map_to; } else { /* * If we haven't allocated the new page yet, we @@ -980,11 +998,11 @@ static int ocfs2_write_data_page(struct */ BUG_ON(!new); - from = cluster_start; - to = cluster_end; + map_from = cluster_start; + map_to = cluster_end; ret = ocfs2_map_page_blocks(page, p_blkno, inode, - cluster_start, cluster_end, 1); + cluster_start, cluster_end, new); if (ret) { mlog_errno(ret); goto out; @@ -1003,108 +1021,110 @@ static int ocfs2_write_data_page(struct */ if (new && !PageUptodate(page)) ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), - wc->w_cpos, zero_from, zero_to); + cpos, user_data_from, user_data_to); flush_dcache_page(page); - if (ocfs2_should_order_data(inode)) { - ret = walk_page_buffers(handle, - page_buffers(page), - from, to, NULL, - ocfs2_journal_dirty_data); - if (ret < 0) - mlog_errno(ret); - } - - /* - * We don't use generic_commit_write() because we need to - * handle our own i_size update. - */ - ret = block_commit_write(page, from, to); - if (ret) - mlog_errno(ret); out: - - return copied ? copied : ret; + return ret; } /* - * Do the actual write of some data into an inode. Optionally allocate - * in order to fulfill the write. - * - * cpos is the logical cluster offset within the file to write at - * - * 'phys' is the physical mapping of that offset. a 'phys' value of - * zero indicates that allocation is required. In this case, data_ac - * and meta_ac should be valid (meta_ac can be null if metadata - * allocation isn't required). + * This function will only grab one clusters worth of pages. */ -static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, - struct buffer_head *di_bh, - struct ocfs2_alloc_context *data_ac, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_write_ctxt *wc) -{ - int ret, i, numpages = 1, new; - unsigned int copied = 0; - u32 tmp_pos; - u64 v_blkno, p_blkno; - struct address_space *mapping = file->f_mapping; +static int ocfs2_grab_pages_for_write(struct address_space *mapping, + struct ocfs2_write_ctxt *wc, + u32 cpos, loff_t user_pos, int new, + struct page *mmap_page) +{ + int ret = 0, i; + unsigned long start, target_index, index; struct inode *inode = mapping->host; - unsigned long index, start; - struct page **cpages; - new = phys == 0 ? 1 : 0; + target_index = user_pos >> PAGE_CACHE_SHIFT; /* * Figure out how many pages we'll be manipulating here. For * non allocating write, we just change the one * page. Otherwise, we'll need a whole clusters worth. */ - if (new) - numpages = ocfs2_pages_per_cluster(inode->i_sb); - - cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); - if (!cpages) { - ret = -ENOMEM; - mlog_errno(ret); - return ret; - } - - /* - * Fill our page array first. That way we've grabbed enough so - * that we can zero and flush if we error after adding the - * extent. - */ if (new) { - start = ocfs2_align_clusters_to_page_index(inode->i_sb, - wc->w_cpos); - v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); + wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb); + start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos); } else { - start = wc->w_pos >> PAGE_CACHE_SHIFT; - v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; + wc->w_num_pages = 1; + start = target_index; } - for(i = 0; i < numpages; i++) { + for(i = 0; i < wc->w_num_pages; i++) { index = start + i; - cpages[i] = find_or_create_page(mapping, index, GFP_NOFS); - if (!cpages[i]) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; + if (index == target_index && mmap_page) { + /* + * ocfs2_pagemkwrite() is a little different + * and wants us to directly use the page + * passed in. + */ + lock_page(mmap_page); + + if (mmap_page->mapping != mapping) { + unlock_page(mmap_page); + /* + * Sanity check - the locking in + * ocfs2_pagemkwrite() should ensure + * that this code doesn't trigger. + */ + ret = -EINVAL; + mlog_errno(ret); + goto out; + } + + page_cache_get(mmap_page); + wc->w_pages[i] = mmap_page; + } else { + wc->w_pages[i] = find_or_create_page(mapping, index, + GFP_NOFS); + if (!wc->w_pages[i]) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } } + + if (index == target_index) + wc->w_target_page = wc->w_pages[i]; } +out: + return ret; +} + +/* + * Prepare a single cluster for write one cluster into the file. + */ +static int ocfs2_write_cluster(struct address_space *mapping, + u32 phys, struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_write_ctxt *wc, u32 cpos, + loff_t user_pos, unsigned user_len) +{ + int ret, i, new; + u64 v_blkno, p_blkno; + struct inode *inode = mapping->host; + + new = phys == 0 ? 1 : 0; if (new) { + u32 tmp_pos; + /* * This is safe to call with the page locks - it won't take * any additional semaphores or cluster locks. */ - tmp_pos = wc->w_cpos; + tmp_pos = cpos; ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, - &tmp_pos, 1, di_bh, handle, - data_ac, meta_ac, NULL); + &tmp_pos, 1, wc->w_di_bh, + wc->w_handle, data_ac, + meta_ac, NULL); /* * This shouldn't happen because we must have already * calculated the correct meta data allocation required. The @@ -1121,159 +1141,349 @@ static ssize_t ocfs2_write(struct file * mlog_errno(ret); goto out; } + + v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, cpos); + } else { + v_blkno = user_pos >> inode->i_sb->s_blocksize_bits; } + /* + * The only reason this should fail is due to an inability to + * find the extent added. + */ ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, NULL); if (ret < 0) { - - /* - * XXX: Should we go readonly here? - */ - - mlog_errno(ret); + ocfs2_error(inode->i_sb, "Corrupting extend for inode %llu, " + "at logical block %llu", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_blkno); goto out; } BUG_ON(p_blkno == 0); - for(i = 0; i < numpages; i++) { - ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], - wc, new); - if (ret < 0) { - mlog_errno(ret); - goto out; - } + for(i = 0; i < wc->w_num_pages; i++) { + int tmpret; - copied += ret; + tmpret = ocfs2_prepare_page_for_write(inode, &p_blkno, wc, + wc->w_pages[i], cpos, + user_pos, user_len, new); + if (tmpret) { + mlog_errno(tmpret); + if (ret == 0) + tmpret = ret; + } } + /* + * We only have cleanup to do in case of allocating write. + */ + if (ret && new) + ocfs2_write_failure(inode, wc, user_pos, user_len); + out: - for(i = 0; i < numpages; i++) { - unlock_page(cpages[i]); - mark_page_accessed(cpages[i]); - page_cache_release(cpages[i]); - } - kfree(cpages); - return copied ? copied : ret; + return ret; } -static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, - struct ocfs2_super *osb, loff_t pos, - size_t count, ocfs2_page_writer *cb, - void *cb_priv) +/* + * ocfs2_write_end() wants to know which parts of the target page it + * should complete the write on. It's easiest to compute them ahead of + * time when a more complete view of the write is available. + */ +static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, + struct ocfs2_write_ctxt *wc, + loff_t pos, unsigned len, int alloc) { - wc->w_count = count; - wc->w_pos = pos; - wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; - wc->w_finished_copy = 0; + struct ocfs2_write_cluster_desc *desc; - if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) - wc->w_large_pages = 1; - else - wc->w_large_pages = 0; + wc->w_target_from = pos & (PAGE_CACHE_SIZE - 1); + wc->w_target_to = wc->w_target_from + len; + + if (alloc == 0) + return; + + /* + * Allocating write - we may have different boundaries based + * on page size and cluster size. + * + * NOTE: We can no longer compute one value from the other as + * the actual write length and user provided length may be + * different. + */ - wc->w_write_data_page = cb; - wc->w_private = cb_priv; + if (wc->w_large_pages) { + /* + * We only care about the 1st and last cluster within + * our range and whether they are holes or not. Either + * value may be extended out to the start/end of a + * newly allocated cluster. + */ + desc = &wc->w_desc[0]; + if (desc->c_new) + ocfs2_figure_cluster_boundaries(osb, + desc->c_cpos, + &wc->w_target_from, + NULL); + + desc = &wc->w_desc[wc->w_clen - 1]; + if (desc->c_new) + ocfs2_figure_cluster_boundaries(osb, + desc->c_cpos, + NULL, + &wc->w_target_to); + } else { + wc->w_target_from = 0; + wc->w_target_to = PAGE_CACHE_SIZE; + } } -/* - * Write a cluster to an inode. The cluster may not be allocated yet, - * in which case it will be. This only exists for buffered writes - - * O_DIRECT takes a more "traditional" path through the kernel. - * - * The caller is responsible for incrementing pos, written counts, etc - * - * For file systems that don't support sparse files, pre-allocation - * and page zeroing up until cpos should be done prior to this - * function call. - * - * Callers should be holding i_sem, and the rw cluster lock. - * - * Returns the number of user bytes written, or less than zero for - * error. - */ -ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, - size_t count, ocfs2_page_writer *actor, - void *priv) -{ - int ret, credits = OCFS2_INODE_UPDATE_CREDITS; - ssize_t written = 0; - u32 phys; - struct inode *inode = file->f_mapping->host; +int ocfs2_write_begin_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + struct buffer_head *di_bh, struct page *mmap_page) +{ + int ret, i, credits = OCFS2_INODE_UPDATE_CREDITS; + unsigned int num_clusters = 0, clusters_to_alloc = 0; + u32 phys = 0; + struct ocfs2_write_ctxt *wc; + struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle; - struct ocfs2_write_ctxt wc; + struct ocfs2_write_cluster_desc *desc; - ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); - - ret = ocfs2_meta_lock(inode, &di_bh, 1); + ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); if (ret) { mlog_errno(ret); - goto out; + return ret; } - di = (struct ocfs2_dinode *)di_bh->b_data; - /* - * Take alloc sem here to prevent concurrent lookups. That way - * the mapping, zeroing and tree manipulation within - * ocfs2_write() will be safe against ->readpage(). This - * should also serve to lock out allocation from a shared - * writeable region. - */ - down_write(&OCFS2_I(inode)->ip_alloc_sem); + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; - ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); - if (ret) { - mlog_errno(ret); - goto out_meta; + for (i = 0; i < wc->w_clen; i++) { + desc = &wc->w_desc[i]; + desc->c_cpos = wc->w_cpos + i; + + if (num_clusters == 0) { + ret = ocfs2_get_clusters(inode, desc->c_cpos, &phys, + &num_clusters, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + } else if (phys) { + /* + * Only increment phys if it doesn't describe + * a hole. + */ + phys++; + } + + desc->c_phys = phys; + if (phys == 0) { + desc->c_new = 1; + clusters_to_alloc++; + } + + num_clusters--; } - /* phys == 0 means that allocation is required. */ - if (phys == 0) { - ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); + /* + * We set w_target_from, w_target_to here so that + * ocfs2_write_end() knows which range in the target page to + * write out. An allocation requires that we write the entire + * cluster range. + */ + if (clusters_to_alloc > 0) { + /* + * XXX: We are stretching the limits of + * ocfs2_lock_allocators(). It greately over-estimates + * the work to be done. + */ + ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, + &data_ac, &meta_ac); if (ret) { mlog_errno(ret); - goto out_meta; + goto out; } - credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); - } + credits = ocfs2_calc_extend_credits(inode->i_sb, di, + clusters_to_alloc); - ret = ocfs2_data_lock(inode, 1); - if (ret) { - mlog_errno(ret); - goto out_meta; } + ocfs2_set_target_boundaries(osb, wc, pos, len, clusters_to_alloc); + handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); - goto out_data; + goto out; } - written = ocfs2_write(file, phys, handle, di_bh, data_ac, - meta_ac, &wc); - if (written < 0) { - ret = written; + wc->w_handle = handle; + + /* + * We don't want this to fail in ocfs2_write_end(), so do it + * here. + */ + ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { mlog_errno(ret); goto out_commit; } - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + /* + * Fill our page array first. That way we've grabbed enough so + * that we can zero and flush if we error after adding the + * extent. + */ + ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, + clusters_to_alloc, mmap_page); if (ret) { mlog_errno(ret); goto out_commit; } - pos += written; + for (i = 0; i < wc->w_clen; i++) { + desc = &wc->w_desc[i]; + + ret = ocfs2_write_cluster(mapping, desc->c_phys, data_ac, + meta_ac, wc, desc->c_cpos, pos, len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + *pagep = wc->w_target_page; + *fsdata = wc; + return 0; +out_commit: + ocfs2_commit_trans(osb, handle); + +out: + ocfs2_free_write_ctxt(wc); + + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + return ret; +} + +int ocfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + int ret; + struct buffer_head *di_bh = NULL; + struct inode *inode = mapping->host; + + ret = ocfs2_meta_lock(inode, &di_bh, 1); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* + * Take alloc sem here to prevent concurrent lookups. That way + * the mapping, zeroing and tree manipulation within + * ocfs2_write() will be safe against ->readpage(). This + * should also serve to lock out allocation from a shared + * writeable region. + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + ret = ocfs2_data_lock(inode, 1); + if (ret) { + mlog_errno(ret); + goto out_fail; + } + + ret = ocfs2_write_begin_nolock(mapping, pos, len, flags, pagep, + fsdata, di_bh, NULL); + if (ret) { + mlog_errno(ret); + goto out_fail_data; + } + + brelse(di_bh); + + return 0; + +out_fail_data: + ocfs2_data_unlock(inode, 1); +out_fail: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + brelse(di_bh); + ocfs2_meta_unlock(inode, 1); + + return ret; +} + +int ocfs2_write_end_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int i; + unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_write_ctxt *wc = fsdata; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + handle_t *handle = wc->w_handle; + struct page *tmppage; + + if (unlikely(copied < len)) { + if (!PageUptodate(wc->w_target_page)) + copied = 0; + + ocfs2_zero_new_buffers(wc->w_target_page, start+copied, + start+len); + } + flush_dcache_page(wc->w_target_page); + + for(i = 0; i < wc->w_num_pages; i++) { + tmppage = wc->w_pages[i]; + + if (tmppage == wc->w_target_page) { + from = wc->w_target_from; + to = wc->w_target_to; + + BUG_ON(from > PAGE_CACHE_SIZE || + to > PAGE_CACHE_SIZE || + to < from); + } else { + /* + * Pages adjacent to the target (if any) imply + * a hole-filling write in which case we want + * to flush their entire range. + */ + from = 0; + to = PAGE_CACHE_SIZE; + } + + if (ocfs2_should_order_data(inode)) + walk_page_buffers(wc->w_handle, page_buffers(tmppage), + from, to, NULL, + ocfs2_journal_dirty_data); + + block_commit_write(tmppage, from, to); + } + + pos += copied; if (pos > inode->i_size) { i_size_write(inode, pos); mark_inode_dirty(inode); @@ -1284,28 +1494,28 @@ ssize_t ocfs2_buffered_write_cluster(str di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret) - mlog_errno(ret); + ocfs2_journal_dirty(handle, wc->w_di_bh); -out_commit: ocfs2_commit_trans(osb, handle); + ocfs2_free_write_ctxt(wc); -out_data: - ocfs2_data_unlock(inode, 1); + return copied; +} + +int ocfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + struct inode *inode = mapping->host; + + ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); -out_meta: + ocfs2_data_unlock(inode, 1); up_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_meta_unlock(inode, 1); -out: - brelse(di_bh); - if (data_ac) - ocfs2_free_alloc_context(data_ac); - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); - - return written ? written : ret; + return ret; } const struct address_space_operations ocfs2_aops = { diff -puN fs/ocfs2/aops.h~git-ocfs2 fs/ocfs2/aops.h --- a/fs/ocfs2/aops.h~git-ocfs2 +++ a/fs/ocfs2/aops.h @@ -42,57 +42,22 @@ int walk_page_buffers( handle_t *handle, int (*fn)( handle_t *handle, struct buffer_head *bh)); -struct ocfs2_write_ctxt; -typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *, - u64 *, unsigned int *, unsigned int *); - -ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, - size_t count, ocfs2_page_writer *actor, - void *priv); - -struct ocfs2_write_ctxt { - size_t w_count; - loff_t w_pos; - u32 w_cpos; - unsigned int w_finished_copy; - - /* This is true if page_size > cluster_size */ - unsigned int w_large_pages; - - /* Filler callback and private data */ - ocfs2_page_writer *w_write_data_page; - void *w_private; - - /* Only valid for the filler callback */ - struct page *w_this_page; - unsigned int w_this_page_new; -}; - -struct ocfs2_buffered_write_priv { - char *b_src_buf; - const struct iovec *b_cur_iov; /* Current iovec */ - size_t b_cur_off; /* Offset in the - * current iovec */ -}; -int ocfs2_map_and_write_user_data(struct inode *inode, - struct ocfs2_write_ctxt *wc, - u64 *p_blkno, - unsigned int *ret_from, - unsigned int *ret_to); - -struct ocfs2_splice_write_priv { - struct splice_desc *s_sd; - struct pipe_buffer *s_buf; - struct pipe_inode_info *s_pipe; - /* Neither offset value is ever larger than one page */ - unsigned int s_offset; - unsigned int s_buf_offset; -}; -int ocfs2_map_and_write_splice_data(struct inode *inode, - struct ocfs2_write_ctxt *wc, - u64 *p_blkno, - unsigned int *ret_from, - unsigned int *ret_to); +int ocfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); + +int ocfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + +int ocfs2_write_end_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + +int ocfs2_write_begin_nolock(struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + struct buffer_head *di_bh, struct page *mmap_page); /* all ocfs2_dio_end_io()'s fault */ #define ocfs2_iocb_is_rw_locked(iocb) \ diff -puN fs/ocfs2/cluster/tcp.c~git-ocfs2 fs/ocfs2/cluster/tcp.c --- a/fs/ocfs2/cluster/tcp.c~git-ocfs2 +++ a/fs/ocfs2/cluster/tcp.c @@ -261,14 +261,12 @@ out: static void o2net_complete_nodes_nsw(struct o2net_node *nn) { - struct list_head *iter, *tmp; + struct o2net_status_wait *nsw, *tmp; unsigned int num_kills = 0; - struct o2net_status_wait *nsw; assert_spin_locked(&nn->nn_lock); - list_for_each_safe(iter, tmp, &nn->nn_status_list) { - nsw = list_entry(iter, struct o2net_status_wait, ns_node_item); + list_for_each_entry_safe(nsw, tmp, &nn->nn_status_list, ns_node_item) { o2net_complete_nsw_locked(nn, nsw, O2NET_ERR_DIED, 0); num_kills++; } @@ -764,13 +762,10 @@ EXPORT_SYMBOL_GPL(o2net_register_handler void o2net_unregister_handler_list(struct list_head *list) { - struct list_head *pos, *n; - struct o2net_msg_handler *nmh; + struct o2net_msg_handler *nmh, *n; write_lock(&o2net_handler_lock); - list_for_each_safe(pos, n, list) { - nmh = list_entry(pos, struct o2net_msg_handler, - nh_unregister_item); + list_for_each_entry_safe(nmh, n, list, nh_unregister_item) { mlog(ML_TCP, "unregistering handler func %p type %u key %08x\n", nmh->nh_func, nmh->nh_msg_type, nmh->nh_key); rb_erase(&nmh->nh_node, &o2net_handler_tree); diff -puN fs/ocfs2/dlm/dlmmaster.c~git-ocfs2 fs/ocfs2/dlm/dlmmaster.c --- a/fs/ocfs2/dlm/dlmmaster.c~git-ocfs2 +++ a/fs/ocfs2/dlm/dlmmaster.c @@ -192,25 +192,20 @@ static void dlm_print_one_mle(struct dlm static void dlm_dump_mles(struct dlm_ctxt *dlm) { struct dlm_master_list_entry *mle; - struct list_head *iter; mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); spin_lock(&dlm->master_lock); - list_for_each(iter, &dlm->master_list) { - mle = list_entry(iter, struct dlm_master_list_entry, list); + list_for_each_entry(mle, &dlm->master_list, list) dlm_print_one_mle(mle); - } spin_unlock(&dlm->master_lock); } int dlm_dump_all_mles(const char __user *data, unsigned int len) { - struct list_head *iter; struct dlm_ctxt *dlm; spin_lock(&dlm_domain_lock); - list_for_each(iter, &dlm_domains) { - dlm = list_entry (iter, struct dlm_ctxt, list); + list_for_each_entry(dlm, &dlm_domains, list) { mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name); dlm_dump_mles(dlm); } @@ -454,12 +449,10 @@ static int dlm_find_mle(struct dlm_ctxt char *name, unsigned int namelen) { struct dlm_master_list_entry *tmpmle; - struct list_head *iter; assert_spin_locked(&dlm->master_lock); - list_for_each(iter, &dlm->master_list) { - tmpmle = list_entry(iter, struct dlm_master_list_entry, list); + list_for_each_entry(tmpmle, &dlm->master_list, list) { if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) continue; dlm_get_mle(tmpmle); @@ -472,13 +465,10 @@ static int dlm_find_mle(struct dlm_ctxt void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up) { struct dlm_master_list_entry *mle; - struct list_head *iter; assert_spin_locked(&dlm->spinlock); - list_for_each(iter, &dlm->mle_hb_events) { - mle = list_entry(iter, struct dlm_master_list_entry, - hb_events); + list_for_each_entry(mle, &dlm->mle_hb_events, hb_events) { if (node_up) dlm_mle_node_up(dlm, mle, NULL, idx); else @@ -2434,7 +2424,7 @@ static int dlm_is_lockres_migrateable(st int ret; int i; int count = 0; - struct list_head *queue, *iter; + struct list_head *queue; struct dlm_lock *lock; assert_spin_locked(&res->spinlock); @@ -2453,8 +2443,7 @@ static int dlm_is_lockres_migrateable(st ret = 0; queue = &res->granted; for (i = 0; i < 3; i++) { - list_for_each(iter, queue) { - lock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(lock, queue, list) { ++count; if (lock->ml.node == dlm->node_num) { mlog(0, "found a lock owned by this node still " @@ -2923,18 +2912,16 @@ again: static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { - struct list_head *iter, *iter2; struct list_head *queue = &res->granted; int i, bit; - struct dlm_lock *lock; + struct dlm_lock *lock, *next; assert_spin_locked(&res->spinlock); BUG_ON(res->owner == dlm->node_num); for (i=0; i<3; i++) { - list_for_each_safe(iter, iter2, queue) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry_safe(lock, next, queue, list) { if (lock->ml.node != dlm->node_num) { mlog(0, "putting lock for node %u\n", lock->ml.node); @@ -2976,7 +2963,6 @@ static u8 dlm_pick_migration_target(stru { int i; struct list_head *queue = &res->granted; - struct list_head *iter; struct dlm_lock *lock; int nodenum; @@ -2984,10 +2970,9 @@ static u8 dlm_pick_migration_target(stru spin_lock(&res->spinlock); for (i=0; i<3; i++) { - list_for_each(iter, queue) { + list_for_each_entry(lock, queue, list) { /* up to the caller to make sure this node * is alive */ - lock = list_entry (iter, struct dlm_lock, list); if (lock->ml.node != dlm->node_num) { spin_unlock(&res->spinlock); return lock->ml.node; @@ -3234,8 +3219,7 @@ static int dlm_add_migration_mle(struct void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) { - struct list_head *iter, *iter2; - struct dlm_master_list_entry *mle; + struct dlm_master_list_entry *mle, *next; struct dlm_lock_resource *res; unsigned int hash; @@ -3245,9 +3229,7 @@ top: /* clean the master list */ spin_lock(&dlm->master_lock); - list_for_each_safe(iter, iter2, &dlm->master_list) { - mle = list_entry(iter, struct dlm_master_list_entry, list); - + list_for_each_entry_safe(mle, next, &dlm->master_list, list) { BUG_ON(mle->type != DLM_MLE_BLOCK && mle->type != DLM_MLE_MASTER && mle->type != DLM_MLE_MIGRATION); diff -puN fs/ocfs2/dlm/dlmrecovery.c~git-ocfs2 fs/ocfs2/dlm/dlmrecovery.c --- a/fs/ocfs2/dlm/dlmrecovery.c~git-ocfs2 +++ a/fs/ocfs2/dlm/dlmrecovery.c @@ -158,8 +158,7 @@ void dlm_dispatch_work(struct work_struc struct dlm_ctxt *dlm = container_of(work, struct dlm_ctxt, dispatched_work); LIST_HEAD(tmp_list); - struct list_head *iter, *iter2; - struct dlm_work_item *item; + struct dlm_work_item *item, *next; dlm_workfunc_t *workfunc; int tot=0; @@ -167,13 +166,12 @@ void dlm_dispatch_work(struct work_struc list_splice_init(&dlm->work_list, &tmp_list); spin_unlock(&dlm->work_lock); - list_for_each_safe(iter, iter2, &tmp_list) { + list_for_each_entry(item, &tmp_list, list) { tot++; } mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); - list_for_each_safe(iter, iter2, &tmp_list) { - item = list_entry(iter, struct dlm_work_item, list); + list_for_each_entry_safe(item, next, &tmp_list, list) { workfunc = item->func; list_del_init(&item->list); @@ -549,7 +547,6 @@ static int dlm_remaster_locks(struct dlm { int status = 0; struct dlm_reco_node_data *ndata; - struct list_head *iter; int all_nodes_done; int destroy = 0; int pass = 0; @@ -567,8 +564,7 @@ static int dlm_remaster_locks(struct dlm /* safe to access the node data list without a lock, since this * process is the only one to change the list */ - list_for_each(iter, &dlm->reco.node_data) { - ndata = list_entry (iter, struct dlm_reco_node_data, list); + list_for_each_entry(ndata, &dlm->reco.node_data, list) { BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); ndata->state = DLM_RECO_NODE_DATA_REQUESTING; @@ -655,9 +651,7 @@ static int dlm_remaster_locks(struct dlm * done, or if anyone died */ all_nodes_done = 1; spin_lock(&dlm_reco_state_lock); - list_for_each(iter, &dlm->reco.node_data) { - ndata = list_entry (iter, struct dlm_reco_node_data, list); - + list_for_each_entry(ndata, &dlm->reco.node_data, list) { mlog(0, "checking recovery state of node %u\n", ndata->node_num); switch (ndata->state) { @@ -774,16 +768,14 @@ static int dlm_init_recovery_area(struct static void dlm_destroy_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) { - struct list_head *iter, *iter2; - struct dlm_reco_node_data *ndata; + struct dlm_reco_node_data *ndata, *next; LIST_HEAD(tmplist); spin_lock(&dlm_reco_state_lock); list_splice_init(&dlm->reco.node_data, &tmplist); spin_unlock(&dlm_reco_state_lock); - list_for_each_safe(iter, iter2, &tmplist) { - ndata = list_entry (iter, struct dlm_reco_node_data, list); + list_for_each_entry_safe(ndata, next, &tmplist, list) { list_del_init(&ndata->list); kfree(ndata); } @@ -876,7 +868,6 @@ static void dlm_request_all_locks_worker struct dlm_lock_resource *res; struct dlm_ctxt *dlm; LIST_HEAD(resources); - struct list_head *iter; int ret; u8 dead_node, reco_master; int skip_all_done = 0; @@ -920,8 +911,7 @@ static void dlm_request_all_locks_worker /* any errors returned will be due to the new_master dying, * the dlm_reco_thread should detect this */ - list_for_each(iter, &resources) { - res = list_entry (iter, struct dlm_lock_resource, recovering); + list_for_each_entry(res, &resources, recovering) { ret = dlm_send_one_lockres(dlm, res, mres, reco_master, DLM_MRES_RECOVERY); if (ret < 0) { @@ -983,7 +973,6 @@ int dlm_reco_data_done_handler(struct o2 { struct dlm_ctxt *dlm = data; struct dlm_reco_data_done *done = (struct dlm_reco_data_done *)msg->buf; - struct list_head *iter; struct dlm_reco_node_data *ndata = NULL; int ret = -EINVAL; @@ -1000,8 +989,7 @@ int dlm_reco_data_done_handler(struct o2 dlm->reco.dead_node, done->node_idx, dlm->node_num); spin_lock(&dlm_reco_state_lock); - list_for_each(iter, &dlm->reco.node_data) { - ndata = list_entry (iter, struct dlm_reco_node_data, list); + list_for_each_entry(ndata, &dlm->reco.node_data, list) { if (ndata->node_num != done->node_idx) continue; @@ -1049,13 +1037,11 @@ static void dlm_move_reco_locks_to_list( struct list_head *list, u8 dead_node) { - struct dlm_lock_resource *res; - struct list_head *iter, *iter2; + struct dlm_lock_resource *res, *next; struct dlm_lock *lock; spin_lock(&dlm->spinlock); - list_for_each_safe(iter, iter2, &dlm->reco.resources) { - res = list_entry (iter, struct dlm_lock_resource, recovering); + list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { /* always prune any $RECOVERY entries for dead nodes, * otherwise hangs can occur during later recovery */ if (dlm_is_recovery_lock(res->lockname.name, @@ -1252,7 +1238,7 @@ int dlm_send_one_lockres(struct dlm_ctxt struct dlm_migratable_lockres *mres, u8 send_to, u8 flags) { - struct list_head *queue, *iter; + struct list_head *queue; int total_locks, i; u64 mig_cookie = 0; struct dlm_lock *lock; @@ -1278,9 +1264,7 @@ int dlm_send_one_lockres(struct dlm_ctxt total_locks = 0; for (i=DLM_GRANTED_LIST; i<=DLM_BLOCKED_LIST; i++) { queue = dlm_list_idx_to_ptr(res, i); - list_for_each(iter, queue) { - lock = list_entry (iter, struct dlm_lock, list); - + list_for_each_entry(lock, queue, list) { /* add another lock. */ total_locks++; if (!dlm_add_lock_to_array(lock, mres, i)) @@ -1717,7 +1701,6 @@ static int dlm_process_recovery_data(str struct dlm_lockstatus *lksb = NULL; int ret = 0; int i, j, bad; - struct list_head *iter; struct dlm_lock *lock = NULL; u8 from = O2NM_MAX_NODES; unsigned int added = 0; @@ -1755,8 +1738,7 @@ static int dlm_process_recovery_data(str spin_lock(&res->spinlock); for (j = DLM_GRANTED_LIST; j <= DLM_BLOCKED_LIST; j++) { tmpq = dlm_list_idx_to_ptr(res, j); - list_for_each(iter, tmpq) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry(lock, tmpq, list) { if (lock->ml.cookie != ml->cookie) lock = NULL; else @@ -1930,8 +1912,8 @@ void dlm_move_lockres_to_recovery_list(s struct dlm_lock_resource *res) { int i; - struct list_head *queue, *iter, *iter2; - struct dlm_lock *lock; + struct list_head *queue; + struct dlm_lock *lock, *next; res->state |= DLM_LOCK_RES_RECOVERING; if (!list_empty(&res->recovering)) { @@ -1947,8 +1929,7 @@ void dlm_move_lockres_to_recovery_list(s /* find any pending locks and put them back on proper list */ for (i=DLM_BLOCKED_LIST; i>=DLM_GRANTED_LIST; i--) { queue = dlm_list_idx_to_ptr(res, i); - list_for_each_safe(iter, iter2, queue) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry_safe(lock, next, queue, list) { dlm_lock_get(lock); if (lock->convert_pending) { /* move converting lock back to granted */ @@ -2013,18 +1994,15 @@ static void dlm_finish_local_lockres_rec u8 dead_node, u8 new_master) { int i; - struct list_head *iter, *iter2; struct hlist_node *hash_iter; struct hlist_head *bucket; - - struct dlm_lock_resource *res; + struct dlm_lock_resource *res, *next; mlog_entry_void(); assert_spin_locked(&dlm->spinlock); - list_for_each_safe(iter, iter2, &dlm->reco.resources) { - res = list_entry (iter, struct dlm_lock_resource, recovering); + list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { if (res->owner == dead_node) { list_del_init(&res->recovering); spin_lock(&res->spinlock); @@ -2099,7 +2077,7 @@ static inline int dlm_lvb_needs_invalida static void dlm_revalidate_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 dead_node) { - struct list_head *iter, *queue; + struct list_head *queue; struct dlm_lock *lock; int blank_lvb = 0, local = 0; int i; @@ -2121,8 +2099,7 @@ static void dlm_revalidate_lvb(struct dl for (i=DLM_GRANTED_LIST; i<=DLM_CONVERTING_LIST; i++) { queue = dlm_list_idx_to_ptr(res, i); - list_for_each(iter, queue) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry(lock, queue, list) { if (lock->ml.node == search_node) { if (dlm_lvb_needs_invalidation(lock, local)) { /* zero the lksb lvb and lockres lvb */ @@ -2143,8 +2120,7 @@ static void dlm_revalidate_lvb(struct dl static void dlm_free_dead_locks(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, u8 dead_node) { - struct list_head *iter, *tmpiter; - struct dlm_lock *lock; + struct dlm_lock *lock, *next; unsigned int freed = 0; /* this node is the lockres master: @@ -2155,24 +2131,21 @@ static void dlm_free_dead_locks(struct d assert_spin_locked(&res->spinlock); /* TODO: check pending_asts, pending_basts here */ - list_for_each_safe(iter, tmpiter, &res->granted) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry_safe(lock, next, &res->granted, list) { if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); freed++; } } - list_for_each_safe(iter, tmpiter, &res->converting) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry_safe(lock, next, &res->converting, list) { if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); freed++; } } - list_for_each_safe(iter, tmpiter, &res->blocked) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry_safe(lock, next, &res->blocked, list) { if (lock->ml.node == dead_node) { list_del_init(&lock->list); dlm_lock_put(lock); diff -puN fs/ocfs2/dlmglue.c~git-ocfs2 fs/ocfs2/dlmglue.c --- a/fs/ocfs2/dlmglue.c~git-ocfs2 +++ a/fs/ocfs2/dlmglue.c @@ -600,15 +600,13 @@ static inline int ocfs2_highest_compat_l static void lockres_set_flags(struct ocfs2_lock_res *lockres, unsigned long newflags) { - struct list_head *pos, *tmp; - struct ocfs2_mask_waiter *mw; + struct ocfs2_mask_waiter *mw, *tmp; assert_spin_locked(&lockres->l_lock); lockres->l_flags = newflags; - list_for_each_safe(pos, tmp, &lockres->l_mask_waiters) { - mw = list_entry(pos, struct ocfs2_mask_waiter, mw_item); + list_for_each_entry_safe(mw, tmp, &lockres->l_mask_waiters, mw_item) { if ((lockres->l_flags & mw->mw_mask) != mw->mw_goal) continue; diff -puN fs/ocfs2/extent_map.c~git-ocfs2 fs/ocfs2/extent_map.c --- a/fs/ocfs2/extent_map.c~git-ocfs2 +++ a/fs/ocfs2/extent_map.c @@ -109,17 +109,14 @@ static int ocfs2_extent_map_lookup(struc */ void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos) { - struct list_head *p, *n; - struct ocfs2_extent_map_item *emi; + struct ocfs2_extent_map_item *emi, *n; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_extent_map *em = &oi->ip_extent_map; LIST_HEAD(tmp_list); unsigned int range; spin_lock(&oi->ip_lock); - list_for_each_safe(p, n, &em->em_list) { - emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); - + list_for_each_entry_safe(emi, n, &em->em_list, ei_list) { if (emi->ei_cpos >= cpos) { /* Full truncate of this record. */ list_move(&emi->ei_list, &tmp_list); @@ -136,8 +133,7 @@ void ocfs2_extent_map_trunc(struct inode } spin_unlock(&oi->ip_lock); - list_for_each_safe(p, n, &tmp_list) { - emi = list_entry(p, struct ocfs2_extent_map_item, ei_list); + list_for_each_entry_safe(emi, n, &tmp_list, ei_list) { list_del(&emi->ei_list); kfree(emi); } diff -puN fs/ocfs2/file.c~git-ocfs2 fs/ocfs2/file.c --- a/fs/ocfs2/file.c~git-ocfs2 +++ a/fs/ocfs2/file.c @@ -326,9 +326,6 @@ static int ocfs2_truncate_file(struct in (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)new_i_size); - unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(inode->i_mapping, new_i_size); - fe = (struct ocfs2_dinode *) di_bh->b_data; if (!OCFS2_IS_VALID_DINODE(fe)) { OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); @@ -363,16 +360,23 @@ static int ocfs2_truncate_file(struct in if (new_i_size == le64_to_cpu(fe->i_size)) goto bail; + down_write(&OCFS2_I(inode)->ip_alloc_sem); + /* This forces other nodes to sync and drop their pages. Do * this even if we have a truncate without allocation change - * ocfs2 cluster sizes can be much greater than page size, so * we have to truncate them anyway. */ status = ocfs2_data_lock(inode, 1); if (status < 0) { + up_write(&OCFS2_I(inode)->ip_alloc_sem); + mlog_errno(status); goto bail; } + unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); + /* alright, we're going to need to do a full blown alloc size * change. Orphan the inode so that recovery can complete the * truncate if necessary. This does the task of marking @@ -399,6 +403,8 @@ static int ocfs2_truncate_file(struct in bail_unlock_data: ocfs2_data_unlock(inode, 1); + up_write(&OCFS2_I(inode)->ip_alloc_sem); + bail: mlog_exit(status); @@ -995,6 +1001,13 @@ int ocfs2_setattr(struct dentry *dentry, goto bail_unlock; } + /* + * This will intentionally not wind up calling vmtruncate(), + * since all the work for a size change has been done above. + * Otherwise, we could get into problems with truncate as + * ip_alloc_sem is used there to protect against i_size + * changes. + */ status = inode_setattr(inode, attr); if (status < 0) { mlog_errno(status); @@ -1329,15 +1342,16 @@ ocfs2_set_next_iovec(const struct iovec *basep = base; } -static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, +static struct page * ocfs2_get_write_source(char **ret_src_buf, const struct iovec *cur_iov, size_t iov_offset) { int ret; - char *buf; + char *buf = cur_iov->iov_base + iov_offset; struct page *src_page = NULL; + unsigned long off; - buf = cur_iov->iov_base + iov_offset; + off = (unsigned long)(buf) & ~PAGE_CACHE_MASK; if (!segment_eq(get_fs(), KERNEL_DS)) { /* @@ -1349,18 +1363,17 @@ static struct page * ocfs2_get_write_sou (unsigned long)buf & PAGE_CACHE_MASK, 1, 0, 0, &src_page, NULL); if (ret == 1) - bp->b_src_buf = kmap(src_page); + *ret_src_buf = kmap(src_page) + off; else src_page = ERR_PTR(-EFAULT); } else { - bp->b_src_buf = buf; + *ret_src_buf = buf; } return src_page; } -static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, - struct page *page) +static void ocfs2_put_write_source(struct page *page) { if (page) { kunmap(page); @@ -1376,10 +1389,12 @@ static ssize_t ocfs2_file_buffered_write { int ret = 0; ssize_t copied, total = 0; - size_t iov_offset = 0; + size_t iov_offset = 0, bytes; + loff_t pos; const struct iovec *cur_iov = iov; - struct ocfs2_buffered_write_priv bp; - struct page *page; + struct page *user_page, *page; + char *buf, *dst; + void *fsdata; /* * handle partial DIO write. Adjust cur_iov if needed. @@ -1387,21 +1402,38 @@ static ssize_t ocfs2_file_buffered_write ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); do { - bp.b_cur_off = iov_offset; - bp.b_cur_iov = cur_iov; + pos = *ppos; - page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); - if (IS_ERR(page)) { - ret = PTR_ERR(page); + user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset); + if (IS_ERR(user_page)) { + ret = PTR_ERR(user_page); goto out; } - copied = ocfs2_buffered_write_cluster(file, *ppos, count, - ocfs2_map_and_write_user_data, - &bp); + /* Stay within our page boundaries */ + bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)), + (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK))); + /* Stay within the vector boundary */ + bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset); + /* Stay within count */ + bytes = min(bytes, count); + + page = NULL; + ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0, + &page, &fsdata); + if (ret) { + mlog_errno(ret); + goto out; + } - ocfs2_put_write_source(&bp, page); + dst = kmap_atomic(page, KM_USER0); + memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes); + kunmap_atomic(dst, KM_USER0); + flush_dcache_page(page); + ocfs2_put_write_source(user_page); + copied = ocfs2_write_end(file, file->f_mapping, pos, bytes, + bytes, page, fsdata); if (copied < 0) { mlog_errno(copied); ret = copied; @@ -1409,7 +1441,7 @@ static ssize_t ocfs2_file_buffered_write } total += copied; - *ppos = *ppos + copied; + *ppos = pos + copied; count -= copied; ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); @@ -1579,52 +1611,46 @@ static int ocfs2_splice_write_actor(stru struct pipe_buffer *buf, struct splice_desc *sd) { - int ret, count, total = 0; + int ret, count; ssize_t copied = 0; - struct ocfs2_splice_write_priv sp; + struct file *file = sd->file; + unsigned int offset; + struct page *page = NULL; + void *fsdata; + char *src, *dst; ret = buf->ops->pin(pipe, buf); if (ret) goto out; - sp.s_sd = sd; - sp.s_buf = buf; - sp.s_pipe = pipe; - sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; - sp.s_buf_offset = buf->offset; - + offset = sd->pos & ~PAGE_CACHE_MASK; count = sd->len; - if (count + sp.s_offset > PAGE_CACHE_SIZE) - count = PAGE_CACHE_SIZE - sp.s_offset; + if (count + offset > PAGE_CACHE_SIZE) + count = PAGE_CACHE_SIZE - offset; - do { - /* - * splice wants us to copy up to one page at a - * time. For pagesize > cluster size, this means we - * might enter ocfs2_buffered_write_cluster() more - * than once, so keep track of our progress here. - */ - copied = ocfs2_buffered_write_cluster(sd->file, - (loff_t)sd->pos + total, - count, - ocfs2_map_and_write_splice_data, - &sp); - if (copied < 0) { - mlog_errno(copied); - ret = copied; - goto out; - } - - count -= copied; - sp.s_offset += copied; - sp.s_buf_offset += copied; - total += copied; - } while (count); + ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0, + &page, &fsdata); + if (ret) { + mlog_errno(ret); + goto out; + } - ret = 0; + src = buf->ops->map(pipe, buf, 1); + dst = kmap_atomic(page, KM_USER1); + memcpy(dst + offset, src + buf->offset, count); + kunmap_atomic(page, KM_USER1); + buf->ops->unmap(pipe, buf, src); + + copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count, + page, fsdata); + if (copied < 0) { + mlog_errno(copied); + ret = copied; + goto out; + } out: - return total ? total : ret; + return copied ? copied : ret; } static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, diff -puN fs/ocfs2/journal.c~git-ocfs2 fs/ocfs2/journal.c --- a/fs/ocfs2/journal.c~git-ocfs2 +++ a/fs/ocfs2/journal.c @@ -722,8 +722,7 @@ void ocfs2_complete_recovery(struct work container_of(work, struct ocfs2_journal, j_recovery_work); struct ocfs2_super *osb = journal->j_osb; struct ocfs2_dinode *la_dinode, *tl_dinode; - struct ocfs2_la_recovery_item *item; - struct list_head *p, *n; + struct ocfs2_la_recovery_item *item, *n; LIST_HEAD(tmp_la_list); mlog_entry_void(); @@ -734,8 +733,7 @@ void ocfs2_complete_recovery(struct work list_splice_init(&journal->j_la_cleanups, &tmp_la_list); spin_unlock(&journal->j_lock); - list_for_each_safe(p, n, &tmp_la_list) { - item = list_entry(p, struct ocfs2_la_recovery_item, lri_list); + list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) { list_del_init(&item->lri_list); mlog(0, "Complete recovery for slot %d\n", item->lri_slot); diff -puN fs/ocfs2/mmap.c~git-ocfs2 fs/ocfs2/mmap.c --- a/fs/ocfs2/mmap.c~git-ocfs2 +++ a/fs/ocfs2/mmap.c @@ -37,11 +37,29 @@ #include "ocfs2.h" +#include "aops.h" #include "dlmglue.h" #include "file.h" #include "inode.h" #include "mmap.h" +static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset) +{ + /* The best way to deal with signals in the vm path is + * to block them upfront, rather than allowing the + * locking paths to return -ERESTARTSYS. */ + sigfillset(blocked); + + /* We should technically never get a bad return value + * from sigprocmask */ + return sigprocmask(SIG_BLOCK, blocked, oldset); +} + +static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset) +{ + return sigprocmask(SIG_SETMASK, oldset, NULL); +} + static struct page *ocfs2_fault(struct vm_area_struct *area, struct fault_data *fdata) { @@ -51,14 +69,7 @@ static struct page *ocfs2_fault(struct v mlog_entry("(area=%p, page offset=%lu)\n", area, fdata->pgoff); - /* The best way to deal with signals in this path is - * to block them upfront, rather than allowing the - * locking paths to return -ERESTARTSYS. */ - sigfillset(&blocked); - - /* We should technically never get a bad ret return - * from sigprocmask */ - ret = sigprocmask(SIG_BLOCK, &blocked, &oldset); + ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); if (ret < 0) { fdata->type = VM_FAULT_SIGBUS; mlog_errno(ret); @@ -67,7 +78,7 @@ static struct page *ocfs2_fault(struct v page = filemap_fault(area, fdata); - ret = sigprocmask(SIG_SETMASK, &oldset, NULL); + ret = ocfs2_vm_op_unblock_sigs(&oldset); if (ret < 0) mlog_errno(ret); out: @@ -75,27 +86,135 @@ out: return page; } +static int __ocfs2_page_mkwrite(struct inode *inode, struct buffer_head *di_bh, + struct page *page) +{ + int ret; + struct address_space *mapping = inode->i_mapping; + loff_t pos = page->index << PAGE_CACHE_SHIFT; + unsigned int len = PAGE_CACHE_SIZE; + pgoff_t last_index; + struct page *locked_page = NULL; + void *fsdata; + loff_t size = i_size_read(inode); + + /* + * Another node might have truncated while we were waiting on + * cluster locks. + */ + last_index = size >> PAGE_CACHE_SHIFT; + if (page->index > last_index) { + ret = -EINVAL; + goto out; + } + + /* + * The i_size check above doesn't catch the case where nodes + * truncated and then re-extended the file. We'll re-check the + * page mapping after taking the page lock inside of + * ocfs2_write_begin_nolock(). + */ + if (!PageUptodate(page) || page->mapping != inode->i_mapping) { + ret = -EINVAL; + goto out; + } + + /* + * Call ocfs2_write_begin() and ocfs2_write_end() to take + * advantage of the allocation code there. We pass a write + * length of the whole page (chopped to i_size) to make sure + * the whole thing is allocated. + * + * Since we know the page is up to date, we don't have to + * worry about ocfs2_write_begin() skipping some buffer reads + * because the "write" would invalidate their data. + */ + if (page->index == last_index) + len = size & ~PAGE_CACHE_MASK; + + ret = ocfs2_write_begin_nolock(mapping, pos, len, 0, &locked_page, + &fsdata, di_bh, page); + if (ret) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + + ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, + fsdata); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + BUG_ON(ret != len); + ret = 0; +out: + return ret; +} + +static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct buffer_head *di_bh = NULL; + sigset_t blocked, oldset; + int ret, ret2; + + ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + /* + * The cluster locks taken will block a truncate from another + * node. Taking the data lock will also ensure that we don't + * attempt page truncation as part of a downconvert. + */ + ret = ocfs2_meta_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + /* + * The alloc sem should be enough to serialize with + * ocfs2_truncate_file() changing i_size as well as any thread + * modifying the inode btree. + */ + down_write(&OCFS2_I(inode)->ip_alloc_sem); + + ret = ocfs2_data_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_meta_unlock; + } + + ret = __ocfs2_page_mkwrite(inode, di_bh, page); + + ocfs2_data_unlock(inode, 1); + +out_meta_unlock: + up_write(&OCFS2_I(inode)->ip_alloc_sem); + + brelse(di_bh); + ocfs2_meta_unlock(inode, 1); + +out: + ret2 = ocfs2_vm_op_unblock_sigs(&oldset); + if (ret2 < 0) + mlog_errno(ret2); + + return ret; +} + static struct vm_operations_struct ocfs2_file_vm_ops = { .fault = ocfs2_fault, + .page_mkwrite = ocfs2_page_mkwrite, }; int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) { int ret = 0, lock_level = 0; - struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb); - - /* - * Only support shared writeable mmap for local mounts which - * don't know about holes. - */ - if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) && - ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) && - ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { - mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); - /* This is -EINVAL because generic_file_readonly_mmap - * returns it in a similar situation. */ - return -EINVAL; - } ret = ocfs2_meta_lock_atime(file->f_dentry->d_inode, file->f_vfsmnt, &lock_level); _