Index: linux-2.6/mm/filemap.c =================================================================== --- linux-2.6.orig/mm/filemap.c +++ linux-2.6/mm/filemap.c @@ -30,7 +30,7 @@ #include #include #include -#include "filemap.h" +#include /* for BUG_ON(!in_atomic()) only */ #include "internal.h" /* @@ -664,26 +664,22 @@ EXPORT_SYMBOL(find_lock_page); struct page *find_or_create_page(struct address_space *mapping, unsigned long index, gfp_t gfp_mask) { - struct page *page, *cached_page = NULL; + struct page *page; int err; repeat: page = find_lock_page(mapping, index); if (!page) { - if (!cached_page) { - cached_page = alloc_page(gfp_mask); - if (!cached_page) - return NULL; - } - err = add_to_page_cache_lru(cached_page, mapping, - index, gfp_mask); - if (!err) { - page = cached_page; - cached_page = NULL; - } else if (err == -EEXIST) - goto repeat; + page = alloc_page(gfp_mask); + if (!page) + return NULL; + err = add_to_page_cache_lru(page, mapping, index, gfp_mask); + if (unlikely(err)) { + page_cache_release(page); + page = NULL; + if (err == -EEXIST) + goto repeat; + } } - if (cached_page) - page_cache_release(cached_page); return page; } EXPORT_SYMBOL(find_or_create_page); @@ -869,11 +865,9 @@ void do_generic_mapping_read(struct addr unsigned long next_index; unsigned long prev_index; loff_t isize; - struct page *cached_page; int error; struct file_ra_state ra = *_ra; - cached_page = NULL; index = *ppos >> PAGE_CACHE_SHIFT; next_index = index; prev_index = ra.prev_page; @@ -1037,23 +1031,20 @@ no_cached_page: * Ok, it wasn't cached, so we need to create a new * page.. */ - if (!cached_page) { - cached_page = page_cache_alloc_cold(mapping); - if (!cached_page) { - desc->error = -ENOMEM; - goto out; - } + page = page_cache_alloc_cold(mapping); + if (!page) { + desc->error = -ENOMEM; + goto out; } - error = add_to_page_cache_lru(cached_page, mapping, + error = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); if (error) { + page_cache_release(page); if (error == -EEXIST) goto find_page; desc->error = error; goto out; } - page = cached_page; - cached_page = NULL; goto readpage; } @@ -1061,8 +1052,6 @@ out: *_ra = ra; *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; - if (cached_page) - page_cache_release(cached_page); if (filp) file_accessed(filp); } @@ -1731,35 +1720,28 @@ static inline struct page *__read_cache_ int (*filler)(void *,struct page*), void *data) { - struct page *page, *cached_page = NULL; + struct page *page; int err; repeat: page = find_get_page(mapping, index); if (!page) { - if (!cached_page) { - cached_page = page_cache_alloc_cold(mapping); - if (!cached_page) - return ERR_PTR(-ENOMEM); - } - err = add_to_page_cache_lru(cached_page, mapping, - index, GFP_KERNEL); - if (err == -EEXIST) - goto repeat; - if (err < 0) { + page = page_cache_alloc_cold(mapping); + if (!page) + return ERR_PTR(-ENOMEM); + err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + if (unlikely(err)) { + page_cache_release(page); + if (err == -EEXIST) + goto repeat; /* Presumably ENOMEM for radix tree node */ - page_cache_release(cached_page); return ERR_PTR(err); } - page = cached_page; - cached_page = NULL; err = filler(data, page); if (err < 0) { page_cache_release(page); page = ERR_PTR(err); } } - if (cached_page) - page_cache_release(cached_page); return page; } @@ -1810,40 +1792,6 @@ retry: EXPORT_SYMBOL(read_cache_page); /* - * If the page was newly created, increment its refcount and add it to the - * caller's lru-buffering pagevec. This function is specifically for - * generic_file_write(). - */ -static inline struct page * -__grab_cache_page(struct address_space *mapping, unsigned long index, - struct page **cached_page, struct pagevec *lru_pvec) -{ - int err; - struct page *page; -repeat: - page = find_lock_page(mapping, index); - if (!page) { - if (!*cached_page) { - *cached_page = page_cache_alloc(mapping); - if (!*cached_page) - return NULL; - } - err = add_to_page_cache(*cached_page, mapping, - index, GFP_KERNEL); - if (err == -EEXIST) - goto repeat; - if (err == 0) { - page = *cached_page; - page_cache_get(page); - if (!pagevec_add(lru_pvec, page)) - __pagevec_lru_add(lru_pvec); - *cached_page = NULL; - } - } - return page; -} - -/* * The logic we want is * * if suid or (sgid and xgrp) @@ -1891,8 +1839,7 @@ int remove_suid(struct dentry *dentry) } EXPORT_SYMBOL(remove_suid); -size_t -__filemap_copy_from_user_iovec_inatomic(char *vaddr, +static size_t __iovec_copy_from_user_inatomic(char *vaddr, const struct iovec *iov, size_t base, size_t bytes) { size_t copied = 0, left = 0; @@ -1915,6 +1862,110 @@ __filemap_copy_from_user_iovec_inatomic( } /* + * Copy as much as we can into the page and return the number of bytes which + * were sucessfully copied. If a fault is encountered then return the number of + * bytes which were copied. + */ +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + BUG_ON(!in_atomic()); + kaddr = kmap_atomic(page, KM_USER0); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_inatomic_nocache(kaddr + offset, + buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap_atomic(kaddr, KM_USER0); + + return copied; +} + +/* + * This has the same sideeffects and return value as + * iov_iter_copy_from_user_atomic(). + * The difference is that it attempts to resolve faults. + * Page must not be locked. + */ +size_t iov_iter_copy_from_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes) +{ + char *kaddr; + size_t copied; + + kaddr = kmap(page); + if (likely(i->nr_segs == 1)) { + int left; + char __user *buf = i->iov->iov_base + i->iov_offset; + left = __copy_from_user_nocache(kaddr + offset, buf, bytes); + copied = bytes - left; + } else { + copied = __iovec_copy_from_user_inatomic(kaddr + offset, + i->iov, i->iov_offset, bytes); + } + kunmap(page); + return copied; +} + +static void __iov_iter_advance_iov(struct iov_iter *i, size_t bytes) +{ + if (likely(i->nr_segs == 1)) { + i->iov_offset += bytes; + } else { + const struct iovec *iov = i->iov; + size_t base = i->iov_offset; + + while (bytes) { + int copy = min(bytes, iov->iov_len - base); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + base = 0; + } + } + i->iov = iov; + i->iov_offset = base; + } +} + +void iov_iter_advance(struct iov_iter *i, size_t bytes) +{ + BUG_ON(i->count < bytes); + + __iov_iter_advance_iov(i, bytes); + i->count -= bytes; +} + +int iov_iter_fault_in_readable(struct iov_iter *i) +{ + size_t seglen = min(i->iov->iov_len - i->iov_offset, i->count); + char __user *buf = i->iov->iov_base + i->iov_offset; + return fault_in_pages_readable(buf, seglen); +} + +/* + * Return the count of just the current iov_iter segment. + */ +size_t iov_iter_single_seg_count(struct iov_iter *i) +{ + const struct iovec *iov = i->iov; + if (i->nr_segs == 1) + return i->count; + else + return min(i->count, iov->iov_len - i->iov_offset); +} + +/* * Performs necessary checks before doing a write * * Can adjust writing position or amount of bytes to write. @@ -1998,6 +2049,89 @@ inline int generic_write_checks(struct f } EXPORT_SYMBOL(generic_write_checks); +int pagecache_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + + if (aops->write_begin) { + return aops->write_begin(file, mapping, pos, len, flags, + pagep, fsdata); + } else { + int ret; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + struct page *page; +again: + page = __grab_cache_page(mapping, index); + *pagep = page; + if (!page) + return -ENOMEM; + + if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) { + /* + * There is no way to resolve a short write situation + * for a !Uptodate page (except by double copying in + * the caller done by generic_perform_write_2copy). + * + * Instead, we have to bring it uptodate here. + */ + ret = aops->readpage(file, page); + page_cache_release(page); + if (ret) { + if (ret == AOP_TRUNCATED_PAGE) + goto again; + return ret; + } + goto again; + } + + ret = aops->prepare_write(file, page, offset, offset+len); + if (ret) { + unlock_page(page); + page_cache_release(page); + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } + return ret; + } +} +EXPORT_SYMBOL(pagecache_write_begin); + +int pagecache_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + const struct address_space_operations *aops = mapping->a_ops; + int ret; + + if (aops->write_begin) { + ret = aops->write_end(file, mapping, pos, len, copied, + page, fsdata); + } else { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + struct inode *inode = mapping->host; + + flush_dcache_page(page); + ret = aops->commit_write(file, page, offset, offset+len); + unlock_page(page); + page_cache_release(page); + + if (ret < 0) { + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } else if (ret > 0) + ret = min_t(size_t, copied, ret); + else + ret = copied; + } + + return ret; +} +EXPORT_SYMBOL(pagecache_write_end); + ssize_t generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long *nr_segs, loff_t pos, loff_t *ppos, @@ -2037,151 +2171,316 @@ generic_file_direct_write(struct kiocb * } EXPORT_SYMBOL(generic_file_direct_write); -ssize_t -generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos, loff_t *ppos, - size_t count, ssize_t written) +/* + * Find or create a page at the given pagecache position. Return the locked + * page. This function is specifically for buffered writes. + */ +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) { - struct file *file = iocb->ki_filp; - struct address_space * mapping = file->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; - long status = 0; - struct page *page; - struct page *cached_page = NULL; - size_t bytes; - struct pagevec lru_pvec; - const struct iovec *cur_iov = iov; /* current iovec */ - size_t iov_base = 0; /* offset in the current iovec */ - char __user *buf; - - pagevec_init(&lru_pvec, 0); - - /* - * handle partial DIO write. Adjust cur_iov if needed. - */ - if (likely(nr_segs == 1)) - buf = iov->iov_base + written; - else { - filemap_set_next_iovec(&cur_iov, &iov_base, written); - buf = cur_iov->iov_base + iov_base; + int status; + struct page *page; +repeat: + page = find_lock_page(mapping, index); + if (likely(page)) + return page; + + page = page_cache_alloc(mapping); + if (!page) + return NULL; + status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); + if (unlikely(status)) { + page_cache_release(page); + if (status == -EEXIST) + goto repeat; + return NULL; } + return page; +} +EXPORT_SYMBOL(__grab_cache_page); + +static ssize_t generic_perform_write_2copy(struct file *file, + struct iov_iter *i, loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + long status = 0; + ssize_t written = 0; do { - unsigned long index; - unsigned long offset; - size_t copied; + struct page *src_page; + struct page *page; + pgoff_t index; /* Pagecache index for current page */ + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ - offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ + offset = (pos & (PAGE_CACHE_SIZE - 1)); index = pos >> PAGE_CACHE_SHIFT; - bytes = PAGE_CACHE_SIZE - offset; - - /* Limit the size of the copy to the caller's write size */ - bytes = min(bytes, count); + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); - /* We only need to worry about prefaulting when writes are from - * user-space. NFSd uses vfs_writev with several non-aligned - * segments in the vector, and limiting to one segment a time is - * a noticeable performance for re-write + /* + * a non-NULL src_page indicates that we're doing the + * copy via get_user_pages and kmap. */ - if (!segment_eq(get_fs(), KERNEL_DS)) { - /* - * Limit the size of the copy to that of the current - * segment, because fault_in_pages_readable() doesn't - * know how to walk segments. - */ - bytes = min(bytes, cur_iov->iov_len - iov_base); + src_page = NULL; - /* - * Bring in the user page that we will copy from - * _first_. Otherwise there's a nasty deadlock on - * copying from the same page as we're writing to, - * without it being marked up-to-date. - */ - fault_in_pages_readable(buf, bytes); + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i))) { + status = -EFAULT; + break; } - page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec); + + page = __grab_cache_page(mapping, index); if (!page) { status = -ENOMEM; break; } - if (unlikely(bytes == 0)) { - status = 0; - copied = 0; - goto zero_length_segment; - } + /* + * non-uptodate pages cannot cope with short copies, and we + * cannot take a pagefault with the destination page locked. + * So pin the source page to copy it. + */ + if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) { + unlock_page(page); - status = a_ops->prepare_write(file, page, offset, offset+bytes); - if (unlikely(status)) { - loff_t isize = i_size_read(inode); + src_page = alloc_page(GFP_KERNEL); + if (!src_page) { + page_cache_release(page); + status = -ENOMEM; + break; + } + + /* + * Cannot get_user_pages with a page locked for the + * same reason as we can't take a page fault with a + * page locked (as explained below). + */ + copied = iov_iter_copy_from_user(src_page, i, + offset, bytes); + if (unlikely(copied == 0)) { + status = -EFAULT; + page_cache_release(page); + page_cache_release(src_page); + break; + } + bytes = copied; - if (status != AOP_TRUNCATED_PAGE) + lock_page(page); + /* + * Can't handle the page going uptodate here, because + * that means we would use non-atomic usercopies, which + * zero out the tail of the page, which can cause + * zeroes to become transiently visible. We could just + * use a non-zeroing copy, but the APIs aren't too + * consistent. + */ + if (unlikely(!page->mapping || PageUptodate(page))) { unlock_page(page); - page_cache_release(page); - if (status == AOP_TRUNCATED_PAGE) + page_cache_release(page); + page_cache_release(src_page); continue; + } + } + + status = a_ops->prepare_write(file, page, offset, offset+bytes); + if (unlikely(status)) + goto fs_write_aop_error; + + if (!src_page) { /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. + * Must not enter the pagefault handler here, because + * we hold the page lock, so we might recursively + * deadlock on the same lock, or get an ABBA deadlock + * against a different lock, or against the mmap_sem + * (which nests outside the page lock). So increment + * preempt count, and use _atomic usercopies. + * + * The page is uptodate so we are OK to encounter a + * short copy: if unmodified parts of the page are + * marked dirty and written out to disk, it doesn't + * really matter. */ - if (pos + bytes > isize) - vmtruncate(inode, isize); - break; + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, + offset, bytes); + pagefault_enable(); + } else { + void *src, *dst; + src = kmap_atomic(src_page, KM_USER0); + dst = kmap_atomic(page, KM_USER1); + memcpy(dst + offset, src + offset, bytes); + kunmap_atomic(dst, KM_USER1); + kunmap_atomic(src, KM_USER0); + copied = bytes; } - if (likely(nr_segs == 1)) - copied = filemap_copy_from_user(page, offset, - buf, bytes); - else - copied = filemap_copy_from_user_iovec(page, offset, - cur_iov, iov_base, bytes); flush_dcache_page(page); + status = a_ops->commit_write(file, page, offset, offset+bytes); - if (status == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - continue; - } -zero_length_segment: - if (likely(copied >= 0)) { - if (!status) - status = copied; - - if (status >= 0) { - written += status; - count -= status; - pos += status; - buf += status; - if (unlikely(nr_segs > 1)) { - filemap_set_next_iovec(&cur_iov, - &iov_base, status); - if (count) - buf = cur_iov->iov_base + - iov_base; - } else { - iov_base += status; - } - } - } - if (unlikely(copied != bytes)) - if (status >= 0) - status = -EFAULT; + if (unlikely(status < 0)) + goto fs_write_aop_error; + if (unlikely(status > 0)) /* filesystem did partial write */ + copied = min_t(size_t, copied, status); + unlock_page(page); mark_page_accessed(page); page_cache_release(page); - if (status < 0) - break; + if (src_page) + page_cache_release(src_page); + + iov_iter_advance(i, copied); + pos += copied; + written += copied; + balance_dirty_pages_ratelimited(mapping); cond_resched(); - } while (count); - *ppos = pos; + continue; + +fs_write_aop_error: + unlock_page(page); + page_cache_release(page); + if (src_page) + page_cache_release(src_page); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + bytes > inode->i_size) + vmtruncate(inode, inode->i_size); + break; + } while (iov_iter_count(i)); - if (cached_page) - page_cache_release(cached_page); + return written ? written : status; +} + +static ssize_t generic_perform_write(struct file *file, + struct iov_iter *i, loff_t pos) +{ + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + long status = 0; + ssize_t written = 0; + unsigned int flags = 0; /* - * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC + * Copies from kernel address space cannot fail (NFSD is a big user). */ + if (segment_eq(get_fs(), KERNEL_DS)) + flags |= AOP_FLAG_UNINTERRUPTIBLE; + + do { + struct page *page; + pgoff_t index; /* Pagecache index for current page */ + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + size_t copied; /* Bytes copied from user */ + void *fsdata; + + offset = (pos & (PAGE_CACHE_SIZE - 1)); + index = pos >> PAGE_CACHE_SHIFT; + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_count(i)); + +again: + + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(iov_iter_fault_in_readable(i))) { + status = -EFAULT; + break; + } + + status = a_ops->write_begin(file, mapping, pos, bytes, flags, + &page, &fsdata); + if (unlikely(status)) + break; + + pagefault_disable(); + copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + pagefault_enable(); + flush_dcache_page(page); + + status = a_ops->write_end(file, mapping, pos, bytes, copied, + page, fsdata); + if (unlikely(status < 0)) + break; + copied = status; + + cond_resched(); + + if (unlikely(copied == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, + iov_iter_single_seg_count(i)); + goto again; + } + iov_iter_advance(i, copied); + pos += copied; + written += copied; + + balance_dirty_pages_ratelimited(mapping); + + } while (iov_iter_count(i)); + + return written ? written : status; +} + +ssize_t +generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos, loff_t *ppos, + size_t count, ssize_t written) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + struct inode *inode = mapping->host; + ssize_t status; + struct iov_iter i; + + iov_iter_init(&i, iov, nr_segs, count, written); + if (a_ops->perform_write) + status = a_ops->perform_write(file, mapping, &i, pos); + else if (a_ops->write_begin) + status = generic_perform_write(file, &i, pos); + else + status = generic_perform_write_2copy(file, &i, pos); + if (likely(status >= 0)) { + written += status; + *ppos = pos + status; + + /* + * For now, when the user asks for O_SYNC, we'll actually give + * O_DSYNC + */ if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { if (!a_ops->writepage || !is_sync_kiocb(iocb)) status = generic_osync_inode(inode, mapping, @@ -2197,7 +2496,6 @@ zero_length_segment: if (unlikely(file->f_flags & O_DIRECT) && written) status = filemap_write_and_wait(mapping); - pagevec_lru_add(&lru_pvec); return written ? written : status; } EXPORT_SYMBOL(generic_file_buffered_write); Index: linux-2.6/mm/filemap.h =================================================================== --- linux-2.6.orig/mm/filemap.h +++ /dev/null @@ -1,103 +0,0 @@ -/* - * linux/mm/filemap.h - * - * Copyright (C) 1994-1999 Linus Torvalds - */ - -#ifndef __FILEMAP_H -#define __FILEMAP_H - -#include -#include -#include -#include -#include -#include - -size_t -__filemap_copy_from_user_iovec_inatomic(char *vaddr, - const struct iovec *iov, - size_t base, - size_t bytes); - -/* - * Copy as much as we can into the page and return the number of bytes which - * were sucessfully copied. If a fault is encountered then clear the page - * out to (offset+bytes) and return the number of bytes which were copied. - * - * NOTE: For this to work reliably we really want copy_from_user_inatomic_nocache - * to *NOT* zero any tail of the buffer that it failed to copy. If it does, - * and if the following non-atomic copy succeeds, then there is a small window - * where the target page contains neither the data before the write, nor the - * data after the write (it contains zero). A read at this time will see - * data that is inconsistent with any ordering of the read and the write. - * (This has been detected in practice). - */ -static inline size_t -filemap_copy_from_user(struct page *page, unsigned long offset, - const char __user *buf, unsigned bytes) -{ - char *kaddr; - int left; - - kaddr = kmap_atomic(page, KM_USER0); - left = __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); - kunmap_atomic(kaddr, KM_USER0); - - if (left != 0) { - /* Do it the slow way */ - kaddr = kmap(page); - left = __copy_from_user_nocache(kaddr + offset, buf, bytes); - kunmap(page); - } - return bytes - left; -} - -/* - * This has the same sideeffects and return value as filemap_copy_from_user(). - * The difference is that on a fault we need to memset the remainder of the - * page (out to offset+bytes), to emulate filemap_copy_from_user()'s - * single-segment behaviour. - */ -static inline size_t -filemap_copy_from_user_iovec(struct page *page, unsigned long offset, - const struct iovec *iov, size_t base, size_t bytes) -{ - char *kaddr; - size_t copied; - - kaddr = kmap_atomic(page, KM_USER0); - copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, - base, bytes); - kunmap_atomic(kaddr, KM_USER0); - if (copied != bytes) { - kaddr = kmap(page); - copied = __filemap_copy_from_user_iovec_inatomic(kaddr + offset, iov, - base, bytes); - if (bytes - copied) - memset(kaddr + offset + copied, 0, bytes - copied); - kunmap(page); - } - return copied; -} - -static inline void -filemap_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) -{ - const struct iovec *iov = *iovp; - size_t base = *basep; - - do { - int copy = min(bytes, iov->iov_len - base); - - bytes -= copy; - base += copy; - if (iov->iov_len == base) { - iov++; - base = 0; - } - } while (bytes); - *iovp = iov; - *basep = base; -} -#endif Index: linux-2.6/fs/mpage.c =================================================================== --- linux-2.6.orig/fs/mpage.c +++ linux-2.6/fs/mpage.c @@ -389,31 +389,25 @@ mpage_readpages(struct address_space *ma struct bio *bio = NULL; unsigned page_idx; sector_t last_block_in_bio = 0; - struct pagevec lru_pvec; struct buffer_head map_bh; unsigned long first_logical_block = 0; clear_buffer_mapped(&map_bh); - pagevec_init(&lru_pvec, 0); for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_entry(pages->prev, struct page, lru); prefetchw(&page->flags); list_del(&page->lru); - if (!add_to_page_cache(page, mapping, + if (!add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) { bio = do_mpage_readpage(bio, page, nr_pages - page_idx, &last_block_in_bio, &map_bh, &first_logical_block, get_block); - if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); - } else { - page_cache_release(page); } + page_cache_release(page); } - pagevec_lru_add(&lru_pvec); BUG_ON(!list_empty(pages)); if (bio) mpage_bio_submit(READ, bio); Index: linux-2.6/mm/readahead.c =================================================================== --- linux-2.6.orig/mm/readahead.c +++ linux-2.6/mm/readahead.c @@ -133,28 +133,25 @@ int read_cache_pages(struct address_spac int (*filler)(void *, struct page *), void *data) { struct page *page; - struct pagevec lru_pvec; int ret = 0; - pagevec_init(&lru_pvec, 0); - while (!list_empty(pages)) { page = list_to_page(pages); list_del(&page->lru); - if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) { + if (add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) { page_cache_release(page); continue; } + page_cache_release(page); + ret = filler(data, page); - if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); - if (ret) { + if (unlikely(ret)) { put_pages_list(pages); break; } task_io_account_read(PAGE_CACHE_SIZE); } - pagevec_lru_add(&lru_pvec); return ret; } @@ -164,7 +161,6 @@ static int read_pages(struct address_spa struct list_head *pages, unsigned nr_pages) { unsigned page_idx; - struct pagevec lru_pvec; int ret; if (mapping->a_ops->readpages) { @@ -174,19 +170,15 @@ static int read_pages(struct address_spa goto out; } - pagevec_init(&lru_pvec, 0); for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_to_page(pages); list_del(&page->lru); - if (!add_to_page_cache(page, mapping, + if (!add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) { mapping->a_ops->readpage(filp, page); - if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); - } else - page_cache_release(page); + } + page_cache_release(page); } - pagevec_lru_add(&lru_pvec); ret = 0; out: return ret; Index: linux-2.6/include/linux/pagemap.h =================================================================== --- linux-2.6.orig/include/linux/pagemap.h +++ linux-2.6/include/linux/pagemap.h @@ -85,6 +85,8 @@ unsigned find_get_pages_contig(struct ad unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages); +struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index); + /* * Returns locked page at given index in given cache, creating it if needed. */ @@ -196,6 +198,9 @@ static inline int fault_in_pages_writeab { int ret; + if (unlikely(size == 0)) + return 0; + /* * Writing zeroes into userspace here is OK, because we know that if * the zero gets there, we'll be overwriting it. @@ -215,19 +220,23 @@ static inline int fault_in_pages_writeab return ret; } -static inline void fault_in_pages_readable(const char __user *uaddr, int size) +static inline int fault_in_pages_readable(const char __user *uaddr, int size) { volatile char c; int ret; + if (unlikely(size == 0)) + return 0; + ret = __get_user(c, uaddr); if (ret == 0) { const char __user *end = uaddr + size - 1; if (((unsigned long)uaddr & PAGE_MASK) != ((unsigned long)end & PAGE_MASK)) - __get_user(c, end); + ret = __get_user(c, end); } + return ret; } #endif /* _LINUX_PAGEMAP_H */ Index: linux-2.6/include/linux/fs.h =================================================================== --- linux-2.6.orig/include/linux/fs.h +++ linux-2.6/include/linux/fs.h @@ -376,7 +376,7 @@ struct iattr { * trying again. The aop will be taking reasonable * precautions not to livelock. If the caller held a page * reference, it should drop it before retrying. Returned - * by readpage(), prepare_write(), and commit_write(). + * by readpage(). * * address_space_operation functions return these large constants to indicate * special semantics to the caller. These are much larger than the bytes in a @@ -389,6 +389,8 @@ enum positive_aop_returns { AOP_TRUNCATED_PAGE = 0x80001, }; +#define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */ + /* * oh the beauties of C type declarations. */ @@ -396,6 +398,39 @@ struct page; struct address_space; struct writeback_control; +struct iov_iter { + const struct iovec *iov; + unsigned long nr_segs; + size_t iov_offset; + size_t count; +}; + +size_t iov_iter_copy_from_user_atomic(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes); +size_t iov_iter_copy_from_user(struct page *page, + struct iov_iter *i, unsigned long offset, size_t bytes); +void iov_iter_advance(struct iov_iter *i, size_t bytes); +int iov_iter_fault_in_readable(struct iov_iter *i); +size_t iov_iter_single_seg_count(struct iov_iter *i); + +static inline void iov_iter_init(struct iov_iter *i, + const struct iovec *iov, unsigned long nr_segs, + size_t count, size_t written) +{ + i->iov = iov; + i->nr_segs = nr_segs; + i->iov_offset = 0; + i->count = count + written; + + iov_iter_advance(i, written); +} + +static inline size_t iov_iter_count(struct iov_iter *i) +{ + return i->count; +} + + struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); @@ -416,6 +451,17 @@ struct address_space_operations { */ int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); int (*commit_write)(struct file *, struct page *, unsigned, unsigned); + + int (*write_begin)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); + int (*write_end)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + + ssize_t (*perform_write)(struct file *, struct address_space *, + struct iov_iter *, loff_t); + /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidatepage) (struct page *, unsigned long); @@ -430,6 +476,18 @@ struct address_space_operations { int (*launder_page) (struct page *); }; +/* + * pagecache_write_begin/pagecache_write_end must be used by general code + * to write into the pagecache. + */ +int pagecache_write_begin(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); + +int pagecache_write_end(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ @@ -1869,6 +1927,12 @@ extern int simple_prepare_write(struct f unsigned offset, unsigned to); extern int simple_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to); +extern int simple_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +extern int simple_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); extern struct dentry *simple_lookup(struct inode *, struct dentry *, struct nameidata *); extern ssize_t generic_read_dir(struct file *, char __user *, size_t, loff_t *); Index: linux-2.6/fs/buffer.c =================================================================== --- linux-2.6.orig/fs/buffer.c +++ linux-2.6/fs/buffer.c @@ -1799,7 +1799,9 @@ static int __block_prepare_write(struct unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); if (PageUptodate(page)) { + clear_buffer_new(bh); set_buffer_uptodate(bh); + mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) { @@ -1838,44 +1840,54 @@ static int __block_prepare_write(struct if (!buffer_uptodate(*wait_bh)) err = -EIO; } - if (!err) { - bh = head; - do { - if (buffer_new(bh)) - clear_buffer_new(bh); - } while ((bh = bh->b_this_page) != head); - return 0; - } - /* Error case: */ - /* - * Zero out any newly allocated blocks to avoid exposing stale - * data. If BH_New is set, we know that the block was newly - * allocated in the above loop. - */ - bh = head; + if (unlikely(err)) + page_zero_new_buffers(page, from, to); + return err; +} + +/* + * If a page has any new buffers, zero them out here, and mark them uptodate + * and dirty so they'll be written out (in order to prevent uninitialised + * block data from leaking). And clear the new bit. + */ +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to) +{ + unsigned int block_start, block_end; + struct buffer_head *head, *bh; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + return; + + bh = head = page_buffers(page); block_start = 0; do { - block_end = block_start+blocksize; - if (block_end <= from) - goto next_bh; - if (block_start >= to) - break; + block_end = block_start + bh->b_size; + if (buffer_new(bh)) { - void *kaddr; + if (block_end > from && block_start < to) { + if (!PageUptodate(page)) { + unsigned start, end; + void *kaddr; - clear_buffer_new(bh); - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr+block_start, 0, bh->b_size); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); + start = max(from, block_start); + end = min(to, block_end); + + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+start, 0, end - start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + set_buffer_uptodate(bh); + } + + clear_buffer_new(bh); + mark_buffer_dirty(bh); + } } -next_bh: + block_start = block_end; bh = bh->b_this_page; } while (bh != head); - return err; } static int __block_commit_write(struct inode *inode, struct page *page, @@ -1899,6 +1911,7 @@ static int __block_commit_write(struct i set_buffer_uptodate(bh); mark_buffer_dirty(bh); } + clear_buffer_new(bh); } /* @@ -1913,6 +1926,115 @@ static int __block_commit_write(struct i } /* + * block_write_begin takes care of the basic task of block allocation and + * bringing partial write blocks uptodate first. + * + * If *pagep is not NULL, then block_write_begin uses the locked page + * at *pagep rather than allocating its own. In this case, the page will + * not be unlocked or deallocated on failure. + */ +int block_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block) +{ + struct inode *inode = mapping->host; + int status = 0; + struct page *page; + pgoff_t index; + unsigned start, end; + int ownpage = 0; + + index = pos >> PAGE_CACHE_SHIFT; + start = pos & (PAGE_CACHE_SIZE - 1); + end = start + len; + + page = *pagep; + if (page == NULL) { + ownpage = 1; + page = __grab_cache_page(mapping, index); + if (!page) { + status = -ENOMEM; + goto out; + } + *pagep = page; + } else + BUG_ON(!PageLocked(page)); + + status = __block_prepare_write(inode, page, start, end, get_block); + if (unlikely(status)) { + ClearPageUptodate(page); + + if (ownpage) { + unlock_page(page); + page_cache_release(page); + + /* + * prepare_write() may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); + } + goto out; + } + +out: + return status; +} +EXPORT_SYMBOL(block_write_begin); + +int block_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + unsigned start; + + start = pos & (PAGE_CACHE_SIZE - 1); + + if (unlikely(copied < len)) { + /* + * The buffers that were written will now be uptodate, so we + * don't have to worry about a readpage reading them and + * overwriting a partial write. However if we have encountered + * a short write and only partially written into a buffer, it + * will not be marked uptodate, so a readpage might come in and + * destroy our partial write. + * + * Do the simplest thing, and just treat any short write to a + * non uptodate page as a zero-length write, and force the + * caller to redo the whole thing. + */ + if (!PageUptodate(page)) + copied = 0; + + page_zero_new_buffers(page, start+copied, start+len); + } + flush_dcache_page(page); + + /* This could be a short (even 0-length) commit */ + __block_commit_write(inode, page, start, start+copied); + + unlock_page(page); + mark_page_accessed(page); /* XXX: put this in caller? */ + page_cache_release(page); + + /* + * No need to use i_size_read() here, the i_size + * cannot change under us because we hold i_mutex. + */ + if (pos+copied > inode->i_size) { + i_size_write(inode, pos+copied); + mark_inode_dirty(inode); + } + + return copied; +} +EXPORT_SYMBOL(block_write_end); + +/* * Generic "read page" function for block devices that have the normal * get_block functionality. This is most of the block device filesystems. * Reads the page asynchronously --- the unlock_buffer() and @@ -2009,167 +2131,145 @@ int block_read_full_page(struct page *pa return 0; } -/* utility function for filesystems that need to do work on expanding - * truncates. Uses prepare/commit_write to allow the filesystem to - * deal with the hole. - */ -static int __generic_cont_expand(struct inode *inode, loff_t size, - pgoff_t index, unsigned int offset) +int generic_cont_expand(struct inode *inode, loff_t size, + get_block_t *get_block, loff_t *bytes) { struct address_space *mapping = inode->i_mapping; - struct page *page; + unsigned blocksize = 1 << inode->i_blkbits; unsigned long limit; - int err; + unsigned zerofrom; + pgoff_t index, new_index; + void *kaddr; + struct page *page; + int err = 0; + + if (size < *bytes) + goto out; - err = -EFBIG; limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; if (limit != RLIM_INFINITY && size > (loff_t)limit) { send_sig(SIGXFSZ, current, 0); - goto out; + return -EFBIG; } if (size > inode->i_sb->s_maxbytes) - goto out; + return -EFBIG; - err = -ENOMEM; - page = grab_cache_page(mapping, index); - if (!page) - goto out; - err = mapping->a_ops->prepare_write(NULL, page, offset, offset); - if (err) { - /* - * ->prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. - */ + new_index = size >> PAGE_CACHE_SHIFT; + index = *bytes >> PAGE_CACHE_SHIFT; + + while (new_index > index) { + page = grab_cache_page(mapping, index); + if (!page) { + err = -ENOMEM; + goto out; + } + /* we might sleep (XXX: but we hold i_mutex?) */ + if (*bytes>>PAGE_CACHE_SHIFT != index) { + unlock_page(page); + page_cache_release(page); + continue; + } + zerofrom = *bytes & ~PAGE_CACHE_MASK; + err = __block_prepare_write(inode, page, zerofrom, + PAGE_CACHE_SIZE, get_block); + if (err) { + unlock_page(page); + page_cache_release(page); + goto out; + } + + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + generic_commit_write(NULL, page, zerofrom, PAGE_CACHE_SIZE); unlock_page(page); page_cache_release(page); - vmtruncate(inode, inode->i_size); - goto out; + + index = *bytes >> PAGE_CACHE_SHIFT; } - err = mapping->a_ops->commit_write(NULL, page, offset, offset); + if (new_index == index) { + unsigned offset = size & ~PAGE_CACHE_MASK; - unlock_page(page); - page_cache_release(page); - if (err > 0) - err = 0; -out: - return err; -} + /* page covers the boundary, find the boundary offset */ + zerofrom = *bytes & ~PAGE_CACHE_MASK; -int generic_cont_expand(struct inode *inode, loff_t size) -{ - pgoff_t index; - unsigned int offset; + /* starting below the boundary? Nothing to zero out */ + if (zerofrom < offset) { + page = grab_cache_page(mapping, index); + if (!page) { + err = -ENOMEM; + goto out; + } - offset = (size & (PAGE_CACHE_SIZE - 1)); /* Within page */ + err = __block_prepare_write(inode, page, zerofrom, + offset, get_block); + if (err) { + unlock_page(page); + page_cache_release(page); + goto out; + } - /* ugh. in prepare/commit_write, if from==to==start of block, we - ** skip the prepare. make sure we never send an offset for the start - ** of a block - */ - if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { - /* caller must handle this extra byte. */ - offset++; + if (zerofrom & (blocksize-1)) { + *bytes |= (blocksize-1); + (*bytes)++; + } + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr+zerofrom, 0, offset-zerofrom); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + generic_commit_write(NULL, page, zerofrom, offset); + unlock_page(page); + page_cache_release(page); + } } - index = size >> PAGE_CACHE_SHIFT; - - return __generic_cont_expand(inode, size, index, offset); -} -int generic_cont_expand_simple(struct inode *inode, loff_t size) -{ - loff_t pos = size - 1; - pgoff_t index = pos >> PAGE_CACHE_SHIFT; - unsigned int offset = (pos & (PAGE_CACHE_SIZE - 1)) + 1; - - /* prepare/commit_write can handle even if from==to==start of block. */ - return __generic_cont_expand(inode, size, index, offset); +out: + return err; } /* * For moronic filesystems that do not allow holes in file. * We may have to extend the file. */ - -int cont_prepare_write(struct page *page, unsigned offset, - unsigned to, get_block_t *get_block, loff_t *bytes) +int cont_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata, + get_block_t *get_block, loff_t *bytes) { - struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; - struct page *new_page; - pgoff_t pgpos; - long status; - unsigned zerofrom; - unsigned blocksize = 1 << inode->i_blkbits; - void *kaddr; + int err; - while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { - status = -ENOMEM; - new_page = grab_cache_page(mapping, pgpos); - if (!new_page) - goto out; - /* we might sleep */ - if (*bytes>>PAGE_CACHE_SHIFT != pgpos) { - unlock_page(new_page); - page_cache_release(new_page); - continue; - } - zerofrom = *bytes & ~PAGE_CACHE_MASK; - if (zerofrom & (blocksize-1)) { - *bytes |= (blocksize-1); - (*bytes)++; - } - status = __block_prepare_write(inode, new_page, zerofrom, - PAGE_CACHE_SIZE, get_block); - if (status) - goto out_unmap; - kaddr = kmap_atomic(new_page, KM_USER0); - memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom); - flush_dcache_page(new_page); - kunmap_atomic(kaddr, KM_USER0); - generic_commit_write(NULL, new_page, zerofrom, PAGE_CACHE_SIZE); - unlock_page(new_page); - page_cache_release(new_page); + if (*bytes < pos) { + err = generic_cont_expand(inode, pos, get_block, bytes); + if (err) + return err; } - if (page->index < pgpos) { - /* completely inside the area */ - zerofrom = offset; - } else { - /* page covers the boundary, find the boundary offset */ - zerofrom = *bytes & ~PAGE_CACHE_MASK; + return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + get_block); +} - /* if we will expand the thing last block will be filled */ - if (to > zerofrom && (zerofrom & (blocksize-1))) { - *bytes |= (blocksize-1); - (*bytes)++; - } +int cont_prepare_write(struct page *page, unsigned offset, + unsigned to, get_block_t *get_block, loff_t *bytes) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + loff_t pos = page_offset(page) + offset; + int err; - /* starting below the boundary? Nothing to zero out */ - if (offset <= zerofrom) - zerofrom = offset; - } - status = __block_prepare_write(inode, page, zerofrom, to, get_block); - if (status) - goto out1; - if (zerofrom < offset) { - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr+zerofrom, 0, offset-zerofrom); - flush_dcache_page(page); - kunmap_atomic(kaddr, KM_USER0); - __block_commit_write(inode, page, zerofrom, offset); + if (*bytes < pos) { + err = generic_cont_expand(inode, pos, get_block, bytes); + if (err) + return err; } - return 0; -out1: - ClearPageUptodate(page); - return status; -out_unmap: - ClearPageUptodate(new_page); - unlock_page(new_page); - page_cache_release(new_page); -out: - return status; + return __block_prepare_write(inode, page, offset, to, get_block); } int block_prepare_write(struct page *page, unsigned from, unsigned to, @@ -3035,7 +3135,6 @@ EXPORT_SYMBOL(fsync_bdev); EXPORT_SYMBOL(generic_block_bmap); EXPORT_SYMBOL(generic_commit_write); EXPORT_SYMBOL(generic_cont_expand); -EXPORT_SYMBOL(generic_cont_expand_simple); EXPORT_SYMBOL(init_buffer); EXPORT_SYMBOL(invalidate_bdev); EXPORT_SYMBOL(ll_rw_block); Index: linux-2.6/include/linux/buffer_head.h =================================================================== --- linux-2.6.orig/include/linux/buffer_head.h +++ linux-2.6/include/linux/buffer_head.h @@ -202,11 +202,19 @@ void block_invalidatepage(struct page *p int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc); int block_read_full_page(struct page*, get_block_t*); +int block_write_begin(struct file *, struct address_space *, + loff_t, unsigned, unsigned, + struct page **, void **, get_block_t*); +int block_write_end(struct file *, struct address_space *, + loff_t, unsigned, unsigned, + struct page *, void *); +void page_zero_new_buffers(struct page *page, unsigned from, unsigned to); int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); -int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, - loff_t *); -int generic_cont_expand(struct inode *inode, loff_t size); -int generic_cont_expand_simple(struct inode *inode, loff_t size); +int cont_write_begin(struct file *, struct address_space *, loff_t, + unsigned, unsigned, struct page **, void **, + get_block_t *, loff_t *); +int generic_cont_expand(struct inode *inode, loff_t size, + get_block_t *get_block, loff_t *bytes); int block_commit_write(struct page *page, unsigned from, unsigned to); void block_sync_page(struct page *); sector_t generic_block_bmap(struct address_space *, sector_t, get_block_t *); Index: linux-2.6/fs/libfs.c =================================================================== --- linux-2.6.orig/fs/libfs.c +++ linux-2.6/fs/libfs.c @@ -342,6 +342,26 @@ int simple_prepare_write(struct file *fi return 0; } +int simple_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct page *page; + pgoff_t index; + unsigned from; + + index = pos >> PAGE_CACHE_SHIFT; + from = pos & (PAGE_CACHE_SIZE - 1); + + page = __grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + + *pagep = page; + + return simple_prepare_write(file, page, from, from+len); +} + int simple_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { @@ -360,6 +380,28 @@ int simple_commit_write(struct file *fil return 0; } +int simple_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + + /* zero the stale part of the page if we did a short copy */ + if (copied < len) { + void *kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + from + copied, 0, len - copied); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + + simple_commit_write(file, page, from, from+copied); + + unlock_page(page); + page_cache_release(page); + + return copied; +} + int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files) { struct inode *inode; @@ -616,6 +658,8 @@ EXPORT_SYMBOL(dcache_dir_open); EXPORT_SYMBOL(dcache_readdir); EXPORT_SYMBOL(generic_read_dir); EXPORT_SYMBOL(get_sb_pseudo); +EXPORT_SYMBOL(simple_write_begin); +EXPORT_SYMBOL(simple_write_end); EXPORT_SYMBOL(simple_commit_write); EXPORT_SYMBOL(simple_dir_inode_operations); EXPORT_SYMBOL(simple_dir_operations); Index: linux-2.6/drivers/block/loop.c =================================================================== --- linux-2.6.orig/drivers/block/loop.c +++ linux-2.6/drivers/block/loop.c @@ -206,11 +206,10 @@ lo_do_transfer(struct loop_device *lo, i * space operations prepare_write and commit_write. */ static int do_lo_send_aops(struct loop_device *lo, struct bio_vec *bvec, - int bsize, loff_t pos, struct page *page) + int bsize, loff_t pos, struct page *unused) { struct file *file = lo->lo_backing_file; /* kudos to NFsckingS */ struct address_space *mapping = file->f_mapping; - const struct address_space_operations *aops = mapping->a_ops; pgoff_t index; unsigned offset, bv_offs; int len, ret; @@ -222,67 +221,47 @@ static int do_lo_send_aops(struct loop_d len = bvec->bv_len; while (len > 0) { sector_t IV; - unsigned size; + unsigned size, copied; int transfer_result; + struct page *page; + void *fsdata; IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9); size = PAGE_CACHE_SIZE - offset; if (size > len) size = len; - page = grab_cache_page(mapping, index); - if (unlikely(!page)) + + ret = pagecache_write_begin(file, mapping, pos, size, 0, + &page, &fsdata); + if (ret) goto fail; - ret = aops->prepare_write(file, page, offset, - offset + size); - if (unlikely(ret)) { - if (ret == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - continue; - } - goto unlock; - } + transfer_result = lo_do_transfer(lo, WRITE, page, offset, bvec->bv_page, bv_offs, size, IV); - if (unlikely(transfer_result)) { - char *kaddr; + copied = size; + if (unlikely(transfer_result)) + copied = 0; + + ret = pagecache_write_end(file, mapping, pos, size, copied, + page, fsdata); + if (ret < 0) + goto fail; + if (ret < copied) + copied = ret; - /* - * The transfer failed, but we still write the data to - * keep prepare/commit calls balanced. - */ - printk(KERN_ERR "loop: transfer error block %llu\n", - (unsigned long long)index); - kaddr = kmap_atomic(page, KM_USER0); - memset(kaddr + offset, 0, size); - kunmap_atomic(kaddr, KM_USER0); - } - flush_dcache_page(page); - ret = aops->commit_write(file, page, offset, - offset + size); - if (unlikely(ret)) { - if (ret == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - continue; - } - goto unlock; - } if (unlikely(transfer_result)) - goto unlock; - bv_offs += size; - len -= size; + goto fail; + + bv_offs += copied; + len -= copied; offset = 0; index++; - pos += size; - unlock_page(page); - page_cache_release(page); + pos += copied; } ret = 0; out: mutex_unlock(&mapping->host->i_mutex); return ret; -unlock: - unlock_page(page); - page_cache_release(page); fail: ret = -1; goto out; Index: linux-2.6/fs/namei.c =================================================================== --- linux-2.6.orig/fs/namei.c +++ linux-2.6/fs/namei.c @@ -2688,53 +2688,30 @@ int __page_symlink(struct inode *inode, { struct address_space *mapping = inode->i_mapping; struct page *page; + void *fsdata; int err; char *kaddr; retry: - err = -ENOMEM; - page = find_or_create_page(mapping, 0, gfp_mask); - if (!page) - goto fail; - err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); - if (err == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry; - } + err = pagecache_write_begin(NULL, mapping, 0, PAGE_CACHE_SIZE, + AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); if (err) - goto fail_map; + goto fail; + kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len-1); + memset(kaddr+len-1, 0, PAGE_CACHE_SIZE-(len-1)); kunmap_atomic(kaddr, KM_USER0); - err = mapping->a_ops->commit_write(NULL, page, 0, len-1); - if (err == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto retry; - } - if (err) - goto fail_map; - /* - * Notice that we are _not_ going to block here - end of page is - * unmapped, so this will only try to map the rest of page, see - * that it is unmapped (typically even will not look into inode - - * ->i_size will be enough for everything) and zero it out. - * OTOH it's obviously correct and should make the page up-to-date. - */ - if (!PageUptodate(page)) { - err = mapping->a_ops->readpage(NULL, page); - if (err != AOP_TRUNCATED_PAGE) - wait_on_page_locked(page); - } else { - unlock_page(page); - } - page_cache_release(page); + + err = pagecache_write_end(NULL, mapping, 0, PAGE_CACHE_SIZE, PAGE_CACHE_SIZE, + page, fsdata); if (err < 0) goto fail; + if (err < PAGE_CACHE_SIZE) + goto retry; + mark_inode_dirty(inode); return 0; -fail_map: - unlock_page(page); - page_cache_release(page); fail: return err; } Index: linux-2.6/fs/splice.c =================================================================== --- linux-2.6.orig/fs/splice.c +++ linux-2.6/fs/splice.c @@ -559,7 +559,7 @@ static int pipe_to_file(struct pipe_inod struct address_space *mapping = file->f_mapping; unsigned int offset, this_len; struct page *page; - pgoff_t index; + void *fsdata; int ret; /* @@ -569,13 +569,13 @@ static int pipe_to_file(struct pipe_inod if (unlikely(ret)) return ret; - index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; this_len = sd->len; if (this_len + offset > PAGE_CACHE_SIZE) this_len = PAGE_CACHE_SIZE - offset; +#if 0 /* * Reuse buf page, if SPLICE_F_MOVE is set and we are doing a full * page. @@ -587,86 +587,12 @@ static int pipe_to_file(struct pipe_inod * locked on successful return. */ if (buf->ops->steal(pipe, buf)) - goto find_page; +#endif - page = buf->page; - if (add_to_page_cache(page, mapping, index, GFP_KERNEL)) { - unlock_page(page); - goto find_page; - } - - page_cache_get(page); - - if (!(buf->flags & PIPE_BUF_FLAG_LRU)) - lru_cache_add(page); - } else { -find_page: - page = find_lock_page(mapping, index); - if (!page) { - ret = -ENOMEM; - page = page_cache_alloc_cold(mapping); - if (unlikely(!page)) - goto out_ret; - - /* - * This will also lock the page - */ - ret = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL); - if (unlikely(ret)) - goto out; - } - - /* - * We get here with the page locked. If the page is also - * uptodate, we don't need to do more. If it isn't, we - * may need to bring it in if we are not going to overwrite - * the full page. - */ - if (!PageUptodate(page)) { - if (this_len < PAGE_CACHE_SIZE) { - ret = mapping->a_ops->readpage(file, page); - if (unlikely(ret)) - goto out; - - lock_page(page); - - if (!PageUptodate(page)) { - /* - * Page got invalidated, repeat. - */ - if (!page->mapping) { - unlock_page(page); - page_cache_release(page); - goto find_page; - } - ret = -EIO; - goto out; - } - } else - SetPageUptodate(page); - } - } - - ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); - if (unlikely(ret)) { - loff_t isize = i_size_read(mapping->host); - - if (ret != AOP_TRUNCATED_PAGE) - unlock_page(page); - page_cache_release(page); - if (ret == AOP_TRUNCATED_PAGE) - goto find_page; - - /* - * prepare_write() may have instantiated a few blocks - * outside i_size. Trim these off again. - */ - if (sd->pos + this_len > isize) - vmtruncate(mapping->host, isize); - - goto out_ret; - } + ret = pagecache_write_begin(file, mapping, sd->pos, sd->len, + AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + if (unlikely(ret)) + goto out; if (buf->page != page) { /* @@ -676,28 +602,13 @@ find_page: char *dst = kmap_atomic(page, KM_USER1); memcpy(dst + offset, src + buf->offset, this_len); - flush_dcache_page(page); kunmap_atomic(dst, KM_USER1); buf->ops->unmap(pipe, buf, src); } - ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); - if (!ret) { - /* - * Return the number of bytes written and mark page as - * accessed, we are now done! - */ - ret = this_len; - mark_page_accessed(page); - balance_dirty_pages_ratelimited(mapping); - } else if (ret == AOP_TRUNCATED_PAGE) { - page_cache_release(page); - goto find_page; - } + ret = pagecache_write_end(file, mapping, sd->pos, sd->len, sd->len, page, fsdata); + out: - page_cache_release(page); - unlock_page(page); -out_ret: return ret; } Index: linux-2.6/Documentation/filesystems/Locking =================================================================== --- linux-2.6.orig/Documentation/filesystems/Locking +++ linux-2.6/Documentation/filesystems/Locking @@ -176,15 +176,18 @@ prototypes: locking rules: All except set_page_dirty may block - BKL PageLocked(page) + BKL PageLocked(page) i_sem writepage: no yes, unlocks (see below) readpage: no yes, unlocks sync_page: no maybe writepages: no set_page_dirty no no readpages: no -prepare_write: no yes -commit_write: no yes +prepare_write: no yes yes +commit_write: no yes yes +write_begin: no locks the page yes +write_end: no yes, unlocks yes +perform_write: no n/a yes bmap: yes invalidatepage: no yes releasepage: no yes Index: linux-2.6/Documentation/filesystems/vfs.txt =================================================================== --- linux-2.6.orig/Documentation/filesystems/vfs.txt +++ linux-2.6/Documentation/filesystems/vfs.txt @@ -534,6 +534,14 @@ struct address_space_operations { struct list_head *pages, unsigned nr_pages); int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); int (*commit_write)(struct file *, struct page *, unsigned, unsigned); + int (*write_begin)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); + int (*write_end)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); + int (*perform_write)(struct file *, struct address_space *, + struct iov_iter *, loff_t); sector_t (*bmap)(struct address_space *, sector_t); int (*invalidatepage) (struct page *, unsigned long); int (*releasepage) (struct page *, int); @@ -611,11 +619,7 @@ struct address_space_operations { any basic-blocks on storage, then those blocks should be pre-read (if they haven't been read already) so that the updated blocks can be written out properly. - The page will be locked. If prepare_write wants to unlock the - page it, like readpage, may do so and return - AOP_TRUNCATED_PAGE. - In this case the prepare_write will be retried one the lock is - regained. + The page will be locked. Note: the page _must not_ be marked uptodate in this function (or anywhere else) unless it actually is uptodate right now. As @@ -629,6 +633,34 @@ struct address_space_operations { operations. It should avoid returning an error if possible - errors should have been handled by prepare_write. + write_begin: This is intended as a replacement for prepare_write. Called + by the generic buffered write code to ask the filesystem to prepare + to write len bytes at the given offset in the file. flags is a field + for AOP_FLAG_xxx flags, described in include/linux/mm.h. + + The filesystem must return the locked pagecache page for the caller + to write into. + + A void * may be returned in fsdata, which then gets passed into + write_end. + + Returns < 0 on failure, in which case all cleanup must be done and + write_end not called. 0 on success, in which case write_end must + be called. + + write_end: After a successful write_begin, and data copy, write_end must + be called. len is the original len passed to write_begin, and copied + is the amount that was able to be copied (they must be equal if + write_begin was called with intr == 0). + + The filesystem must take care of unlocking the page and dropping its + refcount, and updating i_size. + + Returns < 0 on failure, otherwise the number of bytes (<= 'copied') + that were able to be copied into the file. + + perform_write: + bmap: called by the VFS to map a logical block offset within object to physical block number. This method is used by the FIBMAP ioctl and for working with swap-files. To be able to swap to Index: linux-2.6/mm/shmem.c =================================================================== --- linux-2.6.orig/mm/shmem.c +++ linux-2.6/mm/shmem.c @@ -1419,10 +1419,14 @@ static const struct inode_operations shm * lets a tmpfs file be used read-write below the loop driver. */ static int -shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) -{ - struct inode *inode = page->mapping->host; - return shmem_getpage(inode, page->index, &page, SGP_WRITE, NULL); +shmem_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + struct inode *inode = mapping->host; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + *pagep = NULL; + return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); } static ssize_t @@ -2319,8 +2323,8 @@ static const struct address_space_operat .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, #ifdef CONFIG_TMPFS - .prepare_write = shmem_prepare_write, - .commit_write = simple_commit_write, + .write_begin = shmem_write_begin, + .write_end = simple_write_end, #endif .migratepage = migrate_page, }; Index: linux-2.6/fs/configfs/inode.c =================================================================== --- linux-2.6.orig/fs/configfs/inode.c +++ linux-2.6/fs/configfs/inode.c @@ -40,8 +40,8 @@ extern struct super_block * configfs_sb; static const struct address_space_operations configfs_aops = { .readpage = simple_readpage, - .prepare_write = simple_prepare_write, - .commit_write = simple_commit_write + .write_begin = simple_write_begin, + .write_end = simple_write_end, }; static struct backing_dev_info configfs_backing_dev_info = { Index: linux-2.6/fs/sysfs/inode.c =================================================================== --- linux-2.6.orig/fs/sysfs/inode.c +++ linux-2.6/fs/sysfs/inode.c @@ -20,8 +20,8 @@ extern struct super_block * sysfs_sb; static const struct address_space_operations sysfs_aops = { .readpage = simple_readpage, - .prepare_write = simple_prepare_write, - .commit_write = simple_commit_write + .write_begin = simple_write_begin, + .write_end = simple_write_end, }; static struct backing_dev_info sysfs_backing_dev_info = { Index: linux-2.6/fs/ramfs/file-mmu.c =================================================================== --- linux-2.6.orig/fs/ramfs/file-mmu.c +++ linux-2.6/fs/ramfs/file-mmu.c @@ -29,8 +29,8 @@ const struct address_space_operations ramfs_aops = { .readpage = simple_readpage, - .prepare_write = simple_prepare_write, - .commit_write = simple_commit_write, + .write_begin = simple_write_begin, + .write_end = simple_write_end, .set_page_dirty = __set_page_dirty_no_writeback, }; Index: linux-2.6/fs/ramfs/file-nommu.c =================================================================== --- linux-2.6.orig/fs/ramfs/file-nommu.c +++ linux-2.6/fs/ramfs/file-nommu.c @@ -30,8 +30,8 @@ static int ramfs_nommu_setattr(struct de const struct address_space_operations ramfs_aops = { .readpage = simple_readpage, - .prepare_write = simple_prepare_write, - .commit_write = simple_commit_write, + .write_begin = simple_write_begin, + .write_end = simple_write_end, .set_page_dirty = __set_page_dirty_no_writeback, }; Index: linux-2.6/fs/hugetlbfs/inode.c =================================================================== --- linux-2.6.orig/fs/hugetlbfs/inode.c +++ linux-2.6/fs/hugetlbfs/inode.c @@ -162,15 +162,19 @@ static int hugetlbfs_readpage(struct fil return -EINVAL; } -static int hugetlbfs_prepare_write(struct file *file, - struct page *page, unsigned offset, unsigned to) +static int hugetlbfs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { return -EINVAL; } -static int hugetlbfs_commit_write(struct file *file, - struct page *page, unsigned offset, unsigned to) +static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { + BUG(); return -EINVAL; } @@ -542,8 +546,8 @@ static void hugetlbfs_destroy_inode(stru static const struct address_space_operations hugetlbfs_aops = { .readpage = hugetlbfs_readpage, - .prepare_write = hugetlbfs_prepare_write, - .commit_write = hugetlbfs_commit_write, + .write_begin = hugetlbfs_write_begin, + .write_end = hugetlbfs_write_end, .set_page_dirty = hugetlbfs_set_page_dirty, }; Index: linux-2.6/fs/fat/file.c =================================================================== --- linux-2.6.orig/fs/fat/file.c +++ linux-2.6/fs/fat/file.c @@ -137,24 +137,6 @@ const struct file_operations fat_file_op .sendfile = generic_file_sendfile, }; -static int fat_cont_expand(struct inode *inode, loff_t size) -{ - struct address_space *mapping = inode->i_mapping; - loff_t start = inode->i_size, count = size - inode->i_size; - int err; - - err = generic_cont_expand_simple(inode, size); - if (err) - goto out; - - inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - if (IS_SYNC(inode)) - err = sync_page_range_nolock(inode, mapping, start, count); -out: - return err; -} - int fat_notify_change(struct dentry *dentry, struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); Index: linux-2.6/fs/fat/inode.c =================================================================== --- linux-2.6.orig/fs/fat/inode.c +++ linux-2.6/fs/fat/inode.c @@ -139,18 +139,42 @@ static int fat_readpages(struct file *fi return mpage_readpages(mapping, pages, nr_pages, fat_get_block); } -static int fat_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +int fat_cont_expand(struct inode *inode, loff_t size) { - return cont_prepare_write(page, from, to, fat_get_block, - &MSDOS_I(page->mapping->host)->mmu_private); + struct address_space *mapping = inode->i_mapping; + loff_t start = inode->i_size, count = size - inode->i_size; + int err; + + err = generic_cont_expand(inode, size, fat_get_block, + &MSDOS_I(inode)->mmu_private); + if (err) + goto out; + + inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + if (IS_SYNC(inode)) + err = sync_page_range_nolock(inode, mapping, start, count); +out: + return err; +} + +static int fat_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + *pagep = NULL; + return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + fat_get_block, + &MSDOS_I(mapping->host)->mmu_private); } -static int fat_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int fat_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *pagep, void *fsdata) { - struct inode *inode = page->mapping->host; - int err = generic_commit_write(file, page, from, to); + struct inode *inode = mapping->host; + int err; + err = block_write_end(file, mapping, pos, len, copied, pagep, fsdata); if (!err && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) { inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; MSDOS_I(inode)->i_attrs |= ATTR_ARCH; @@ -200,8 +224,8 @@ static const struct address_space_operat .writepage = fat_writepage, .writepages = fat_writepages, .sync_page = block_sync_page, - .prepare_write = fat_prepare_write, - .commit_write = fat_commit_write, + .write_begin = fat_write_begin, + .write_end = fat_write_end, .direct_IO = fat_direct_IO, .bmap = _fat_bmap }; Index: linux-2.6/include/linux/msdos_fs.h =================================================================== --- linux-2.6.orig/include/linux/msdos_fs.h +++ linux-2.6/include/linux/msdos_fs.h @@ -406,6 +406,7 @@ extern int fat_getattr(struct vfsmount * struct kstat *stat); /* fat/inode.c */ +extern int fat_cont_expand(struct inode *inode, loff_t size); extern void fat_attach(struct inode *inode, loff_t i_pos); extern void fat_detach(struct inode *inode); extern struct inode *fat_iget(struct super_block *sb, loff_t i_pos); Index: linux-2.6/fs/reiserfs/file.c =================================================================== --- linux-2.6.orig/fs/reiserfs/file.c +++ linux-2.6/fs/reiserfs/file.c @@ -947,16 +947,18 @@ static int reiserfs_check_for_tail_and_c ih = get_ih(&path); res = 0; if (is_direct_le_ih(ih)) { + struct buffer_head tmp_bh; + /* Ok, closest item is file tail (tails are stored in "direct" - * items), so we need to unpack it. */ - /* To not overcomplicate matters, we just call generic_cont_expand - which will in turn call other stuff and finally will boil down to - reiserfs_get_block() that would do necessary conversion. */ + * items), so we need to unpack it. reiserfs_get_block will + * do that for us. */ cont_expand_offset = le_key_k_offset(get_inode_item_key_version(inode), &(ih->ih_key)); pathrelse(&path); - res = generic_cont_expand(inode, cont_expand_offset); + res = reiserfs_get_block(inode, + cont_expand_offset / inode->i_sb->s_blocksize, + &tmp_bh, 1); } else pathrelse(&path); Index: linux-2.6/fs/ext2/inode.c =================================================================== --- linux-2.6.orig/fs/ext2/inode.c +++ linux-2.6/fs/ext2/inode.c @@ -643,6 +643,16 @@ ext2_readpages(struct file *file, struct } static int +ext2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + *pagep = NULL; + return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + ext2_get_block); +} + +static int ext2_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { @@ -689,6 +699,8 @@ const struct address_space_operations ex .readpages = ext2_readpages, .writepage = ext2_writepage, .sync_page = block_sync_page, + .write_begin = ext2_write_begin, + .write_end = block_write_end, .prepare_write = ext2_prepare_write, .commit_write = generic_commit_write, .bmap = ext2_bmap, Index: linux-2.6/fs/ext3/inode.c =================================================================== --- linux-2.6.orig/fs/ext3/inode.c +++ linux-2.6/fs/ext3/inode.c @@ -1155,23 +1155,28 @@ static int do_journal_get_write_access(h * This content is expected to be set to zeroes by block_prepare_write(). * 2006/10/14 SAW */ -static int ext3_prepare_failure(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ext3_write_failure(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, + struct page *page, void *fsdata) { - struct address_space *mapping; struct buffer_head *bh, *head, *next; unsigned block_start, block_end; unsigned blocksize; + unsigned from, to; int ret; handle_t *handle = ext3_journal_current_handle(); - mapping = page->mapping; if (ext3_should_writeback_data(mapping->host)) { /* optimization: no constraints about data */ skip: + unlock_page(page); + page_cache_release(page); return ext3_journal_stop(handle); } + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + head = page_buffers(page); blocksize = head->b_size; for ( bh = head, block_start = 0; @@ -1191,10 +1196,8 @@ skip: break; if (ext3_should_journal_data(mapping->host)) { ret = do_journal_get_write_access(handle, bh); - if (ret) { - ext3_journal_stop(handle); - return ret; - } + if (ret) + goto skip; } /* * block_start here becomes the first block where the current iteration @@ -1205,32 +1208,44 @@ skip: goto skip; /* commit allocated and zeroed buffers */ - return mapping->a_ops->commit_write(file, page, from, block_start); + return mapping->a_ops->write_end(file, mapping, pos, len, + block_start - from, page, fsdata); } -static int ext3_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ext3_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { - struct inode *inode = page->mapping->host; - int ret, ret2; + struct inode *inode = mapping->host; int needed_blocks = ext3_writepage_trans_blocks(inode); + int ret, ret2; handle_t *handle; int retries = 0; + struct page *page; + pgoff_t index; + unsigned start, end; + + index = pos >> PAGE_CACHE_SHIFT; + start = pos * (PAGE_CACHE_SIZE - 1); + end = start + len; retry: + page = __grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + *pagep = page; + handle = ext3_journal_start(inode, needed_blocks); if (IS_ERR(handle)) return PTR_ERR(handle); - if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) - ret = nobh_prepare_write(page, from, to, ext3_get_block); - else - ret = block_prepare_write(page, from, to, ext3_get_block); + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + ext3_get_block); if (ret) goto failure; if (ext3_should_journal_data(inode)) { ret = walk_page_buffers(handle, page_buffers(page), - from, to, NULL, do_journal_get_write_access); + start, end, NULL, do_journal_get_write_access); if (ret) /* fatal error, just put the handle and return */ journal_stop(handle); @@ -1238,7 +1253,10 @@ retry: return ret; failure: - ret2 = ext3_prepare_failure(file, page, from, to); + ret2 = ext3_write_failure(file, mapping, pos, len, page, *fsdata); + /* trim off blocks (XXX: need better helpers than vmtruncate) */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); if (ret2 < 0) return ret2; if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) @@ -1247,17 +1265,18 @@ failure: return ret; } + int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) { int err = journal_dirty_data(handle, bh); if (err) ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__, - bh, handle,err); + bh, handle, err); return err; } -/* For commit_write() in data=journal mode */ -static int commit_write_fn(handle_t *handle, struct buffer_head *bh) +/* For write_end() in data=journal mode */ +static int write_end_fn(handle_t *handle, struct buffer_head *bh) { if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; @@ -1272,78 +1291,103 @@ static int commit_write_fn(handle_t *han * ext3 never places buffers on inode->i_mapping->private_list. metadata * buffers are managed internally. */ -static int ext3_ordered_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ext3_ordered_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = file->f_mapping->host; + unsigned from, to; int ret = 0, ret2; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + ret = walk_page_buffers(handle, page_buffers(page), from, to, NULL, ext3_journal_dirty_data); if (ret == 0) { /* - * generic_commit_write() will run mark_inode_dirty() if i_size + * block_write_end() will run mark_inode_dirty() if i_size * changes. So let's piggyback the i_disksize mark_inode_dirty * into that. */ loff_t new_i_size; - new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + new_i_size = pos + copied; if (new_i_size > EXT3_I(inode)->i_disksize) EXT3_I(inode)->i_disksize = new_i_size; - ret = generic_commit_write(file, page, from, to); + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + if (copied < 0) + ret = copied; } ret2 = ext3_journal_stop(handle); if (!ret) ret = ret2; - return ret; + return ret ? ret : copied; } -static int ext3_writeback_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ext3_writeback_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = file->f_mapping->host; int ret = 0, ret2; loff_t new_i_size; - new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + new_i_size = pos + copied; if (new_i_size > EXT3_I(inode)->i_disksize) EXT3_I(inode)->i_disksize = new_i_size; - if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) - ret = nobh_commit_write(file, page, from, to); - else - ret = generic_commit_write(file, page, from, to); + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + if (copied < 0) + ret = copied; ret2 = ext3_journal_stop(handle); if (!ret) ret = ret2; - return ret; + return ret ? ret : copied; } -static int ext3_journalled_commit_write(struct file *file, - struct page *page, unsigned from, unsigned to) +static int ext3_journalled_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = mapping->host; int ret = 0, ret2; int partial = 0; - loff_t pos; + unsigned from, to; - /* - * Here we duplicate the generic_commit_write() functionality - */ - pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; + + if (copied < len) { + if (PageUptodate(page)) + copied = len; + else { + /* XXX: don't need to zero new buffers because we abort? */ + copied = 0; + if (!is_handle_aborted(handle)) + journal_abort_handle(handle); + unlock_page(page); + page_cache_release(page); + goto out; + } + } ret = walk_page_buffers(handle, page_buffers(page), from, - to, &partial, commit_write_fn); + to, &partial, write_end_fn); if (!partial) SetPageUptodate(page); - if (pos > inode->i_size) - i_size_write(inode, pos); + unlock_page(page); + page_cache_release(page); + if (pos+copied > inode->i_size) + i_size_write(inode, pos+copied); EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; if (inode->i_size > EXT3_I(inode)->i_disksize) { EXT3_I(inode)->i_disksize = inode->i_size; @@ -1351,10 +1395,12 @@ static int ext3_journalled_commit_write( if (!ret) ret = ret2; } + +out: ret2 = ext3_journal_stop(handle); if (!ret) ret = ret2; - return ret; + return ret ? ret : copied; } /* @@ -1612,7 +1658,7 @@ static int ext3_journalled_writepage(str PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); err = walk_page_buffers(handle, page_buffers(page), 0, - PAGE_CACHE_SIZE, NULL, commit_write_fn); + PAGE_CACHE_SIZE, NULL, write_end_fn); if (ret == 0) ret = err; EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; @@ -1772,8 +1818,8 @@ static const struct address_space_operat .readpages = ext3_readpages, .writepage = ext3_ordered_writepage, .sync_page = block_sync_page, - .prepare_write = ext3_prepare_write, - .commit_write = ext3_ordered_commit_write, + .write_begin = ext3_write_begin, + .write_end = ext3_ordered_write_end, .bmap = ext3_bmap, .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, @@ -1786,8 +1832,8 @@ static const struct address_space_operat .readpages = ext3_readpages, .writepage = ext3_writeback_writepage, .sync_page = block_sync_page, - .prepare_write = ext3_prepare_write, - .commit_write = ext3_writeback_commit_write, + .write_begin = ext3_write_begin, + .write_end = ext3_writeback_write_end, .bmap = ext3_bmap, .invalidatepage = ext3_invalidatepage, .releasepage = ext3_releasepage, @@ -1800,8 +1846,8 @@ static const struct address_space_operat .readpages = ext3_readpages, .writepage = ext3_journalled_writepage, .sync_page = block_sync_page, - .prepare_write = ext3_prepare_write, - .commit_write = ext3_journalled_commit_write, + .write_begin = ext3_write_begin, + .write_end = ext3_journalled_write_end, .set_page_dirty = ext3_journalled_set_page_dirty, .bmap = ext3_bmap, .invalidatepage = ext3_invalidatepage, Index: linux-2.6/fs/ocfs2/aops.c =================================================================== --- linux-2.6.orig/fs/ocfs2/aops.c +++ linux-2.6/fs/ocfs2/aops.c @@ -293,29 +293,67 @@ int ocfs2_prepare_write_nolock(struct in } /* - * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called - * from loopback. It must be able to perform its own locking around - * ocfs2_get_block(). + * ocfs2_write_begin() can be an outer-most ocfs2 call when it is + * called from elsewhere in the kernel. It must be able to perform its + * own locking around ocfs2_get_block(). */ -static int ocfs2_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ocfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { - struct inode *inode = page->mapping->host; + struct inode *inode = mapping->host; + struct buffer_head *di_bh = NULL; + struct page *page = NULL; int ret; - mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); - - ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); + ret = ocfs2_meta_lock(inode, &di_bh, 1); if (ret != 0) { mlog_errno(ret); + return ret; + } + + ret = ocfs2_data_lock(inode, 1); + if (ret) { + ocfs2_meta_unlock(inode, 1); + + mlog_errno(ret); + return ret; + } + + /* + * Lock the page out here to preserve ordering with + * ip_alloc_sem. + */ + page = __grab_cache_page(mapping, pos >> PAGE_CACHE_SHIFT); + if (!page) { + ret = -ENOMEM; + mlog_errno(ret); goto out; } - ret = ocfs2_prepare_write_nolock(inode, page, from, to); + *pagep = page; - ocfs2_meta_unlock(inode, 0); + down_read(&OCFS2_I(inode)->ip_alloc_sem); + ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + ocfs2_get_block); + up_read(&OCFS2_I(inode)->ip_alloc_sem); out: - mlog_exit(ret); + if (ret == 0) { + *fsdata = di_bh; + } else { + /* + * Error return - the caller won't call + * ocfs2_write_end, so drop cluster locks here. + */ + brelse(di_bh); + if (page) { + unlock_page(page); + page_cache_release(page); + } + ocfs2_data_unlock(inode, 1); + ocfs2_meta_unlock(inode, 1); + } + return ret; } @@ -388,16 +426,18 @@ out: return handle; } -static int ocfs2_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ocfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { int ret; - struct buffer_head *di_bh = NULL; + unsigned from, to; + struct buffer_head *di_bh = fsdata; struct inode *inode = page->mapping->host; handle_t *handle = NULL; struct ocfs2_dinode *di; - mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); + mlog_entry("(0x%p, 0x%p)\n", file, page); /* NOTE: ocfs2_file_aio_write has ensured that it's safe for * us to continue here without rechecking the I/O against @@ -412,22 +452,13 @@ static int ocfs2_commit_write(struct fil * stale inode allocation image (i_size, i_clusters, etc). */ - ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page); - if (ret != 0) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_data_lock_with_page(inode, 1, page); - if (ret != 0) { - mlog_errno(ret); - goto out_unlock_meta; - } + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; handle = ocfs2_start_walk_page_trans(inode, page, from, to); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_unlock_data; + goto out_unlock; } /* Mark our buffer early. We'd rather catch this error up here @@ -441,8 +472,10 @@ static int ocfs2_commit_write(struct fil } /* might update i_size */ - ret = generic_commit_write(file, page, from, to); - if (ret < 0) { + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); + if (copied < 0) { + ret = copied; + copied = 0; mlog_errno(ret); goto out_commit; } @@ -458,23 +491,30 @@ static int ocfs2_commit_write(struct fil di->i_size = cpu_to_le64((u64)i_size_read(inode)); ret = ocfs2_journal_dirty(handle, di_bh); - if (ret < 0) { + if (ret < 0) mlog_errno(ret); - goto out_commit; - } + ret = 0; out_commit: ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); -out_unlock_data: +out_unlock: ocfs2_data_unlock(inode, 1); -out_unlock_meta: ocfs2_meta_unlock(inode, 1); -out: + + if (ret) { + /* + * We caught an error before block_write_end() - + * unlock and free the page. + */ + unlock_page(page); + page_cache_release(page); + } + if (di_bh) brelse(di_bh); mlog_exit(ret); - return ret; + return copied ? copied : ret; } static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) @@ -678,8 +718,8 @@ out: const struct address_space_operations ocfs2_aops = { .readpage = ocfs2_readpage, .writepage = ocfs2_writepage, - .prepare_write = ocfs2_prepare_write, - .commit_write = ocfs2_commit_write, + .write_begin = ocfs2_write_begin, + .write_end = ocfs2_write_end, .bmap = ocfs2_bmap, .sync_page = block_sync_page, .direct_IO = ocfs2_direct_IO, Index: linux-2.6/fs/gfs2/ops_address.c =================================================================== --- linux-2.6.orig/fs/gfs2/ops_address.c +++ linux-2.6/fs/gfs2/ops_address.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -337,45 +338,49 @@ out_unlock: } /** - * gfs2_prepare_write - Prepare to write a page to a file + * gfs2_write_begin - Begin to write to a file * @file: The file to write to - * @page: The page which is to be prepared for writing - * @from: From (byte range within page) - * @to: To (byte range within page) + * @mapping: The mapping in which to write + * @pos: The file offset at which to start writing + * @len: Length of the write + * @flags: Various flags + * @pagep: Pointer to return the page + * @fsdata: Pointer to return fs data (unused by GFS2) * * Returns: errno */ -static int gfs2_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int gfs2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { - struct gfs2_inode *ip = GFS2_I(page->mapping->host); - struct gfs2_sbd *sdp = GFS2_SB(page->mapping->host); + struct gfs2_inode *ip = GFS2_I(mapping->host); + struct gfs2_sbd *sdp = GFS2_SB(mapping->host); unsigned int data_blocks, ind_blocks, rblocks; int alloc_required; int error = 0; - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + from; - loff_t end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; struct gfs2_alloc *al; - unsigned int write_len = to - from; - + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + struct page *page; - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME|LM_FLAG_TRY_1CB, &ip->i_gh); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh); error = gfs2_glock_nq_atime(&ip->i_gh); - if (unlikely(error)) { - if (error == GLR_TRYFAILED) { - unlock_page(page); - error = AOP_TRUNCATED_PAGE; - yield(); - } + if (unlikely(error)) goto out_uninit; - } - gfs2_write_calc_reserv(ip, write_len, &data_blocks, &ind_blocks); + error = -ENOMEM; + page = __grab_cache_page(mapping, index); + *pagep = page; + if (!page) + goto out_unlock; + + gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); - error = gfs2_write_alloc_required(ip, pos, write_len, &alloc_required); + error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); if (error) - goto out_unlock; + goto out_putpage; ip->i_alloc.al_requested = 0; @@ -407,7 +412,7 @@ static int gfs2_prepare_write(struct fil goto out; if (gfs2_is_stuffed(ip)) { - if (end > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { + if (pos + len > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { error = gfs2_unstuff_dinode(ip, page); if (error == 0) goto prepare_write; @@ -429,6 +434,10 @@ out_qunlock: out_alloc_put: gfs2_alloc_put(ip); } +out_putpage: + page_cache_release(page); + if (pos + len > ip->i_inode.i_size) + vmtruncate(&ip->i_inode, ip->i_inode.i_size); out_unlock: gfs2_glock_dq_m(1, &ip->i_gh); out_uninit: @@ -439,92 +448,128 @@ out_uninit: } /** - * gfs2_commit_write - Commit write to a file + * gfs2_stuffed_write_end - Write end for stuffed files + * @inode: The inode + * @dibh: The buffer_head containing the on-disk inode + * @pos: The file position + * @len: The length of the write + * @copied: How much was actually copied by the VFS + * @page: The page + * + * This copies the data from the page into the inode block after + * the inode data structure itself. + * + * Returns: errno + */ +static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, + loff_t pos, unsigned len, unsigned copied, + struct page *page) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + u64 to = pos + copied; + void *kaddr; + unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); + struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; + + BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); + kaddr = kmap_atomic(page, KM_USER0); + memcpy(buf + pos, kaddr + pos, copied); + memset(kaddr + pos + copied, 0, len - copied); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + + if (!PageUptodate(page)) + SetPageUptodate(page); + unlock_page(page); + mark_page_accessed(page); + page_cache_release(page); + + if (inode->i_size < to) { + i_size_write(inode, to); + ip->i_di.di_size = inode->i_size; + di->di_size = cpu_to_be64(inode->i_size); + mark_inode_dirty(inode); + } + + brelse(dibh); + gfs2_trans_end(sdp); + gfs2_glock_dq(&ip->i_gh); + gfs2_holder_uninit(&ip->i_gh); + return copied; +} + +/** + * gfs2_write_end * @file: The file to write to - * @page: The page containing the data - * @from: From (byte range within page) - * @to: To (byte range within page) + * @mapping: The address space to write to + * @pos: The file position + * @len: The length of the data + * @copied: + * @page: The page that has been written + * @fsdata: The fsdata (unused in GFS2) + * + * The main write_end function for GFS2. We have a separate one for + * stuffed files as they are slightly different, otherwise we just + * put our locking around the VFS provided functions. * * Returns: errno */ -static int gfs2_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int gfs2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); - int error = -EOPNOTSUPP; struct buffer_head *dibh; struct gfs2_alloc *al = &ip->i_alloc; struct gfs2_dinode *di; + unsigned int from = pos & (PAGE_CACHE_SIZE - 1); + unsigned int to = from + len; + int ret; - if (gfs2_assert_withdraw(sdp, gfs2_glock_is_locked_by_me(ip->i_gl))) - goto fail_nounlock; + BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == 0); - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto fail_endtrans; + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (unlikely(ret)) { + unlock_page(page); + page_cache_release(page); + goto failed; + } gfs2_trans_add_bh(ip->i_gl, dibh, 1); - di = (struct gfs2_dinode *)dibh->b_data; - if (gfs2_is_stuffed(ip)) { - u64 file_size; - void *kaddr; + if (gfs2_is_stuffed(ip)) + return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page); - file_size = ((u64)page->index << PAGE_CACHE_SHIFT) + to; + if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || gfs2_is_jdata(ip)) + gfs2_page_add_databufs(ip, page, from, to); - kaddr = kmap_atomic(page, KM_USER0); - memcpy(dibh->b_data + sizeof(struct gfs2_dinode) + from, - kaddr + from, to - from); - kunmap_atomic(kaddr, KM_USER0); + ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); - SetPageUptodate(page); - - if (inode->i_size < file_size) { - i_size_write(inode, file_size); + if (likely(ret >= 0)) { + copied = ret; + if ((pos + copied) > inode->i_size) { + di = (struct gfs2_dinode *)dibh->b_data; + ip->i_di.di_size = inode->i_size; + di->di_size = cpu_to_be64(inode->i_size); mark_inode_dirty(inode); } - } else { - if (sdp->sd_args.ar_data == GFS2_DATA_ORDERED || - gfs2_is_jdata(ip)) - gfs2_page_add_databufs(ip, page, from, to); - error = generic_commit_write(file, page, from, to); - if (error) - goto fail; - } - - if (ip->i_di.di_size < inode->i_size) { - ip->i_di.di_size = inode->i_size; - di->di_size = cpu_to_be64(inode->i_size); } brelse(dibh); gfs2_trans_end(sdp); +failed: if (al->al_requested) { gfs2_inplace_release(ip); gfs2_quota_unlock(ip); gfs2_alloc_put(ip); } - gfs2_glock_dq_m(1, &ip->i_gh); + gfs2_glock_dq(&ip->i_gh); gfs2_holder_uninit(&ip->i_gh); - return 0; - -fail: - brelse(dibh); -fail_endtrans: - gfs2_trans_end(sdp); - if (al->al_requested) { - gfs2_inplace_release(ip); - gfs2_quota_unlock(ip); - gfs2_alloc_put(ip); - } - gfs2_glock_dq_m(1, &ip->i_gh); - gfs2_holder_uninit(&ip->i_gh); -fail_nounlock: - ClearPageUptodate(page); - return error; + return ret; } /** @@ -793,8 +838,8 @@ const struct address_space_operations gf .readpage = gfs2_readpage, .readpages = gfs2_readpages, .sync_page = block_sync_page, - .prepare_write = gfs2_prepare_write, - .commit_write = gfs2_commit_write, + .write_begin = gfs2_write_begin, + .write_end = gfs2_write_end, .bmap = gfs2_bmap, .invalidatepage = gfs2_invalidatepage, .releasepage = gfs2_releasepage, Index: linux-2.6/fs/ecryptfs/mmap.c =================================================================== --- linux-2.6.orig/fs/ecryptfs/mmap.c +++ linux-2.6/fs/ecryptfs/mmap.c @@ -36,34 +36,6 @@ struct kmem_cache *ecryptfs_lower_page_cache; -/** - * ecryptfs_get1page - * - * Get one page from cache or lower f/s, return error otherwise. - * - * Returns unlocked and up-to-date page (if ok), with increased - * refcnt. - */ -static struct page *ecryptfs_get1page(struct file *file, int index) -{ - struct page *page; - struct dentry *dentry; - struct inode *inode; - struct address_space *mapping; - - dentry = file->f_path.dentry; - inode = dentry->d_inode; - mapping = inode->i_mapping; - page = read_cache_page(mapping, index, - (filler_t *)mapping->a_ops->readpage, - (void *)file); - if (IS_ERR(page)) - goto out; - wait_on_page_locked(page); -out: - return page; -} - static int write_zeros(struct file *file, pgoff_t index, int start, int num_zeros); @@ -369,17 +341,14 @@ out: /** * Called with lower inode mutex held. */ -static int fill_zeros_to_end_of_page(struct page *page, unsigned int to) +static int fill_zeros_to_end_of_page(struct page *page, loff_t new_isize) { - struct inode *inode = page->mapping->host; int end_byte_in_page; char *page_virt; - if ((i_size_read(inode) / PAGE_CACHE_SIZE) != page->index) + if ((new_isize >> PAGE_CACHE_SHIFT) != page->index) goto out; - end_byte_in_page = i_size_read(inode) % PAGE_CACHE_SIZE; - if (to > end_byte_in_page) - end_byte_in_page = to; + end_byte_in_page = new_isize % PAGE_CACHE_SIZE; page_virt = kmap_atomic(page, KM_USER0); memset((page_virt + end_byte_in_page), 0, (PAGE_CACHE_SIZE - end_byte_in_page)); @@ -389,16 +358,35 @@ out: return 0; } -static int ecryptfs_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ecryptfs_write_begin(struct file *file,struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { + struct page *page; + pgoff_t index; int rc = 0; - if (from == 0 && to == PAGE_CACHE_SIZE) - goto out; /* If we are writing a full page, it will be - up to date. */ - if (!PageUptodate(page)) - rc = ecryptfs_do_readpage(file, page, page->index); + index = pos >> PAGE_CACHE_SHIFT; + page = __grab_cache_page(mapping, index); + if (!page) { + rc = -ENOMEM; + goto out; + } + + /* + * If we are writing a full page (with no possibility of a short + * write), it will be guaranteed to end up being uptodate at + * write_end-time + */ + if (flags & AOP_FLAG_UNINTERRUPTIBLE && len == PAGE_CACHE_SIZE) + goto out; + if (!PageUptodate(page)) { + rc = ecryptfs_do_readpage(file, page, index); + if (rc) { + unlock_page(page); + page_cache_release(page); + } + } out: return rc; } @@ -421,14 +409,6 @@ out: return rc; } -static -void ecryptfs_release_lower_page(struct page *lower_page, int page_locked) -{ - if (page_locked) - unlock_page(lower_page); - page_cache_release(lower_page); -} - /** * ecryptfs_write_inode_size_to_header * @@ -442,28 +422,17 @@ static int ecryptfs_write_inode_size_to_ { int rc = 0; struct page *header_page; + void *fsdata; char *header_virt; - const struct address_space_operations *lower_a_ops; + struct address_space *lower_mapping = lower_inode->i_mapping; u64 file_size; -retry: - header_page = grab_cache_page(lower_inode->i_mapping, 0); - if (!header_page) { - ecryptfs_printk(KERN_ERR, "grab_cache_page for " - "lower_page_index 0 failed\n"); - rc = -EINVAL; - goto out; - } - lower_a_ops = lower_inode->i_mapping->a_ops; - rc = lower_a_ops->prepare_write(lower_file, header_page, 0, 8); - if (rc) { - if (rc == AOP_TRUNCATED_PAGE) { - ecryptfs_release_lower_page(header_page, 0); - goto retry; - } else - ecryptfs_release_lower_page(header_page, 1); + rc = pagecache_write_begin(lower_file, lower_mapping, 0, sizeof(u64), + AOP_FLAG_UNINTERRUPTIBLE, + &header_page, &fsdata); + if (rc) goto out; - } + file_size = (u64)i_size_read(inode); ecryptfs_printk(KERN_DEBUG, "Writing size: [0x%.16x]\n", file_size); file_size = cpu_to_be64(file_size); @@ -471,17 +440,17 @@ retry: memcpy(header_virt, &file_size, sizeof(u64)); kunmap_atomic(header_virt, KM_USER0); flush_dcache_page(header_page); - rc = lower_a_ops->commit_write(lower_file, header_page, 0, 8); - if (rc < 0) + + rc = pagecache_write_end(lower_file, lower_mapping, 0, sizeof(u64), + sizeof(u64), header_page, fsdata); + if (rc != sizeof(u64)) { ecryptfs_printk(KERN_ERR, "Error commiting header page " "write\n"); - if (rc == AOP_TRUNCATED_PAGE) { - ecryptfs_release_lower_page(header_page, 0); - goto retry; - } else - ecryptfs_release_lower_page(header_page, 1); + if (rc > 0) + rc = -EINVAL; /* XXX: can we do better? */ + } lower_inode->i_mtime = lower_inode->i_ctime = CURRENT_TIME; - mark_inode_dirty_sync(inode); + mark_inode_dirty_sync(inode); /* XXX: lower_inode? */ out: return rc; } @@ -564,36 +533,21 @@ ecryptfs_write_inode_size_to_metadata(st int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode, struct file *lower_file, unsigned long lower_page_index, int byte_offset, - int region_bytes) + int region_bytes, void **fsdata) { - int rc = 0; + int rc; + struct address_space *lower_mapping = lower_inode->i_mapping; + loff_t pos = (lower_page_index << PAGE_CACHE_SHIFT) + byte_offset; -retry: - *lower_page = grab_cache_page(lower_inode->i_mapping, lower_page_index); - if (!(*lower_page)) { - rc = -EINVAL; - ecryptfs_printk(KERN_ERR, "Error attempting to grab " - "lower page with index [0x%.16x]\n", - lower_page_index); - goto out; - } - rc = lower_inode->i_mapping->a_ops->prepare_write(lower_file, - (*lower_page), - byte_offset, - region_bytes); + rc = pagecache_write_begin(lower_file, lower_mapping, pos, region_bytes, + AOP_FLAG_UNINTERRUPTIBLE, /* XXX: ok? */ + lower_page, fsdata); if (rc) { - if (rc == AOP_TRUNCATED_PAGE) { - ecryptfs_release_lower_page(*lower_page, 0); - goto retry; - } else { - ecryptfs_printk(KERN_ERR, "prepare_write for " - "lower_page_index = [0x%.16x] failed; rc = " - "[%d]\n", lower_page_index, rc); - ecryptfs_release_lower_page(*lower_page, 1); - (*lower_page) = NULL; - } + ecryptfs_printk(KERN_ERR, "pagecache_write_begin for " + "lower_page_index = [0x%.16x] failed; rc = " + "[%d]\n", lower_page_index, rc); + (*lower_page) = NULL; } -out: return rc; } @@ -605,21 +559,21 @@ out: int ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode, struct file *lower_file, int byte_offset, - int region_size) + int region_size, void *fsdata) { - int page_locked = 1; - int rc = 0; + int rc; + struct address_space *lower_mapping = lower_inode->i_mapping; + loff_t pos = (lower_page->index << PAGE_CACHE_SHIFT) + byte_offset; - rc = lower_inode->i_mapping->a_ops->commit_write( - lower_file, lower_page, byte_offset, region_size); - if (rc == AOP_TRUNCATED_PAGE) - page_locked = 0; - if (rc < 0) { + rc = pagecache_write_end(lower_file, lower_mapping, pos, region_size, + region_size, lower_page, fsdata); + if (rc != region_size) { ecryptfs_printk(KERN_ERR, "Error committing write; rc = [%d]\n", rc); + if (rc > 0) + rc = -EINVAL; } else rc = 0; - ecryptfs_release_lower_page(lower_page, page_locked); return rc; } @@ -634,9 +588,10 @@ int ecryptfs_copy_page_to_lower(struct p { int rc = 0; struct page *lower_page; + void *fsdata; rc = ecryptfs_get_lower_page(&lower_page, lower_inode, lower_file, - page->index, 0, PAGE_CACHE_SIZE); + page->index, 0, PAGE_CACHE_SIZE, &fsdata); if (rc) { ecryptfs_printk(KERN_ERR, "Error attempting to get page " "at index [0x%.16x]\n", page->index); @@ -646,7 +601,7 @@ int ecryptfs_copy_page_to_lower(struct p memcpy((char *)page_address(lower_page), page_address(page), PAGE_CACHE_SIZE); rc = ecryptfs_commit_lower_page(lower_page, lower_inode, lower_file, - 0, PAGE_CACHE_SIZE); + 0, PAGE_CACHE_SIZE, fsdata); if (rc) ecryptfs_printk(KERN_ERR, "Error attempting to commit page " "at index [0x%.16x]\n", page->index); @@ -657,31 +612,37 @@ out: struct kmem_cache *ecryptfs_xattr_cache; /** - * ecryptfs_commit_write + * ecryptfs_write_end * @file: The eCryptfs file object - * @page: The eCryptfs page - * @from: Ignored (we rotate the page IV on each write) - * @to: Ignored + * @mapping: The eCryptfs address_space + * @pos: The start of the write + * @len: The length passed to write_begin (unused) + * @copied: The actual amount copied + * @page: The eCryptfs page returned by write_begin + * @fsdata: Filesystem private data (unused) * * This is where we encrypt the data and pass the encrypted data to * the lower filesystem. In OpenPGP-compatible mode, we operate on * entire underlying packets. */ -static int ecryptfs_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ecryptfs_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { struct ecryptfs_page_crypt_context ctx; - loff_t pos; + loff_t isize; struct inode *inode; struct inode *lower_inode; struct file *lower_file; struct ecryptfs_crypt_stat *crypt_stat; int rc; - inode = page->mapping->host; + inode = mapping->host; + isize = inode->i_size; /* i_mutex is held */ lower_inode = ecryptfs_inode_to_lower(inode); lower_file = ecryptfs_file_to_lower(file); - mutex_lock(&lower_inode->i_mutex); + mutex_lock(&lower_inode->i_mutex); /* XXX: put this in write_begin? */ crypt_stat = &ecryptfs_inode_to_private(file->f_path.dentry->d_inode) ->crypt_stat; if (crypt_stat->flags & ECRYPTFS_NEW_FILE) { @@ -692,8 +653,8 @@ static int ecryptfs_commit_write(struct ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" "(page w/ index = [0x%.16x], to = [%d])\n", page->index, - to); - rc = fill_zeros_to_end_of_page(page, to); + max(isize, pos+copied)); + rc = fill_zeros_to_end_of_page(page, max(isize, pos+copied)); if (rc) { ecryptfs_printk(KERN_WARNING, "Error attempting to fill " "zeros in page with index = [0x%.16x]\n", @@ -710,11 +671,10 @@ static int ecryptfs_commit_write(struct goto out; } inode->i_blocks = lower_inode->i_blocks; - pos = (page->index << PAGE_CACHE_SHIFT) + to; - if (pos > i_size_read(inode)) { - i_size_write(inode, pos); + if (pos + copied > isize) { + i_size_write(inode, pos + copied); ecryptfs_printk(KERN_DEBUG, "Expanded file size to " - "[0x%.16x]\n", i_size_read(inode)); + "[0x%.16x]\n", pos + copied); } rc = ecryptfs_write_inode_size_to_metadata(lower_file, lower_inode, inode, file->f_dentry, @@ -730,6 +690,9 @@ out: else SetPageUptodate(page); mutex_unlock(&lower_inode->i_mutex); + unlock_page(page); + page_cache_release(page); + return rc; } @@ -750,32 +713,31 @@ int write_zeros(struct file *file, pgoff int rc = 0; struct page *tmp_page; char *tmp_page_virt; - - tmp_page = ecryptfs_get1page(file, index); - if (IS_ERR(tmp_page)) { - ecryptfs_printk(KERN_ERR, "Error getting page at index " - "[0x%.16x]\n", index); - rc = PTR_ERR(tmp_page); - goto out; - } - rc = ecryptfs_prepare_write(file, tmp_page, start, start + num_zeros); + void *fsdata; + struct address_space *mapping = file->f_path.dentry->d_inode->i_mapping; + loff_t pos = (index << PAGE_CACHE_SHIFT) + start; + + rc = pagecache_write_begin(file, mapping, pos, num_zeros, + AOP_FLAG_UNINTERRUPTIBLE, + &tmp_page, &fsdata); if (rc) { ecryptfs_printk(KERN_ERR, "Error preparing to write zero's " "to remainder of page at index [0x%.16x]\n", index); - page_cache_release(tmp_page); goto out; } tmp_page_virt = kmap_atomic(tmp_page, KM_USER0); memset(((char *)tmp_page_virt + start), 0, num_zeros); kunmap_atomic(tmp_page_virt, KM_USER0); flush_dcache_page(tmp_page); - rc = ecryptfs_commit_write(file, tmp_page, start, start + num_zeros); - if (rc < 0) { + rc = pagecache_write_end(file, mapping, pos, num_zeros, num_zeros, + tmp_page, fsdata); + if (rc != num_zeros) { ecryptfs_printk(KERN_ERR, "Error attempting to write zero's " "to remainder of page at index [0x%.16x]\n", index); - page_cache_release(tmp_page); + if (rc > 0) + rc = -EINVAL; goto out; } rc = 0; @@ -823,8 +785,8 @@ static void ecryptfs_sync_page(struct pa struct address_space_operations ecryptfs_aops = { .writepage = ecryptfs_writepage, .readpage = ecryptfs_readpage, - .prepare_write = ecryptfs_prepare_write, - .commit_write = ecryptfs_commit_write, + .write_begin = ecryptfs_write_begin, + .write_end = ecryptfs_write_end, .bmap = ecryptfs_bmap, .sync_page = ecryptfs_sync_page, }; Index: linux-2.6/fs/nfs/file.c =================================================================== --- linux-2.6.orig/fs/nfs/file.c +++ linux-2.6/fs/nfs/file.c @@ -282,27 +282,50 @@ nfs_fsync(struct file *file, struct dent } /* - * This does the "real" work of the write. The generic routine has - * allocated the page, locked it, done all the page alignment stuff - * calculations etc. Now we should just copy the data from user - * space and write it back to the real medium.. + * This does the "real" work of the write. We must allocate and lock the + * page to be sent back to the generic routine, which then copies the + * data from user space. * * If the writer ends up delaying the write, the writer needs to * increment the page use counts until he is done with the page. */ -static int nfs_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) -{ +static int nfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index; + struct page *page; + index = pos >> PAGE_CACHE_SHIFT; + + page = __grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + *pagep = page; + return nfs_flush_incompatible(file, page); } -static int nfs_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) +static int nfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); long status; + /* + * XXX: is there a lock_kernel/lock_page inversion here (eg. against + * nfs_fsync)? Could be fixed by taking bkl in nfs_write_begin and + * releasing it here... assuming nfs_flush_incompatible can handle + * it. + */ lock_kernel(); - status = nfs_updatepage(file, page, offset, to-offset); + status = nfs_updatepage(file, page, offset, copied); unlock_kernel(); - return status; + + unlock_page(page); + page_cache_release(page); + + return status < 0 ? status : copied; } static void nfs_invalidate_page(struct page *page, unsigned long offset) @@ -330,8 +353,8 @@ const struct address_space_operations nf .set_page_dirty = nfs_set_page_dirty, .writepage = nfs_writepage, .writepages = nfs_writepages, - .prepare_write = nfs_prepare_write, - .commit_write = nfs_commit_write, + .write_begin = nfs_write_begin, + .write_end = nfs_write_end, .invalidatepage = nfs_invalidate_page, .releasepage = nfs_release_page, #ifdef CONFIG_NFS_DIRECTIO Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c =================================================================== --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c +++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c @@ -1414,13 +1414,18 @@ xfs_vm_direct_IO( } STATIC int -xfs_vm_prepare_write( +xfs_vm_write_begin( struct file *file, - struct page *page, - unsigned int from, - unsigned int to) + struct address_space *mapping, + loff_t pos, + unsigned len, + unsigned flags, + struct page **pagep, + void **fsdata) { - return block_prepare_write(page, from, to, xfs_get_blocks); + *pagep = NULL; + return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + xfs_get_blocks); } STATIC sector_t @@ -1474,8 +1479,8 @@ const struct address_space_operations xf .sync_page = block_sync_page, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, - .prepare_write = xfs_vm_prepare_write, - .commit_write = generic_commit_write, + .write_begin = xfs_vm_write_begin, + .write_end = block_write_end, .bmap = xfs_vm_bmap, .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, Index: linux-2.6/fs/xfs/linux-2.6/xfs_lrw.c =================================================================== --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_lrw.c +++ linux-2.6/fs/xfs/linux-2.6/xfs_lrw.c @@ -134,45 +134,35 @@ xfs_iozero( loff_t pos, /* offset in file */ size_t count) /* size of data to zero */ { - unsigned bytes; struct page *page; struct address_space *mapping; int status; mapping = ip->i_mapping; do { - unsigned long index, offset; + unsigned offset, bytes; + void *fsdata; offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ - index = pos >> PAGE_CACHE_SHIFT; bytes = PAGE_CACHE_SIZE - offset; if (bytes > count) bytes = count; - status = -ENOMEM; - page = grab_cache_page(mapping, index); - if (!page) + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (!status) break; - status = mapping->a_ops->prepare_write(NULL, page, offset, - offset + bytes); - if (status) - goto unlock; - memclear_highpage_flush(page, offset, bytes); - status = mapping->a_ops->commit_write(NULL, page, offset, - offset + bytes); - if (!status) { - pos += bytes; - count -= bytes; - } - -unlock: - unlock_page(page); - page_cache_release(page); - if (status) + status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, + page, fsdata); + if (status < 0) break; + bytes = status; + pos += bytes; + count -= bytes; } while (count); return (-status); Index: linux-2.6/fs/ecryptfs/crypto.c =================================================================== --- linux-2.6.orig/fs/ecryptfs/crypto.c +++ linux-2.6/fs/ecryptfs/crypto.c @@ -375,7 +375,8 @@ ecryptfs_extent_to_lwr_pg_idx_and_offset static int ecryptfs_write_out_page(struct ecryptfs_page_crypt_context *ctx, struct page *lower_page, struct inode *lower_inode, - int byte_offset_in_page, int bytes_to_write) + int byte_offset_in_page, int bytes_to_write, + void *fsdata) { int rc = 0; @@ -383,7 +384,7 @@ static int ecryptfs_write_out_page(struc rc = ecryptfs_commit_lower_page(lower_page, lower_inode, ctx->param.lower_file, byte_offset_in_page, - bytes_to_write); + bytes_to_write, fsdata); if (rc) { ecryptfs_printk(KERN_ERR, "Error calling lower " "commit; rc = [%d]\n", rc); @@ -407,7 +408,7 @@ static int ecryptfs_read_in_page(struct struct page **lower_page, struct inode *lower_inode, unsigned long lower_page_idx, - int byte_offset_in_page) + int byte_offset_in_page, void **fsdata) { int rc = 0; @@ -419,13 +420,12 @@ static int ecryptfs_read_in_page(struct lower_page_idx, byte_offset_in_page, (PAGE_CACHE_SIZE - - byte_offset_in_page)); + - byte_offset_in_page), fsdata); if (rc) { ecryptfs_printk( - KERN_ERR, "Error attempting to grab, map, " - "and prepare_write lower page with index " + KERN_ERR, "Error in ecryptfs_get_lower_page " + "lower page with index " "[0x%.16x]; rc = [%d]\n", lower_page_idx, rc); - goto out; } } else { *lower_page = grab_cache_page(lower_inode->i_mapping, @@ -436,10 +436,9 @@ static int ecryptfs_read_in_page(struct KERN_ERR, "Error attempting to grab and map " "lower page with index [0x%.16x]; rc = [%d]\n", lower_page_idx, rc); - goto out; } } -out: + return rc; } @@ -475,6 +474,8 @@ int ecryptfs_encrypt_page(struct ecryptf int lower_byte_offset = 0; int orig_byte_offset = 0; int num_extents_per_page; + void *fsdata; + #define ECRYPTFS_PAGE_STATE_UNREAD 0 #define ECRYPTFS_PAGE_STATE_READ 1 #define ECRYPTFS_PAGE_STATE_MODIFIED 2 @@ -503,10 +504,9 @@ int ecryptfs_encrypt_page(struct ecryptf if (prior_lower_page_idx != lower_page_idx && page_state == ECRYPTFS_PAGE_STATE_MODIFIED) { rc = ecryptfs_write_out_page(ctx, lower_page, - lower_inode, - orig_byte_offset, - (PAGE_CACHE_SIZE - - orig_byte_offset)); + lower_inode, orig_byte_offset, + (PAGE_CACHE_SIZE - orig_byte_offset), + fsdata); if (rc) { ecryptfs_printk(KERN_ERR, "Error attempting " "to write out page; rc = [%d]" @@ -519,7 +519,7 @@ int ecryptfs_encrypt_page(struct ecryptf || page_state == ECRYPTFS_PAGE_STATE_WRITTEN) { rc = ecryptfs_read_in_page(ctx, &lower_page, lower_inode, lower_page_idx, - lower_byte_offset); + lower_byte_offset, &fsdata); if (rc) { ecryptfs_printk(KERN_ERR, "Error attempting " "to read in lower page with " @@ -571,8 +571,8 @@ int ecryptfs_encrypt_page(struct ecryptf } BUG_ON(orig_byte_offset != 0); rc = ecryptfs_write_out_page(ctx, lower_page, lower_inode, 0, - (lower_byte_offset - + crypt_stat->extent_size)); + (lower_byte_offset + crypt_stat->extent_size), + fsdata); if (rc) { ecryptfs_printk(KERN_ERR, "Error attempting to write out " "page; rc = [%d]\n", rc); Index: linux-2.6/fs/ecryptfs/ecryptfs_kernel.h =================================================================== --- linux-2.6.orig/fs/ecryptfs/ecryptfs_kernel.h +++ linux-2.6/fs/ecryptfs/ecryptfs_kernel.h @@ -503,11 +503,11 @@ int ecryptfs_write_inode_size_to_metadat int ecryptfs_get_lower_page(struct page **lower_page, struct inode *lower_inode, struct file *lower_file, unsigned long lower_page_index, int byte_offset, - int region_bytes); + int region_bytes, void **fsdata); int ecryptfs_commit_lower_page(struct page *lower_page, struct inode *lower_inode, struct file *lower_file, int byte_offset, - int region_size); + int region_size, void *fsdata); int ecryptfs_copy_page_to_lower(struct page *page, struct inode *lower_inode, struct file *lower_file); int ecryptfs_do_readpage(struct file *file, struct page *page, Index: linux-2.6/fs/fuse/file.c =================================================================== --- linux-2.6.orig/fs/fuse/file.c +++ linux-2.6/fs/fuse/file.c @@ -443,50 +443,61 @@ static size_t fuse_send_write(struct fus return outarg.size; } -static int fuse_prepare_write(struct file *file, struct page *page, - unsigned offset, unsigned to) -{ - /* No op */ +static int fuse_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) +{ + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + + *pagep = __grab_cache_page(mapping, index); + if (!*pagep) + return -ENOMEM; return 0; } -static int fuse_commit_write(struct file *file, struct page *page, - unsigned offset, unsigned to) +static int fuse_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { int err; size_t nres; - unsigned count = to - offset; - struct inode *inode = page->mapping->host; + struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - loff_t pos = page_offset(page) + offset; struct fuse_req *req; + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); - if (is_bad_inode(inode)) - return -EIO; + if (is_bad_inode(inode)) { + err = -EIO; + goto out; + } req = fuse_get_req(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } req->num_pages = 1; req->pages[0] = page; req->page_offset = offset; - nres = fuse_send_write(req, file, inode, pos, count); + nres = fuse_send_write(req, file, inode, pos, copied); err = req->out.h.error; fuse_put_request(fc, req); - if (!err && nres != count) - err = -EIO; if (!err) { - pos += count; + copied = nres; + pos += copied; spin_lock(&fc->lock); if (pos > inode->i_size) i_size_write(inode, pos); spin_unlock(&fc->lock); - if (offset == 0 && to == PAGE_CACHE_SIZE) + if (copied == PAGE_CACHE_SIZE) SetPageUptodate(page); } fuse_invalidate_attr(inode); +out: + unlock_page(page); + page_cache_release(page); return err; } @@ -817,8 +828,8 @@ static const struct file_operations fuse static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, - .prepare_write = fuse_prepare_write, - .commit_write = fuse_commit_write, + .write_begin = fuse_write_begin, + .write_end = fuse_write_end, .readpages = fuse_readpages, .set_page_dirty = fuse_set_page_dirty, .bmap = fuse_bmap,