GIT 699ea966fa88dd1d6e544feb8fa63d2a50377a9d git+ssh://master.kernel.org/pub/scm/linux/kernel/git/mfasheh/ocfs2.git#ALL commit f90ee07782ead84f6269ea998e3b4294615af020 Author: Tiger Yang Date: Tue Oct 17 18:29:52 2006 -0700 ocfs2: Add splice support Add splice read/write support in ocfs2. ocfs2_file_splice_read/write are very similar to ocfs2_file_aio_read/write. Signed-off-by: Tiger Yang Signed-off-by: Mark Fasheh commit ba4a9ee62103d0715849b2ac013bfd8840308cd4 Author: Mark Fasheh Date: Tue Oct 17 17:06:53 2006 -0700 ocfs2: Remove ocfs2_write_should_remove_suid() Use should_remove_suid() instead. Signed-off-by: Mark Fasheh commit 0dfd2b71210b4e259cfe85160fcc63937c23a530 Author: Mark Fasheh Date: Tue Oct 17 17:05:18 2006 -0700 [PATCH] Export should_remove_suid() This helps us avoid replicating the same logic within file system drivers. Signed-off-by: Mark Fasheh commit 3bcc276af214fc384c059713f9b5ef7a55bdbfab Author: Mark Fasheh Date: Fri Oct 20 14:55:54 2006 -0700 configfs: mutex_lock_nested() fix configfs_unregister_subsystem() nests a pair of inode i_mutex acquisitions, and thus needs annotation via mutex_lock_nested(). Signed-off-by: Mark Fasheh commit e47575895914402e1e72f00b3db539f468b849e3 Author: Joel Becker Date: Fri Oct 6 17:33:23 2006 -0700 configfs: accessing item hierarchy during rmdir(2) Add a notification callback, ops->disconnect_notify(). It has the same prototype as ->drop_item(), but it will be called just before the item linkage is broken. This way, configfs users who want to do work while the object is still in the heirarchy have a chance. Client drivers will still need to config_item_put() in their ->drop_item(), if they implement it. They need do nothing in ->disconnect_notify(). They don't have to provide it if they don't care. But someone who wants to be notified before ci_parent is set to NULL can now be notified. Signed-off-by: Joel Becker commit 3fdf42fd63f1206fb235aa39b49ad0696fe8f6f6 Author: Mark Fasheh Date: Wed Jul 5 13:15:54 2006 -0700 ocfs2: Shared writeable mmap Implement cluster consistent shared writeable mappings using the ->page_mkwrite() callback. Signed-off-by: Mark Fasheh Documentation/filesystems/configfs/configfs.txt | 12 + fs/configfs/dir.c | 34 +++- fs/ocfs2/dlmglue.c | 10 + fs/ocfs2/file.c | 218 +++++++++++++++-------- fs/ocfs2/mmap.c | 100 +++++++++-- include/linux/configfs.h | 1 mm/filemap.c | 1 7 files changed, 280 insertions(+), 96 deletions(-) diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt index c3a7afb..b0e94c8 100644 --- a/Documentation/filesystems/configfs/configfs.txt +++ b/Documentation/filesystems/configfs/configfs.txt @@ -238,6 +238,8 @@ config_item_type. struct config_group *(*make_group)(struct config_group *group, const char *name); int (*commit_item)(struct config_item *item); + void (*disconnect_notify)(struct config_group *group, + struct config_item *item); void (*drop_item)(struct config_group *group, struct config_item *item); }; @@ -268,6 +270,16 @@ the item in other threads, the memory is for the item to actually disappear from the subsystem's usage. But it is gone from configfs. +When drop_item() is called, the item's linkage has already been torn +down. It no longer has a reference on its parent and has no place in +the item hierarchy. If a client needs to do some cleanup before this +teardown happens, the subsystem can implement the +ct_group_ops->disconnect_notify() method. The method is called after +configfs has removed the item from the filesystem view but before the +item is removed from its parent group. Like drop_item(), +disconnect_notify() is void and cannot fail. Client subsystems should +not drop any references here, as they still must do it in drop_item(). + A config_group cannot be removed while it still has child items. This is implemented in the configfs rmdir(2) code. ->drop_item() will not be called, as the item has not been dropped. rmdir(2) will fail, as the diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 8a3b6a1..91469e0 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -715,6 +715,28 @@ static void configfs_detach_group(struct } /* + * After the item has been detached from the filesystem view, we are + * ready to tear it out of the hierarchy. Notify the client before + * we do that so they can perform any cleanup that requires + * navigating the hierarchy. A client does not need to provide this + * callback. The subsystem semaphore MUST be held by the caller, and + * references must be valid for both items. It also assumes the + * caller has validated ci_type. + */ +static void client_disconnect_notify(struct config_item *parent_item, + struct config_item *item) +{ + struct config_item_type *type; + + type = parent_item->ci_type; + BUG_ON(!type); + + if (type->ct_group_ops && type->ct_group_ops->disconnect_notify) + type->ct_group_ops->disconnect_notify(to_config_group(parent_item), + item); +} + +/* * Drop the initial reference from make_item()/make_group() * This function assumes that reference is held on item * and that item holds a valid reference to the parent. Also, it @@ -734,7 +756,7 @@ static void client_drop_item(struct conf */ if (type->ct_group_ops && type->ct_group_ops->drop_item) type->ct_group_ops->drop_item(to_config_group(parent_item), - item); + item); else config_item_put(item); } @@ -843,11 +865,14 @@ out_unlink: if (ret) { /* Tear down everything we built up */ down(&subsys->su_sem); + + client_disconnect_notify(parent_item, item); if (group) unlink_group(group); else unlink_obj(item); client_drop_item(parent_item, item); + up(&subsys->su_sem); if (module_got) @@ -912,11 +937,13 @@ static int configfs_rmdir(struct inode * configfs_detach_group(item); down(&subsys->su_sem); + client_disconnect_notify(parent_item, item); unlink_group(to_config_group(item)); } else { configfs_detach_item(item); down(&subsys->su_sem); + client_disconnect_notify(parent_item, item); unlink_obj(item); } @@ -1176,8 +1203,9 @@ void configfs_unregister_subsystem(struc return; } - mutex_lock(&configfs_sb->s_root->d_inode->i_mutex); - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock_nested(&configfs_sb->s_root->d_inode->i_mutex, + I_MUTEX_PARENT); + mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); if (configfs_detach_prep(dentry)) { printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 8801e41..7691f8a 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2649,6 +2649,15 @@ static int ocfs2_data_convert_worker(str inode = ocfs2_lock_res_inode(lockres); mapping = inode->i_mapping; + /* + * We need this before the filemap_fdatawrite() so that it can + * transfer the dirty bit from the PTE to the + * page. Unfortunately this means that even for EX->PR + * downconverts, we'll lose our mappings and have to build + * them up again. + */ + unmap_mapping_range(mapping, 0, 0, 0); + if (filemap_fdatawrite(mapping)) { mlog(ML_ERROR, "Could not sync inode %llu for downconvert!", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -2656,7 +2665,6 @@ static int ocfs2_data_convert_worker(str sync_mapping_buffers(mapping); if (blocking == LKM_EXMODE) { truncate_inode_pages(mapping, 0); - unmap_mapping_range(mapping, 0, 0, 0); } else { /* We only need to wait on the I/O if we're not also * truncating pages because truncate_inode_pages waits diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 1be74c4..17c0eac 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -31,6 +31,7 @@ #include #include #include #include +#include #define MLOG_MASK_PREFIX ML_INODE #include @@ -957,74 +958,27 @@ out: return ret; } -static inline int ocfs2_write_should_remove_suid(struct inode *inode) +static int ocfs2_prepare_inode_for_write(struct dentry *dentry, + loff_t *ppos, + size_t count, + int appending) { - mode_t mode = inode->i_mode; - - if (!capable(CAP_FSETID)) { - if (unlikely(mode & S_ISUID)) - return 1; - - if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) - return 1; - } - return 0; -} - -static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, - const struct iovec *iov, - unsigned long nr_segs, - loff_t pos) -{ - int ret, rw_level = -1, meta_level = -1, have_alloc_sem = 0; + int ret = 0, meta_level = appending; + struct inode *inode = dentry->d_inode; u32 clusters; - struct file *filp = iocb->ki_filp; - struct inode *inode = filp->f_dentry->d_inode; - loff_t newsize, saved_pos; + loff_t newsize, saved_pos; - mlog_entry("(0x%p, %u, '%.*s')\n", filp, - (unsigned int)nr_segs, - filp->f_dentry->d_name.len, - filp->f_dentry->d_name.name); - - /* happy write of zero bytes */ - if (iocb->ki_left == 0) - return 0; - - if (!inode) { - mlog(0, "bad inode\n"); - return -EIO; - } - - mutex_lock(&inode->i_mutex); - /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ - if (filp->f_flags & O_DIRECT) { - have_alloc_sem = 1; - down_read(&inode->i_alloc_sem); - } - - /* concurrent O_DIRECT writes are allowed */ - rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; - ret = ocfs2_rw_lock(inode, rw_level); - if (ret < 0) { - rw_level = -1; - mlog_errno(ret); - goto out; - } - - /* + /* * We sample i_size under a read level meta lock to see if our write * is extending the file, if it is we back off and get a write level * meta lock. */ - meta_level = (filp->f_flags & O_APPEND) ? 1 : 0; - for(;;) { - ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); - if (ret < 0) { - meta_level = -1; - mlog_errno(ret); - goto out; - } + for(;;) { + ret = ocfs2_meta_lock(inode, NULL, NULL, meta_level); + if (ret < 0) { + mlog_errno(ret); + goto out; + } /* Clear suid / sgid if necessary. We do this here * instead of later in the write path because @@ -1035,7 +989,7 @@ static ssize_t ocfs2_file_aio_write(stru * inode. There's also the dinode i_size state which * can be lost via setattr during extending writes (we * set inode->i_size at the end of a write. */ - if (ocfs2_write_should_remove_suid(inode)) { + if (should_remove_suid(dentry)) { if (meta_level == 0) { ocfs2_meta_unlock(inode, meta_level); meta_level = 1; @@ -1045,19 +999,19 @@ static ssize_t ocfs2_file_aio_write(stru ret = ocfs2_write_remove_suid(inode); if (ret < 0) { mlog_errno(ret); - goto out; + goto out_unlock; } } /* work on a copy of ppos until we're sure that we won't have * to recalculate it due to relocking. */ - if (filp->f_flags & O_APPEND) { + if (appending) { saved_pos = i_size_read(inode); mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); } else { - saved_pos = iocb->ki_pos; + saved_pos = *ppos; } - newsize = iocb->ki_left + saved_pos; + newsize = count + saved_pos; mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", (long long) saved_pos, (long long) newsize, @@ -1090,19 +1044,66 @@ static ssize_t ocfs2_file_aio_write(stru if (!clusters) break; - ret = ocfs2_extend_file(inode, NULL, newsize, iocb->ki_left); + ret = ocfs2_extend_file(inode, NULL, newsize, count); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); - goto out; + goto out_unlock; } break; } - /* ok, we're done with i_size and alloc work */ - iocb->ki_pos = saved_pos; + if (appending) + *ppos = saved_pos; + +out_unlock: ocfs2_meta_unlock(inode, meta_level); - meta_level = -1; + +out: + return ret; +} + +static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, + const struct iovec *iov, + unsigned long nr_segs, + loff_t pos) +{ + int ret, rw_level, have_alloc_sem = 0; + struct file *filp = iocb->ki_filp; + struct inode *inode = filp->f_dentry->d_inode; + int appending = filp->f_flags & O_APPEND ? 1 : 0; + + mlog_entry("(0x%p, %u, '%.*s')\n", filp, + (unsigned int)nr_segs, + filp->f_dentry->d_name.len, + filp->f_dentry->d_name.name); + + /* happy write of zero bytes */ + if (iocb->ki_left == 0) + return 0; + + mutex_lock(&inode->i_mutex); + /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ + if (filp->f_flags & O_DIRECT) { + have_alloc_sem = 1; + down_read(&inode->i_alloc_sem); + } + + /* concurrent O_DIRECT writes are allowed */ + rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; + ret = ocfs2_rw_lock(inode, rw_level); + if (ret < 0) { + rw_level = -1; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_prepare_inode_for_write(filp->f_dentry, &iocb->ki_pos, + iocb->ki_left, appending); + if (ret < 0) { + mlog_errno(ret); + goto out; + } /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb); @@ -1128,8 +1129,6 @@ static ssize_t ocfs2_file_aio_write(stru } out: - if (meta_level != -1) - ocfs2_meta_unlock(inode, meta_level); if (have_alloc_sem) up_read(&inode->i_alloc_sem); if (rw_level != -1) @@ -1140,6 +1139,77 @@ out: return ret; } +static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, + loff_t *ppos, + size_t len, + unsigned int flags) +{ + int ret; + struct inode *inode = out->f_dentry->d_inode; + + mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, + (unsigned int)len, + out->f_dentry->d_name.len, + out->f_dentry->d_name.name); + + inode_double_lock(inode, pipe->inode); + + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_prepare_inode_for_write(out->f_dentry, ppos, len, 0); + if (ret < 0) { + mlog_errno(ret); + goto out_unlock; + } + + /* ok, we're done with i_size and alloc work */ + ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); + +out_unlock: + ocfs2_rw_unlock(inode, 1); +out: + inode_double_unlock(inode, pipe->inode); + + mlog_exit(ret); + return ret; +} + +static ssize_t ocfs2_file_splice_read(struct file *in, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + int ret = 0; + struct inode *inode = in->f_dentry->d_inode; + + mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, + (unsigned int)len, + in->f_dentry->d_name.len, + in->f_dentry->d_name.name); + + /* + * See the comment in ocfs2_file_aio_read() + */ + ret = ocfs2_meta_lock(inode, NULL, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto bail; + } + ocfs2_meta_unlock(inode, 0); + + ret = generic_file_splice_read(in, ppos, pipe, len, flags); + +bail: + mlog_exit(ret); + return ret; +} + static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, @@ -1238,6 +1308,8 @@ const struct file_operations ocfs2_fops .aio_read = ocfs2_file_aio_read, .aio_write = ocfs2_file_aio_write, .ioctl = ocfs2_ioctl, + .splice_read = ocfs2_file_splice_read, + .splice_write = ocfs2_file_splice_write, }; const struct file_operations ocfs2_dops = { diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 83934e3..fb5b18f 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -42,6 +42,23 @@ #include "file.h" #include "inode.h" #include "mmap.h" +static inline int ocfs2_vm_op_block_sigs(sigset_t *blocked, sigset_t *oldset) +{ + /* The best way to deal with signals in the vm path is + * to block them upfront, rather than allowing the + * locking paths to return -ERESTARTSYS. */ + sigfillset(blocked); + + /* We should technically never get a bad return value + * from sigprocmask */ + return sigprocmask(SIG_BLOCK, blocked, oldset); +} + +static inline int ocfs2_vm_op_unblock_sigs(sigset_t *oldset) +{ + return sigprocmask(SIG_SETMASK, oldset, NULL); +} + static struct page *ocfs2_nopage(struct vm_area_struct * area, unsigned long address, int *type) @@ -53,14 +70,7 @@ static struct page *ocfs2_nopage(struct mlog_entry("(area=%p, address=%lu, type=%p)\n", area, address, type); - /* The best way to deal with signals in this path is - * to block them upfront, rather than allowing the - * locking paths to return -ERESTARTSYS. */ - sigfillset(&blocked); - - /* We should technically never get a bad ret return - * from sigprocmask */ - ret = sigprocmask(SIG_BLOCK, &blocked, &oldset); + ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); if (ret < 0) { mlog_errno(ret); goto out; @@ -68,7 +78,7 @@ static struct page *ocfs2_nopage(struct page = filemap_nopage(area, address, type); - ret = sigprocmask(SIG_SETMASK, &oldset, NULL); + ret = ocfs2_vm_op_unblock_sigs(&oldset); if (ret < 0) mlog_errno(ret); out: @@ -76,21 +86,73 @@ out: return page; } +static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct inode *inode = vma->vm_file->f_dentry->d_inode; + sigset_t blocked, oldset; + int ret, ret2; + pgoff_t last_index; + + mlog_entry("(inode %llu, page index %lu)\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, page->index); + + ret = ocfs2_vm_op_block_sigs(&blocked, &oldset); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + /* Take a meta data lock so that we can test the page location + * against the proper end of file. This particular check may + * be a little paranoid. */ + ret = ocfs2_meta_lock(inode, NULL, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto out_restore_signals; + } + + /* + * When we support holes, allocation should be handled here, + * as writepage() is too late to handle ENOSPC issues. + */ + last_index = i_size_read(inode) << PAGE_CACHE_SHIFT; + if (page->index > last_index) { + ret = -EFBIG; + goto out_meta_unlock; + } + + /* + * Take and drop an exclusive data lock here. This will ensure + * that other nodes write out and invalidate their pages for + * this inode. Dlmglue handles caching of the exclusive lock, + * so the page can be safely marked writeable until another + * node notifies us of competing access. + */ + ret = ocfs2_data_lock(inode, 1); + if (ret < 0) + mlog_errno(ret); + else + ocfs2_data_unlock(inode, 1); + +out_meta_unlock: + ocfs2_meta_unlock(inode, 0); + +out_restore_signals: + ret2 = ocfs2_vm_op_unblock_sigs(&oldset); + if (ret2 < 0) + mlog_errno(ret2); + +out: + return ret; +} + static struct vm_operations_struct ocfs2_file_vm_ops = { - .nopage = ocfs2_nopage, + .nopage = ocfs2_nopage, + .page_mkwrite = ocfs2_page_mkwrite, }; int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) { - /* We don't want to support shared writable mappings yet. */ - if (((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) - && ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) { - mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags); - /* This is -EINVAL because generic_file_readonly_mmap - * returns it in a similar situation. */ - return -EINVAL; - } - file_accessed(file); vma->vm_ops = &ocfs2_file_vm_ops; return 0; diff --git a/include/linux/configfs.h b/include/linux/configfs.h index a7f0150..11100ca 100644 --- a/include/linux/configfs.h +++ b/include/linux/configfs.h @@ -157,6 +157,7 @@ struct configfs_group_operations { struct config_item *(*make_item)(struct config_group *group, const char *name); struct config_group *(*make_group)(struct config_group *group, const char *name); int (*commit_item)(struct config_item *item); + void (*disconnect_notify)(struct config_group *group, struct config_item *item); void (*drop_item)(struct config_group *group, struct config_item *item); }; diff --git a/mm/filemap.c b/mm/filemap.c index 7b84dc8..13df01c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1893,6 +1893,7 @@ int should_remove_suid(struct dentry *de return 0; } +EXPORT_SYMBOL(should_remove_suid); int __remove_suid(struct dentry *dentry, int kill) {