[PATCH] revoke: core code From: Pekka Enberg The revokeat(2) system call ensures that after successful revocation you can only access an inode via a file descriptor that is obtained from subsequent open(2) calls. The open(2) system call can be blocked by the caller with chmod(2) and chown(2) prior to calling revokeat(2) to gain exclusive access to an inode. After an successful revocation, operations on file descriptors fail with the EBADF or ENXIO error code for regular and device files, respectively. Attempting to read from or write to a revoked mapping causes SIGBUS. What the revokeat(2) system call guarantees is that: (1) open file descriptors are revoked, (2) file descriptors created by fork(2) and dup(2) during the operation are revoked, (3) file descriptors obtained via a SCM_RIGHTS datagram during or after the revoke operation are revoked, (4) in-flight read(2) and write(2) operations are either completed or aborted before revokeat(2) returns successfully, (5) attempting to read from or write to a shared memory mapping raises SIGBUS, and (6) copy-on-write to a private memory mapping after successful revokeat(2) call does not reveal any data written after the system call has returned. FIXME: - From the above list, (4) is not guaranteed at all and memory mappings are not guaranteed to be revoked for (2). Cc: Alan Cox Cc: Al Viro Cc: Christoph Hellwig Cc: Peter Zijlstra Signed-off-by: Pekka Enberg --- fs/revoke.c | 507 ++++++++++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 11 + include/linux/magic.h | 2 include/linux/mm.h | 1 mm/mmap.c | 11 + 5 files changed, 531 insertions(+), 1 deletion(-) Index: 2.6/fs/revoke.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ 2.6/fs/revoke.c 2007-10-29 00:01:19.000000000 +0200 @@ -0,0 +1,507 @@ +/* + * Invalidate all current open file descriptors of an inode. + * + * Copyright (C) 2006-2007 Pekka Enberg + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct vfsmount *revokefs_mnt; + +static int revoke_files(struct inode *inode, struct dentry *new_dentry) +{ + struct super_block *sb; + struct file *file; + int err = 0; + + sb = inode->i_sb; + if (!sb) + return -EINVAL; + +restart: + file_list_lock(); + list_for_each_entry(file, &sb->s_files, f_u.fu_list) { + struct dentry *dentry = file->f_path.dentry; + + if (dentry->d_inode != inode) + continue; + + if (file->f_path.mnt == revokefs_mnt) + continue; + + get_file(file); + + /* + * inode->i_mutex cannot be acquired under files_lock + */ + file_list_unlock(); + + err = file->f_op->revoke(file, new_dentry, revokefs_mnt); + fput(file); + + if (err) + goto out; + + if (signal_pending(current)) { + err = -EINTR; + goto out; + } + cond_resched(); + goto restart; + } + file_list_unlock(); +out: + return err; +} + +static inline bool vma_matches(struct vm_area_struct *vma, struct inode *inode) +{ + struct file *file = vma->vm_file; + + return file && file->f_path.dentry->d_inode == inode; +} + +/* + * LOCKING: read_lock(&tasklist_lock) + */ +static unsigned long nr_tasks_with_mm(void) +{ + struct task_struct *g, *p; + int ret = 0; + + do_each_thread(g, p) { + if (!p->mm) + continue; + ret++; + } + while_each_thread(g, p); + return ret; +} + +static int task_break_cow(struct task_struct *tsk, struct inode *inode) +{ + struct vm_area_struct *vma; + struct mm_struct *mm; + int ret = 0; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + int err; + + if (vma->vm_flags & VM_SHARED) + continue; + + if (!vma_matches(vma, inode)) + continue; + + err = get_user_pages(tsk, tsk->mm, vma->vm_start, + vma_pages(vma), 1, 1, NULL, NULL); + if (err < 0) { + ret = err; + break; + } + if (err != vma_pages(vma)) { + ret = -ENOMEM; + break; + } + unlink_file_vma(vma); + fput(vma->vm_file); + vma->vm_file = NULL; + } + up_write(&mm->mmap_sem); + mmput(mm); + return ret; +} + +static int revoke_break_cow(struct inode *inode) +{ + struct task_struct **tsk_array; + struct task_struct *g, *p; + unsigned long nr, i; + int err = 0; + +restart: + read_lock(&tasklist_lock); + nr = nr_tasks_with_mm(); + read_unlock(&tasklist_lock); + + tsk_array = kcalloc(nr, sizeof(struct task_struct *), GFP_KERNEL); + if (!tsk_array) + return -ENOMEM; + + read_lock(&tasklist_lock); + + if (nr != nr_tasks_with_mm()) { + read_unlock(&tasklist_lock); + kfree(tsk_array); + cond_resched(); + goto restart; + } + + i = 0; + do_each_thread(g, p) { + if (i >= nr) { + read_unlock(&tasklist_lock); + err = -EAGAIN; + goto out; + } + + if (!p->mm) + continue; + + get_task_struct(p); + tsk_array[i++] = p; + } + while_each_thread(g, p); + read_unlock(&tasklist_lock); + + for (i = 0; i < nr; i++) { + struct task_struct *tsk = tsk_array[i]; + + err = task_break_cow(tsk, inode); + if (err) + break; + } + + for (i = 0; i < nr; i++) { + struct task_struct *tsk = tsk_array[i]; + + put_task_struct(tsk); + } +out: + kfree(tsk_array); + return err; +} + +/* + * LOCKING: down_write(&mm->mmap_sem) + * -> spin_lock(&mapping->i_mmap_lock) + */ +static int revoke_vma(struct vm_area_struct *vma, struct zap_details *details) +{ + unsigned long restart_addr, start_addr, end_addr; + int need_break; + + start_addr = vma->vm_start; + end_addr = vma->vm_end; + +again: + restart_addr = zap_page_range(vma, start_addr, end_addr - start_addr, + details); + + need_break = need_resched() || need_lockbreak(details->i_mmap_lock); + if (need_break) + goto out_need_break; + + if (restart_addr < end_addr) { + start_addr = restart_addr; + goto again; + } + vma->vm_flags |= VM_REVOKED; + return 0; + +out_need_break: + spin_unlock(details->i_mmap_lock); + cond_resched(); + spin_lock(details->i_mmap_lock); + return -EINTR; +} + +static inline bool vma_is_revocable(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_SHARED) && !(vma->vm_flags & VM_REVOKED); +} + +/* + * LOCKING: spin_lock(&mapping->i_mmap_lock) + */ +static int revoke_mm(struct mm_struct *mm, struct address_space *mapping) +{ + struct vm_area_struct *vma; + struct zap_details details; + int err = 0; + + details.i_mmap_lock = &mapping->i_mmap_lock; + + /* + * If ->mmap_sem is under contention, we continue scanning other + * mms and try again later. + */ + if (!down_write_trylock(&mm->mmap_sem)) { + err = -EAGAIN; + goto out; + } + for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + if (!vma_is_revocable(vma)) + continue; + + if (!vma_matches(vma, mapping->host)) + continue; + + err = revoke_vma(vma, &details); + if (err) + break; + + __unlink_file_vma(vma); + fput(vma->vm_file); + vma->vm_file = NULL; + } + up_write(&mm->mmap_sem); +out: + return err; +} + +/* + * LOCKING: spin_lock(&mapping->i_mmap_lock) + */ +static void revoke_mapping_tree(struct address_space *mapping) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int try_again; + +restart: + try_again = 0; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) { + int err; + + if (!vma_is_revocable(vma)) + continue; + + if (!vma_matches(vma, mapping->host)) + continue; + + err = revoke_mm(vma->vm_mm, mapping); + if (err == -EAGAIN) + try_again = 1; + + goto restart; + } + if (try_again) { + cond_resched(); + goto restart; + } +} + +/* + * LOCKING: spin_lock(&mapping->i_mmap_lock) + */ +static void revoke_mapping_list(struct address_space *mapping) +{ + struct vm_area_struct *vma; + int try_again; + +restart: + try_again = 0; + + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { + int err; + + if (!vma_is_revocable(vma)) + continue; + + if (!vma_matches(vma, mapping->host)) + continue; + + err = revoke_mm(vma->vm_mm, mapping); + if (err == -EAGAIN) { + try_again = 1; + continue; + } + if (err == -EINTR) + goto restart; + } + if (try_again) { + cond_resched(); + goto restart; + } +} + +static void revoke_mapping(struct address_space *mapping) +{ + spin_lock(&mapping->i_mmap_lock); + if (unlikely(!prio_tree_empty(&mapping->i_mmap))) + revoke_mapping_tree(mapping); + if (unlikely(!list_empty(&mapping->i_mmap_nonlinear))) + revoke_mapping_list(mapping); + spin_unlock(&mapping->i_mmap_lock); +} + +static inline void revoke_unlock(struct inode *inode) +{ + mutex_lock(&inode->i_mutex); + inode->i_flags &= ~S_REVOKE_LOCK; + mutex_unlock(&inode->i_mutex); +} + +/* + * Returns true if revoke lock was acquired + */ +static inline bool revoke_trylock(struct inode *inode) +{ + bool ret = false; + + mutex_lock(&inode->i_mutex); + if (!IS_REVOKE_LOCKED(inode)) { + inode->i_flags |= S_REVOKE_LOCK; + ret = true; + } + mutex_unlock(&inode->i_mutex); + + return ret; +} + +static int do_revoke(struct inode *inode) +{ + struct address_space *mapping = inode->i_mapping; + struct dentry *revoke_dentry; + struct inode *revoke_inode; + struct qstr name; + int err = 0; + + if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) + return -EPERM; + + if (!inode->i_sb->s_bdev || !inode->i_fop->revoke) + return -EOPNOTSUPP; + + /* + * Take the S_REVOKE_LOCK to avoid concurrent revoke operations on the + * same inode. + */ + if (!revoke_trylock(inode)) + return -EBUSY; + + revoke_inode = new_inode(revokefs_mnt->mnt_sb); + if (!revoke_inode) { + err = -ENOMEM; + goto failed_unlock; + } + + revoke_inode->i_mode = inode->i_mode; + make_revoked_inode(revoke_inode); + + name.name = "revoked_file"; + name.len = strlen(name.name); + revoke_dentry = d_alloc(revokefs_mnt->mnt_sb->s_root, &name); + if (!revoke_dentry) { + iput(revoke_inode); + err = -ENOMEM; + goto failed_unlock; + } + d_instantiate(revoke_dentry, revoke_inode); + + revoke_mapping(mapping); + + err = revoke_break_cow(inode); + if (err) + goto failed_dput; + + err = revoke_files(inode, revoke_dentry); + if (err) + goto failed_dput; + + /* + * Make pending reads fail. + */ + err = invalidate_inode_pages2(inode->i_mapping); + +failed_dput: + dput(revoke_dentry); +failed_unlock: + revoke_unlock(inode); + return err; +} + +asmlinkage long sys_revokeat(int dfd, const char __user *filename) +{ + struct nameidata nd; + int err; + + err = __user_walk_fd(dfd, filename, 0, &nd); + if (!err) { + err = do_revoke(nd.dentry->d_inode); + path_release(&nd); + } + return err; +} + +int generic_file_revoke(struct file *file, struct dentry *new_dentry, + struct vfsmount *mnt) +{ + struct inode *inode = new_dentry->d_inode; + int err; + + err = do_fsync(file, 1); + if (err) + goto out; + + file->f_mapping = inode->i_mapping; + file->f_dentry = dget(new_dentry); + file->f_vfsmnt = mntget(mnt); + file->f_op = fops_get(inode->i_fop); + file->f_pos = 0; +out: + return err; +} +EXPORT_SYMBOL(generic_file_revoke); + +/* + * Filesystem for revoked files. + */ + +static struct super_operations revokefs_super_ops = { + .drop_inode = generic_delete_inode, +}; + +static int revokefs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) +{ + return get_sb_pseudo(fs_type, "revoke:", &revokefs_super_ops, + REVOKEFS_MAGIC, mnt); +} + +static struct file_system_type revokefs_fs_type = { + .name = "revokefs", + .get_sb = revokefs_get_sb, + .kill_sb = kill_anon_super +}; + +static int __init revokefs_init(void) +{ + int err = -ENOMEM; + + err = register_filesystem(&revokefs_fs_type); + if (err) + goto out; + + revokefs_mnt = kern_mount(&revokefs_fs_type); + if (IS_ERR(revokefs_mnt)) { + err = PTR_ERR(revokefs_mnt); + goto err_mnt; + } +out: + return err; +err_mnt: + unregister_filesystem(&revokefs_fs_type); + return err; +} + +late_initcall(revokefs_init); Index: 2.6/include/linux/fs.h =================================================================== --- 2.6.orig/include/linux/fs.h 2007-10-28 23:53:41.000000000 +0200 +++ 2.6/include/linux/fs.h 2007-10-28 23:54:16.000000000 +0200 @@ -150,6 +150,7 @@ #define MS_MGC_MSK 0xffff0000 #define S_NOCMTIME 128 /* Do not update file c/mtime */ #define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */ #define S_PRIVATE 512 /* Inode is fs-internal */ +#define S_REVOKE_LOCK 1024 /* Inode is being revoked */ /* * Note that nosuid etc flags are inode-specific: setting some file-system @@ -183,6 +184,7 @@ #define MS_MGC_MSK 0xffff0000 #define IS_NOCMTIME(inode) ((inode)->i_flags & S_NOCMTIME) #define IS_SWAPFILE(inode) ((inode)->i_flags & S_SWAPFILE) #define IS_PRIVATE(inode) ((inode)->i_flags & S_PRIVATE) +#define IS_REVOKE_LOCKED(inode) ((inode)->i_flags & S_REVOKE_LOCK) /* the read-only stuff doesn't really belong here, but any other place is probably as bad and I don't want to create yet another include file. */ @@ -1188,6 +1190,7 @@ struct file_operations { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **); + int (*revoke) (struct file *, struct dentry *, struct vfsmount *); }; struct inode_operations { @@ -1821,6 +1824,14 @@ extern ssize_t generic_splice_sendpage(s extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, size_t len, unsigned int flags); +/* fs/revoke.c */ +#ifdef CONFIG_MMU +extern void make_revoked_inode(struct inode *); +extern int generic_file_revoke(struct file *, struct dentry *, struct vfsmount *); +#else +#define generic_file_revoke NULL +#endif + extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern loff_t no_llseek(struct file *file, loff_t offset, int origin); Index: 2.6/include/linux/mm.h =================================================================== --- 2.6.orig/include/linux/mm.h 2007-10-28 23:53:36.000000000 +0200 +++ 2.6/include/linux/mm.h 2007-10-28 23:54:16.000000000 +0200 @@ -970,6 +970,7 @@ extern int split_vma(struct mm_struct *, extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, struct rb_node **, struct rb_node *); +extern void __unlink_file_vma(struct vm_area_struct *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff); Index: 2.6/mm/mmap.c =================================================================== --- 2.6.orig/mm/mmap.c 2007-10-28 23:53:36.000000000 +0200 +++ 2.6/mm/mmap.c 2007-10-28 23:54:16.000000000 +0200 @@ -201,6 +201,17 @@ static void __remove_shared_vm_struct(st } /* + * Requires inode->i_mapping->i_mmap_lock + */ +void __unlink_file_vma(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct address_space *mapping = file->f_mapping; + + __remove_shared_vm_struct(vma, file, mapping); +} + +/* * Unlink a file-based vm structure from its prio_tree, to hide * vma from rmap and vmtruncate before freeing its page tables. */ Index: 2.6/include/linux/magic.h =================================================================== --- 2.6.orig/include/linux/magic.h 2007-10-28 23:53:36.000000000 +0200 +++ 2.6/include/linux/magic.h 2007-10-28 23:54:16.000000000 +0200 @@ -34,7 +34,7 @@ #define REISERFS_SUPER_MAGIC 0x52654973 #define REISERFS_SUPER_MAGIC_STRING "ReIsErFs" #define REISER2FS_SUPER_MAGIC_STRING "ReIsEr2Fs" #define REISER2FS_JR_SUPER_MAGIC_STRING "ReIsEr3Fs" - +#define REVOKEFS_MAGIC 0x5245564B /* REVK */ #define SMB_SUPER_MAGIC 0x517B #define USBDEVICE_SUPER_MAGIC 0x9fa2 #define CGROUP_SUPER_MAGIC 0x27e0eb