Subject: [RFC/PATCH] revokeat/frevoke system calls V3 From: Pekka Enberg This patch implements the revokeat(2) and frevoke(2) system calls which can be used to invalidate open file descriptors to an inode. The former system call invalidates all open descriptors whereas the latter invalidates open descriptors of other processes while keeping the current process intact. After successful revocation, operations on a revoked file descriptor fail with the EBADF error code. Accesses to a revoked shared mapping raise the SIGSEGV. The revocation is done in passes: first we replace all struct file pointers that point to the inode with NULL in fd tables to avoid live-locking with a malicious process doing fork/dup while we are scanning the open files. After that, in a second pass, we take down shared mappings, invoke f_op->revoke, and finally close the file. The file descriptor is not free'd to ensure it is not reused by open(2). If either the mmap takedown or f_op->revoke fails, we restore the fds to point to the file. To ensure do_revoke does not race with users of fget_light/fput_light, we delay closing of the files until fput_light is called. These bits were taken from the forced unmount patch by Tigran Aivazian. This patch also adds a generic_file_revoke function that can be used by filesystems to support revokeat(2) and frevoke(2) for regular files. The function synchronizes file's in-core state on disk to ensure no I/O operations are in-flight and invalidates inode pages to ensure pending reads fail. The ext2 filesystem is patched to support revoke in this patch. Thanks to Andrew Morton, Alan Cox, Ulrich Drepper and Edgar Toernig for review comments! Open issues: - Accessing a revoked shared mapping causes SIGSEGV although SIGBUS would be more appropriate - Needs to give ENXIO for devices as per BSD - Revoked files must support close(2) - Revoked shared mappings must support munmap(2) Changes from V2 to V3: - Add ->revoke hook to file_operations so that we can support revoke for devices - Convert sys_revoke to sys_revokeat - Do not BUG_ON if someone expands the fd tables after we have allocated the revoke table. - Document the fact that accesses to revoked shared mappings raise SIGSEGV. Changes from V1 to V2: - No kmalloc under tasklist_lock - Keep fget_light/fput_light locking in sys_read and sys_write Signed-off-by: Pekka Enberg --- arch/i386/kernel/syscall_table.S | 2 fs/Makefile | 2 fs/ext2/file.c | 1 fs/file_table.c | 1 fs/revoke.c | 359 +++++++++++++++++++++++++++++++++++++++ include/asm-i386/unistd.h | 4 include/linux/file.h | 14 + include/linux/fs.h | 6 include/linux/syscalls.h | 3 9 files changed, 390 insertions(+), 2 deletions(-) Index: 2.6/arch/i386/kernel/syscall_table.S =================================================================== --- 2.6.orig/arch/i386/kernel/syscall_table.S +++ 2.6/arch/i386/kernel/syscall_table.S @@ -317,3 +317,5 @@ ENTRY(sys_call_table) .long sys_tee /* 315 */ .long sys_vmsplice .long sys_move_pages + .long sys_revokeat + .long sys_frevoke Index: 2.6/fs/Makefile =================================================================== --- 2.6.orig/fs/Makefile +++ 2.6/fs/Makefile @@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table. ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ - ioprio.o pnode.o drop_caches.o splice.o sync.o + ioprio.o pnode.o drop_caches.o splice.o sync.o revoke.o obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_INOTIFY_USER) += inotify_user.o Index: 2.6/fs/revoke.c =================================================================== --- /dev/null +++ 2.6/fs/revoke.c @@ -0,0 +1,359 @@ +/* + * fs/revoke.c - Invalidate all current open file descriptors of an inode. + * + * Copyright (C) 2006 Pekka Enberg + * + * This file is released under the GPLv2. + */ +#include +#include +#include +#include +#include +#include + +/* + * Auxiliary struct for keeping track of revoked files. + */ +struct revoked_file { + unsigned int fd; + struct file *file; + struct task_struct *owner; +}; + +static inline int inode_matches(struct file *file, struct inode *inode, + struct file *to_exclude) +{ + return file && file != to_exclude && file->f_dentry->d_inode == inode; +} + +/* + * LOCKING: task_lock(owner) + */ +static int revoke_fds(struct task_struct *owner, + struct inode *inode, + struct file *to_exclude, + struct revoked_file *to_close, + unsigned long *nr_fds, + unsigned long max_fds) +{ + int err = 0; + unsigned long offset; + struct files_struct *files; + struct fdtable *fdt; + unsigned int fd; + + files = get_files_struct(owner); + if (!files) + goto out; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + + offset = *nr_fds; + + for (fd = 0; fd < fdt->max_fds; fd++) { + struct file *file; + struct revoked_file *revoked; + + file = fcheck_files(files, fd); + if (!inode_matches(file, inode, to_exclude)) + continue; + + if (!file->f_op->revoke) { + err = -EOPNOTSUPP; + goto failed; + } + + if (offset >= max_fds) { + err = -ENOMEM; + goto failed; + } + + /* + * Leak the fd so it is not reused. After this point, we don't + * need to worry about racing with sys_close or sys_dup. + */ + rcu_assign_pointer(fdt->fd[fd], NULL); + FD_CLR(fd, fdt->close_on_exec); + + /* + * Hold on to task until we can take down the file and its + * mmap. + */ + get_task_struct(owner); + + revoked = &to_close[offset++]; + revoked->fd = fd; + revoked->file = file; + revoked->owner = owner; + } + failed: + spin_unlock(&files->file_lock); + put_files_struct(files); + *nr_fds = offset; + out: + return err; +} + +static int revoke_mmap(struct revoked_file *revoked) +{ + int err = 0; + struct mm_struct *mm; + struct vm_area_struct *this, *next; + + mm = get_task_mm(revoked->owner); + down_write(&mm->mmap_sem); + + /* + * Be careful, do_munmap removes the unmapped vma from mm->mmap list. + */ + this = mm->mmap; + while (this) { + next = this->vm_next; + if (this->vm_flags & VM_SHARED && this->vm_file == revoked->file) { + err = do_munmap(mm, this->vm_start, + this->vm_end - this->vm_start); + if (err) + break; + } + this = next; + } + up_write(&mm->mmap_sem); + mmput(mm); + return err; +} + +static int close_files(struct revoked_file *revoked) +{ + int err = 0; + struct files_struct *files; + + files = get_files_struct(revoked->owner); + if (files) { + err = filp_close(revoked->file, files); + put_files_struct(files); + } + return err; +} + +static void restore_files(struct revoked_file *to_restore, unsigned long nr_fds) +{ + unsigned long i; + + for (i = 0; i < nr_fds; i++) { + struct revoked_file *this; + struct files_struct *files; + + this = &to_restore[i]; + if (!this) + continue; + + files = get_files_struct(this->owner); + if (files) { + struct fdtable *fdt; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + rcu_assign_pointer(fdt->fd[this->fd], this->file); + FD_SET(this->fd, fdt->close_on_exec); + spin_unlock(&files->file_lock); + put_files_struct(files); + } + + put_task_struct(this->owner); + } +} + +static int cleanup_files(struct revoked_file *to_cleanup, unsigned long nr_fds) +{ + int err = 0; + unsigned long i; + + for (i = 0; i < nr_fds; i++) { + struct revoked_file *this; + struct file *file; + + this = &to_cleanup[i]; + + err = revoke_mmap(this); + if (err) + break; + + file = this->file; + + err = file->f_op->revoke(file); + if (err) + break; + + err = close_files(this); + + put_task_struct(this->owner); + if (err) + break; + } + if (err) + restore_files(&to_cleanup[i], nr_fds-i); + + return err; +} + +/* + * Returns the maximum number of fds pointing to inode. + * + * LOCKING: read_lock(&tasklist_lock) + */ +static unsigned long inode_fds(struct inode *inode, struct file *to_exclude) +{ + struct task_struct *g, *p; + unsigned long nr_fds = 0; + + do_each_thread(g, p) { + struct files_struct *files; + struct fdtable *fdt; + unsigned int fd; + + files = get_files_struct(p); + if (!files) + continue; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + for (fd = 0; fd < fdt->max_fds; fd++) { + struct file *file; + + file = fcheck_files(files, fd); + if (inode_matches(file, inode, to_exclude)) { + nr_fds += fdt->max_fds; + break; + } + } + spin_unlock(&files->file_lock); + put_files_struct(files); + } while_each_thread(g, p); + return nr_fds; +} + +/* + * Only allocate memory for those threads that actually have an fd + * pointing to the inode. + */ +static struct revoked_file *alloc_revoke_table(struct inode *inode, + struct file *to_exclude, + unsigned long *nr_fds) +{ + read_lock(&tasklist_lock); + *nr_fds = inode_fds(inode, to_exclude); + read_unlock(&tasklist_lock); + + return kcalloc(*nr_fds, sizeof(struct revoked_file), GFP_KERNEL); +} + +static int do_revoke(struct inode *inode, struct file *to_exclude) +{ + int err = 0; + unsigned long nr_fds, max_fds; + struct revoked_file *to_close = NULL; + struct task_struct *g, *p; + + if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) { + err = -EPERM; + goto out; + } + + retry: + if (signal_pending(current)) { + err = -ERESTARTSYS; + goto out; + } + + to_close = alloc_revoke_table(inode, to_exclude, &max_fds); + if (!to_close) { + err = -ENOMEM; + goto out; + } + + read_lock(&tasklist_lock); + + /* + * If someone forked while we were allocating memory, try again. + */ + if (inode_fds(inode, to_exclude) > max_fds) { + read_unlock(&tasklist_lock); + kfree(to_close); + goto retry; + } + + /* + * First revoke the fds. After we are done, no one can start new + * operations on them. + */ + nr_fds = 0; + do_each_thread(g, p) { + err = revoke_fds(p, inode, to_exclude, to_close, + &nr_fds, max_fds); + if (err) + goto exit_loop; + } while_each_thread(g, p); + exit_loop: + read_unlock(&tasklist_lock); + + if (err) { + restore_files(to_close, nr_fds); + goto out; + } + + /* + * Now, take down the mmaps and close the files for good. + */ + err = cleanup_files(to_close, nr_fds); + out: + kfree(to_close); + return err; +} + +asmlinkage int sys_revokeat(int dfd, const char __user *filename) +{ + int err; + struct nameidata nd; + + err = __user_walk_fd(dfd, filename, 0, &nd); + if (!err) { + err = do_revoke(nd.dentry->d_inode, NULL); + path_release(&nd); + } + return err; +} + +asmlinkage int sys_frevoke(unsigned int fd) +{ + struct file *file = fget(fd); + int err = -EBADF; + + if (file) { + err = do_revoke(file->f_dentry->d_inode, file); + fput(file); + } + return err; +} + +int generic_file_revoke(struct file *file) +{ + int err; + + /* + * Flush pending writes. + */ + err = do_fsync(file, 1); + if (err) + goto out; + + /* + * Make pending reads fail. + */ + err = invalidate_inode_pages2(file->f_mapping); + + out: + return err; +} +EXPORT_SYMBOL(generic_file_revoke); Index: 2.6/include/asm-i386/unistd.h =================================================================== --- 2.6.orig/include/asm-i386/unistd.h +++ 2.6/include/asm-i386/unistd.h @@ -323,10 +323,12 @@ #define __NR_tee 315 #define __NR_vmsplice 316 #define __NR_move_pages 317 +#define __NR_revokeat 318 +#define __NR_frevoke 319 #ifdef __KERNEL__ -#define NR_syscalls 318 +#define NR_syscalls 320 /* * user-visible error numbers are in the range -1 - -128: see Index: 2.6/include/linux/syscalls.h =================================================================== --- 2.6.orig/include/linux/syscalls.h +++ 2.6/include/linux/syscalls.h @@ -597,4 +597,7 @@ asmlinkage long sys_get_robust_list(int asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len); +asmlinkage int sys_revokeat(int dfd, const char __user *filename); +asmlinkage int sys_frevoke(unsigned int fd); + #endif Index: 2.6/fs/file_table.c =================================================================== --- 2.6.orig/fs/file_table.c +++ 2.6/fs/file_table.c @@ -218,6 +218,7 @@ struct file fastcall *fget_light(unsigne *fput_needed = 0; if (likely((atomic_read(&files->count) == 1))) { file = fcheck_files(files, fd); + set_f_light(file); } else { rcu_read_lock(); file = fcheck_files(files, fd); Index: 2.6/include/linux/file.h =================================================================== --- 2.6.orig/include/linux/file.h +++ 2.6/include/linux/file.h @@ -6,6 +6,7 @@ #define __LINUX_FILE_H #include +#include #include #include #include @@ -67,10 +68,23 @@ struct files_struct { extern void FASTCALL(__fput(struct file *)); extern void FASTCALL(fput(struct file *)); +static inline void clear_f_light(struct file *file) +{ + file->f_light = 0; +} + +static inline void set_f_light(struct file *file) +{ + if (file) + file->f_light = 1; +} + static inline void fput_light(struct file *file, int fput_needed) { if (unlikely(fput_needed)) fput(file); + else + clear_f_light(file); } extern struct file * FASTCALL(fget(unsigned int fd)); Index: 2.6/include/linux/fs.h =================================================================== --- 2.6.orig/include/linux/fs.h +++ 2.6/include/linux/fs.h @@ -698,6 +698,8 @@ struct file { struct list_head f_ep_links; spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ + /* This instance is being used without holding a reference. */ + int f_light; struct address_space *f_mapping; }; extern spinlock_t files_lock; @@ -1080,6 +1082,7 @@ struct file_operations { int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); + int (*revoke)(struct file *); }; struct inode_operations { @@ -1670,6 +1673,9 @@ extern ssize_t generic_splice_sendpage(s extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, size_t len, unsigned int flags); +/* fs/revoke.c */ +extern int generic_file_revoke(struct file *); + extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, Index: 2.6/fs/ext2/file.c =================================================================== --- 2.6.orig/fs/ext2/file.c +++ 2.6/fs/ext2/file.c @@ -55,6 +55,7 @@ const struct file_operations ext2_file_o .sendfile = generic_file_sendfile, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, + .revoke = generic_file_revoke, }; #ifdef CONFIG_EXT2_FS_XIP