Subject: [RFC/PATCH] revokeat/frevoke system calls V4 From: Pekka Enberg Add revokeat(2) and frevoke(2) system calls that invalidate open file descriptors of an inode. The former call invalidates all open descriptors wheras the latter keeps the descriptors of the current process intact. After successful revocation, operations on the revoked fds fail with the EBADF error code, except for close(2) which succeeds. Accesses to a revoked shared mapping raise the SIGSEGV. The implementation is heavily influenced by Tigran Aivazian's "forced unmount" patches at: http://developer.osdl.org/dev/fumount/kernel2/patches/2.6.12/1/forced-unmount-2.6.12-1.patch Open issues: - Accessing a revoked shared mapping causes SIGSEGV although SIGBUS would be more appropriate - Needs to give ENXIO for devices as per BSD - Revoked shared mappings must support munmap(2) Changes from V3 to V4: - Replace revoked descriptors with a pointer to file in a pseudo-filesystem - You can now close(2) revoked descriptors Changes from V2 to V3: - Add ->revoke hook to file_operations so that we can support revoke for devices - Convert sys_revoke to sys_revokeat - Do not BUG_ON if someone expands the fd tables after we have allocated the revoke table. - Document the fact that accesses to revoked shared mappings raise SIGSEGV. Changes from V1 to V2: - No kmalloc under tasklist_lock - Keep fget_light/fput_light locking in sys_read and sys_write Thanks to Andrew Morton, Alan Cox, Ulrich Drepper and Edgar Toernig for review comment. Signed-off-by: Pekka Enberg --- arch/i386/kernel/syscall_table.S | 3 fs/Makefile | 2 fs/ext2/file.c | 1 fs/file_table.c | 1 fs/revoke.c | 657 +++++++++++++++++++++++++++++++++++++++ include/asm-i386/unistd.h | 5 include/linux/file.h | 14 include/linux/fs.h | 6 include/linux/syscalls.h | 3 9 files changed, 690 insertions(+), 2 deletions(-) Index: 2.6/arch/i386/kernel/syscall_table.S =================================================================== --- 2.6.orig/arch/i386/kernel/syscall_table.S +++ 2.6/arch/i386/kernel/syscall_table.S @@ -318,3 +318,6 @@ ENTRY(sys_call_table) .long sys_vmsplice .long sys_move_pages .long sys_getcpu + .long sys_revokeat + .long sys_frevoke /* 320 */ + Index: 2.6/fs/Makefile =================================================================== --- 2.6.orig/fs/Makefile +++ 2.6/fs/Makefile @@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table. ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ - pnode.o drop_caches.o splice.o sync.o utimes.o + pnode.o drop_caches.o splice.o sync.o utimes.o revoke.o ifeq ($(CONFIG_BLOCK),y) obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o Index: 2.6/fs/revoke.c =================================================================== --- /dev/null +++ 2.6/fs/revoke.c @@ -0,0 +1,657 @@ +/* + * fs/revoke.c - Invalidate all current open file descriptors of an inode. + * + * Copyright (C) 2006 Pekka Enberg + * + * This file is released under the GPLv2. + */ +#include +#include +#include +#include +#include +#include +#include + +/* + * Used for pre-allocating struct files so that we don't need to do kmalloc() + * under tasklist_lock. + */ +struct revoke_table { + unsigned long nr_revoked; /* Number of revoked files */ + unsigned long nr_used; /* Number of used files from the + pre-allocated set */ + unsigned long nr_fds; /* Number of pre-allocated files */ + struct file **files; +}; + +struct kmem_cache *revokefs_inode_cache; + +/* + * Revoked file descriptors point to files in the revokefs filesystem. + */ +static struct vfsmount *revokefs_mnt; + +struct revokefs_inode_info { + struct task_struct *owner; + struct file *file; + unsigned int fd; + struct inode vfs_inode; +}; + +static inline struct revokefs_inode_info *REVOKEFS_I(struct inode *inode) +{ + return container_of(inode, struct revokefs_inode_info, vfs_inode); +} + +static int return_EBADF(void) +{ + return -EBADF; +} + +#define EBADF_ERROR ((void *) (return_EBADF)) + +static int return_ZERO(void) +{ + return 0; +} + +#define SUCCESS ((void *) (return_ZERO)) + +static const struct file_operations revoked_file_ops = { + .llseek = EBADF_ERROR, + .aio_read = EBADF_ERROR, + .read = EBADF_ERROR, + .write = EBADF_ERROR, + .aio_write = EBADF_ERROR, + .readdir = EBADF_ERROR, + .poll = EBADF_ERROR, + .ioctl = EBADF_ERROR, + .mmap = EBADF_ERROR, + .open = EBADF_ERROR, + .flush = SUCCESS, /* sys_close must succeed */ + .release = EBADF_ERROR, + .fsync = EBADF_ERROR, + .aio_fsync = EBADF_ERROR, + .fasync = EBADF_ERROR, + .lock = EBADF_ERROR, + .sendfile = EBADF_ERROR, + .sendpage = EBADF_ERROR, + .get_unmapped_area = EBADF_ERROR, +}; + +static struct inode_operations revoked_inode_ops = { + .create = EBADF_ERROR, + .lookup = EBADF_ERROR, + .link = EBADF_ERROR, + .unlink = EBADF_ERROR, + .symlink = EBADF_ERROR, + .mkdir = EBADF_ERROR, + .rmdir = EBADF_ERROR, + .mknod = EBADF_ERROR, + .rename = EBADF_ERROR, + .readlink = EBADF_ERROR, + /* follow_link must be no-op, otherwise unmounting this inode + won't work */ + .truncate = EBADF_ERROR, + .permission = EBADF_ERROR, + .getattr = EBADF_ERROR, + .setattr = EBADF_ERROR, + .setxattr = EBADF_ERROR, + .getxattr = EBADF_ERROR, + .listxattr = EBADF_ERROR, + .removexattr = EBADF_ERROR, +}; + +static void make_revoked_inode(struct inode *inode) +{ + remove_inode_hash(inode); + + inode->i_mode = S_IFREG; + inode->i_atime = inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); + inode->i_op = &revoked_inode_ops; + inode->i_fop = &revoked_file_ops; +} + +static struct inode *revoked_get_inode(struct super_block *sb, int mode) +{ + struct inode *inode = new_inode(sb); + + if (inode) { + make_revoked_inode(inode); + inode->i_mode = mode; + } + return inode; +} + +static struct file *get_revoked_file(void) +{ + struct dentry *dentry; + struct inode *inode; + struct file *filp; + struct qstr name; + + filp = get_empty_filp(); + if (!filp) + goto err; + + inode = revoked_get_inode(revokefs_mnt->mnt_sb, 0755); + if (!inode) + goto err_inode; + + name.name = "revoked_file"; + name.len = strlen(name.name); + dentry = d_alloc(revokefs_mnt->mnt_sb->s_root, &name); + if (!dentry) + goto err_dentry; + + d_instantiate(dentry, inode); + + filp->f_mapping = inode->i_mapping; + filp->f_dentry = dget(dentry); + filp->f_vfsmnt = mntget(revokefs_mnt); + filp->f_op = fops_get(inode->i_fop); + filp->f_pos = 0; + + return filp; + + err_dentry: + iput(inode); + err_inode: + fput(filp); + err: + return NULL; +} + +static inline int inode_matches(struct file *file, struct inode *inode, + struct file *to_exclude) +{ + return file && file != to_exclude && file->f_dentry->d_inode == inode; +} + +static inline bool revoke_table_is_full(struct revoke_table *table) +{ + return table->nr_used == table->nr_fds; +} + +static inline struct file *revoke_table_get(struct revoke_table *table) +{ + return table->files[table->nr_used++]; +} + +/* + * LOCKING: task_lock(owner) + */ +static int revoke_fds(struct task_struct *owner, + struct inode *inode, + struct file *to_exclude, + struct revoke_table *table) +{ + struct files_struct *files; + struct fdtable *fdt; + unsigned int fd; + int err = 0; + + files = get_files_struct(owner); + if (!files) + goto out; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + + for (fd = 0; fd < fdt->max_fds; fd++) { + struct revokefs_inode_info *info; + struct file *filp, *new_filp; + + filp = fcheck_files(files, fd); + if (!inode_matches(filp, inode, to_exclude)) + continue; + + if (!filp->f_op->revoke) { + err = -EOPNOTSUPP; + goto failed; + } + + if (revoke_table_is_full(table)) { + err = -ENOMEM; + goto failed; + } + + new_filp = revoke_table_get(table); + + /* + * Replace original struct file pointer with a pointer to + * a 'revoked file.' After this point, we don't need to worry + * about racing with sys_close or sys_dup. + */ + rcu_assign_pointer(fdt->fd[fd], new_filp); + + /* + * Hold on to task until we can take down the file and its + * mmap. + */ + get_task_struct(owner); + + info = REVOKEFS_I(new_filp->f_dentry->d_inode); + info->fd = fd; + info->file = filp; + info->owner = owner; + } + failed: + spin_unlock(&files->file_lock); + put_files_struct(files); + out: + return err; +} + +static int revoke_mmap(struct revokefs_inode_info *revoked) +{ + struct vm_area_struct *this, *next; + struct mm_struct *mm; + int err = 0; + + mm = get_task_mm(revoked->owner); + down_write(&mm->mmap_sem); + + /* + * Be careful, do_munmap removes the unmapped vma from mm->mmap list. + */ + this = mm->mmap; + while (this) { + next = this->vm_next; + if (this->vm_flags & VM_SHARED && this->vm_file == revoked->file) { + err = do_munmap(mm, this->vm_start, + this->vm_end - this->vm_start); + if (err) + break; + } + this = next; + } + up_write(&mm->mmap_sem); + mmput(mm); + return err; +} + +static int close_files(struct revokefs_inode_info *info) +{ + struct files_struct *files; + int err = 0; + + files = get_files_struct(info->owner); + if (files) { + while (info->file->f_light) + schedule(); + err = filp_close(info->file, files); + put_files_struct(files); + } + return err; +} + +static void restore_file(struct file *to_restore) +{ + struct revokefs_inode_info *info; + struct files_struct *files; + + info = REVOKEFS_I(to_restore->f_dentry->d_inode); + files = get_files_struct(info->owner); + if (files) { + struct fdtable *fdt; + struct file *filp; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + + filp = fdt->fd[info->fd]; + if (filp) + fput(filp); + + rcu_assign_pointer(fdt->fd[info->fd], info->file); + FD_SET(info->fd, fdt->close_on_exec); + spin_unlock(&files->file_lock); + put_files_struct(files); + } + put_task_struct(info->owner); +} + +static void restore_files(struct revoke_table *table) +{ + unsigned long i; + + /* + * Only restore those that have not been permanently revoked. + */ + for (i = table->nr_revoked; i < table->nr_fds; i++) { + struct file *this; + + this = table->files[i]; + if (!this) + continue; + + restore_file(this); + } +} + +static int cleanup_files(struct revoke_table *table) +{ + unsigned long i; + int err = 0; + + for (i = 0; i < table->nr_used; i++) { + struct revokefs_inode_info *info; + struct file *this, *file; + + this = table->files[i]; + info = REVOKEFS_I(this->f_dentry->d_inode); + + err = revoke_mmap(info); + if (err) + break; + + file = info->file; + + err = file->f_op->revoke(file); + if (err) + break; + + err = close_files(info); + + put_task_struct(info->owner); + if (err) + break; + + table->nr_revoked++; + } + if (err) + restore_files(table); + + return err; +} + +/* + * Returns the maximum number of fds pointing to inode. + * + * LOCKING: read_lock(&tasklist_lock) + */ +static unsigned long inode_fds(struct inode *inode, struct file *to_exclude) +{ + struct task_struct *g, *p; + unsigned long nr_fds = 0; + + do_each_thread(g, p) { + struct files_struct *files; + struct fdtable *fdt; + unsigned int fd; + + files = get_files_struct(p); + if (!files) + continue; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + for (fd = 0; fd < fdt->max_fds; fd++) { + struct file *file; + + file = fcheck_files(files, fd); + if (inode_matches(file, inode, to_exclude)) { + nr_fds += fdt->max_fds; + break; + } + } + spin_unlock(&files->file_lock); + put_files_struct(files); + } while_each_thread(g, p); + return nr_fds; +} + +static void free_revoke_table(struct revoke_table *table) +{ + int i; + + for (i = table->nr_revoked; i < table->nr_fds; i++) + kfree(table->files[i]); + kfree(table); +} + +static inline void revoke_table_put(struct revoke_table *table, + struct file *filp, unsigned long idx) +{ + table->files[idx] = filp; +} + +static struct revoke_table *__alloc_revoke_table(unsigned long nr_fds) +{ + struct revoke_table *table; + int i; + + table = kzalloc(sizeof *table, GFP_KERNEL); + if (!table) + return NULL; + + table->nr_fds = nr_fds; + + table->files = kcalloc(nr_fds, sizeof(struct file *), GFP_KERNEL); + if (!table->files) { + kfree(table); + return NULL; + } + + for (i = 0; i < table->nr_fds; i++) { + struct file *filp; + + filp = get_revoked_file(); + if (!filp) + goto err; + + revoke_table_put(table, filp, i); + } + return table; + err: + free_revoke_table(table); + return NULL; +} + +/* + * Only allocate memory for those threads that actually have an fd + * pointing to the inode. + */ +static struct revoke_table *alloc_revoke_table(struct inode *inode, + struct file *to_exclude) +{ + unsigned long nr_fds; + + read_lock(&tasklist_lock); + nr_fds = inode_fds(inode, to_exclude); + read_unlock(&tasklist_lock); + + return __alloc_revoke_table(nr_fds); +} + +static int do_revoke(struct inode *inode, struct file *to_exclude) +{ + struct revoke_table *table = NULL; + struct task_struct *g, *p; + int err = 0; + + if (current->fsuid != inode->i_uid && !capable(CAP_FOWNER)) { + err = -EPERM; + goto out; + } + + retry: + if (signal_pending(current)) { + err = -ERESTARTSYS; + goto out; + } + + table = alloc_revoke_table(inode, to_exclude); + if (!table) { + err = -ENOMEM; + goto out; + } + + read_lock(&tasklist_lock); + + /* + * If someone forked while we were allocating memory, try again. + */ + if (inode_fds(inode, to_exclude) > table->nr_fds) { + read_unlock(&tasklist_lock); + free_revoke_table(table); + goto retry; + } + + /* + * First revoke the fds. After we are done, no one can start new + * operations on them. + */ + do_each_thread(g, p) { + err = revoke_fds(p, inode, to_exclude, table); + if (err) + goto exit_loop; + } while_each_thread(g, p); + exit_loop: + read_unlock(&tasklist_lock); + + if (err) { + restore_files(table); + goto out; + } + + /* + * Now, take down the mmaps and close the files for good. + */ + err = cleanup_files(table); + out: + free_revoke_table(table); + return err; +} + +asmlinkage int sys_revokeat(int dfd, const char __user *filename) +{ + struct nameidata nd; + int err; + + err = __user_walk_fd(dfd, filename, 0, &nd); + if (!err) { + err = do_revoke(nd.dentry->d_inode, NULL); + path_release(&nd); + } + return err; +} + +asmlinkage int sys_frevoke(unsigned int fd) +{ + struct file *file = fget(fd); + int err = -EBADF; + + if (file) { + err = do_revoke(file->f_dentry->d_inode, file); + fput(file); + } + return err; +} + +int generic_file_revoke(struct file *file) +{ + int err; + + /* + * Flush pending writes. + */ + err = do_fsync(file, 1); + if (err) + goto out; + + /* + * Make pending reads fail. + */ + err = invalidate_inode_pages2(file->f_mapping); + + out: + return err; +} +EXPORT_SYMBOL(generic_file_revoke); + +/* + * Filesystem for revoked files. + */ + +static struct inode *revokefs_alloc_inode(struct super_block *sb) +{ + struct revokefs_inode_info *ri; + + ri = kmem_cache_alloc(revokefs_inode_cache, GFP_NOFS); + if (!ri) + return NULL; + + return &ri->vfs_inode; +} + +static void revokefs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(revokefs_inode_cache, REVOKEFS_I(inode)); +} + +#define REVOKEFS_MAGIC 0x5245564B /* REVK */ + +static struct super_operations revokefs_super_ops = { + .alloc_inode = revokefs_alloc_inode, + .destroy_inode = revokefs_destroy_inode, + .drop_inode = generic_delete_inode, +}; + +static int revokefs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) + +{ + return get_sb_pseudo(fs_type, "revoke:", &revokefs_super_ops, REVOKEFS_MAGIC, mnt); +} + +struct file_system_type revokefs_fs_type = { + .name = "revokefs", + .get_sb = revokefs_get_sb, + .kill_sb = kill_anon_super +}; + +static void init_once(void *obj, struct kmem_cache *cache, unsigned long flags) +{ + struct revokefs_inode_info *ri = obj; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + inode_init_once(&ri->vfs_inode); + } +} + +static int __init revokefs_init(void) +{ + int err = -ENOMEM; + + revokefs_inode_cache = + kmem_cache_create("revokefs_inode_cache", + sizeof(struct revokefs_inode_info), + 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + init_once, NULL); + if (!revokefs_inode_cache) + goto out; + + err = register_filesystem(&revokefs_fs_type); + if (err) + goto err_register; + + revokefs_mnt = kern_mount(&revokefs_fs_type); + if (IS_ERR(revokefs_mnt)) { + err = PTR_ERR(revokefs_mnt); + goto err_mnt; + } + out: + return err; + err_mnt: + unregister_filesystem(&revokefs_fs_type); + err_register: + kmem_cache_destroy(revokefs_inode_cache); + return err; +} +late_initcall(revokefs_init); Index: 2.6/include/asm-i386/unistd.h =================================================================== --- 2.6.orig/include/asm-i386/unistd.h +++ 2.6/include/asm-i386/unistd.h @@ -324,10 +324,13 @@ #define __NR_vmsplice 316 #define __NR_move_pages 317 #define __NR_getcpu 318 +#define __NR_revokeat 319 +#define __NR_frevoke 320 + #ifdef __KERNEL__ -#define NR_syscalls 319 +#define NR_syscalls 321 #include /* Index: 2.6/include/linux/syscalls.h =================================================================== --- 2.6.orig/include/linux/syscalls.h +++ 2.6/include/linux/syscalls.h @@ -601,4 +601,7 @@ asmlinkage long sys_getcpu(unsigned __us int kernel_execve(const char *filename, char *const argv[], char *const envp[]); +asmlinkage int sys_revokeat(int dfd, const char __user *filename); +asmlinkage int sys_frevoke(unsigned int fd); + #endif Index: 2.6/fs/file_table.c =================================================================== --- 2.6.orig/fs/file_table.c +++ 2.6/fs/file_table.c @@ -219,6 +219,7 @@ struct file fastcall *fget_light(unsigne *fput_needed = 0; if (likely((atomic_read(&files->count) == 1))) { file = fcheck_files(files, fd); + set_f_light(file); } else { rcu_read_lock(); file = fcheck_files(files, fd); Index: 2.6/include/linux/file.h =================================================================== --- 2.6.orig/include/linux/file.h +++ 2.6/include/linux/file.h @@ -6,6 +6,7 @@ #define __LINUX_FILE_H #include +#include #include #include #include @@ -67,10 +68,23 @@ struct files_struct { extern void FASTCALL(__fput(struct file *)); extern void FASTCALL(fput(struct file *)); +static inline void clear_f_light(struct file *file) +{ + file->f_light = 0; +} + +static inline void set_f_light(struct file *file) +{ + if (file) + file->f_light = 1; +} + static inline void fput_light(struct file *file, int fput_needed) { if (unlikely(fput_needed)) fput(file); + else + clear_f_light(file); } extern struct file * FASTCALL(fget(unsigned int fd)); Index: 2.6/include/linux/fs.h =================================================================== --- 2.6.orig/include/linux/fs.h +++ 2.6/include/linux/fs.h @@ -742,6 +742,8 @@ struct file { struct list_head f_ep_links; spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ + /* This instance is being used without holding a reference. */ + int f_light; struct address_space *f_mapping; }; extern spinlock_t files_lock; @@ -1126,6 +1128,7 @@ struct file_operations { int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); + int (*revoke)(struct file *); }; struct inode_operations { @@ -1756,6 +1759,9 @@ extern ssize_t generic_splice_sendpage(s extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, size_t len, unsigned int flags); +/* fs/revoke.c */ +extern int generic_file_revoke(struct file *); + extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); extern loff_t no_llseek(struct file *file, loff_t offset, int origin); Index: 2.6/fs/ext2/file.c =================================================================== --- 2.6.orig/fs/ext2/file.c +++ 2.6/fs/ext2/file.c @@ -56,6 +56,7 @@ const struct file_operations ext2_file_o .sendfile = generic_file_sendfile, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, + .revoke = generic_file_revoke, }; #ifdef CONFIG_EXT2_FS_XIP