From: Andrea Arcangeli This changes the seccomp API from /proc//seccomp to a prctl (this will produce a smaller kernel) and it adds a TIF_NOTSC that seccomp sets. Only the current task can call disable_TSC (obviously because it hasn't a task_t param). This includes Chuck's patch to give zero runtime cost to the notsc feature. After applying this patch, seccomp will keep working fine on all other archs that currently support it too. Signed-off-by: Andrea Arcangeli Cc: Andi Kleen Signed-off-by: Andrew Morton --- arch/i386/kernel/process.c | 122 ++++++++++++++++--------------- fs/proc/base.c | 79 -------------------- include/asm-i386/processor.h | 4 + include/asm-i386/thread_info.h | 5 + include/linux/prctl.h | 4 + include/linux/seccomp.h | 19 ++-- kernel/seccomp.c | 29 +++++++ kernel/sys.c | 8 ++ 8 files changed, 124 insertions(+), 146 deletions(-) diff -puN arch/i386/kernel/process.c~x86-tif_notsc-and-seccomp-prctl arch/i386/kernel/process.c --- a/arch/i386/kernel/process.c~x86-tif_notsc-and-seccomp-prctl +++ a/arch/i386/kernel/process.c @@ -523,8 +523,31 @@ int dump_task_regs(struct task_struct *t return 1; } -static noinline void __switch_to_xtra(struct task_struct *next_p, - struct tss_struct *tss) +#ifdef CONFIG_SECCOMP +void hard_disable_TSC(void) +{ + write_cr4(read_cr4() | X86_CR4_TSD); +} +void disable_TSC(void) +{ + preempt_disable(); + if (!test_and_set_thread_flag(TIF_NOTSC)) + /* + * Must flip the CPU state synchronously with + * TIF_NOTSC in the current running context. + */ + hard_disable_TSC(); + preempt_enable(); +} +void hard_enable_TSC(void) +{ + write_cr4(read_cr4() & ~X86_CR4_TSD); +} +#endif /* CONFIG_SECCOMP */ + +static noinline void +__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) { struct thread_struct *next; @@ -540,60 +563,47 @@ static noinline void __switch_to_xtra(st set_debugreg(next->debugreg[7], 7); } - if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { - /* - * Disable the bitmap via an invalid offset. We still cache - * the previous bitmap owner and the IO bitmap contents: - */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; - return; +#ifdef CONFIG_SECCOMP + if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ + test_tsk_thread_flag(next_p, TIF_NOTSC)) { + /* prev and next are different */ + if (test_tsk_thread_flag(next_p, TIF_NOTSC)) + hard_disable_TSC(); + else + hard_enable_TSC(); } +#endif - if (likely(next == tss->io_bitmap_owner)) { + if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP) || + test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + /* + * Disable the bitmap via an invalid offset. We still cache + * the previous bitmap owner and the IO bitmap contents: + */ + tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET; + return; + } + + if (likely(next == tss->io_bitmap_owner)) { + /* + * Previous owner of the bitmap (hence the bitmap content) + * matches the next task, we dont have to do anything but + * to set a valid offset in the TSS: + */ + tss->io_bitmap_base = IO_BITMAP_OFFSET; + return; + } /* - * Previous owner of the bitmap (hence the bitmap content) - * matches the next task, we dont have to do anything but - * to set a valid offset in the TSS: + * Lazy TSS's I/O bitmap copy. We set an invalid offset here + * and we let the task to get a GPF in case an I/O instruction + * is performed. The handler of the GPF will verify that the + * faulting task has a valid I/O bitmap and, it true, does the + * real copy and restart the instruction. This will save us + * redundant copies when the currently switched task does not + * perform any I/O during its timeslice. */ - tss->io_bitmap_base = IO_BITMAP_OFFSET; - return; - } - /* - * Lazy TSS's I/O bitmap copy. We set an invalid offset here - * and we let the task to get a GPF in case an I/O instruction - * is performed. The handler of the GPF will verify that the - * faulting task has a valid I/O bitmap and, it true, does the - * real copy and restart the instruction. This will save us - * redundant copies when the currently switched task does not - * perform any I/O during its timeslice. - */ - tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; -} - -/* - * This function selects if the context switch from prev to next - * has to tweak the TSC disable bit in the cr4. - */ -static inline void disable_tsc(struct task_struct *prev_p, - struct task_struct *next_p) -{ - struct thread_info *prev, *next; - - /* - * gcc should eliminate the ->thread_info dereference if - * has_secure_computing returns 0 at compile time (SECCOMP=n). - */ - prev = task_thread_info(prev_p); - next = task_thread_info(next_p); - - if (has_secure_computing(prev) || has_secure_computing(next)) { - /* slow path here */ - if (has_secure_computing(prev) && - !has_secure_computing(next)) { - write_cr4(read_cr4() & ~X86_CR4_TSD); - } else if (!has_secure_computing(prev) && - has_secure_computing(next)) - write_cr4(read_cr4() | X86_CR4_TSD); + tss->io_bitmap_base = INVALID_IO_BITMAP_OFFSET_LAZY; } } @@ -681,11 +691,9 @@ struct task_struct fastcall * __switch_t /* * Now maybe handle debug registers and/or IO bitmaps */ - if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW) - || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))) - __switch_to_xtra(next_p, tss); - - disable_tsc(prev_p, next_p); + if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || + task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) + __switch_to_xtra(prev_p, next_p, tss); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the diff -puN fs/proc/base.c~x86-tif_notsc-and-seccomp-prctl fs/proc/base.c --- a/fs/proc/base.c~x86-tif_notsc-and-seccomp-prctl +++ a/fs/proc/base.c @@ -67,7 +67,6 @@ #include #include #include -#include #include #include #include @@ -778,78 +777,6 @@ static struct file_operations proc_login }; #endif -#ifdef CONFIG_SECCOMP -static ssize_t seccomp_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); - char __buf[20]; - loff_t __ppos = *ppos; - size_t len; - - if (!tsk) - return -ESRCH; - /* no need to print the trailing zero, so use only len */ - len = sprintf(__buf, "%u\n", tsk->seccomp.mode); - put_task_struct(tsk); - if (__ppos >= len) - return 0; - if (count > len - __ppos) - count = len - __ppos; - if (copy_to_user(buf, __buf + __ppos, count)) - return -EFAULT; - *ppos = __ppos + count; - return count; -} - -static ssize_t seccomp_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); - char __buf[20], *end; - unsigned int seccomp_mode; - ssize_t result; - - result = -ESRCH; - if (!tsk) - goto out_no_task; - - /* can set it only once to be even more secure */ - result = -EPERM; - if (unlikely(tsk->seccomp.mode)) - goto out; - - result = -EFAULT; - memset(__buf, 0, sizeof(__buf)); - count = min(count, sizeof(__buf) - 1); - if (copy_from_user(__buf, buf, count)) - goto out; - - seccomp_mode = simple_strtoul(__buf, &end, 0); - if (*end == '\n') - end++; - result = -EINVAL; - if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { - tsk->seccomp.mode = seccomp_mode; - set_tsk_thread_flag(tsk, TIF_SECCOMP); - } else - goto out; - result = -EIO; - if (unlikely(!(end - __buf))) - goto out; - result = end - __buf; -out: - put_task_struct(tsk); -out_no_task: - return result; -} - -static struct file_operations proc_seccomp_operations = { - .read = seccomp_read, - .write = seccomp_write, -}; -#endif /* CONFIG_SECCOMP */ - static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -1762,9 +1689,6 @@ static struct pid_entry tgid_base_stuff[ REG("numa_maps", S_IRUGO, numa_maps), #endif REG("mem", S_IRUSR|S_IWUSR, mem), -#ifdef CONFIG_SECCOMP - REG("seccomp", S_IRUSR|S_IWUSR, seccomp), -#endif LNK("cwd", cwd), LNK("root", root), LNK("exe", exe), @@ -2037,9 +1961,6 @@ static struct pid_entry tid_base_stuff[] REG("numa_maps", S_IRUGO, numa_maps), #endif REG("mem", S_IRUSR|S_IWUSR, mem), -#ifdef CONFIG_SECCOMP - REG("seccomp", S_IRUSR|S_IWUSR, seccomp), -#endif LNK("cwd", cwd), LNK("root", root), LNK("exe", exe), diff -puN include/asm-i386/processor.h~x86-tif_notsc-and-seccomp-prctl include/asm-i386/processor.h --- a/include/asm-i386/processor.h~x86-tif_notsc-and-seccomp-prctl +++ a/include/asm-i386/processor.h @@ -252,6 +252,10 @@ static inline void clear_in_cr4 (unsigne write_cr4(cr4); } +extern void hard_disable_TSC(void); +extern void disable_TSC(void); +extern void hard_enable_TSC(void); + /* * NSC/Cyrix CPU configuration register indexes */ diff -puN include/asm-i386/thread_info.h~x86-tif_notsc-and-seccomp-prctl include/asm-i386/thread_info.h --- a/include/asm-i386/thread_info.h~x86-tif_notsc-and-seccomp-prctl +++ a/include/asm-i386/thread_info.h @@ -142,6 +142,7 @@ static inline struct thread_info *curren #define TIF_MEMDIE 16 #define TIF_DEBUG 17 /* uses debug registers */ #define TIF_IO_BITMAP 18 /* uses I/O bitmap */ +#define TIF_NOTSC 19 /* TSC is not accessible in userland */ #define _TIF_SYSCALL_TRACE (1< #include @@ -18,20 +16,23 @@ static inline void secure_computing(int __secure_computing(this_syscall); } -static inline int has_secure_computing(struct thread_info *ti) -{ - return unlikely(test_ti_thread_flag(ti, TIF_SECCOMP)); -} +extern long prctl_get_seccomp(void); +extern long prctl_set_seccomp(unsigned long); #else /* CONFIG_SECCOMP */ typedef struct { } seccomp_t; #define secure_computing(x) do { } while (0) -/* static inline to preserve typechecking */ -static inline int has_secure_computing(struct thread_info *ti) + +static inline long prctl_get_seccomp(void) +{ + return -EINVAL; +} + +static inline long prctl_set_seccomp(unsigned long arg2) { - return 0; + return -EINVAL; } #endif /* CONFIG_SECCOMP */ diff -puN kernel/seccomp.c~x86-tif_notsc-and-seccomp-prctl kernel/seccomp.c --- a/kernel/seccomp.c~x86-tif_notsc-and-seccomp-prctl +++ a/kernel/seccomp.c @@ -10,6 +10,7 @@ #include /* #define SECCOMP_DEBUG 1 */ +#define NR_SECCOMP_MODES 1 /* * Secure computing mode 1 allows only read/write/exit/sigreturn. @@ -54,3 +55,31 @@ void __secure_computing(int this_syscall #endif do_exit(SIGKILL); } + +long prctl_get_seccomp(void) +{ + return current->seccomp.mode; +} + +long prctl_set_seccomp(unsigned long seccomp_mode) +{ + long ret; + + /* can set it only once to be even more secure */ + ret = -EPERM; + if (unlikely(current->seccomp.mode)) + goto out; + + ret = -EINVAL; + if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { + current->seccomp.mode = seccomp_mode; + set_thread_flag(TIF_SECCOMP); +#ifdef TIF_NOTSC + disable_TSC(); +#endif + ret = 0; + } + + out: + return ret; +} diff -puN kernel/sys.c~x86-tif_notsc-and-seccomp-prctl kernel/sys.c --- a/kernel/sys.c~x86-tif_notsc-and-seccomp-prctl +++ a/kernel/sys.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -2159,6 +2160,13 @@ asmlinkage long sys_prctl(int option, un error = SET_ENDIAN(current, arg2); break; + case PR_GET_SECCOMP: + error = prctl_get_seccomp(); + break; + case PR_SET_SECCOMP: + error = prctl_set_seccomp(arg2); + break; + default: error = -EINVAL; break; _