Move int 3 handler to debug stack and allow to increase it. From: Jan Beulich This - switches the INT3 handler to run on an IST stack (to cope with breakpoints set by a kernel debugger on places where the kernel's %gs base hasn't been set up, yet); the IST stack used is shared with the INT1 handler's - allows nesting of INT1/INT3 handlers so that one can, with a kernel debugger, debug (at least) the user-mode portions of the INT1/INT3 handling; the nesting isn't actively enabled here since a kernel- debugger-free kernel doesn't need it Signed-Off-By: Jan Beulich Signed-off-by: Andi Kleen Index: linux/arch/x86_64/kernel/asm-offsets.c =================================================================== --- linux.orig/arch/x86_64/kernel/asm-offsets.c +++ linux/arch/x86_64/kernel/asm-offsets.c @@ -64,5 +64,9 @@ int main(void) DEFINE(pbe_address, offsetof(struct pbe, address)); DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); DEFINE(pbe_next, offsetof(struct pbe, next)); + BLANK(); +#if DEBUG_STKSZ > EXCEPTION_STKSZ + DEFINE(DEBUG_IST, DEBUG_STACK); +#endif return 0; } Index: linux/arch/x86_64/kernel/entry.S =================================================================== --- linux.orig/arch/x86_64/kernel/entry.S +++ linux/arch/x86_64/kernel/entry.S @@ -673,7 +673,10 @@ ENTRY(spurious_interrupt) /* error code is on the stack already */ /* handle NMI like exceptions that can happen everywhere */ - .macro paranoidentry sym +#ifndef DEBUG_IST +# define DEBUG_IST 0 +#endif + .macro paranoidentry sym, ist=0 SAVE_ALL cld movl $1,%ebx @@ -683,10 +686,20 @@ ENTRY(spurious_interrupt) js 1f swapgs xorl %ebx,%ebx -1: movq %rsp,%rdi +1: + .if \ist + movq %gs:pda_data_offset, %rbp + .endif + movq %rsp,%rdi movq ORIG_RAX(%rsp),%rsi movq $-1,ORIG_RAX(%rsp) + .if \ist + subq $EXCEPTION_STACK_SIZE, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) + .endif call \sym + .if \ist + addq $EXCEPTION_STACK_SIZE, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp) + .endif cli .endm @@ -904,7 +917,7 @@ KPROBE_ENTRY(debug) INTR_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 - paranoidentry do_debug + paranoidentry do_debug, DEBUG_IST jmp paranoid_exit CFI_ENDPROC .previous .text @@ -959,7 +972,12 @@ paranoid_schedule: CFI_ENDPROC KPROBE_ENTRY(int3) - zeroentry do_int3 + INTR_FRAME + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + paranoidentry do_int3, DEBUG_IST + jmp paranoid_exit + CFI_ENDPROC .previous .text ENTRY(overflow) Index: linux/arch/x86_64/kernel/setup64.c =================================================================== --- linux.orig/arch/x86_64/kernel/setup64.c +++ linux/arch/x86_64/kernel/setup64.c @@ -145,7 +145,7 @@ void pda_init(int cpu) pda->irqstackptr += IRQSTACKSIZE-64; } -char boot_exception_stacks[N_EXCEPTION_STACKS * EXCEPTION_STKSZ] +char boot_exception_stacks[(N_EXCEPTION_STACKS - 2) * EXCEPTION_STKSZ + DEBUG_STKSZ] __attribute__((section(".bss.page_aligned"))); /* May not be marked __init: used by software suspend */ @@ -236,13 +236,27 @@ void __cpuinit cpu_init (void) */ for (v = 0; v < N_EXCEPTION_STACKS; v++) { if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, - EXCEPTION_STACK_ORDER); + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; + + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); if (!estacks) panic("Cannot allocate exception stack %ld %d\n", v, cpu); } - estacks += EXCEPTION_STKSZ; + switch (v + 1) { +#if DEBUG_STKSZ > EXCEPTION_STKSZ + case DEBUG_STACK: + cpu_pda[cpu].debugstack = (unsigned long)estacks; + estacks += DEBUG_STKSZ; + break; +#endif + default: + estacks += EXCEPTION_STKSZ; + break; + } t->ist[v] = (unsigned long)estacks; } Index: linux/arch/x86_64/kernel/traps.c =================================================================== --- linux.orig/arch/x86_64/kernel/traps.c +++ linux/arch/x86_64/kernel/traps.c @@ -121,19 +121,31 @@ int printk_address(unsigned long address static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, unsigned *usedp, const char **idp) { - static const char ids[N_EXCEPTION_STACKS][8] = { + static char ids[][8] = { [DEBUG_STACK - 1] = "#DB", [NMI_STACK - 1] = "NMI", [DOUBLEFAULT_STACK - 1] = "#DF", [STACKFAULT_STACK - 1] = "#SS", [MCE_STACK - 1] = "#MC", +#if DEBUG_STKSZ > EXCEPTION_STKSZ + [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" +#endif }; unsigned k; for (k = 0; k < N_EXCEPTION_STACKS; k++) { unsigned long end; - end = per_cpu(init_tss, cpu).ist[k]; + switch (k + 1) { +#if DEBUG_STKSZ > EXCEPTION_STKSZ + case DEBUG_STACK: + end = cpu_pda[cpu].debugstack + DEBUG_STKSZ; + break; +#endif + default: + end = per_cpu(init_tss, cpu).ist[k]; + break; + } if (stack >= end) continue; if (stack >= end - EXCEPTION_STKSZ) { @@ -143,6 +155,22 @@ static unsigned long *in_exception_stack *idp = ids[k]; return (unsigned long *)end; } +#if DEBUG_STKSZ > EXCEPTION_STKSZ + if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { + unsigned j = N_EXCEPTION_STACKS - 1; + + do { + ++j; + end -= EXCEPTION_STKSZ; + ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); + } while (stack < end - EXCEPTION_STKSZ); + if (*usedp & (1U << j)) + break; + *usedp |= 1U << j; + *idp = ids[j]; + return (unsigned long *)end; + } +#endif } return NULL; } @@ -613,6 +641,7 @@ asmlinkage void default_do_nmi(struct pt io_check_error(reason, regs); } +/* runs on IST stack. */ asmlinkage void __kprobes do_int3(struct pt_regs * regs, long error_code) { if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) { @@ -894,7 +923,7 @@ void __init trap_init(void) set_intr_gate(0,÷_error); set_intr_gate_ist(1,&debug,DEBUG_STACK); set_intr_gate_ist(2,&nmi,NMI_STACK); - set_system_gate(3,&int3); + set_system_gate_ist(3,&int3,DEBUG_STACK); /* int3 can be called from all */ set_system_gate(4,&overflow); /* int4 can be called from all */ set_intr_gate(5,&bounds); set_intr_gate(6,&invalid_op); Index: linux/include/asm-x86_64/desc.h =================================================================== --- linux.orig/include/asm-x86_64/desc.h +++ linux/include/asm-x86_64/desc.h @@ -114,6 +114,11 @@ static inline void set_system_gate(int n _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0); } +static inline void set_system_gate_ist(int nr, void *func, unsigned ist) +{ + _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist); +} + static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type, unsigned size) { Index: linux/include/asm-x86_64/page.h =================================================================== --- linux.orig/include/asm-x86_64/page.h +++ linux/include/asm-x86_64/page.h @@ -14,13 +14,18 @@ #define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK) #define THREAD_ORDER 1 -#ifdef __ASSEMBLY__ -#define THREAD_SIZE (1 << (PAGE_SHIFT + THREAD_ORDER)) -#else -#define THREAD_SIZE (1UL << (PAGE_SHIFT + THREAD_ORDER)) -#endif +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) #define CURRENT_MASK (~(THREAD_SIZE-1)) +#define EXCEPTION_STACK_ORDER 0 +#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) + +#define DEBUG_STACK_ORDER EXCEPTION_STACK_ORDER +#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) + +#define IRQSTACK_ORDER 2 +#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER) + #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) #define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) Index: linux/include/asm-x86_64/pda.h =================================================================== --- linux.orig/include/asm-x86_64/pda.h +++ linux/include/asm-x86_64/pda.h @@ -5,6 +5,7 @@ #include #include #include +#include /* Per processor datastructure. %gs points to it while the kernel runs */ struct x8664_pda { @@ -12,6 +13,9 @@ struct x8664_pda { unsigned long data_offset; /* Per cpu data offset from linker address */ unsigned long kernelstack; /* top of kernel stack for current */ unsigned long oldrsp; /* user rsp for system call */ +#if DEBUG_STKSZ > EXCEPTION_STKSZ + unsigned long debugstack; /* #DB/#BP stack. */ +#endif int irqcount; /* Irq nesting counter. Starts with -1 */ int cpunumber; /* Logical CPU number */ char *irqstackptr; /* top of irqstack */ @@ -23,10 +27,6 @@ struct x8664_pda { unsigned apic_timer_irqs; } ____cacheline_aligned_in_smp; - -#define IRQSTACK_ORDER 2 -#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER) - extern struct x8664_pda cpu_pda[]; /* Index: linux/include/asm-x86_64/processor.h =================================================================== --- linux.orig/include/asm-x86_64/processor.h +++ linux/include/asm-x86_64/processor.h @@ -273,8 +273,6 @@ struct thread_struct { #define DEBUG_STACK 4 #define MCE_STACK 5 #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ -#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) -#define EXCEPTION_STACK_ORDER 0 #define start_thread(regs,new_rip,new_rsp) do { \ asm volatile("movl %0,%%fs; movl %0,%%es; movl %0,%%ds": :"r" (0)); \