Avoid overwriting the current pgd (V4, x86_64) From: Magnus Damm kexec: Avoid overwriting the current pgd (V4, x86_64) This patch upgrades the x86_64-specific kexec code to avoid overwriting the current pgd. Overwriting the current pgd is bad when CONFIG_CRASH_DUMP is used to start a secondary kernel that dumps the memory of the previous kernel. The code introduces a new set of page tables. These tables are used to provide an executable identity mapping without overwriting the current pgd. Signed-off-by: Magnus Damm Signed-off-by: Andi Kleen --- Applies on top of 2.6.18-rc4. arch/x86_64/kernel/machine_kexec.c | 67 +++++++------ arch/x86_64/kernel/relocate_kernel.S | 171 +++++++++++++++++++++++++++++++---- include/asm-x86_64/kexec.h | 29 +++++ 3 files changed, 216 insertions(+), 51 deletions(-) Index: linux/arch/x86_64/kernel/machine_kexec.c =================================================================== --- linux.orig/arch/x86_64/kernel/machine_kexec.c +++ linux/arch/x86_64/kernel/machine_kexec.c @@ -15,6 +15,15 @@ #include #include +#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) +static u64 kexec_pgd[512] PAGE_ALIGNED; +static u64 kexec_pud0[512] PAGE_ALIGNED; +static u64 kexec_pmd0[512] PAGE_ALIGNED; +static u64 kexec_pte0[512] PAGE_ALIGNED; +static u64 kexec_pud1[512] PAGE_ALIGNED; +static u64 kexec_pmd1[512] PAGE_ALIGNED; +static u64 kexec_pte1[512] PAGE_ALIGNED; + static void init_level2_page(pmd_t *level2p, unsigned long addr) { unsigned long end_addr; @@ -144,32 +153,19 @@ static void load_segments(void) ); } -typedef NORET_TYPE void (*relocate_new_kernel_t)(unsigned long indirection_page, - unsigned long control_code_buffer, - unsigned long start_address, - unsigned long pgtable) ATTRIB_NORET; - -extern const unsigned char relocate_new_kernel[]; -extern const unsigned long relocate_new_kernel_size; - int machine_kexec_prepare(struct kimage *image) { - unsigned long start_pgtable, control_code_buffer; + unsigned long start_pgtable; int result; /* Calculate the offsets */ start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + PAGE_SIZE; /* Setup the identity mapped 64bit page table */ result = init_pgtable(image, start_pgtable); if (result) return result; - /* Place the code in the reboot code buffer */ - memcpy(__va(control_code_buffer), relocate_new_kernel, - relocate_new_kernel_size); - return 0; } @@ -184,28 +180,34 @@ void machine_kexec_cleanup(struct kimage */ NORET_TYPE void machine_kexec(struct kimage *image) { - unsigned long page_list; - unsigned long control_code_buffer; - unsigned long start_pgtable; - relocate_new_kernel_t rnk; + unsigned long page_list[PAGES_NR]; + void *control_page; /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); - /* Calculate the offsets */ - page_list = image->head; - start_pgtable = page_to_pfn(image->control_code_page) << PAGE_SHIFT; - control_code_buffer = start_pgtable + PAGE_SIZE; + control_page = page_address(image->control_code_page) + PAGE_SIZE; + memcpy(control_page, relocate_kernel, PAGE_SIZE); - /* Set the low half of the page table to my identity mapped - * page table for kexec. Leave the high half pointing at the - * kernel pages. Don't bother to flush the global pages - * as that will happen when I fully switch to my identity mapped - * page table anyway. - */ - memcpy(__va(read_cr3()), __va(start_pgtable), PAGE_SIZE/2); - __flush_tlb(); + page_list[PA_CONTROL_PAGE] = __pa(control_page); + page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[PA_PGD] = __pa(kexec_pgd); + page_list[VA_PGD] = (unsigned long)kexec_pgd; + page_list[PA_PUD_0] = __pa(kexec_pud0); + page_list[VA_PUD_0] = (unsigned long)kexec_pud0; + page_list[PA_PMD_0] = __pa(kexec_pmd0); + page_list[VA_PMD_0] = (unsigned long)kexec_pmd0; + page_list[PA_PTE_0] = __pa(kexec_pte0); + page_list[VA_PTE_0] = (unsigned long)kexec_pte0; + page_list[PA_PUD_1] = __pa(kexec_pud1); + page_list[VA_PUD_1] = (unsigned long)kexec_pud1; + page_list[PA_PMD_1] = __pa(kexec_pmd1); + page_list[VA_PMD_1] = (unsigned long)kexec_pmd1; + page_list[PA_PTE_1] = __pa(kexec_pte1); + page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_TABLE_PAGE] = + (unsigned long)__pa(page_address(image->control_code_page)); /* The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -222,9 +224,10 @@ NORET_TYPE void machine_kexec(struct kim */ set_gdt(phys_to_virt(0),0); set_idt(phys_to_virt(0),0); + /* now call it */ - rnk = (relocate_new_kernel_t) control_code_buffer; - (*rnk)(page_list, control_code_buffer, image->start, start_pgtable); + relocate_kernel((unsigned long)image->head, (unsigned long)page_list, + image->start); } /* crashkernel=size@addr specifies the location to reserve for Index: linux/arch/x86_64/kernel/relocate_kernel.S =================================================================== --- linux.orig/arch/x86_64/kernel/relocate_kernel.S +++ linux/arch/x86_64/kernel/relocate_kernel.S @@ -7,31 +7,169 @@ */ #include +#include +#include - /* - * Must be relocatable PIC code callable as a C function, that once - * it starts can not use the previous processes stack. - */ - .globl relocate_new_kernel +/* + * Must be relocatable PIC code callable as a C function + */ + +#define PTR(x) (x << 3) +#define PAGE_ALIGNED (1 << PAGE_SHIFT) +#define PAGE_ATTR 0x63 /* _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY */ + + .text + .align PAGE_ALIGNED .code64 + .globl relocate_kernel +relocate_kernel: + /* %rdi indirection_page + * %rsi page_list + * %rdx start address + */ + + /* map the control page at its virtual address */ + + movq $0x0000ff8000000000, %r10 /* mask */ + mov $(39 - 3), %cl /* bits to shift */ + movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PGD)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PUD_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PUD_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PMD_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PMD_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PTE_0)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PTE_0)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + /* identity map the control page at its physical address */ + + movq $0x0000ff8000000000, %r10 /* mask */ + mov $(39 - 3), %cl /* bits to shift */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */ + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PGD)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PUD_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PUD_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PMD_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PMD_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_PTE_1)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + + shrq $9, %r10 + sub $9, %cl + + movq %r11, %r9 + andq %r10, %r9 + shrq %cl, %r9 + + movq PTR(VA_PTE_1)(%rsi), %r8 + addq %r8, %r9 + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + orq $PAGE_ATTR, %r8 + movq %r8, (%r9) + relocate_new_kernel: - /* %rdi page_list - * %rsi reboot_code_buffer + /* %rdi indirection_page + * %rsi page_list * %rdx start address - * %rcx page_table - * %r8 arg5 - * %r9 arg6 */ /* zero out flags, and disable interrupts */ pushq $0 popfq - /* set a new stack at the bottom of our page... */ - lea 4096(%rsi), %rsp + /* get physical address of control page now */ + /* this is impossible after page table switch */ + movq PTR(PA_CONTROL_PAGE)(%rsi), %r8 + + /* get physical address of page table now too */ + movq PTR(PA_TABLE_PAGE)(%rsi), %rcx + + /* switch to new set of page tables */ + movq PTR(PA_PGD)(%rsi), %r9 + movq %r9, %cr3 + + /* setup a new stack at the end of the physical control page */ + lea 4096(%r8), %rsp + + /* jump to identity mapped page */ + addq $(identity_mapped - relocate_kernel), %r8 + pushq %r8 + ret - /* store the parameters back on the stack */ - pushq %rdx /* store the start address */ +identity_mapped: + /* store the start address on the stack */ + pushq %rdx /* Set cr0 to a known state: * 31 1 == Paging enabled @@ -136,8 +274,3 @@ relocate_new_kernel: xorq %r15, %r15 ret -relocate_new_kernel_end: - - .globl relocate_new_kernel_size -relocate_new_kernel_size: - .quad relocate_new_kernel_end - relocate_new_kernel Index: linux/include/asm-x86_64/kexec.h =================================================================== --- linux.orig/include/asm-x86_64/kexec.h +++ linux/include/asm-x86_64/kexec.h @@ -1,6 +1,27 @@ #ifndef _X86_64_KEXEC_H #define _X86_64_KEXEC_H +#define PA_CONTROL_PAGE 0 +#define VA_CONTROL_PAGE 1 +#define PA_PGD 2 +#define VA_PGD 3 +#define PA_PUD_0 4 +#define VA_PUD_0 5 +#define PA_PMD_0 6 +#define VA_PMD_0 7 +#define PA_PTE_0 8 +#define VA_PTE_0 9 +#define PA_PUD_1 10 +#define VA_PUD_1 11 +#define PA_PMD_1 12 +#define VA_PMD_1 13 +#define PA_PTE_1 14 +#define VA_PTE_1 15 +#define PA_TABLE_PAGE 16 +#define PAGES_NR 17 + +#ifndef __ASSEMBLY__ + #include #include @@ -64,4 +85,12 @@ static inline void crash_setup_regs(stru newregs->rip = (unsigned long)current_text_addr(); } } + +NORET_TYPE void +relocate_kernel(unsigned long indirection_page, + unsigned long page_list, + unsigned long start_address) ATTRIB_NORET; + +#endif /* __ASSEMBLY__ */ + #endif /* _X86_64_KEXEC_H */