GIT ec6c0653f50d7752af57f7fd9f69595f53368d26 git+ssh://master.kernel.org/pub/scm/linux/kernel/git/aegl/linux-2.6.git#test commit b373e385743597f576b67c423807bbdfe3b862e7 Author: Khalid Aziz Date: Tue May 9 15:53:22 2006 -0700 [IA64] kexec for ia64 Enable kexec for ia64. Signed-off-by: Khalid Aziz Signed-off-by: Nanhai Zou Signed-off-by: Tony Luck commit 32e62c636a728cb39c0b3bd191286f2ca65d4028 Author: Bjorn Helgaas Date: Fri May 5 17:19:50 2006 -0600 [IA64] rework memory attribute aliasing This closes a couple holes in our attribute aliasing avoidance scheme: - The current kernel fails mmaps of some /dev/mem MMIO regions because they don't appear in the EFI memory map. This keeps X from working on the Intel Tiger box. - The current kernel allows UC mmap of the 0-1MB region of /sys/.../legacy_mem even when the chipset doesn't support UC access. This causes an MCA when starting X on HP rx7620 and rx8620 boxes in the default configuration. There's more detail in the Documentation/ia64/aliasing.txt file this adds, but the general idea is that if a region might be covered by a granule-sized kernel identity mapping, any access via /dev/mem or mmap must use the same attribute as the identity mapping. Otherwise, we fall back to using an attribute that is supported according to the EFI memory map, or to using UC if the EFI memory map doesn't mention the region. Signed-off-by: Bjorn Helgaas Signed-off-by: Tony Luck --- diff --git a/Documentation/ia64/aliasing.txt b/Documentation/ia64/aliasing.txt new file mode 100644 index 0000000..38f9a52 --- /dev/null +++ b/Documentation/ia64/aliasing.txt @@ -0,0 +1,208 @@ + MEMORY ATTRIBUTE ALIASING ON IA-64 + + Bjorn Helgaas + + May 4, 2006 + + +MEMORY ATTRIBUTES + + Itanium supports several attributes for virtual memory references. + The attribute is part of the virtual translation, i.e., it is + contained in the TLB entry. The ones of most interest to the Linux + kernel are: + + WB Write-back (cacheable) + UC Uncacheable + WC Write-coalescing + + System memory typically uses the WB attribute. The UC attribute is + used for memory-mapped I/O devices. The WC attribute is uncacheable + like UC is, but writes may be delayed and combined to increase + performance for things like frame buffers. + + The Itanium architecture requires that we avoid accessing the same + page with both a cacheable mapping and an uncacheable mapping[1]. + + The design of the chipset determines which attributes are supported + on which regions of the address space. For example, some chipsets + support either WB or UC access to main memory, while others support + only WB access. + +MEMORY MAP + + Platform firmware describes the physical memory map and the + supported attributes for each region. At boot-time, the kernel uses + the EFI GetMemoryMap() interface. ACPI can also describe memory + devices and the attributes they support, but Linux/ia64 currently + doesn't use this information. + + The kernel uses the efi_memmap table returned from GetMemoryMap() to + learn the attributes supported by each region of physical address + space. Unfortunately, this table does not completely describe the + address space because some machines omit some or all of the MMIO + regions from the map. + + The kernel maintains another table, kern_memmap, which describes the + memory Linux is actually using and the attribute for each region. + This contains only system memory; it does not contain MMIO space. + + The kern_memmap table typically contains only a subset of the system + memory described by the efi_memmap. Linux/ia64 can't use all memory + in the system because of constraints imposed by the identity mapping + scheme. + + The efi_memmap table is preserved unmodified because the original + boot-time information is required for kexec. + +KERNEL IDENTITY MAPPINGS + + Linux/ia64 identity mappings are done with large pages, currently + either 16MB or 64MB, referred to as "granules." Cacheable mappings + are speculative[2], so the processor can read any location in the + page at any time, independent of the programmer's intentions. This + means that to avoid attribute aliasing, Linux can create a cacheable + identity mapping only when the entire granule supports cacheable + access. + + Therefore, kern_memmap contains only full granule-sized regions that + can referenced safely by an identity mapping. + + Uncacheable mappings are not speculative, so the processor will + generate UC accesses only to locations explicitly referenced by + software. This allows UC identity mappings to cover granules that + are only partially populated, or populated with a combination of UC + and WB regions. + +USER MAPPINGS + + User mappings are typically done with 16K or 64K pages. The smaller + page size allows more flexibility because only 16K or 64K has to be + homogeneous with respect to memory attributes. + +POTENTIAL ATTRIBUTE ALIASING CASES + + There are several ways the kernel creates new mappings: + + mmap of /dev/mem + + This uses remap_pfn_range(), which creates user mappings. These + mappings may be either WB or UC. If the region being mapped + happens to be in kern_memmap, meaning that it may also be mapped + by a kernel identity mapping, the user mapping must use the same + attribute as the kernel mapping. + + If the region is not in kern_memmap, the user mapping should use + an attribute reported as being supported in the EFI memory map. + + Since the EFI memory map does not describe MMIO on some + machines, this should use an uncacheable mapping as a fallback. + + mmap of /sys/class/pci_bus/.../legacy_mem + + This is very similar to mmap of /dev/mem, except that legacy_mem + only allows mmap of the one megabyte "legacy MMIO" area for a + specific PCI bus. Typically this is the first megabyte of + physical address space, but it may be different on machines with + several VGA devices. + + "X" uses this to access VGA frame buffers. Using legacy_mem + rather than /dev/mem allows multiple instances of X to talk to + different VGA cards. + + The /dev/mem mmap constraints apply. + + However, since this is for mapping legacy MMIO space, WB access + does not make sense. This matters on machines without legacy + VGA support: these machines may have WB memory for the entire + first megabyte (or even the entire first granule). + + On these machines, we could mmap legacy_mem as WB, which would + be safe in terms of attribute aliasing, but X has no way of + knowing that it is accessing regular memory, not a frame buffer, + so the kernel should fail the mmap rather than doing it with WB. + + read/write of /dev/mem + + This uses copy_from_user(), which implicitly uses a kernel + identity mapping. This is obviously safe for things in + kern_memmap. + + There may be corner cases of things that are not in kern_memmap, + but could be accessed this way. For example, registers in MMIO + space are not in kern_memmap, but could be accessed with a UC + mapping. This would not cause attribute aliasing. But + registers typically can be accessed only with four-byte or + eight-byte accesses, and the copy_from_user() path doesn't allow + any control over the access size, so this would be dangerous. + + ioremap() + + This returns a kernel identity mapping for use inside the + kernel. + + If the region is in kern_memmap, we should use the attribute + specified there. Otherwise, if the EFI memory map reports that + the entire granule supports WB, we should use that (granules + that are partially reserved or occupied by firmware do not appear + in kern_memmap). Otherwise, we should use a UC mapping. + +PAST PROBLEM CASES + + mmap of various MMIO regions from /dev/mem by "X" on Intel platforms + + The EFI memory map may not report these MMIO regions. + + These must be allowed so that X will work. This means that + when the EFI memory map is incomplete, every /dev/mem mmap must + succeed. It may create either WB or UC user mappings, depending + on whether the region is in kern_memmap or the EFI memory map. + + mmap of 0x0-0xA0000 /dev/mem by "hwinfo" on HP sx1000 with VGA enabled + + See https://bugzilla.novell.com/show_bug.cgi?id=140858. + + The EFI memory map reports the following attributes: + 0x00000-0x9FFFF WB only + 0xA0000-0xBFFFF UC only (VGA frame buffer) + 0xC0000-0xFFFFF WB only + + This mmap is done with user pages, not kernel identity mappings, + so it is safe to use WB mappings. + + The kernel VGA driver may ioremap the VGA frame buffer at 0xA0000, + which will use a granule-sized UC mapping covering 0-0xFFFFF. This + granule covers some WB-only memory, but since UC is non-speculative, + the processor will never generate an uncacheable reference to the + WB-only areas unless the driver explicitly touches them. + + mmap of 0x0-0xFFFFF legacy_mem by "X" + + If the EFI memory map reports this entire range as WB, there + is no VGA MMIO hole, and the mmap should fail or be done with + a WB mapping. + + There's no easy way for X to determine whether the 0xA0000-0xBFFFF + region is a frame buffer or just memory, so I think it's best to + just fail this mmap request rather than using a WB mapping. As + far as I know, there's no need to map legacy_mem with WB + mappings. + + Otherwise, a UC mapping of the entire region is probably safe. + The VGA hole means the region will not be in kern_memmap. The + HP sx1000 chipset doesn't support UC access to the memory surrounding + the VGA hole, but X doesn't need that area anyway and should not + reference it. + + mmap of 0xA0000-0xBFFFF legacy_mem by "X" on HP sx1000 with VGA disabled + + The EFI memory map reports the following attributes: + 0x00000-0xFFFFF WB only (no VGA MMIO hole) + + This is a special case of the previous case, and the mmap should + fail for the same reason as above. + +NOTES + + [1] SDM rev 2.2, vol 2, sec 4.4.1. + [2] SDM rev 2.2, vol 2, sec 4.4.6. diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 0f3076a..4bab62d 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -413,6 +413,23 @@ config IA64_PALINFO config SGI_SN def_bool y if (IA64_SGI_SN2 || IA64_GENERIC) +config KEXEC + bool "kexec system call (EXPERIMENTAL)" + depends on EXPERIMENTAL + help + kexec is a system call that implements the ability to shutdown your + current kernel, and to start another kernel. It is like a reboot + but it is indepedent of the system firmware. And like a reboot + you can start any kernel with it, not just Linux. + + The name comes from the similiarity to the exec system call. + + It is an ongoing process to be certain the hardware in a machine + is properly shutdown, so do not be surprised if this code does not + initially work for you. It may help to enable device hotplugging + support. As of this writing the exact hardware interface is + strongly in flux, so no good recommendation can be made. + source "drivers/sn/Kconfig" source "drivers/firmware/Kconfig" diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index bdccd0b..6f90d62 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -1624,6 +1624,28 @@ #endif READ_REG(ioc->ioc_hpa + IOC_IBASE); } +#ifdef CONFIG_KEXEC +void +ioc_iova_disable(void) +{ + struct ioc *ioc; + + ioc = ioc_list; + + while (ioc != NULL) { + /* Disable IOVA translation */ + WRITE_REG(ioc->ibase & 0xfffffffffffffffe, ioc->ioc_hpa + IOC_IBASE); + READ_REG(ioc->ioc_hpa + IOC_IBASE); + + /* Clear I/O TLB of any possible entries */ + WRITE_REG(ioc->ibase | (get_iovp_order(ioc->iov_size) + iovp_shift), ioc->ioc_hpa + IOC_PCOM); + READ_REG(ioc->ioc_hpa + IOC_PCOM); + + ioc = ioc->next; + } +} +#endif + static void __init ioc_resource_init(struct ioc *ioc) { diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 09a0dbc..b433b58 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o obj-$(CONFIG_KPROBES) += kprobes.o jprobes.o +obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o mca_recovery-y += mca_drv.o mca_drv_asm.o diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c new file mode 100644 index 0000000..a0e49b7 --- /dev/null +++ b/arch/ia64/kernel/crash.c @@ -0,0 +1,43 @@ +/* + * arch/ia64/kernel/crash.c + * + * Architecture specific (ia64) functions for kexec based crash dumps. + * + * Created by: Khalid Aziz + * + * Copyright (C) 2005 Hewlett-Packard Development Company, L.P. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void +machine_crash_shutdown(struct pt_regs *pt) +{ + /* This function is only called after the system + * has paniced or is otherwise in a critical state. + * The minimum amount of code to allow a kexec'd kernel + * to run successfully needs to happen here. + * + * In practice this means shooting down the other cpus in + * an SMP system. + */ + if (in_interrupt()) + ia64_eoi(); +#ifdef CONFIG_SMP + smp_send_stop(); +#endif +#ifdef CONFIG_IA64_HP_ZX1 + ioc_iova_disable(); +#endif +} diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 12cfedc..c33d0ba 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -8,6 +8,8 @@ * Copyright (C) 1999-2003 Hewlett-Packard Co. * David Mosberger-Tang * Stephane Eranian + * (c) Copyright 2006 Hewlett-Packard Development Company, L.P. + * Bjorn Helgaas * * All EFI Runtime Services are not implemented yet as EFI only * supports physical mode addressing on SoftSDV. This is to be fixed @@ -622,28 +624,20 @@ efi_get_iobase (void) return 0; } -static efi_memory_desc_t * -efi_memory_descriptor (unsigned long phys_addr) +static struct kern_memdesc * +kern_memory_descriptor (unsigned long phys_addr) { - void *efi_map_start, *efi_map_end, *p; - efi_memory_desc_t *md; - u64 efi_desc_size; - - efi_map_start = __va(ia64_boot_param->efi_memmap); - efi_map_end = efi_map_start + ia64_boot_param->efi_memmap_size; - efi_desc_size = ia64_boot_param->efi_memdesc_size; + struct kern_memdesc *md; - for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { - md = p; - - if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) + for (md = kern_memmap; md->start != ~0UL; md++) { + if (phys_addr - md->start < (md->num_pages << EFI_PAGE_SHIFT)) return md; } return 0; } -static int -efi_memmap_has_mmio (void) +static efi_memory_desc_t * +efi_memory_descriptor (unsigned long phys_addr) { void *efi_map_start, *efi_map_end, *p; efi_memory_desc_t *md; @@ -656,8 +650,8 @@ efi_memmap_has_mmio (void) for (p = efi_map_start; p < efi_map_end; p += efi_desc_size) { md = p; - if (md->type == EFI_MEMORY_MAPPED_IO) - return 1; + if (phys_addr - md->phys_addr < (md->num_pages << EFI_PAGE_SHIFT)) + return md; } return 0; } @@ -683,71 +677,125 @@ efi_mem_attributes (unsigned long phys_a } EXPORT_SYMBOL(efi_mem_attributes); -/* - * Determines whether the memory at phys_addr supports the desired - * attribute (WB, UC, etc). If this returns 1, the caller can safely - * access size bytes at phys_addr with the specified attribute. - */ -int -efi_mem_attribute_range (unsigned long phys_addr, unsigned long size, u64 attr) +u64 +efi_mem_attribute (unsigned long phys_addr, unsigned long size) { unsigned long end = phys_addr + size; efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); + u64 attr; + + if (!md) + return 0; + + /* + * EFI_MEMORY_RUNTIME is not a memory attribute; it just tells + * the kernel that firmware needs this region mapped. + */ + attr = md->attribute & ~EFI_MEMORY_RUNTIME; + do { + unsigned long md_end = efi_md_end(md); + + if (end <= md_end) + return attr; + + md = efi_memory_descriptor(md_end); + if (!md || (md->attribute & ~EFI_MEMORY_RUNTIME) != attr) + return 0; + } while (md); + return 0; +} + +u64 +kern_mem_attribute (unsigned long phys_addr, unsigned long size) +{ + unsigned long end = phys_addr + size; + struct kern_memdesc *md; + u64 attr; /* - * Some firmware doesn't report MMIO regions in the EFI memory - * map. The Intel BigSur (a.k.a. HP i2000) has this problem. - * On those platforms, we have to assume UC is valid everywhere. + * This is a hack for ioremap calls before we set up kern_memmap. + * Maybe we should do efi_memmap_init() earlier instead. */ - if (!md || (md->attribute & attr) != attr) { - if (attr == EFI_MEMORY_UC && !efi_memmap_has_mmio()) - return 1; + if (!kern_memmap) { + attr = efi_mem_attribute(phys_addr, size); + if (attr & EFI_MEMORY_WB) + return EFI_MEMORY_WB; return 0; } + md = kern_memory_descriptor(phys_addr); + if (!md) + return 0; + + attr = md->attribute; do { - unsigned long md_end = efi_md_end(md); + unsigned long md_end = kmd_end(md); if (end <= md_end) - return 1; + return attr; - md = efi_memory_descriptor(md_end); - if (!md || (md->attribute & attr) != attr) + md = kern_memory_descriptor(md_end); + if (!md || md->attribute != attr) return 0; } while (md); return 0; } +EXPORT_SYMBOL(kern_mem_attribute); -/* - * For /dev/mem, we only allow read & write system calls to access - * write-back memory, because read & write don't allow the user to - * control access size. - */ int valid_phys_addr_range (unsigned long phys_addr, unsigned long size) { - return efi_mem_attribute_range(phys_addr, size, EFI_MEMORY_WB); + u64 attr; + + /* + * /dev/mem reads and writes use copy_to_user(), which implicitly + * uses a granule-sized kernel identity mapping. It's really + * only safe to do this for regions in kern_memmap. For more + * details, see Documentation/ia64/aliasing.txt. + */ + attr = kern_mem_attribute(phys_addr, size); + if (attr & EFI_MEMORY_WB || attr & EFI_MEMORY_UC) + return 1; + return 0; } -/* - * We allow mmap of anything in the EFI memory map that supports - * either write-back or uncacheable access. For uncacheable regions, - * the supported access sizes are system-dependent, and the user is - * responsible for using the correct size. - * - * Note that this doesn't currently allow access to hot-added memory, - * because that doesn't appear in the boot-time EFI memory map. - */ int valid_mmap_phys_addr_range (unsigned long phys_addr, unsigned long size) { - if (efi_mem_attribute_range(phys_addr, size, EFI_MEMORY_WB)) - return 1; + /* + * MMIO regions are often missing from the EFI memory map. + * We must allow mmap of them for programs like X, so we + * currently can't do any useful validation. + */ + return 1; +} - if (efi_mem_attribute_range(phys_addr, size, EFI_MEMORY_UC)) - return 1; +pgprot_t +phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, + pgprot_t vma_prot) +{ + unsigned long phys_addr = pfn << PAGE_SHIFT; + u64 attr; - return 0; + /* + * For /dev/mem mmap, we use user mappings, but if the region is + * in kern_memmap (and hence may be covered by a kernel mapping), + * we must use the same attribute as the kernel mapping. + */ + attr = kern_mem_attribute(phys_addr, size); + if (attr & EFI_MEMORY_WB) + return pgprot_cacheable(vma_prot); + else if (attr & EFI_MEMORY_UC) + return pgprot_noncached(vma_prot); + + /* + * Some chipsets don't support UC access to memory. If + * WB is supported, we prefer that. + */ + if (efi_mem_attribute(phys_addr, size) & EFI_MEMORY_WB) + return pgprot_cacheable(vma_prot); + + return pgprot_noncached(vma_prot); } int __init diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index bcb80ca..54031c4 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1576,7 +1576,7 @@ sys_call_table: data8 sys_mq_timedreceive // 1265 data8 sys_mq_notify data8 sys_mq_getsetattr - data8 sys_ni_syscall // reserved for kexec_load + data8 sys_kexec_load data8 sys_ni_syscall // reserved for vserver data8 sys_waitid // 1270 data8 sys_add_key diff --git a/arch/ia64/kernel/machine_kexec.c b/arch/ia64/kernel/machine_kexec.c new file mode 100644 index 0000000..3b46143 --- /dev/null +++ b/arch/ia64/kernel/machine_kexec.c @@ -0,0 +1,140 @@ +/* + * arch/ia64/kernel/machine_kexec.c + * + * Handle transition of Linux booting another kernel + * Copyright (C) 2005 Hewlett-Packard Development Comapny, L.P. + * Copyright (C) 2005 Khalid Aziz + * Copyright (C) 2006 Intel Corp, Zou Nan hai + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern unsigned long ia64_iobase; + +typedef void (*relocate_new_kernel_t)( unsigned long, unsigned long, + struct ia64_boot_param *, unsigned long); + +/* + * Do what every setup is needed on image and the + * reboot code buffer to allow us to avoid allocations + * later. + */ +int machine_kexec_prepare(struct kimage *image) +{ + void *control_code_buffer; + const unsigned long *func; + + func = (unsigned long *)&relocate_new_kernel; + /* Pre-load control code buffer to minimize work in kexec path */ + control_code_buffer = page_address(image->control_code_page); + memcpy((void *)control_code_buffer, (const void *)func[0], + relocate_new_kernel_size); + flush_icache_range((unsigned long)control_code_buffer, + (unsigned long)control_code_buffer + relocate_new_kernel_size); + + return 0; +} + +void machine_kexec_cleanup(struct kimage *image) +{ +} + +void machine_shutdown(void) +{ +#ifdef CONFIG_PCI + struct pci_dev *dev = NULL; + irq_desc_t *idesc; + cpumask_t mask = CPU_MASK_NONE; + + /* Disable all PCI devices */ + while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { + if (!(dev->is_enabled)) + continue; + idesc = irq_descp(dev->irq); + if (!idesc) + continue; + cpu_set(0, mask); + disable_irq_nosync(dev->irq); + idesc->handler->end(dev->irq); + idesc->handler->set_affinity(dev->irq, mask); + idesc->action = NULL; + pci_disable_device(dev); + } +#endif + +#ifdef CONFIG_HOTPLUG_CPU + { + int cpu; + + for_each_online_cpu(cpu) { + if (cpu != smp_processor_id()) + cpu_down(cpu); + } + } +#elif defined(CONFIG_SMP) + smp_call_function(kexec_stop_this_cpu, (void *)image->start, 0, 0); +#endif + + ia64_set_itv(1<<16); + +#ifdef CONFIG_IA64_HP_ZX1 + ioc_iova_disable(); +#endif +} + +/* + * Do not allocate memory (or fail in any way) in machine_kexec(). + * We are past the point of no return, committed to rebooting now. + */ +void machine_kexec(struct kimage *image) +{ + unsigned long indirection_page; + relocate_new_kernel_t rnk; + unsigned long pta, impl_va_bits; + void *pal_addr = efi_get_pal_addr(); + unsigned long code_addr = (unsigned long)page_address(image->control_code_page); + + /* Interrupts aren't acceptable while we reboot */ + local_irq_disable(); + + /* Disable VHPT */ + impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61))); + pta = POW2(61) - POW2(vmlpt_bits); + ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | 0); + + /* now execute the control code. + * We will start by executing the control code linked into the + * kernel as opposed to the code we copied in control code buffer * page. When this code switches to physical mode, we will start + * executing the code in control code buffer page. Reason for + * doing this is we start code execution in virtual address space. + * If we were to try to execute the newly copied code in virtual + * address space, we will need to make an ITLB entry to avoid ITLB + * miss. By executing the code linked into kernel, we take advantage + * of the ITLB entry already in place for kernel and avoid making + * a new entry. + */ + indirection_page = image->head & PAGE_MASK; + + rnk = (relocate_new_kernel_t)&code_addr; + (*rnk)(indirection_page, image->start, ia64_boot_param, + GRANULEROUNDDOWN((unsigned long) pal_addr)); + BUG(); + for (;;) + ; +} diff --git a/arch/ia64/kernel/relocate_kernel.S b/arch/ia64/kernel/relocate_kernel.S new file mode 100644 index 0000000..d3e20ad --- /dev/null +++ b/arch/ia64/kernel/relocate_kernel.S @@ -0,0 +1,359 @@ +/* + * arch/ia64/kernel/relocate_kernel.S + * + * Relocate kexec'able kernel and start it + * + * Copyright (C) 2005 Hewlett-Packard Development Company, L.P. + * Copyright (C) 2005 Khalid Aziz + * Copyright (C) 2005 Intel Corp, Zou Nan hai + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ +#include +#include +#include +#include +#include +#include + + /* Must be relocatable PIC code callable as a C function, that once + * it starts can not use the previous processes stack. + * + */ +GLOBAL_ENTRY(relocate_new_kernel) + .prologue + alloc r31=ar.pfs,4,0,0,0 + .body +.reloc_entry: +{ + rsm psr.i| psr.ic + mov r2=ip +} + ;; +{ + flushrs // must be first insn in group + srlz.i +} + ;; + + //first switch to physical mode + add r3=1f-.reloc_entry, r2 + movl r16 = IA64_PSR_AC|IA64_PSR_BN|IA64_PSR_IC|IA64_PSR_MFL + mov ar.rsc=0 // put RSE in enforced lazy mode + ;; + add r2=(memory_stack-.reloc_entry), r2 + ;; + add sp=(memory_stack_end - .reloc_entry),r2 + add r8=(register_stack - .reloc_entry),r2 + ;; + tpa sp=sp + tpa r3=r3 + ;; + loadrs + ;; + mov r18=ar.rnat + mov ar.bspstore=r8 + ;; + mov cr.ipsr=r16 + mov cr.iip=r3 + mov cr.ifs=r0 + srlz.i + ;; + mov ar.rnat=r18 + rfi + ;; +1: + //physical mode code begin + mov b6=in1 + tpa r28=in2 // tpa must before TLB purge + + // purge all TC entries +#define O(member) IA64_CPUINFO_##member##_OFFSET + GET_THIS_PADDR(r2, cpu_info) // load phys addr of cpu_info into r2 + ;; + addl r17=O(PTCE_STRIDE),r2 + addl r2=O(PTCE_BASE),r2 + ;; + ld8 r18=[r2],(O(PTCE_COUNT)-O(PTCE_BASE));; // r18=ptce_base + ld4 r19=[r2],4 // r19=ptce_count[0] + ld4 r21=[r17],4 // r21=ptce_stride[0] + ;; + ld4 r20=[r2] // r20=ptce_count[1] + ld4 r22=[r17] // r22=ptce_stride[1] + mov r24=r0 + ;; + adds r20=-1,r20 + ;; +#undef O +2: + cmp.ltu p6,p7=r24,r19 +(p7) br.cond.dpnt.few 4f + mov ar.lc=r20 +3: + ptc.e r18 + ;; + add r18=r22,r18 + br.cloop.sptk.few 3b + ;; + add r18=r21,r18 + add r24=1,r24 + ;; + br.sptk.few 2b +4: + srlz.i + ;; + //purge TR entry for kernel text and data + movl r16=KERNEL_START + mov r18=KERNEL_TR_PAGE_SHIFT<<2 + ;; + ptr.i r16, r18 + ptr.d r16, r18 + ;; + srlz.i + ;; + + // purge TR entry for percpu data + movl r16=PERCPU_ADDR + mov r18=PERCPU_PAGE_SHIFT<<2 + ;; + ptr.d r16,r18 + ;; + srlz.d + ;; + + // purge TR entry for pal code + mov r16=in3 + mov r18=IA64_GRANULE_SHIFT<<2 + ;; + ptr.i r16,r18 + ;; + srlz.i + ;; + + // purge TR entry for stack + mov r16=IA64_KR(CURRENT_STACK) + ;; + shl r16=r16,IA64_GRANULE_SHIFT + movl r19=PAGE_OFFSET + ;; + add r16=r19,r16 + mov r18=IA64_GRANULE_SHIFT<<2 + ;; + ptr.d r16,r18 + ;; + srlz.i + ;; + + // copy kexec kernel segments + movl r16=PAGE_MASK + ld8 r30=[in0],8;; // in0 is page_list + br.sptk.few .dest_page + ;; +.loop: + ld8 r30=[in0], 8;; +.dest_page: + tbit.z p0, p6=r30, 0;; // 0x1 dest page +(p6) and r17=r30, r16 +(p6) br.cond.sptk.few .loop;; + + tbit.z p0, p6=r30, 1;; // 0x2 indirect page +(p6) and in0=r30, r16 +(p6) br.cond.sptk.few .loop;; + + tbit.z p0, p6=r30, 2;; // 0x4 end flag +(p6) br.cond.sptk.few .end_loop;; + + tbit.z p6, p0=r30, 3;; // 0x8 source page +(p6) br.cond.sptk.few .loop + + and r18=r30, r16 + + // simple copy page, may optimize later + movl r14=PAGE_SIZE/8 - 1;; + mov ar.lc=r14;; +1: + ld8 r14=[r18], 8;; + st8 [r17]=r14, 8;; + fc.i r17 + br.ctop.sptk.few 1b + br.sptk.few .loop + ;; + +.end_loop: + sync.i // for fc.i + ;; + srlz.i + ;; + srlz.d + ;; + br.call.sptk.many b0=b6;; +memory_stack: + .fill 8192, 1, 0 +memory_stack_end: +register_stack: + .fill 8192, 1, 0 +register_stack_end: +relocate_new_kernel_end: +END(relocate_new_kernel) + +GLOBAL_ENTRY(kexec_fake_sal_rendez) + .prologue + alloc r31=ar.pfs,3,0,0,0 + .body +.rendez_entry: + rsm psr.i | psr.ic + mov r25=ip + ;; + { + flushrs + srlz.i + } + ;; + /* See where I am running, and compute gp */ + { + mov ar.rsc = 0 /* Put RSE in enforce lacy, LE mode */ + mov gp = ip /* gp == relocate_new_kernel */ + } + + movl r8=0x00000100000000 + ;; + mov cr.iva=r8 + /* Transition from virtual to physical mode */ + srlz.i + ;; + add r17=5f-.rendez_entry, r25 + movl r16=(IA64_PSR_AC | IA64_PSR_BN | IA64_PSR_IC | IA64_PSR_MFL) + ;; + tpa r17=r17 + mov cr.ipsr=r16 + ;; + mov cr.iip=r17 + mov cr.ifs=r0 + ;; + rfi + ;; +5: + mov b6=in0 /* _start addr */ + mov r8=in1 /* ap_wakeup_vector */ + mov r26=in2 /* PAL addr */ + ;; + /* Purge kernel TRs */ + movl r16=KERNEL_START + mov r18=KERNEL_TR_PAGE_SHIFT<<2 + ;; + ptr.i r16,r18 + ptr.d r16,r18 + ;; + srlz.i + ;; + srlz.d + ;; + /* Purge percpu TR */ + movl r16=PERCPU_ADDR + mov r18=PERCPU_PAGE_SHIFT<<2 + ;; + ptr.d r16,r18 + ;; + srlz.d + ;; + /* Purge PAL TR */ + mov r18=IA64_GRANULE_SHIFT<<2 + ;; + ptr.i r26,r18 + ;; + srlz.i + ;; + /* Purge stack TR */ + mov r16=IA64_KR(CURRENT_STACK) + ;; + shl r16=r16,IA64_GRANULE_SHIFT + movl r19=PAGE_OFFSET + ;; + add r16=r19,r16 + mov r18=IA64_GRANULE_SHIFT<<2 + ;; + ptr.d r16,r18 + ;; + srlz.i + ;; + + /* Ensure we can read and clear external interrupts */ + mov cr.tpr=r0 + srlz.d + + shr.u r9=r8,6 /* which irr */ + ;; + and r8=63,r8 /* bit offset into irr */ + ;; + mov r10=1;; + ;; + shl r10=r10,r8 /* bit mask off irr we want */ + cmp.eq p6,p0=0,r9 + ;; +(p6) br.cond.sptk.few check_irr0 + cmp.eq p7,p0=1,r9 + ;; +(p7) br.cond.sptk.few check_irr1 + cmp.eq p8,p0=2,r9 + ;; +(p8) br.cond.sptk.few check_irr2 + cmp.eq p9,p0=3,r9 + ;; +(p9) br.cond.sptk.few check_irr3 + +check_irr0: + mov r8=cr.irr0 + ;; + and r8=r8,r10 + ;; + cmp.eq p6,p0=0,r8 +(p6) br.cond.sptk.few check_irr0 + br.few call_start + +check_irr1: + mov r8=cr.irr1 + ;; + and r8=r8,r10 + ;; + cmp.eq p6,p0=0,r8 +(p6) br.cond.sptk.few check_irr1 + br.few call_start + +check_irr2: + mov r8=cr.irr2 + ;; + and r8=r8,r10 + ;; + cmp.eq p6,p0=0,r8 +(p6) br.cond.sptk.few check_irr2 + br.few call_start + +check_irr3: + mov r8=cr.irr3 + ;; + and r8=r8,r10 + ;; + cmp.eq p6,p0=0,r8 +(p6) br.cond.sptk.few check_irr3 + br.few call_start + +call_start: + mov cr.eoi=r0 + ;; + srlz.d + ;; + mov r8=cr.ivr + ;; + srlz.d + ;; + cmp.eq p0,p6=15,r8 +(p6) br.cond.sptk.few call_start + br.sptk.few b6 +kexec_fake_sal_rendez_end: +END(kexec_fake_sal_rendez) + + .global relocate_new_kernel_size +relocate_new_kernel_size: + data8 kexec_fake_sal_rendez_end - relocate_new_kernel + diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c index 657ac99..6337278 100644 --- a/arch/ia64/kernel/smp.c +++ b/arch/ia64/kernel/smp.c @@ -30,6 +30,7 @@ #include #include #include #include +#include #include #include @@ -84,6 +85,34 @@ unlock_ipi_calllock(void) spin_unlock_irq(&call_lock); } +#ifdef CONFIG_KEXEC +/* + * Stop the CPU and put it in fake SAL rendezvous. This allows CPU to wake + * up with IPI from boot processor + */ +void +kexec_stop_this_cpu (void *func) +{ + unsigned long pta, impl_va_bits, pal_base; + + /* + * Remove this CPU by putting it into fake SAL rendezvous + */ + cpu_clear(smp_processor_id(), cpu_online_map); + max_xtp(); + ia64_eoi(); + + /* Disable VHPT */ + impl_va_bits = ffz(~(local_cpu_data->unimpl_va_mask | (7UL << 61))); + pta = POW2(61) - POW2(vmlpt_bits); + ia64_set_pta(pta | (0 << 8) | (vmlpt_bits << 2) | 0); + + local_irq_disable(); + pal_base = __get_cpu_var(ia64_mca_pal_base); + kexec_fake_sal_rendez(func, ap_wakeup_vector, pal_base); +} +#endif + static void stop_this_cpu (void) { diff --git a/arch/ia64/mm/ioremap.c b/arch/ia64/mm/ioremap.c index 643ccc6..07bd02b 100644 --- a/arch/ia64/mm/ioremap.c +++ b/arch/ia64/mm/ioremap.c @@ -11,6 +11,7 @@ #include #include #include #include +#include static inline void __iomem * __ioremap (unsigned long offset, unsigned long size) @@ -21,16 +22,29 @@ __ioremap (unsigned long offset, unsigne void __iomem * ioremap (unsigned long offset, unsigned long size) { - if (efi_mem_attribute_range(offset, size, EFI_MEMORY_WB)) - return phys_to_virt(offset); + u64 attr; + unsigned long gran_base, gran_size; - if (efi_mem_attribute_range(offset, size, EFI_MEMORY_UC)) + /* + * For things in kern_memmap, we must use the same attribute + * as the rest of the kernel. For more details, see + * Documentation/ia64/aliasing.txt. + */ + attr = kern_mem_attribute(offset, size); + if (attr & EFI_MEMORY_WB) + return phys_to_virt(offset); + else if (attr & EFI_MEMORY_UC) return __ioremap(offset, size); /* - * Someday this should check ACPI resources so we - * can do the right thing for hot-plugged regions. + * Some chipsets don't support UC access to memory. If + * WB is supported for the whole granule, we prefer that. */ + gran_base = GRANULEROUNDDOWN(offset); + gran_size = GRANULEROUNDUP(offset + size) - gran_base; + if (efi_mem_attribute(gran_base, gran_size) & EFI_MEMORY_WB) + return phys_to_virt(offset); + return __ioremap(offset, size); } EXPORT_SYMBOL(ioremap); @@ -38,6 +52,9 @@ EXPORT_SYMBOL(ioremap); void __iomem * ioremap_nocache (unsigned long offset, unsigned long size) { + if (kern_mem_attribute(offset, size) & EFI_MEMORY_WB) + return 0; + return __ioremap(offset, size); } EXPORT_SYMBOL(ioremap_nocache); diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c index ab829a2..30d148f 100644 --- a/arch/ia64/pci/pci.c +++ b/arch/ia64/pci/pci.c @@ -645,18 +645,31 @@ char *ia64_pci_get_legacy_mem(struct pci int pci_mmap_legacy_page_range(struct pci_bus *bus, struct vm_area_struct *vma) { + unsigned long size = vma->vm_end - vma->vm_start; + pgprot_t prot; char *addr; + /* + * Avoid attribute aliasing. See Documentation/ia64/aliasing.txt + * for more details. + */ + if (!valid_mmap_phys_addr_range(vma->vm_pgoff << PAGE_SHIFT, size)) + return -EINVAL; + prot = phys_mem_access_prot(NULL, vma->vm_pgoff, size, + vma->vm_page_prot); + if (pgprot_val(prot) != pgprot_val(pgprot_noncached(vma->vm_page_prot))) + return -EINVAL; + addr = pci_get_legacy_mem(bus); if (IS_ERR(addr)) return PTR_ERR(addr); vma->vm_pgoff += (unsigned long)addr >> PAGE_SHIFT; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + vma->vm_page_prot = prot; vma->vm_flags |= (VM_SHM | VM_RESERVED | VM_IO); if (remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - vma->vm_end - vma->vm_start, vma->vm_page_prot)) + size, vma->vm_page_prot)) return -EAGAIN; return 0; diff --git a/include/asm-ia64/io.h b/include/asm-ia64/io.h index c2e3742..781ee2c 100644 --- a/include/asm-ia64/io.h +++ b/include/asm-ia64/io.h @@ -88,6 +88,7 @@ phys_to_virt (unsigned long address) } #define ARCH_HAS_VALID_PHYS_ADDR_RANGE +extern u64 kern_mem_attribute (unsigned long phys_addr, unsigned long size); extern int valid_phys_addr_range (unsigned long addr, size_t count); /* efi.c */ extern int valid_mmap_phys_addr_range (unsigned long addr, size_t count); diff --git a/include/asm-ia64/kexec.h b/include/asm-ia64/kexec.h new file mode 100644 index 0000000..e9f0b7a --- /dev/null +++ b/include/asm-ia64/kexec.h @@ -0,0 +1,36 @@ +#ifndef _ASM_IA64_KEXEC_H +#define _ASM_IA64_KEXEC_H + + +/* Maximum physical address we can use pages from */ +#define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) +/* Maximum address we can reach in physical address mode */ +#define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL) +/* Maximum address we can use for the control code buffer */ +#define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE + +#define KEXEC_CONTROL_CODE_SIZE (8192 + 8192 + 4096) + +/* The native architecture */ +#define KEXEC_ARCH KEXEC_ARCH_IA_64 + +#define MAX_NOTE_BYTES 1024 + +#define pte_bits 3 +#define vmlpt_bits (impl_va_bits - PAGE_SHIFT + pte_bits) +#define POW2(n) (1ULL << (n)) + +DECLARE_PER_CPU(u64, ia64_mca_pal_base); + +const extern unsigned int relocate_new_kernel_size; +volatile extern long kexec_rendez; +extern void relocate_new_kernel(unsigned long, unsigned long, + struct ia64_boot_param *, unsigned long); +extern void kexec_fake_sal_rendez(void *start, unsigned long wake_up, + unsigned long pal_base); + +static inline void +crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs) +{ +} +#endif /* _ASM_IA64_KEXEC_H */ diff --git a/include/asm-ia64/machvec_hpzx1.h b/include/asm-ia64/machvec_hpzx1.h index e90daf9..2ae54c3 100644 --- a/include/asm-ia64/machvec_hpzx1.h +++ b/include/asm-ia64/machvec_hpzx1.h @@ -34,4 +34,6 @@ #define platform_dma_sync_sg_for_device #define platform_dma_supported sba_dma_supported #define platform_dma_mapping_error sba_dma_mapping_error +extern void ioc_iova_disable(void); + #endif /* _ASM_IA64_MACHVEC_HPZX1_h */ diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h index c0f8144..90f3a23 100644 --- a/include/asm-ia64/pgtable.h +++ b/include/asm-ia64/pgtable.h @@ -317,22 +317,20 @@ #define pte_mkdirty(pte) (__pte(pte_val( #define pte_mkhuge(pte) (__pte(pte_val(pte))) /* - * Macro to a page protection value as "uncacheable". Note that "protection" is really a - * misnomer here as the protection value contains the memory attribute bits, dirty bits, - * and various other bits as well. + * Make page protection values cacheable, uncacheable, or write- + * combining. Note that "protection" is really a misnomer here as the + * protection value contains the memory attribute bits, dirty bits, and + * various other bits as well. */ +#define pgprot_cacheable(prot) __pgprot((pgprot_val(prot) & ~_PAGE_MA_MASK) | _PAGE_MA_WB) #define pgprot_noncached(prot) __pgprot((pgprot_val(prot) & ~_PAGE_MA_MASK) | _PAGE_MA_UC) - -/* - * Macro to make mark a page protection value as "write-combining". - * Note that "protection" is really a misnomer here as the protection - * value contains the memory attribute bits, dirty bits, and various - * other bits as well. Accesses through a write-combining translation - * works bypasses the caches, but does allow for consecutive writes to - * be combined into single (but larger) write transactions. - */ #define pgprot_writecombine(prot) __pgprot((pgprot_val(prot) & ~_PAGE_MA_MASK) | _PAGE_MA_WC) +struct file; +extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot); +#define __HAVE_PHYS_MEM_ACCESS_PROT + static inline unsigned long pgd_index (unsigned long address) { diff --git a/include/asm-ia64/smp.h b/include/asm-ia64/smp.h index a391435..1d0760f 100644 --- a/include/asm-ia64/smp.h +++ b/include/asm-ia64/smp.h @@ -129,6 +129,9 @@ extern void smp_send_reschedule (int cpu extern void lock_ipi_calllock(void); extern void unlock_ipi_calllock(void); extern void identify_siblings (struct cpuinfo_ia64 *); +#ifdef CONFIG_KEXEC +extern void kexec_stop_this_cpu(void *); +#endif #else diff --git a/include/linux/efi.h b/include/linux/efi.h index e203613..66d621d 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -294,6 +294,7 @@ extern void efi_enter_virtual_mode (void extern u64 efi_get_iobase (void); extern u32 efi_mem_type (unsigned long phys_addr); extern u64 efi_mem_attributes (unsigned long phys_addr); +extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); extern int efi_mem_attribute_range (unsigned long phys_addr, unsigned long size, u64 attr); extern int __init efi_uart_console_only (void);