Index: linux/Documentation/gpio.txt =================================================================== --- linux.orig/Documentation/gpio.txt +++ linux/Documentation/gpio.txt @@ -105,12 +105,15 @@ setting up a platform_device using the G /* set as input or output, returning 0 or negative errno */ int gpio_direction_input(unsigned gpio); - int gpio_direction_output(unsigned gpio); + int gpio_direction_output(unsigned gpio, int value); The return value is zero for success, else a negative errno. It should be checked, since the get/set calls don't have error returns and since misconfiguration is possible. (These calls could sleep.) +For output GPIOs, the value provided becomes the initial output value. +This helps avoid signal glitching during system startup. + Setting the direction can fail if the GPIO number is invalid, or when that particular GPIO can't be used in that mode. It's generally a bad idea to rely on boot firmware to have set the direction correctly, since Index: linux/Documentation/kernel-parameters.txt =================================================================== --- linux.orig/Documentation/kernel-parameters.txt +++ linux/Documentation/kernel-parameters.txt @@ -165,6 +165,10 @@ and is between 256 and 4096 characters. acpi_osi= [HW,ACPI] empty param disables _OSI + acpi_simulate_suspend_to_ram + [KNL] Do not call into BIOS to do suspend-to-RAM + Format: <0/1> + acpi_serialize [HW,ACPI] force serialization of AML methods acpi_skip_timer_override [HW,ACPI] @@ -1117,6 +1121,8 @@ and is between 256 and 4096 characters. nolapic [IA-32,APIC] Do not enable or use the local APIC. + nolapic_timer [IA-32,APIC] Do not use the local APIC timer. + noltlbs [PPC] Do not use large page/tlb entries for kernel lowmem mapping on PPC40x. Index: linux/Documentation/sound/alsa/ALSA-Configuration.txt =================================================================== --- linux.orig/Documentation/sound/alsa/ALSA-Configuration.txt +++ linux/Documentation/sound/alsa/ALSA-Configuration.txt @@ -866,6 +866,7 @@ Prior to version 0.9.0rc4 options had a basic 3-jack (default) hp HP nx6320 thinkpad Lenovo Thinkpad T60/X60/Z60 + toshiba Toshiba U205 AD1986A 6stack 6-jack, separate surrounds (default) @@ -906,7 +907,8 @@ Prior to version 0.9.0rc4 options had a 5stack D945 5stack + SPDIF macmini Intel Mac Mini macbook Intel Mac Book - macbook-pro Intel Mac Book Pro + macbook-pro-v1 Intel Mac Book Pro 1st generation + macbook-pro Intel Mac Book Pro 2nd generation STAC9202/9250/9251 ref Reference board, base config Index: linux/Documentation/stable_api_nonsense.txt =================================================================== --- linux.orig/Documentation/stable_api_nonsense.txt +++ linux/Documentation/stable_api_nonsense.txt @@ -62,6 +62,9 @@ consider the following facts about the L - different structures can contain different fields - Some functions may not be implemented at all, (i.e. some locks compile away to nothing for non-SMP builds.) + - Parameter passing of variables from function to function can be + done in different ways (the CONFIG_REGPARM option controls + this.) - Memory within the kernel can be aligned in different ways, depending on the build options. - Linux runs on a wide range of different processor architectures. Index: linux/MAINTAINERS =================================================================== --- linux.orig/MAINTAINERS +++ linux/MAINTAINERS @@ -3103,6 +3103,9 @@ TPM DEVICE DRIVER P: Kylene Hall M: kjhall@us.ibm.com W: http://tpmdd.sourceforge.net +P: Marcel Selhorst +M: tpm@selhorst.net +W: http://www.prosec.rub.de/tpm/ L: tpmdd-devel@lists.sourceforge.net S: Maintained Index: linux/Makefile =================================================================== --- linux.orig/Makefile +++ linux/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 21 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc4-rt0 NAME = Nocturnal Monster Puppy # *DOCUMENTATION* @@ -490,10 +490,19 @@ endif include $(srctree)/arch/$(ARCH)/Makefile -ifdef CONFIG_FRAME_POINTER -CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) +ifdef CONFIG_MCOUNT +CFLAGS += -pg -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) else -CFLAGS += -fomit-frame-pointer + ifdef CONFIG_FRAME_POINTER + CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) + else + CFLAGS += -fomit-frame-pointer + endif +endif + +ifdef CONFIG_UNWIND_INFO +CFLAGS += -fasynchronous-unwind-tables +LDFLAGS_vmlinux += --eh-frame-hdr endif ifdef CONFIG_DEBUG_INFO Index: linux/arch/arm/Kconfig =================================================================== --- linux.orig/arch/arm/Kconfig +++ linux/arch/arm/Kconfig @@ -29,6 +29,14 @@ config GENERIC_TIME bool default n +config GENERIC_CLOCKEVENTS + bool + default n + +config GENERIC_CLOCKEVENTS + bool + default n + config MMU bool default y @@ -206,6 +214,8 @@ config ARCH_EP93XX bool "EP93xx-based" select ARM_AMBA select ARM_VIC + select GENERIC_TIME + select GENERIC_CLOCKEVENTS help This enables support for the Cirrus EP93xx series of CPUs. @@ -219,6 +229,8 @@ config ARCH_FOOTBRIDGE config ARCH_NETX bool "Hilscher NetX based" select ARM_VIC + select GENERIC_TIME + select GENERIC_CLOCKEVENTS help This enables support for systems based on the Hilscher NetX Soc @@ -230,6 +242,8 @@ config ARCH_H720X config ARCH_IMX bool "IMX" + select GENERIC_TIME + select GENERIC_CLOCKEVENTS help Support for Motorola's i.MX family of processors (MX1, MXL). @@ -262,6 +276,7 @@ config ARCH_IXP4XX bool "IXP4xx-based" depends on MMU select GENERIC_TIME + select GENERIC_CLOCKEVENTS help Support for Intel's IXP4XX (XScale) family of processors. @@ -308,6 +323,8 @@ config ARCH_PNX4008 config ARCH_PXA bool "PXA2xx-based" depends on MMU + select GENERIC_TIME + select GENERIC_CLOCKEVENTS select ARCH_MTD_XIP select GENERIC_GPIO select GENERIC_TIME @@ -363,6 +380,7 @@ config ARCH_LH7A40X config ARCH_OMAP bool "TI OMAP" select GENERIC_GPIO + select GENERIC_TIME help Support for TI's OMAP platform (OMAP1 and OMAP2). @@ -513,6 +531,8 @@ endmenu menu "Kernel Features" +source "kernel/time/Kconfig" + config SMP bool "Symmetric Multi-Processing (EXPERIMENTAL)" depends on EXPERIMENTAL && REALVIEW_MPCORE @@ -557,21 +577,11 @@ config LOCAL_TIMERS accounting to be spread across the timer interval, preventing a "thundering herd" at every timer tick. -config PREEMPT - bool "Preemptible Kernel (EXPERIMENTAL)" - depends on EXPERIMENTAL - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +source kernel/Kconfig.preempt config NO_IDLE_HZ bool "Dynamic tick timer" + depends on !GENERIC_CLOCKEVENTS help Select this option if you want to disable continuous timer ticks and have them programmed to occur as required. This option saves Index: linux/arch/arm/boot/compressed/head.S =================================================================== --- linux.orig/arch/arm/boot/compressed/head.S +++ linux/arch/arm/boot/compressed/head.S @@ -833,6 +833,19 @@ memdump: mov r12, r0 mov pc, r10 #endif +#ifdef CONFIG_MCOUNT +/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this + * trampoline + */ + .text + .align 0 + .type mcount %function + .global mcount +mcount: + mov pc, lr @ just return +#endif + + reloc_end: .align Index: linux/arch/arm/common/time-acorn.c =================================================================== --- linux.orig/arch/arm/common/time-acorn.c +++ linux/arch/arm/common/time-acorn.c @@ -77,7 +77,7 @@ ioc_timer_interrupt(int irq, void *dev_i static struct irqaction ioc_timer_irq = { .name = "timer", - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NODELAY, .handler = ioc_timer_interrupt }; Index: linux/arch/arm/kernel/dma.c =================================================================== --- linux.orig/arch/arm/kernel/dma.c +++ linux/arch/arm/kernel/dma.c @@ -20,7 +20,7 @@ #include -DEFINE_SPINLOCK(dma_spin_lock); +DEFINE_RAW_SPINLOCK(dma_spin_lock); EXPORT_SYMBOL(dma_spin_lock); static dma_t dma_chan[MAX_DMA_CHANNELS]; Index: linux/arch/arm/kernel/entry-armv.S =================================================================== --- linux.orig/arch/arm/kernel/entry-armv.S +++ linux/arch/arm/kernel/entry-armv.S @@ -204,7 +204,7 @@ __irq_svc: irq_handler #ifdef CONFIG_PREEMPT ldr r0, [tsk, #TI_FLAGS] @ get flags - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED blne svc_preempt preempt_return: ldr r0, [tsk, #TI_PREEMPT] @ read preempt value @@ -235,7 +235,7 @@ svc_preempt: str r7, [tsk, #TI_PREEMPT] @ expects preempt_count == 0 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED beq preempt_return @ go again b 1b #endif Index: linux/arch/arm/kernel/entry-common.S =================================================================== --- linux.orig/arch/arm/kernel/entry-common.S +++ linux/arch/arm/kernel/entry-common.S @@ -3,6 +3,8 @@ * * Copyright (C) 2000 Russell King * + * FUNCTION_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -44,7 +46,7 @@ ret_fast_syscall: fast_work_pending: str r0, [sp, #S_R0+S_OFF]! @ returned r0 work_pending: - tst r1, #_TIF_NEED_RESCHED + tst r1, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED bne work_resched tst r1, #_TIF_NOTIFY_RESUME | _TIF_SIGPENDING beq no_work_pending @@ -54,7 +56,8 @@ work_pending: b ret_slow_syscall @ Check work again work_resched: - bl schedule + bl __schedule + /* * "slow" syscall return path. "why" tells us if this was a real syscall. */ @@ -394,6 +397,112 @@ ENTRY(sys_oabi_call_table) #include "calls.S" #undef ABI #undef OBSOLETE +#endif + +#ifdef CONFIG_FRAME_POINTER + +#ifdef CONFIG_MCOUNT +/* + * At the point where we are in mcount() we maintain the + * frame of the prologue code and keep the call to mcount() + * out of the stack frame list: + + saved pc <---\ caller of instrumented routine + saved lr | + ip/prev_sp | + fp -----^ | + : | + | + -> saved pc | instrumented routine + | saved lr | + | ip/prev_sp | + | fp ---------/ + | : + | + | mcount + | saved pc + | saved lr + | ip/prev sp + -- fp + r3 + r2 + r1 + sp-> r0 + : + */ + + .text + .align 0 + .type mcount %function + .global mcount + +/* gcc -pg generated FUNCTION_PROLOGUE references mcount() + * and has already created the stack frame invocation for + * the routine we have been called to instrument. We create + * a complete frame nevertheless, as we want to use the same + * call to mcount() from c code. + */ +mcount: + + ldr ip, =mcount_enabled @ leave early, if disabled + ldr ip, [ip] + cmp ip, #0 + moveq pc, lr + + mov ip, sp + stmdb sp!, {r0 - r3, fp, ip, lr, pc} @ create stack frame + + ldr r1, [fp, #-4] @ get lr (the return address + @ of the caller of the + @ instrumented function) + mov r0, lr @ get lr - (the return address + @ of the instrumented function) + + sub fp, ip, #4 @ point fp at this frame + + bl __trace +1: + ldmdb fp, {r0 - r3, fp, sp, pc} @ pop entry frame and return + +#endif + +/* ARM replacement for unsupported gcc __builtin_return_address(n) + * where 0 < n. n == 0 is supported here as well. + * + * Walk up the stack frame until the desired frame is found or a NULL + * fp is encountered, return NULL in the latter case. + * + * Note: it is possible under code optimization for the stack invocation + * of an ancestor function (level N) to be removed before calling a + * descendant function (level N+1). No easy means is available to deduce + * this scenario with the result being [for example] caller_addr(0) when + * called from level N+1 returning level N-1 rather than the expected + * level N. This optimization issue appears isolated to the case of + * a call to a level N+1 routine made at the tail end of a level N + * routine -- the level N frame is deleted and a simple branch is made + * to the level N+1 routine. + */ + + .text + .align 0 + .type arm_return_addr %function + .global arm_return_addr + +arm_return_addr: + mov ip, r0 + mov r0, fp +3: + cmp r0, #0 + beq 1f @ frame list hit end, bail + cmp ip, #0 + beq 2f @ reached desired frame + ldr r0, [r0, #-12] @ else continue, get next fp + sub ip, ip, #1 + b 3b +2: + ldr r0, [r0, #-4] @ get target return address +1: + mov pc, lr #endif Index: linux/arch/arm/kernel/fiq.c =================================================================== --- linux.orig/arch/arm/kernel/fiq.c +++ linux/arch/arm/kernel/fiq.c @@ -89,7 +89,7 @@ void set_fiq_handler(void *start, unsign * disable irqs for the duration. Note - these functions are almost * entirely coded in assembly. */ -void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( @@ -107,7 +107,7 @@ void __attribute__((naked)) set_fiq_regs : "r" (®s->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE)); } -void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( Index: linux/arch/arm/kernel/irq.c =================================================================== --- linux.orig/arch/arm/kernel/irq.c +++ linux/arch/arm/kernel/irq.c @@ -101,7 +101,7 @@ unlock: /* Handle bad interrupts */ static struct irq_desc bad_irq_desc = { .handle_irq = handle_bad_irq, - .lock = SPIN_LOCK_UNLOCKED + .lock = RAW_SPIN_LOCK_UNLOCKED(bad_irq_desc.lock) }; /* @@ -109,11 +109,13 @@ static struct irq_desc bad_irq_desc = { * come via this function. Instead, they should provide their * own 'handler' */ -asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs) +asmlinkage notrace void asm_do_IRQ(unsigned int irq, struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); struct irq_desc *desc = irq_desc + irq; + trace_special(instruction_pointer(regs), irq, 0); + /* * Some hardware gives randomly wrong interrupts. Rather * than crashing, do something sensible. Index: linux/arch/arm/kernel/process.c =================================================================== --- linux.orig/arch/arm/kernel/process.c +++ linux/arch/arm/kernel/process.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -129,7 +130,7 @@ static void default_idle(void) cpu_relax(); else { local_irq_disable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { timer_dyn_reprogram(); arch_idle(); } @@ -160,12 +161,16 @@ void cpu_idle(void) if (!idle) idle = default_idle; leds_event(led_idle_start); - while (!need_resched()) + tick_nohz_stop_sched_tick(); + while (!need_resched() && !need_resched_delayed()) idle(); leds_event(led_idle_end); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + tick_nohz_restart_sched_tick(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } Index: linux/arch/arm/kernel/semaphore.c =================================================================== --- linux.orig/arch/arm/kernel/semaphore.c +++ linux/arch/arm/kernel/semaphore.c @@ -49,14 +49,16 @@ * we cannot lose wakeup events. */ -void __up(struct semaphore *sem) +fastcall void __attribute_used__ __compat_up(struct compat_semaphore *sem) { wake_up(&sem->wait); } +EXPORT_SYMBOL(__compat_up); + static DEFINE_SPINLOCK(semaphore_lock); -void __sched __down(struct semaphore * sem) +fastcall void __attribute_used__ __sched __compat_down(struct compat_semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +91,9 @@ void __sched __down(struct semaphore * s wake_up(&sem->wait); } -int __sched __down_interruptible(struct semaphore * sem) +EXPORT_SYMBOL(__compat_down); + +fastcall int __attribute_used__ __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -140,6 +144,8 @@ int __sched __down_interruptible(struct return retval; } +EXPORT_SYMBOL(__compat_down_interruptible); + /* * Trylock failed - make sure we correct for * having decremented the count. @@ -148,7 +154,7 @@ int __sched __down_interruptible(struct * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ -int __down_trylock(struct semaphore * sem) +fastcall int __attribute_used__ __sched __compat_down_trylock(struct compat_semaphore * sem) { int sleepers; unsigned long flags; @@ -168,6 +174,15 @@ int __down_trylock(struct semaphore * se return 1; } +EXPORT_SYMBOL(__compat_down_trylock); + +fastcall int __sched compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + +EXPORT_SYMBOL(compat_sem_is_locked); + /* * The semaphore operations have a special calling sequence that * allow us to do a simpler in-line version of them. These routines @@ -185,7 +200,7 @@ asm(" .section .sched.text,\"ax\",%progb __down_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down \n\ + bl __compat_down \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ .align 5 \n\ @@ -193,7 +208,7 @@ __down_failed: \n\ __down_interruptible_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down_interruptible \n\ + bl __compat_down_interruptible \n\ mov ip, r0 \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ @@ -202,7 +217,7 @@ __down_interruptible_failed: \n\ __down_trylock_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down_trylock \n\ + bl __compat_down_trylock \n\ mov ip, r0 \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ @@ -211,7 +226,7 @@ __down_trylock_failed: \n\ __up_wakeup: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __up \n\ + bl __compat_up \n\ ldmfd sp!, {r0 - r4, pc} \n\ "); Index: linux/arch/arm/kernel/setup.c =================================================================== --- linux.orig/arch/arm/kernel/setup.c +++ linux/arch/arm/kernel/setup.c @@ -839,8 +839,11 @@ static int __init topology_init(void) { int cpu; - for_each_possible_cpu(cpu) - register_cpu(&per_cpu(cpu_data, cpu).cpu, cpu); + for_each_possible_cpu(cpu) { + struct cpuinfo_arm *cpuinfo = &per_cpu(cpu_data, cpu); + cpuinfo->cpu.hotpluggable = 1; + register_cpu(&cpuinfo->cpu, cpu); + } return 0; } Index: linux/arch/arm/kernel/signal.c =================================================================== --- linux.orig/arch/arm/kernel/signal.c +++ linux/arch/arm/kernel/signal.c @@ -632,6 +632,14 @@ static int do_signal(sigset_t *oldset, s siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif + /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux/arch/arm/kernel/smp.c =================================================================== --- linux.orig/arch/arm/kernel/smp.c +++ linux/arch/arm/kernel/smp.c @@ -521,7 +521,7 @@ static void ipi_call_function(unsigned i cpu_clear(cpu, data->unfinished); } -static DEFINE_SPINLOCK(stop_lock); +static DEFINE_RAW_SPINLOCK(stop_lock); /* * ipi_cpu_stop - handle IPI from smp_send_stop() Index: linux/arch/arm/kernel/time.c =================================================================== --- linux.orig/arch/arm/kernel/time.c +++ linux/arch/arm/kernel/time.c @@ -236,6 +236,13 @@ static inline void do_leds(void) #define do_leds() #endif +void arch_tick_leds(void) +{ +#ifdef CONFIG_LEDS_TIMER + do_leds(); +#endif +} + #ifndef CONFIG_GENERIC_TIME void do_gettimeofday(struct timeval *tv) { Index: linux/arch/arm/kernel/traps.c =================================================================== --- linux.orig/arch/arm/kernel/traps.c +++ linux/arch/arm/kernel/traps.c @@ -170,6 +170,7 @@ void dump_stack(void) { #ifdef CONFIG_DEBUG_ERRORS __backtrace(); + print_traces(current); #endif } @@ -210,7 +211,7 @@ static void __die(const char *str, int e } } -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. @@ -246,7 +247,7 @@ void notify_die(const char *str, struct } static LIST_HEAD(undef_hook); -static DEFINE_SPINLOCK(undef_lock); +static DEFINE_RAW_SPINLOCK(undef_lock); void register_undef_hook(struct undef_hook *hook) { Index: linux/arch/arm/mach-at91/gpio.c =================================================================== --- linux.orig/arch/arm/mach-at91/gpio.c +++ linux/arch/arm/mach-at91/gpio.c @@ -215,13 +215,14 @@ int gpio_direction_input(unsigned pin) } EXPORT_SYMBOL(gpio_direction_input); -int gpio_direction_output(unsigned pin) +int gpio_direction_output(unsigned pin, int value) { void __iomem *pio = pin_to_controller(pin); unsigned mask = pin_to_mask(pin); if (!pio || !(__raw_readl(pio + PIO_PSR) & mask)) return -EINVAL; + __raw_writel(mask, pio + (value ? PIO_SODR : PIO_CODR)); __raw_writel(mask, pio + PIO_OER); return 0; } Index: linux/arch/arm/mach-footbridge/netwinder-hw.c =================================================================== --- linux.orig/arch/arm/mach-footbridge/netwinder-hw.c +++ linux/arch/arm/mach-footbridge/netwinder-hw.c @@ -67,7 +67,7 @@ static inline void wb977_ww(int reg, int /* * This is a lock for accessing ports GP1_IO_BASE and GP2_IO_BASE */ -DEFINE_SPINLOCK(gpio_lock); +DEFINE_RAW_SPINLOCK(gpio_lock); static unsigned int current_gpio_op; static unsigned int current_gpio_io; Index: linux/arch/arm/mach-footbridge/netwinder-leds.c =================================================================== --- linux.orig/arch/arm/mach-footbridge/netwinder-leds.c +++ linux/arch/arm/mach-footbridge/netwinder-leds.c @@ -32,7 +32,7 @@ static char led_state; static char hw_led_state; static DEFINE_SPINLOCK(leds_lock); -extern spinlock_t gpio_lock; +extern raw_spinlock_t gpio_lock; static void netwinder_leds_event(led_event_t evt) { Index: linux/arch/arm/mach-imx/time.c =================================================================== --- linux.orig/arch/arm/mach-imx/time.c +++ linux/arch/arm/mach-imx/time.c @@ -15,6 +15,9 @@ #include #include #include +#ifdef CONFIG_GENERIC_CLOCKEVENTS +#include +#endif #include #include @@ -25,6 +28,11 @@ /* Use timer 1 as system timer */ #define TIMER_BASE IMX_TIM1_BASE +#ifdef CONFIG_GENERIC_CLOCKEVENTS +static struct clock_event_device clockevent_imx; +static enum clock_event_mode clockevent_mode = CLOCK_EVT_MODE_PERIODIC; +#endif + static unsigned long evt_diff; /* @@ -33,6 +41,7 @@ static unsigned long evt_diff; static irqreturn_t imx_timer_interrupt(int irq, void *dev_id) { + unsigned long tcmp; uint32_t tstat; /* clear the interrupt */ @@ -42,13 +51,20 @@ imx_timer_interrupt(int irq, void *dev_i if (tstat & TSTAT_COMP) { do { +#ifdef CONFIG_GENERIC_CLOCKEVENTS + if (clockevent_imx.event_handler) + clockevent_imx.event_handler(&clockevent_imx); + if (likely(clockevent_mode != CLOCK_EVT_MODE_PERIODIC)) + break; +#else write_seqlock(&xtime_lock); timer_tick(); write_sequnlock(&xtime_lock); - IMX_TCMP(TIMER_BASE) += evt_diff; +#endif + tcmp = IMX_TCMP(TIMER_BASE) + evt_diff; + IMX_TCMP(TIMER_BASE) = tcmp; - } while (unlikely((int32_t)(IMX_TCMP(TIMER_BASE) - - IMX_TCN(TIMER_BASE)) < 0)); + } while (unlikely((int32_t)(tcmp - IMX_TCN(TIMER_BASE)) < 0)); } return IRQ_HANDLED; @@ -70,7 +86,7 @@ static void __init imx_timer_hardware_in */ IMX_TCTL(TIMER_BASE) = 0; IMX_TPRER(TIMER_BASE) = 0; - IMX_TCMP(TIMER_BASE) = LATCH - 1; + IMX_TCMP(TIMER_BASE) = IMX_TCN(TIMER_BASE) + LATCH; IMX_TCTL(TIMER_BASE) = TCTL_FRR | TCTL_CLK_PCLK1 | TCTL_IRQEN | TCTL_TEN; evt_diff = LATCH; @@ -99,11 +115,115 @@ static int __init imx_clocksource_init(v return 0; } +#ifdef CONFIG_GENERIC_CLOCKEVENTS + +static int imx_set_next_event(unsigned long evt, + struct clock_event_device *unused) +{ + unsigned long tcmp; + evt_diff = evt; + + tcmp = IMX_TCN(TIMER_BASE) + evt; + IMX_TCMP(TIMER_BASE) = tcmp; + + return unlikely((int32_t)(tcmp - IMX_TCN(TIMER_BASE)) < 0) ? -ETIME : 0; +} + +#ifdef DEBUG +static const char *clock_event_mode_label[]={ + [CLOCK_EVT_MODE_PERIODIC] = "CLOCK_EVT_MODE_PERIODIC", + [CLOCK_EVT_MODE_ONESHOT] = "CLOCK_EVT_MODE_ONESHOT", + [CLOCK_EVT_MODE_SHUTDOWN] = "CLOCK_EVT_MODE_SHUTDOWN", + [CLOCK_EVT_MODE_UNUSED] = "CLOCK_EVT_MODE_UNUSED" +}; +#endif /*DEBUG*/ + +static void imx_set_mode(enum clock_event_mode mode, struct clock_event_device *evt) +{ + unsigned long flags; + + /* + * The timer interrupt generation is disabled at least + * for enough time to call imx_set_next_event() + */ + local_irq_save(flags); + /* Disable interrupt in GPT module */ + IMX_TCTL(TIMER_BASE) &= ~TCTL_IRQEN; + if ((mode != CLOCK_EVT_MODE_PERIODIC) || (mode != clockevent_mode)) { + /* Set event time into far-far future */ + IMX_TCMP(TIMER_BASE) = IMX_TCN(TIMER_BASE) - 3; + /* Clear pending interrupt */ + IMX_TSTAT(TIMER_BASE) &= ~TSTAT_COMP; + } + +#ifdef DEBUG + printk(KERN_INFO "imx_set_mode: changing mode from %s to %s\n", + clock_event_mode_label[clockevent_mode], clock_event_mode_label[mode]); +#endif /*DEBUG*/ + + /* Remember timer mode */ + clockevent_mode = mode; + local_irq_restore(flags); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + /* It seems, that periodic mode expects old ugly latch period */ + evt_diff = LATCH; + IMX_TCMP(TIMER_BASE) = IMX_TCN(TIMER_BASE) + evt_diff; + case CLOCK_EVT_MODE_ONESHOT: + /* + * Do not put overhead of interrupt enable/disable into + * imx_set_next_event(), the core has about 4 minutes + * to call imx_set_next_event() or shutdown clock after + * mode switching + */ + local_irq_save(flags); + IMX_TCTL(TIMER_BASE) |= TCTL_IRQEN; + local_irq_restore(flags); + break; + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + /* Left event sources disabled, no more interrupts appears */ + break; + } +} + +static struct clock_event_device clockevent_imx = { + .name = "imx_timer1", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .shift = 32, + .set_mode = imx_set_mode, + .set_next_event = imx_set_next_event, + .rating = 200, +}; + +static int __init imx_clockevent_init(void) +{ + clockevent_imx.mult = div_sc(imx_get_perclk1(), NSEC_PER_SEC, + clockevent_imx.shift); + clockevent_imx.max_delta_ns = + clockevent_delta2ns(0xfffffffe, &clockevent_imx); + clockevent_imx.min_delta_ns = + clockevent_delta2ns(0xf, &clockevent_imx); + + clockevent_imx.cpumask = cpumask_of_cpu(0); + + clockevents_register_device(&clockevent_imx); + + return 0; +} +#endif + + static void __init imx_timer_init(void) { imx_timer_hardware_init(); imx_clocksource_init(); +#ifdef CONFIG_GENERIC_CLOCKEVENTS + imx_clockevent_init(); +#endif + /* * Make irqs happen for the system timer */ Index: linux/arch/arm/mach-integrator/core.c =================================================================== --- linux.orig/arch/arm/mach-integrator/core.c +++ linux/arch/arm/mach-integrator/core.c @@ -164,7 +164,7 @@ static struct amba_pl010_data integrator #define CM_CTRL IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET -static DEFINE_SPINLOCK(cm_lock); +static DEFINE_RAW_SPINLOCK(cm_lock); /** * cm_control - update the CM_CTRL register. Index: linux/arch/arm/mach-integrator/pci_v3.c =================================================================== --- linux.orig/arch/arm/mach-integrator/pci_v3.c +++ linux/arch/arm/mach-integrator/pci_v3.c @@ -162,7 +162,7 @@ * 7:2 register number * */ -static DEFINE_SPINLOCK(v3_lock); +static DEFINE_RAW_SPINLOCK(v3_lock); #define PCI_BUS_NONMEM_START 0x00000000 #define PCI_BUS_NONMEM_SIZE SZ_256M Index: linux/arch/arm/mach-integrator/platsmp.c =================================================================== --- linux.orig/arch/arm/mach-integrator/platsmp.c +++ linux/arch/arm/mach-integrator/platsmp.c @@ -30,7 +30,7 @@ extern void integrator_secondary_startup volatile int __cpuinitdata pen_release = -1; unsigned long __cpuinitdata phys_pen_release = 0; -static DEFINE_SPINLOCK(boot_lock); +static DEFINE_RAW_SPINLOCK(boot_lock); void __cpuinit platform_secondary_init(unsigned int cpu) { Index: linux/arch/arm/mach-ixp4xx/common-pci.c =================================================================== --- linux.orig/arch/arm/mach-ixp4xx/common-pci.c +++ linux/arch/arm/mach-ixp4xx/common-pci.c @@ -53,7 +53,7 @@ unsigned long ixp4xx_pci_reg_base = 0; * these transactions are atomic or we will end up * with corrupt data on the bus or in a driver. */ -static DEFINE_SPINLOCK(ixp4xx_pci_lock); +static DEFINE_RAW_SPINLOCK(ixp4xx_pci_lock); /* * Read from PCI config space Index: linux/arch/arm/mach-ixp4xx/common.c =================================================================== --- linux.orig/arch/arm/mach-ixp4xx/common.c +++ linux/arch/arm/mach-ixp4xx/common.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,8 @@ #include static int __init ixp4xx_clocksource_init(void); +static int __init ixp4xx_clockevent_init(void); +static struct clock_event_device clockevent_ixp4xx; /************************************************************************* * IXP4xx chipset I/O mapping @@ -239,52 +242,40 @@ void __init ixp4xx_init_irq(void) * counter as a source of real clock ticks to account for missed jiffies. *************************************************************************/ -static unsigned volatile last_jiffy_time; - -#define CLOCK_TICKS_PER_USEC ((CLOCK_TICK_RATE + USEC_PER_SEC/2) / USEC_PER_SEC) - static irqreturn_t ixp4xx_timer_interrupt(int irq, void *dev_id) { - write_seqlock(&xtime_lock); + struct clock_event_device *evt = &clockevent_ixp4xx; /* Clear Pending Interrupt by writing '1' to it */ *IXP4XX_OSST = IXP4XX_OSST_TIMER_1_PEND; - /* - * Catch up with the real idea of time - */ - while ((signed long)(*IXP4XX_OSTS - last_jiffy_time) >= LATCH) { - timer_tick(); - last_jiffy_time += LATCH; - } - - write_sequnlock(&xtime_lock); + evt->event_handler(evt); return IRQ_HANDLED; } static struct irqaction ixp4xx_timer_irq = { - .name = "IXP4xx Timer Tick", + .name = "timer1", .flags = IRQF_DISABLED | IRQF_TIMER, .handler = ixp4xx_timer_interrupt, }; static void __init ixp4xx_timer_init(void) { + /* Reset/disable counter */ + *IXP4XX_OSRT1 = 0; + /* Clear Pending Interrupt by writing '1' to it */ *IXP4XX_OSST = IXP4XX_OSST_TIMER_1_PEND; - /* Setup the Timer counter value */ - *IXP4XX_OSRT1 = (LATCH & ~IXP4XX_OST_RELOAD_MASK) | IXP4XX_OST_ENABLE; - /* Reset time-stamp counter */ *IXP4XX_OSTS = 0; - last_jiffy_time = 0; /* Connect the interrupt handler and enable the interrupt */ setup_irq(IRQ_IXP4XX_TIMER1, &ixp4xx_timer_irq); ixp4xx_clocksource_init(); + ixp4xx_clockevent_init(); } struct sys_timer ixp4xx_timer = { @@ -384,6 +375,9 @@ void __init ixp4xx_sys_init(void) ixp4xx_exp_bus_size >> 20); } +/* + * clocksource + */ cycle_t ixp4xx_get_cycles(void) { return *IXP4XX_OSTS; @@ -408,3 +402,64 @@ static int __init ixp4xx_clocksource_ini return 0; } + +/* + * clockevents + */ +static int ixp4xx_set_next_event(unsigned long evt, + struct clock_event_device *unused) +{ + unsigned long opts = *IXP4XX_OSRT1 & IXP4XX_OST_RELOAD_MASK; + + *IXP4XX_OSRT1 = (evt & ~IXP4XX_OST_RELOAD_MASK) | opts; + + return 0; +} + +static void ixp4xx_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long opts, osrt = *IXP4XX_OSRT1 & ~IXP4XX_OST_RELOAD_MASK; + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + osrt = LATCH & ~IXP4XX_OST_RELOAD_MASK; + opts = IXP4XX_OST_ENABLE; + break; + case CLOCK_EVT_MODE_ONESHOT: + /* period set by 'set next_event' */ + osrt = 0; + opts = IXP4XX_OST_ENABLE | IXP4XX_OST_ONE_SHOT; + break; + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + default: + osrt = opts = 0; + break; + } + + *IXP4XX_OSRT1 = osrt | opts; +} + +static struct clock_event_device clockevent_ixp4xx = { + .name = "ixp4xx timer1", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .rating = 200, + .shift = 24, + .set_mode = ixp4xx_set_mode, + .set_next_event = ixp4xx_set_next_event, +}; + +static int __init ixp4xx_clockevent_init(void) +{ + clockevent_ixp4xx.mult = div_sc(FREQ, NSEC_PER_SEC, + clockevent_ixp4xx.shift); + clockevent_ixp4xx.max_delta_ns = + clockevent_delta2ns(0xfffffffe, &clockevent_ixp4xx); + clockevent_ixp4xx.min_delta_ns = + clockevent_delta2ns(0xf, &clockevent_ixp4xx); + clockevent_ixp4xx.cpumask = cpumask_of_cpu(0); + + clockevents_register_device(&clockevent_ixp4xx); + return 0; +} Index: linux/arch/arm/mach-omap1/pm.c =================================================================== --- linux.orig/arch/arm/mach-omap1/pm.c +++ linux/arch/arm/mach-omap1/pm.c @@ -120,7 +120,7 @@ void omap_pm_idle(void) local_irq_disable(); local_fiq_disable(); - if (need_resched()) { + if (need_resched() || need_resched_delayed()) { local_fiq_enable(); local_irq_enable(); return; Index: linux/arch/arm/mach-omap1/time.c =================================================================== --- linux.orig/arch/arm/mach-omap1/time.c +++ linux/arch/arm/mach-omap1/time.c @@ -39,6 +39,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -48,13 +52,7 @@ #include #include -struct sys_timer omap_timer; -/* - * --------------------------------------------------------------------------- - * MPU timer - * --------------------------------------------------------------------------- - */ #define OMAP_MPU_TIMER_BASE OMAP_MPU_TIMER1_BASE #define OMAP_MPU_TIMER_OFFSET 0x100 @@ -88,21 +86,6 @@ static inline unsigned long long cycles_ return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR; } -/* - * MPU_TICKS_PER_SEC must be an even number, otherwise machinecycles_to_usecs - * will break. On P2, the timer count rate is 6.5 MHz after programming PTV - * with 0. This divides the 13MHz input by 2, and is undocumented. - */ -#if defined(CONFIG_MACH_OMAP_PERSEUS2) || defined(CONFIG_MACH_OMAP_FSAMPLE) -/* REVISIT: This ifdef construct should be replaced by a query to clock - * framework to see if timer base frequency is 12.0, 13.0 or 19.2 MHz. - */ -#define MPU_TICKS_PER_SEC (13000000 / 2) -#else -#define MPU_TICKS_PER_SEC (12000000 / 2) -#endif - -#define MPU_TIMER_TICK_PERIOD ((MPU_TICKS_PER_SEC / HZ) - 1) typedef struct { u32 cntl; /* CNTL_TIMER, R/W */ @@ -120,98 +103,164 @@ static inline unsigned long omap_mpu_tim return timer->read_tim; } -static inline void omap_mpu_timer_start(int nr, unsigned long load_val) +static inline void omap_mpu_set_autoreset(int nr) +{ + volatile omap_mpu_timer_regs_t* timer = omap_mpu_timer_base(nr); + + timer->cntl = timer->cntl | MPU_TIMER_AR; +} + +static inline void omap_mpu_remove_autoreset(int nr) +{ + volatile omap_mpu_timer_regs_t* timer = omap_mpu_timer_base(nr); + + timer->cntl = timer->cntl & ~MPU_TIMER_AR; +} + +static inline void omap_mpu_timer_start(int nr, unsigned long load_val, + int autoreset) { volatile omap_mpu_timer_regs_t* timer = omap_mpu_timer_base(nr); + unsigned int timerflags = (MPU_TIMER_CLOCK_ENABLE | MPU_TIMER_ST); + + if (autoreset) timerflags |= MPU_TIMER_AR; timer->cntl = MPU_TIMER_CLOCK_ENABLE; udelay(1); timer->load_tim = load_val; udelay(1); - timer->cntl = (MPU_TIMER_CLOCK_ENABLE | MPU_TIMER_AR | MPU_TIMER_ST); + timer->cntl = timerflags; +} + +/* + * --------------------------------------------------------------------------- + * MPU timer 1 ... count down to zero, interrupt, reload + * --------------------------------------------------------------------------- + */ +static int omap_mpu_set_next_event(unsigned long cycles, + struct clock_event_device *evt) +{ + omap_mpu_timer_start(0, cycles, 0); + return 0; } -unsigned long omap_mpu_timer_ticks_to_usecs(unsigned long nr_ticks) +static void omap_mpu_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + omap_mpu_set_autoreset(0); + break; + case CLOCK_EVT_MODE_ONESHOT: + omap_mpu_remove_autoreset(0); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + break; + } +} + +static struct clock_event_device clockevent_mpu_timer1 = { + .name = "mpu_timer1", + .features = CLOCK_EVT_FEAT_PERIODIC, CLOCK_EVT_FEAT_ONESHOT, + .shift = 32, + .set_next_event = omap_mpu_set_next_event, + .set_mode = omap_mpu_set_mode, +}; + +static irqreturn_t omap_mpu_timer1_interrupt(int irq, void *dev_id) { - unsigned long long nsec; + struct clock_event_device *evt = &clockevent_mpu_timer1; + + evt->event_handler(evt); - nsec = cycles_2_ns((unsigned long long)nr_ticks); - return (unsigned long)nsec / 1000; + return IRQ_HANDLED; } -/* - * Last processed system timer interrupt - */ -static unsigned long omap_mpu_timer_last = 0; +static struct irqaction omap_mpu_timer1_irq = { + .name = "mpu_timer1", + .flags = IRQF_DISABLED | IRQF_TIMER, + .handler = omap_mpu_timer1_interrupt, +}; -/* - * Returns elapsed usecs since last system timer interrupt - */ -static unsigned long omap_mpu_timer_gettimeoffset(void) +static __init void omap_init_mpu_timer(unsigned long rate) { - unsigned long now = 0 - omap_mpu_timer_read(0); - unsigned long elapsed = now - omap_mpu_timer_last; + set_cyc2ns_scale(rate / 1000); + + setup_irq(INT_TIMER1, &omap_mpu_timer1_irq); + omap_mpu_timer_start(0, (rate / HZ) - 1, 1); + + clockevent_mpu_timer1.mult = div_sc(rate, NSEC_PER_SEC, + clockevent_mpu_timer1.shift); + clockevent_mpu_timer1.max_delta_ns = + clockevent_delta2ns(-1, &clockevent_mpu_timer1); + clockevent_mpu_timer1.min_delta_ns = + clockevent_delta2ns(1, &clockevent_mpu_timer1); - return omap_mpu_timer_ticks_to_usecs(elapsed); + clockevent_mpu_timer1.cpumask = cpumask_of_cpu(0); + clockevents_register_device(&clockevent_mpu_timer1); } + /* - * Elapsed time between interrupts is calculated using timer0. - * Latency during the interrupt is calculated using timer1. - * Both timer0 and timer1 are counting at 6MHz (P2 6.5MHz). + * --------------------------------------------------------------------------- + * MPU timer 2 ... free running 32-bit clock source and scheduler clock + * --------------------------------------------------------------------------- */ -static irqreturn_t omap_mpu_timer_interrupt(int irq, void *dev_id) -{ - unsigned long now, latency; - write_seqlock(&xtime_lock); - now = 0 - omap_mpu_timer_read(0); - latency = MPU_TICKS_PER_SEC / HZ - omap_mpu_timer_read(1); - omap_mpu_timer_last = now - latency; - timer_tick(); - write_sequnlock(&xtime_lock); +static unsigned long omap_mpu_timer2_overflows; +static irqreturn_t omap_mpu_timer2_interrupt(int irq, void *dev_id) +{ + omap_mpu_timer2_overflows++; return IRQ_HANDLED; } -static struct irqaction omap_mpu_timer_irq = { - .name = "mpu timer", - .flags = IRQF_DISABLED | IRQF_TIMER, - .handler = omap_mpu_timer_interrupt, +static struct irqaction omap_mpu_timer2_irq = { + .name = "mpu_timer2", + .flags = IRQF_DISABLED, + .handler = omap_mpu_timer2_interrupt, }; -static unsigned long omap_mpu_timer1_overflows; -static irqreturn_t omap_mpu_timer1_interrupt(int irq, void *dev_id) +static cycle_t mpu_read(void) { - omap_mpu_timer1_overflows++; - return IRQ_HANDLED; + return ~omap_mpu_timer_read(1); } -static struct irqaction omap_mpu_timer1_irq = { - .name = "mpu timer1 overflow", - .flags = IRQF_DISABLED, - .handler = omap_mpu_timer1_interrupt, +static struct clocksource clocksource_mpu = { + .name = "mpu_timer2", + .rating = 300, + .read = mpu_read, + .mask = CLOCKSOURCE_MASK(32), + .shift = 24, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static __init void omap_init_mpu_timer(void) +static void __init omap_init_clocksource(unsigned long rate) { - set_cyc2ns_scale(MPU_TICKS_PER_SEC / 1000); - omap_timer.offset = omap_mpu_timer_gettimeoffset; - setup_irq(INT_TIMER1, &omap_mpu_timer1_irq); - setup_irq(INT_TIMER2, &omap_mpu_timer_irq); - omap_mpu_timer_start(0, 0xffffffff); - omap_mpu_timer_start(1, MPU_TIMER_TICK_PERIOD); + static char err[] __initdata = KERN_ERR + "%s: can't register clocksource!\n"; + + clocksource_mpu.mult + = clocksource_khz2mult(rate/1000, clocksource_mpu.shift); + + setup_irq(INT_TIMER2, &omap_mpu_timer2_irq); + omap_mpu_timer_start(1, ~0, 1); + + if (clocksource_register(&clocksource_mpu)) + printk(err, clocksource_mpu.name); } + /* * Scheduler clock - returns current time in nanosec units. */ unsigned long long sched_clock(void) { - unsigned long ticks = 0 - omap_mpu_timer_read(0); + unsigned long ticks = 0 - omap_mpu_timer_read(1); unsigned long long ticks64; - ticks64 = omap_mpu_timer1_overflows; + ticks64 = omap_mpu_timer2_overflows; ticks64 <<= 32; ticks64 |= ticks; @@ -225,10 +274,21 @@ unsigned long long sched_clock(void) */ static void __init omap_timer_init(void) { - omap_init_mpu_timer(); + struct clk *ck_ref = clk_get(NULL, "ck_ref"); + unsigned long rate; + + BUG_ON(IS_ERR(ck_ref)); + + rate = clk_get_rate(ck_ref); + clk_put(ck_ref); + + /* PTV = 0 */ + rate /= 2; + + omap_init_mpu_timer(rate); + omap_init_clocksource(rate); } struct sys_timer omap_timer = { .init = omap_timer_init, - .offset = NULL, /* Initialized later */ }; Index: linux/arch/arm/mach-omap2/pm.c =================================================================== --- linux.orig/arch/arm/mach-omap2/pm.c +++ linux/arch/arm/mach-omap2/pm.c @@ -53,7 +53,7 @@ void omap2_pm_idle(void) { local_irq_disable(); local_fiq_disable(); - if (need_resched()) { + if (need_resched() || need_resched_delayed()) { local_fiq_enable(); local_irq_enable(); return; Index: linux/arch/arm/mach-sa1100/badge4.c =================================================================== --- linux.orig/arch/arm/mach-sa1100/badge4.c +++ linux/arch/arm/mach-sa1100/badge4.c @@ -240,15 +240,22 @@ void badge4_set_5V(unsigned subsystem, i /* detect on->off and off->on transitions */ if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { /* was off, now on */ - printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__); GPSR = BADGE4_GPIO_PCMEN5V; } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { /* was on, now off */ - printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__); GPCR = BADGE4_GPIO_PCMEN5V; } local_irq_restore(flags); + + /* detect on->off and off->on transitions */ + if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { + /* was off, now on */ + printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__); + } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { + /* was on, now off */ + printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__); + } } EXPORT_SYMBOL(badge4_set_5V); Index: linux/arch/arm/mach-sa1100/generic.c =================================================================== --- linux.orig/arch/arm/mach-sa1100/generic.c +++ linux/arch/arm/mach-sa1100/generic.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "generic.h" @@ -153,7 +154,7 @@ int gpio_direction_input(unsigned gpio) EXPORT_SYMBOL(gpio_direction_input); -int gpio_direction_output(unsigned gpio) +int gpio_direction_output(unsigned gpio, int value) { unsigned long flags; @@ -161,6 +162,7 @@ int gpio_direction_output(unsigned gpio) return -EINVAL; local_irq_save(flags); + gpio_set_value(gpio, value); GPDR |= GPIO_GPIO(gpio); local_irq_restore(flags); return 0; Index: linux/arch/arm/mach-shark/leds.c =================================================================== --- linux.orig/arch/arm/mach-shark/leds.c +++ linux/arch/arm/mach-shark/leds.c @@ -32,7 +32,7 @@ static char led_state; static short hw_led_state; static short saved_state; -static DEFINE_SPINLOCK(leds_lock); +static DEFINE_RAW_SPINLOCK(leds_lock); short sequoia_read(int addr) { outw(addr,0x24); Index: linux/arch/arm/mm/consistent.c =================================================================== --- linux.orig/arch/arm/mm/consistent.c +++ linux/arch/arm/mm/consistent.c @@ -40,7 +40,7 @@ * These are the page tables (2MB each) covering uncached, DMA consistent allocations */ static pte_t *consistent_pte[NUM_CONSISTENT_PTES]; -static DEFINE_SPINLOCK(consistent_lock); +static DEFINE_RAW_SPINLOCK(consistent_lock); /* * VM region handling support. Index: linux/arch/arm/mm/copypage-v4mc.c =================================================================== --- linux.orig/arch/arm/mm/copypage-v4mc.c +++ linux/arch/arm/mm/copypage-v4mc.c @@ -30,7 +30,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_CACHEABLE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * ARMv4 mini-dcache optimised copy_user_page @@ -44,7 +44,7 @@ static DEFINE_SPINLOCK(minicache_lock); * instruction. If your processor does not supply this, you have to write your * own copy_user_page that does the right thing. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { asm volatile( @@ -88,7 +88,7 @@ void v4_mc_copy_user_page(void *kto, con /* * ARMv4 optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) v4_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux/arch/arm/mm/copypage-v6.c =================================================================== --- linux.orig/arch/arm/mm/copypage-v6.c +++ linux/arch/arm/mm/copypage-v6.c @@ -26,7 +26,7 @@ #define from_address (0xffff8000) #define to_address (0xffffc000) -static DEFINE_SPINLOCK(v6_lock); +static DEFINE_RAW_SPINLOCK(v6_lock); /* * Copy the user page. No aliasing to deal with so we can just Index: linux/arch/arm/mm/copypage-xscale.c =================================================================== --- linux.orig/arch/arm/mm/copypage-xscale.c +++ linux/arch/arm/mm/copypage-xscale.c @@ -32,7 +32,7 @@ #define minicache_pgprot __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | \ L_PTE_CACHEABLE) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * XScale mini-dcache optimised copy_user_page @@ -42,7 +42,7 @@ static DEFINE_SPINLOCK(minicache_lock); * Dcache aliasing issue. The writes will be forwarded to the write buffer, * and merged as appropriate. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { /* @@ -110,7 +110,7 @@ void xscale_mc_copy_user_page(void *kto, /* * XScale optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux/arch/arm/mm/fault.c =================================================================== --- linux.orig/arch/arm/mm/fault.c +++ linux/arch/arm/mm/fault.c @@ -216,7 +216,7 @@ out: return fault; } -static int +static notrace int do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk; @@ -230,7 +230,7 @@ do_page_fault(unsigned long addr, unsign * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto no_context; /* @@ -316,7 +316,7 @@ no_context: * interrupt or a critical region, and should only copy the information * from the master page table, nothing more. */ -static int +static notrace int do_translation_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { @@ -359,7 +359,7 @@ bad_area: * Some section permission faults need to be handled gracefully. * They can happen due to a __{get,put}_user during an oops. */ -static int +static notrace int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { do_bad_area(addr, fsr, regs); @@ -369,7 +369,7 @@ do_sect_fault(unsigned long addr, unsign /* * This abort handler always returns "fault". */ -static int +static notrace int do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { return 1; @@ -424,7 +424,7 @@ static struct fsr_info { { do_bad, SIGBUS, 0, "unknown 31" } }; -void __init +void __init notrace hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *), int sig, const char *name) { @@ -438,7 +438,7 @@ hook_fault_code(int nr, int (*fn)(unsign /* * Dispatch a data abort to the relevant handler. */ -asmlinkage void +asmlinkage notrace void do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6); @@ -457,7 +457,7 @@ do_DataAbort(unsigned long addr, unsigne notify_die("", regs, &info, fsr, 0); } -asmlinkage void +asmlinkage notrace void do_PrefetchAbort(unsigned long addr, struct pt_regs *regs) { do_translation_fault(addr, 0, regs); Index: linux/arch/arm/mm/mmu.c =================================================================== --- linux.orig/arch/arm/mm/mmu.c +++ linux/arch/arm/mm/mmu.c @@ -25,7 +25,7 @@ #include "mm.h" -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); extern void _stext, _etext, __data_start, _end; extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; Index: linux/arch/arm/oprofile/op_model_xscale.c =================================================================== --- linux.orig/arch/arm/oprofile/op_model_xscale.c +++ linux/arch/arm/oprofile/op_model_xscale.c @@ -381,8 +381,9 @@ static int xscale_pmu_start(void) { int ret; u32 pmnc = read_pmnc(); + int irq_flags = IRQF_DISABLED | IRQF_NODELAY; - ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, IRQF_DISABLED, + ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, irq_flags, "XScale PMU", (void *)results); if (ret < 0) { Index: linux/arch/arm/plat-omap/Kconfig =================================================================== --- linux.orig/arch/arm/plat-omap/Kconfig +++ linux/arch/arm/plat-omap/Kconfig @@ -11,6 +11,7 @@ choice config ARCH_OMAP1 bool "TI OMAP1" + select GENERIC_CLOCKEVENTS config ARCH_OMAP2 bool "TI OMAP2" Index: linux/arch/arm/plat-omap/clock.c =================================================================== --- linux.orig/arch/arm/plat-omap/clock.c +++ linux/arch/arm/plat-omap/clock.c @@ -29,7 +29,7 @@ static LIST_HEAD(clocks); static DEFINE_MUTEX(clocks_mutex); -static DEFINE_SPINLOCK(clockfw_lock); +static DEFINE_RAW_SPINLOCK(clockfw_lock); static struct clk_functions *arch_clock; Index: linux/arch/arm/plat-omap/common.c =================================================================== --- linux.orig/arch/arm/plat-omap/common.c +++ linux/arch/arm/plat-omap/common.c @@ -156,3 +156,53 @@ static int __init omap_add_serial_consol return add_preferred_console("ttyS", line, opt); } console_initcall(omap_add_serial_console); + + +/* + * 32KHz clocksource ... always available, on pretty most chips except + * OMAP 730 and 1510. Other timers could be used as clocksources, with + * higher resolution in free-running counter modes (e.g. 12 MHz xtal), + * but systems won't necessarily want to spend resources that way. + */ + +#if defined(CONFIG_ARCH_OMAP16XX) +#define TIMER_32K_SYNCHRONIZED 0xfffbc410 +#elif defined(CONFIG_ARCH_OMAP24XX) +#define TIMER_32K_SYNCHRONIZED 0x48004010 +#endif + +#ifdef TIMER_32K_SYNCHRONIZED + +#include + +static cycle_t omap_32k_read(void) +{ + return omap_readl(TIMER_32K_SYNCHRONIZED); +} + +static struct clocksource clocksource_32k = { + .name = "32k_counter", + .rating = 250, + .read = omap_32k_read, + .mask = CLOCKSOURCE_MASK(32), + .shift = 10, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static int __init omap_init_clocksource_32k(void) +{ + static char err[] __initdata = KERN_ERR + "%s: can't register clocksource!\n"; + + if (cpu_is_omap16xx() || cpu_is_omap24xx()) { + clocksource_32k.mult = clocksource_hz2mult(32768, + clocksource_32k.shift); + + if (clocksource_register(&clocksource_32k)) + printk(err, clocksource_32k.name); + } + return 0; +} +arch_initcall(omap_init_clocksource_32k); + +#endif /* TIMER_32K_SYNCHRONIZED */ Index: linux/arch/arm/plat-omap/dma.c =================================================================== --- linux.orig/arch/arm/plat-omap/dma.c +++ linux/arch/arm/plat-omap/dma.c @@ -982,7 +982,7 @@ static struct irqaction omap24xx_dma_irq /*----------------------------------------------------------------------------*/ static struct lcd_dma_info { - spinlock_t lock; + raw_spinlock_t lock; int reserved; void (* callback)(u16 status, void *data); void *cb_data; Index: linux/arch/arm/plat-omap/gpio.c =================================================================== --- linux.orig/arch/arm/plat-omap/gpio.c +++ linux/arch/arm/plat-omap/gpio.c @@ -120,7 +120,7 @@ struct gpio_bank { u32 reserved_map; u32 suspend_wakeup; u32 saved_wakeup; - spinlock_t lock; + raw_spinlock_t lock; }; #define METHOD_MPUIO 0 Index: linux/arch/arm/plat-omap/mux.c =================================================================== --- linux.orig/arch/arm/plat-omap/mux.c +++ linux/arch/arm/plat-omap/mux.c @@ -56,7 +56,7 @@ int __init omap_mux_register(struct pin_ */ int __init_or_module omap_cfg_reg(const unsigned long index) { - static DEFINE_SPINLOCK(mux_spin_lock); + static DEFINE_RAW_SPINLOCK(mux_spin_lock); unsigned long flags; struct pin_config *cfg; Index: linux/arch/arm/plat-omap/timer32k.c =================================================================== --- linux.orig/arch/arm/plat-omap/timer32k.c +++ linux/arch/arm/plat-omap/timer32k.c @@ -42,6 +42,8 @@ #include #include #include +#include +#include #include #include @@ -80,13 +82,13 @@ struct sys_timer omap_timer; #define OMAP1_32K_TIMER_TVR 0x00 #define OMAP1_32K_TIMER_TCR 0x04 -#define OMAP_32K_TICKS_PER_HZ (32768 / HZ) +#define OMAP_32K_TICKS_PER_SEC (32768) /* * TRM says 1 / HZ = ( TVR + 1) / 32768, so TRV = (32768 / HZ) - 1 * so with HZ = 128, TVR = 255. */ -#define OMAP_32K_TIMER_TICK_PERIOD ((32768 / HZ) - 1) +#define OMAP_32K_TIMER_TICK_PERIOD ((OMAP_32K_TICKS_PER_SEC / HZ) - 1) #define JIFFIES_TO_HW_TICKS(nr_jiffies, clock_rate) \ (((nr_jiffies) * (clock_rate)) / HZ) @@ -142,6 +144,28 @@ static inline void omap_32k_timer_ack_ir #endif +static void omap_32k_timer_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + switch (mode) { + case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_MODE_PERIODIC: + omap_32k_timer_start(OMAP_32K_TIMER_TICK_PERIOD); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + omap_32k_timer_stop(); + break; + } +} + +static struct clock_event_device clockevent_32k_timer = { + .name = "32k-timer", + .features = CLOCK_EVT_FEAT_PERIODIC, + .shift = 32, + .set_mode = omap_32k_timer_set_mode, +}; + /* * The 32KHz synchronized timer is an additional timer on 16xx. * It is always running. @@ -171,15 +195,6 @@ omap_32k_ticks_to_nsecs(unsigned long ti static unsigned long omap_32k_last_tick = 0; /* - * Returns elapsed usecs since last 32k timer interrupt - */ -static unsigned long omap_32k_timer_gettimeoffset(void) -{ - unsigned long now = omap_32k_sync_timer_read(); - return omap_32k_ticks_to_usecs(now - omap_32k_last_tick); -} - -/* * Returns current time from boot in nsecs. It's OK for this to wrap * around for now, as it's just a relative time stamp. */ @@ -188,95 +203,16 @@ unsigned long long sched_clock(void) return omap_32k_ticks_to_nsecs(omap_32k_sync_timer_read()); } -/* - * Timer interrupt for 32KHz timer. When dynamic tick is enabled, this - * function is also called from other interrupts to remove latency - * issues with dynamic tick. In the dynamic tick case, we need to lock - * with irqsave. - */ -static inline irqreturn_t _omap_32k_timer_interrupt(int irq, void *dev_id) -{ - unsigned long now; - - omap_32k_timer_ack_irq(); - now = omap_32k_sync_timer_read(); - - while ((signed long)(now - omap_32k_last_tick) - >= OMAP_32K_TICKS_PER_HZ) { - omap_32k_last_tick += OMAP_32K_TICKS_PER_HZ; - timer_tick(); - } - - /* Restart timer so we don't drift off due to modulo or dynamic tick. - * By default we program the next timer to be continuous to avoid - * latencies during high system load. During dynamic tick operation the - * continuous timer can be overridden from pm_idle to be longer. - */ - omap_32k_timer_start(omap_32k_last_tick + OMAP_32K_TICKS_PER_HZ - now); - - return IRQ_HANDLED; -} - -static irqreturn_t omap_32k_timer_handler(int irq, void *dev_id) -{ - return _omap_32k_timer_interrupt(irq, dev_id); -} - static irqreturn_t omap_32k_timer_interrupt(int irq, void *dev_id) { - unsigned long flags; + struct clock_event_device *evt = &clockevent_32k_timer; + omap_32k_timer_ack_irq(); - write_seqlock_irqsave(&xtime_lock, flags); - _omap_32k_timer_interrupt(irq, dev_id); - write_sequnlock_irqrestore(&xtime_lock, flags); + evt->event_handler(evt); return IRQ_HANDLED; } -#ifdef CONFIG_NO_IDLE_HZ -/* - * Programs the next timer interrupt needed. Called when dynamic tick is - * enabled, and to reprogram the ticks to skip from pm_idle. Note that - * we can keep the timer continuous, and don't need to set it to run in - * one-shot mode. This is because the timer will get reprogrammed again - * after next interrupt. - */ -void omap_32k_timer_reprogram(unsigned long next_tick) -{ - unsigned long ticks = JIFFIES_TO_HW_TICKS(next_tick, 32768) + 1; - unsigned long now = omap_32k_sync_timer_read(); - unsigned long idled = now - omap_32k_last_tick; - - if (idled + 1 < ticks) - ticks -= idled; - else - ticks = 1; - omap_32k_timer_start(ticks); -} - -static struct irqaction omap_32k_timer_irq; -extern struct timer_update_handler timer_update; - -static int omap_32k_timer_enable_dyn_tick(void) -{ - /* No need to reprogram timer, just use the next interrupt */ - return 0; -} - -static int omap_32k_timer_disable_dyn_tick(void) -{ - omap_32k_timer_start(OMAP_32K_TIMER_TICK_PERIOD); - return 0; -} - -static struct dyn_tick_timer omap_dyn_tick_timer = { - .enable = omap_32k_timer_enable_dyn_tick, - .disable = omap_32k_timer_disable_dyn_tick, - .reprogram = omap_32k_timer_reprogram, - .handler = omap_32k_timer_handler, -}; -#endif /* CONFIG_NO_IDLE_HZ */ - static struct irqaction omap_32k_timer_irq = { .name = "32KHz timer", .flags = IRQF_DISABLED | IRQF_TIMER, @@ -285,13 +221,8 @@ static struct irqaction omap_32k_timer_i static __init void omap_init_32k_timer(void) { -#ifdef CONFIG_NO_IDLE_HZ - omap_timer.dyn_tick = &omap_dyn_tick_timer; -#endif - if (cpu_class_is_omap1()) setup_irq(INT_OS_TIMER, &omap_32k_timer_irq); - omap_timer.offset = omap_32k_timer_gettimeoffset; omap_32k_last_tick = omap_32k_sync_timer_read(); #ifdef CONFIG_ARCH_OMAP2 @@ -308,7 +239,16 @@ static __init void omap_init_32k_timer(v } #endif - omap_32k_timer_start(OMAP_32K_TIMER_TICK_PERIOD); + clockevent_32k_timer.mult = div_sc(OMAP_32K_TICKS_PER_SEC, + NSEC_PER_SEC, + clockevent_32k_timer.shift); + clockevent_32k_timer.max_delta_ns = + clockevent_delta2ns(0xfffffffe, &clockevent_32k_timer); + clockevent_32k_timer.min_delta_ns = + clockevent_delta2ns(1, &clockevent_32k_timer); + + clockevent_32k_timer.cpumask = cpumask_of_cpu(0); + clockevents_register_device(&clockevent_32k_timer); } /* @@ -326,5 +266,4 @@ static void __init omap_timer_init(void) struct sys_timer omap_timer = { .init = omap_timer_init, - .offset = NULL, /* Initialized later */ }; Index: linux/arch/avr32/mach-at32ap/pio.c =================================================================== --- linux.orig/arch/avr32/mach-at32ap/pio.c +++ linux/arch/avr32/mach-at32ap/pio.c @@ -214,7 +214,7 @@ int gpio_direction_input(unsigned int gp } EXPORT_SYMBOL(gpio_direction_input); -int gpio_direction_output(unsigned int gpio) +int gpio_direction_output(unsigned int gpio, int value) { struct pio_device *pio; unsigned int pin; @@ -223,6 +223,8 @@ int gpio_direction_output(unsigned int g if (!pio) return -ENODEV; + gpio_set_value(gpio, value); + pin = gpio & 0x1f; pio_writel(pio, OER, 1 << pin); Index: linux/arch/i386/Kconfig =================================================================== --- linux.orig/arch/i386/Kconfig +++ linux/arch/i386/Kconfig @@ -305,6 +305,19 @@ config SCHED_MC source "kernel/Kconfig.preempt" +config RWSEM_GENERIC_SPINLOCK + bool + depends on M386 || PREEMPT_RT + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + default y if !RWSEM_GENERIC_SPINLOCK + config X86_UP_APIC bool "Local APIC support on uniprocessors" depends on !SMP && !(X86_VISWS || X86_VOYAGER || X86_GENERICARCH) @@ -756,6 +769,14 @@ config BOOT_IOREMAP depends on (((X86_SUMMIT || X86_GENERICARCH) && NUMA) || (X86 && EFI)) default y +# +# function tracing might turn this off: +# +config REGPARM + bool + depends on !MCOUNT + default y + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" depends on PROC_FS Index: linux/arch/i386/Kconfig.cpu =================================================================== --- linux.orig/arch/i386/Kconfig.cpu +++ linux/arch/i386/Kconfig.cpu @@ -238,11 +238,6 @@ config RWSEM_GENERIC_SPINLOCK depends on M386 default y -config RWSEM_XCHGADD_ALGORITHM - bool - depends on !M386 - default y - config ARCH_HAS_ILOG2_U32 bool default n Index: linux/arch/i386/Kconfig.debug =================================================================== --- linux.orig/arch/i386/Kconfig.debug +++ linux/arch/i386/Kconfig.debug @@ -22,6 +22,7 @@ config EARLY_PRINTK config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL + default y help This option will cause messages to be printed if free stack space drops below a certain limit. @@ -29,6 +30,7 @@ config DEBUG_STACKOVERFLOW config DEBUG_STACK_USAGE bool "Stack utilization instrumentation" depends on DEBUG_KERNEL + default y help Enables the display of the minimum amount of free stack which each task has ever had available in the sysrq-T and sysrq-P debug output. @@ -49,6 +51,7 @@ config DEBUG_PAGEALLOC config DEBUG_RODATA bool "Write protect kernel read-only data structures" depends on DEBUG_KERNEL + default y help Mark the kernel read-only data as write-protected in the pagetables, in order to catch accidental (and incorrect) writes to such const @@ -59,6 +62,7 @@ config DEBUG_RODATA config 4KSTACKS bool "Use 4Kb for kernel stacks instead of 8Kb" depends on DEBUG_KERNEL + default y help If you say Y here the kernel will use a 4Kb stacksize for the kernel stack attached to each process/thread. This facilitates Index: linux/arch/i386/Makefile =================================================================== --- linux.orig/arch/i386/Makefile +++ linux/arch/i386/Makefile @@ -31,7 +31,7 @@ LDFLAGS_vmlinux := --emit-relocs endif CHECKFLAGS += -D__i386__ -CFLAGS += -pipe -msoft-float -mregparm=3 -freg-struct-return +CFLAGS += -pipe -msoft-float # prevent gcc from keeping the stack 16 byte aligned CFLAGS += $(call cc-option,-mpreferred-stack-boundary=2) @@ -39,6 +39,8 @@ CFLAGS += $(call cc-option,-mpreferred-s # CPU-specific tuning. Anything which can be shared with UML should go here. include $(srctree)/arch/i386/Makefile.cpu +cflags-$(CONFIG_REGPARM) += -mregparm=3 -freg-struct-return + # temporary until string.h is fixed cflags-y += -ffreestanding Index: linux/arch/i386/boot/compressed/Makefile =================================================================== --- linux.orig/arch/i386/boot/compressed/Makefile +++ linux/arch/i386/boot/compressed/Makefile @@ -9,6 +9,7 @@ targets := vmlinux vmlinux.bin vmlinux. EXTRA_AFLAGS := -traditional LDFLAGS_vmlinux := -T +CFLAGS := -m32 -D__KERNEL__ -Iinclude -O2 -fno-strict-aliasing CFLAGS_misc.o += -fPIC hostprogs-y := relocs Index: linux/arch/i386/boot/compressed/misc.c =================================================================== --- linux.orig/arch/i386/boot/compressed/misc.c +++ linux/arch/i386/boot/compressed/misc.c @@ -189,7 +189,7 @@ static void putstr(const char *); static unsigned long free_mem_ptr; static unsigned long free_mem_end_ptr; -#define HEAP_SIZE 0x3000 +#define HEAP_SIZE 0x4000 static char *vidmem = (char *)0xb8000; static int vidport; Index: linux/arch/i386/defconfig =================================================================== --- linux.orig/arch/i386/defconfig +++ linux/arch/i386/defconfig @@ -1,10 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.20-git8 -# Tue Feb 13 11:25:18 2007 +# Linux kernel version: 2.6.21-rc3 +# Wed Mar 7 15:29:47 2007 # CONFIG_X86_32=y CONFIG_GENERIC_TIME=y +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y CONFIG_SEMAPHORE_SLEEPERS=y @@ -34,6 +37,7 @@ CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y CONFIG_SYSVIPC=y # CONFIG_IPC_NS is not set +CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set @@ -44,6 +48,7 @@ CONFIG_IKCONFIG_PROC=y # CONFIG_CPUSETS is not set CONFIG_SYSFS_DEPRECATED=y # CONFIG_RELAY is not set +CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y @@ -103,6 +108,9 @@ CONFIG_DEFAULT_IOSCHED="anticipatory" # # Processor type and features # +# CONFIG_TICK_ONESHOT is not set +# CONFIG_NO_HZ is not set +# CONFIG_HIGH_RES_TIMERS is not set CONFIG_SMP=y # CONFIG_X86_PC is not set # CONFIG_X86_ELAN is not set @@ -235,10 +243,8 @@ CONFIG_ACPI_PROCFS=y CONFIG_ACPI_AC=y CONFIG_ACPI_BATTERY=y CONFIG_ACPI_BUTTON=y -# CONFIG_ACPI_HOTKEY is not set CONFIG_ACPI_FAN=y # CONFIG_ACPI_DOCK is not set -# CONFIG_ACPI_BAY is not set CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_THERMAL=y # CONFIG_ACPI_ASUS is not set @@ -289,6 +295,7 @@ CONFIG_X86_POWERNOW_K8_ACPI=y # CONFIG_X86_CPUFREQ_NFORCE2 is not set # CONFIG_X86_LONGRUN is not set # CONFIG_X86_LONGHAUL is not set +# CONFIG_X86_E_POWERSAVER is not set # # shared options @@ -368,7 +375,7 @@ CONFIG_IP_PNP_DHCP=y # CONFIG_INET_ESP is not set # CONFIG_INET_IPCOMP is not set # CONFIG_INET_XFRM_TUNNEL is not set -# CONFIG_INET_TUNNEL is not set +CONFIG_INET_TUNNEL=y CONFIG_INET_XFRM_MODE_TRANSPORT=y CONFIG_INET_XFRM_MODE_TUNNEL=y # CONFIG_INET_XFRM_MODE_BEET is not set @@ -470,7 +477,13 @@ CONFIG_FW_LOADER=y # # Plug and Play support # -# CONFIG_PNP is not set +CONFIG_PNP=y +# CONFIG_PNP_DEBUG is not set + +# +# Protocols +# +CONFIG_PNPACPI=y # # Block devices @@ -490,7 +503,6 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=4096 CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 -CONFIG_BLK_DEV_INITRD=y # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set @@ -500,6 +512,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_IBM_ASM is not set # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set +# CONFIG_SONY_LAPTOP is not set # # ATA/ATAPI/MFM/RLL support @@ -526,6 +539,7 @@ CONFIG_BLK_DEV_IDEACPI=y # CONFIG_IDE_GENERIC=y # CONFIG_BLK_DEV_CMD640 is not set +# CONFIG_BLK_DEV_IDEPNP is not set CONFIG_BLK_DEV_IDEPCI=y # CONFIG_IDEPCI_SHARE_IRQ is not set # CONFIG_BLK_DEV_OFFBOARD is not set @@ -679,6 +693,7 @@ CONFIG_SATA_VIA=y # CONFIG_SATA_VITESSE is not set # CONFIG_SATA_INIC162X is not set CONFIG_SATA_INTEL_COMBINED=y +CONFIG_SATA_ACPI=y # CONFIG_PATA_ALI is not set # CONFIG_PATA_AMD is not set # CONFIG_PATA_ARTOP is not set @@ -786,6 +801,7 @@ CONFIG_NETDEVICES=y # CONFIG_BONDING is not set # CONFIG_EQUALIZER is not set # CONFIG_TUN is not set +# CONFIG_NET_SB1000 is not set # # ARCnet devices @@ -979,6 +995,7 @@ CONFIG_HW_CONSOLE=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_PNP=y CONFIG_SERIAL_8250_NR_UARTS=4 CONFIG_SERIAL_8250_RUNTIME_UARTS=4 # CONFIG_SERIAL_8250_EXTENDED is not set @@ -1065,6 +1082,11 @@ CONFIG_HANGCHECK_TIMER=y # CONFIG_HWMON_VID is not set # +# Multifunction device drivers +# +# CONFIG_MFD_SM501 is not set + +# # Multimedia devices # # CONFIG_VIDEO_DEV is not set @@ -1078,7 +1100,7 @@ CONFIG_HANGCHECK_TIMER=y # # Graphics support # -CONFIG_FIRMWARE_EDID=y +# CONFIG_BACKLIGHT_LCD_SUPPORT is not set # CONFIG_FB is not set # @@ -1089,7 +1111,6 @@ CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=128 CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y -# CONFIG_BACKLIGHT_LCD_SUPPORT is not set # # Sound @@ -1238,6 +1259,7 @@ CONFIG_USB_MON=y # CONFIG_USB_RIO500 is not set # CONFIG_USB_LEGOTOWER is not set # CONFIG_USB_LCD is not set +# CONFIG_USB_BERRY_CHARGE is not set # CONFIG_USB_LED is not set # CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set @@ -1248,6 +1270,7 @@ CONFIG_USB_MON=y # CONFIG_USB_SISUSBVGA is not set # CONFIG_USB_LD is not set # CONFIG_USB_TRANCEVIBRATOR is not set +# CONFIG_USB_IOWARRIOR is not set # CONFIG_USB_TEST is not set # @@ -1506,6 +1529,7 @@ CONFIG_DEBUG_KERNEL=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set +# CONFIG_TIMER_STATS is not set # CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set @@ -1522,9 +1546,12 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set +CONFIG_UNWIND_INFO=y +CONFIG_STACK_UNWIND=y # CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set +# CONFIG_FAULT_INJECTION is not set CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_DEBUG_STACK_USAGE is not set Index: linux/arch/i386/kernel/Makefile =================================================================== --- linux.orig/arch/i386/kernel/Makefile +++ linux/arch/i386/kernel/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o tsc_sync.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-$(CONFIG_MCOUNT) += mcount-wrapper.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o @@ -33,6 +34,7 @@ obj-$(CONFIG_MODULES) += module.o obj-y += sysenter.o vsyscall.o obj-$(CONFIG_ACPI_SRAT) += srat.o obj-$(CONFIG_EFI) += efi.o efi_stub.o +obj-$(CONFIG_PARAVIRT) += hyper_clock.o obj-$(CONFIG_DOUBLEFAULT) += doublefault.o obj-$(CONFIG_VM86) += vm86.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o Index: linux/arch/i386/kernel/acpi/boot.c =================================================================== --- linux.orig/arch/i386/kernel/acpi/boot.c +++ linux/arch/i386/kernel/acpi/boot.c @@ -874,7 +874,7 @@ static void __init acpi_process_madt(voi acpi_ioapic = 1; smp_found_config = 1; - clustered_apic_check(); + setup_apic_routing(); } } if (error == -EINVAL) { Index: linux/arch/i386/kernel/acpi/earlyquirk.c =================================================================== --- linux.orig/arch/i386/kernel/acpi/earlyquirk.c +++ linux/arch/i386/kernel/acpi/earlyquirk.c @@ -10,7 +10,6 @@ #include #include #include -#include #ifdef CONFIG_ACPI @@ -45,24 +44,6 @@ static int __init check_bridge(int vendo return 0; } -static void check_intel(void) -{ - u16 vendor, device; - - vendor = read_pci_config_16(0, 0, 0, PCI_VENDOR_ID); - - if (vendor != PCI_VENDOR_ID_INTEL) - return; - - device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); -#ifdef CONFIG_SMP - if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || - device == PCI_DEVICE_ID_INTEL_E7520_MCH || - device == PCI_DEVICE_ID_INTEL_E7525_MCH) - quirk_intel_irqbalance(); -#endif -} - void __init check_acpi_pci(void) { int num, slot, func; @@ -74,8 +55,6 @@ void __init check_acpi_pci(void) if (!early_pci_allowed()) return; - check_intel(); - /* Poor man's PCI discovery */ for (num = 0; num < 32; num++) { for (slot = 0; slot < 32; slot++) { Index: linux/arch/i386/kernel/alternative.c =================================================================== --- linux.orig/arch/i386/kernel/alternative.c +++ linux/arch/i386/kernel/alternative.c @@ -350,6 +350,8 @@ void alternatives_smp_switch(int smp) #endif #ifdef CONFIG_PARAVIRT +// hack +#ifndef CONFIG_X86_64 void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end) { struct paravirt_patch *p; @@ -381,6 +383,7 @@ void apply_paravirt(struct paravirt_patc } extern struct paravirt_patch __start_parainstructions[], __stop_parainstructions[]; +#endif /* hack */ #endif /* CONFIG_PARAVIRT */ void __init alternative_instructions(void) @@ -430,6 +433,9 @@ void __init alternative_instructions(voi alternatives_smp_switch(0); } #endif +// hack +#ifndef CONFIG_X86_64 apply_paravirt(__start_parainstructions, __stop_parainstructions); +#endif local_irq_restore(flags); } Index: linux/arch/i386/kernel/apic.c =================================================================== --- linux.orig/arch/i386/kernel/apic.c +++ linux/arch/i386/kernel/apic.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include @@ -61,6 +63,8 @@ static int enable_local_apic __initdata /* Local APIC timer verification ok */ static int local_apic_timer_verify_ok; +/* Disable local APIC timer from the kernel commandline or via dmi quirk */ +static int local_apic_timer_disabled; /* * Debug level, exported for io_apic.c @@ -266,6 +270,32 @@ static void __devinit setup_APIC_timer(v } /* + * Detect systems with known broken BIOS implementations + */ +static int __init lapic_check_broken_bios(struct dmi_system_id *d) +{ + printk(KERN_NOTICE "%s detected: disabling lapic timer.\n", + d->ident); + local_apic_timer_disabled = 1; + return 0; +} + +static struct dmi_system_id __initdata broken_bios_dmi_table[] = { + { + /* + * BIOS exports only C1 state, but uses deeper power + * modes behind the kernels back. + */ + .callback = lapic_check_broken_bios, + .ident = "HP nx6325", + .matches = { + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), + }, + }, + {} +}; + +/* * In this functions we calibrate APIC bus clocks to the external timer. * * We want to do the calibration only once since we want to have local timer @@ -338,6 +368,23 @@ void __init setup_boot_APIC_clock(void) void (*real_handler)(struct clock_event_device *dev); unsigned long deltaj; long delta, deltapm; + int pm_referenced = 0; + + /* Detect know broken systems */ + dmi_check_system(broken_bios_dmi_table); + + /* + * The local apic timer can be disabled via the kernel + * commandline or from the dmi quirk above. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. + */ + if (local_apic_timer_disabled) { + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" "calibrating APIC timer ...\n"); @@ -357,7 +404,8 @@ void __init setup_boot_APIC_clock(void) /* Let the interrupts run */ local_irq_enable(); - while(lapic_cal_loops <= LAPIC_CAL_LOOPS); + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); local_irq_disable(); @@ -394,6 +442,7 @@ void __init setup_boot_APIC_clock(void) "%lu (%ld)\n", (unsigned long) res, delta); delta = (long) res; } + pm_referenced = 1; } /* Calculate the scaled math multiplication factor */ @@ -423,68 +472,41 @@ void __init setup_boot_APIC_clock(void) calibration_result / (1000000 / HZ), calibration_result % (1000000 / HZ)); - - apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); - - /* - * Setup the apic timer manually - */ local_apic_timer_verify_ok = 1; - levt->event_handler = lapic_cal_handler; - lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); - lapic_cal_loops = -1; - /* Let the interrupts run */ - local_irq_enable(); + /* We trust the pm timer based calibration */ + if (!pm_referenced) { + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); - while(lapic_cal_loops <= LAPIC_CAL_LOOPS); + /* + * Setup the apic timer manually + */ + levt->event_handler = lapic_cal_handler; + lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); + lapic_cal_loops = -1; - local_irq_disable(); + /* Let the interrupts run */ + local_irq_enable(); - /* Stop the lapic timer */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); + while(lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); - local_irq_enable(); + local_irq_disable(); - /* Jiffies delta */ - deltaj = lapic_cal_j2 - lapic_cal_j1; - apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + /* Stop the lapic timer */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); - /* Check, if the PM timer is available */ - deltapm = lapic_cal_pm2 - lapic_cal_pm1; - apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + local_irq_enable(); - local_apic_timer_verify_ok = 0; + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); - if (deltapm) { - if (deltapm > (pm_100ms - pm_thresh) && - deltapm < (pm_100ms + pm_thresh)) { - apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); - /* Check, if the jiffies result is consistent */ - if (deltaj < LAPIC_CAL_LOOPS-2 || - deltaj > LAPIC_CAL_LOOPS+2) { - /* - * Not sure, what we can do about this one. - * When high resultion timers are active - * and the lapic timer does not stop in C3 - * we are fine. Otherwise more trouble might - * be waiting. -- tglx - */ - printk(KERN_WARNING "Global event device %s " - "has wrong frequency " - "(%lu ticks instead of %d)\n", - global_clock_event->name, deltaj, - LAPIC_CAL_LOOPS); - } - local_apic_timer_verify_ok = 1; - } - } else { /* Check, if the jiffies result is consistent */ - if (deltaj >= LAPIC_CAL_LOOPS-2 && - deltaj <= LAPIC_CAL_LOOPS+2) { + if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); - local_apic_timer_verify_ok = 1; - } + else + local_apic_timer_verify_ok = 0; } if (!local_apic_timer_verify_ok) { @@ -553,10 +575,12 @@ static void local_apic_timer_interrupt(v * interrupt as well. Thus we cannot inline the local irq ... ] */ -void fastcall smp_apic_timer_interrupt(struct pt_regs *regs) +void fastcall notrace smp_apic_timer_interrupt(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); + trace_special(regs->eip, 1, 0); + /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. @@ -1203,6 +1227,13 @@ static int __init parse_nolapic(char *ar } early_param("nolapic", parse_nolapic); +static int __init parse_disable_lapic_timer(char *arg) +{ + local_apic_timer_disabled = 1; + return 0; +} +early_param("nolapic_timer", parse_disable_lapic_timer); + static int __init apic_set_verbosity(char *str) { if (strcmp("debug", str) == 0) @@ -1269,6 +1300,7 @@ void smp_error_interrupt(struct pt_regs */ printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); + dump_stack(); irq_exit(); } Index: linux/arch/i386/kernel/apm.c =================================================================== --- linux.orig/arch/i386/kernel/apm.c +++ linux/arch/i386/kernel/apm.c @@ -227,6 +227,7 @@ #include #include #include +#include #include #include @@ -791,7 +792,7 @@ static int apm_do_idle(void) */ smp_mb(); } - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { idled = 1; ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax); } Index: linux/arch/i386/kernel/cpu/intel.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/intel.c +++ linux/arch/i386/kernel/cpu/intel.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include Index: linux/arch/i386/kernel/cpu/mcheck/p4.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/mcheck/p4.c +++ linux/arch/i386/kernel/cpu/mcheck/p4.c @@ -155,7 +155,7 @@ static fastcall void intel_machine_check u32 alow, ahigh, high, low; u32 mcgstl, mcgsth; int i; - struct intel_mce_extended_msrs dbg; + struct intel_mce_extended_msrs dbg = { 0, } /* shut up gcc */; rdmsr (MSR_IA32_MCG_STATUS, mcgstl, mcgsth); if (mcgstl & (1<<0)) /* Recoverable ? */ Index: linux/arch/i386/kernel/cpu/mtrr/generic.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/mtrr/generic.c +++ linux/arch/i386/kernel/cpu/mtrr/generic.c @@ -288,7 +288,7 @@ static unsigned long set_mtrr_state(void static unsigned long cr4 = 0; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts - they Index: linux/arch/i386/kernel/crash.c =================================================================== --- linux.orig/arch/i386/kernel/crash.c +++ linux/arch/i386/kernel/crash.c @@ -70,14 +70,6 @@ static int crash_nmi_callback(struct not return 1; } -static void smp_send_nmi_allbutself(void) -{ - cpumask_t mask = cpu_online_map; - cpu_clear(safe_smp_processor_id(), mask); - if (!cpus_empty(mask)) - send_IPI_mask(mask, NMI_VECTOR); -} - static struct notifier_block crash_nmi_nb = { .notifier_call = crash_nmi_callback, }; Index: linux/arch/i386/kernel/efi.c =================================================================== --- linux.orig/arch/i386/kernel/efi.c +++ linux/arch/i386/kernel/efi.c @@ -278,7 +278,7 @@ void efi_memmap_walk(efi_freemem_callbac struct range { unsigned long start; unsigned long end; - } prev, curr; + } prev = { } /* shut up gcc */ , curr = { } /* shut up gcc */ ; efi_memory_desc_t *md; unsigned long start, end; void *p; Index: linux/arch/i386/kernel/entry.S =================================================================== --- linux.orig/arch/i386/kernel/entry.S +++ linux/arch/i386/kernel/entry.S @@ -264,14 +264,18 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) + cmpl $0, kernel_preemption + jz restore_nocheck cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl - jz restore_all + jz restore_nocheck testl $IF_MASK,PT_EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_nocheck + DISABLE_INTERRUPTS(CLBR_ANY) + call preempt_schedule_irq jmp need_resched END(resume_kernel) @@ -333,6 +337,11 @@ sysenter_past_esp: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_EVENT_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -347,6 +356,11 @@ sysenter_past_esp: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work +#ifdef CONFIG_EVENT_TRACE + pushl %eax + call sys_ret + popl %eax +#endif /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx movl PT_OLDESP(%esp), %ecx @@ -370,6 +384,11 @@ ENTRY(system_call) pushl %eax # save orig_eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_EVENT_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) testl $TF_MASK,PT_EFLAGS(%esp) jz no_singlestep @@ -469,19 +488,19 @@ ENDPROC(system_call) ALIGN RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jz work_notifysig work_resched: - call schedule - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt + DISABLE_INTERRUPTS(CLBR_ANY) + call __schedule + # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret - TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jnz work_resched work_notifysig: # deal with pending signals and @@ -1021,6 +1040,38 @@ ENTRY(spurious_interrupt_bug) CFI_ENDPROC END(spurious_interrupt_bug) +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movl 4(%esp), %edx + movl (%esp), %ecx + leal 4(%esp), %eax + movl %ebx, PT_EBX(%edx) + xorl %ebx, %ebx + movl %ebx, PT_ECX(%edx) + movl %ebx, PT_EDX(%edx) + movl %esi, PT_ESI(%edx) + movl %edi, PT_EDI(%edx) + movl %ebp, PT_EBP(%edx) + movl %ebx, PT_EAX(%edx) + movl $__USER_DS, PT_DS(%edx) + movl $__USER_DS, PT_ES(%edx) + movl $0, PT_FS(%edx) + movl %ebx, PT_ORIG_EAX(%edx) + movl %ecx, PT_EIP(%edx) + movl 12(%esp), %ecx + movl $__KERNEL_CS, PT_CS(%edx) + movl %ebx, PT_EFLAGS(%edx) + movl %eax, PT_OLDESP(%edx) + movl 8(%esp), %eax + movl %ecx, 8(%esp) + movl PT_EBX(%edx), %ebx + movl $__KERNEL_DS, PT_OLDSS(%edx) + jmpl *%eax + CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif + ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder CFI_STARTPROC Index: linux/arch/i386/kernel/head.S =================================================================== --- linux.orig/arch/i386/kernel/head.S +++ linux/arch/i386/kernel/head.S @@ -494,6 +494,7 @@ ignore_int: call printk #endif addl $(5*4),%esp + call dump_stack popl %ds popl %es popl %edx Index: linux/arch/i386/kernel/hpet.c =================================================================== --- linux.orig/arch/i386/kernel/hpet.c +++ linux/arch/i386/kernel/hpet.c @@ -203,7 +203,7 @@ static int hpet_next_event(unsigned long /* * Clock source related code */ -static cycle_t read_hpet(void) +static cycle_t notrace read_hpet(void) { return (cycle_t)hpet_readl(HPET_COUNTER); } @@ -480,9 +480,11 @@ static void hpet_rtc_timer_reinit(void) if (lost_ints) { if (hpet_rtc_flags & RTC_PIE) hpet_pie_count += lost_ints; +#if 0 if (printk_ratelimit()) printk(KERN_WARNING "rtc: lost %d interrupts\n", lost_ints); +#endif } } Index: linux/arch/i386/kernel/hyper_clock.c =================================================================== --- /dev/null +++ linux/arch/i386/kernel/hyper_clock.c @@ -0,0 +1,49 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "mach_timer.h" + +#include +#include +#include + +static cycle_t read_hyper(void) +{ + struct timespec now; + int ret; + + ret = hypercall(1, __NR_hypercall_get_ktime, __pa(&now)); + WARN_ON(ret); + + return now.tv_nsec + now.tv_sec * (cycles_t)1e9; +} + +static struct clocksource clocksource_hyper = { + .name = "hyper", + .rating = 200, + .read = read_hyper, + .mask = CLOCKSOURCE_MASK(64), + .mult = 1, + .shift = 0, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static int __init init_hyper_clocksource(void) +{ + if (!paravirt_enabled()) + return 0; + + return clocksource_register(&clocksource_hyper); +} + +module_init(init_hyper_clocksource); Index: linux/arch/i386/kernel/i386_ksyms.c =================================================================== --- linux.orig/arch/i386/kernel/i386_ksyms.c +++ linux/arch/i386/kernel/i386_ksyms.c @@ -2,10 +2,12 @@ #include #include -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_ASM_SEMAPHORES +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); @@ -20,7 +22,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(strstr); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_ASM_SEMAPHORES) extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); EXPORT_SYMBOL(__write_lock_failed); @@ -28,3 +30,5 @@ EXPORT_SYMBOL(__read_lock_failed); #endif EXPORT_SYMBOL(csum_partial); + +EXPORT_SYMBOL(_proxy_pda); Index: linux/arch/i386/kernel/i8253.c =================================================================== --- linux.orig/arch/i386/kernel/i8253.c +++ linux/arch/i386/kernel/i8253.c @@ -16,7 +16,7 @@ #include "io_ports.h" -DEFINE_SPINLOCK(i8253_lock); +DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); /* Index: linux/arch/i386/kernel/i8259.c =================================================================== --- linux.orig/arch/i386/kernel/i8259.c +++ linux/arch/i386/kernel/i8259.c @@ -35,7 +35,7 @@ */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); static struct irq_chip i8259A_chip = { @@ -171,6 +171,8 @@ static void mask_and_ack_8259A(unsigned */ if (cached_irq_mask & irqmask) goto spurious_8259A_irq; + if (irq & 8) + outb(0x60+(irq&7),PIC_SLAVE_CMD); /* 'Specific EOI' to slave */ cached_irq_mask |= irqmask; handle_real_irq: @@ -298,10 +300,10 @@ void init_8259A(int auto_eoi) outb_p(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ outb_p(0x20 + 0, PIC_MASTER_IMR); /* ICW2: 8259A-1 IR0-7 mapped to 0x20-0x27 */ outb_p(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ - if (auto_eoi) /* master does Auto EOI */ - outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); - else /* master expects normal EOI */ + if (!auto_eoi) /* master expects normal EOI */ outb_p(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); + else /* master does Auto EOI */ + outb_p(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); outb_p(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ outb_p(0x20 + 8, PIC_SLAVE_IMR); /* ICW2: 8259A-2 IR0-7 mapped to 0x28-0x2f */ @@ -351,7 +353,7 @@ static irqreturn_t math_error_irq(int cp * New motherboards sometimes make IRQ 13 be a PCI interrupt, * so allow interrupt sharing. */ -static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL }; +static struct irqaction fpu_irq = { math_error_irq, IRQF_NODELAY, CPU_MASK_NONE, "fpu", NULL, NULL }; void __init init_ISA_irqs (void) { Index: linux/arch/i386/kernel/io_apic.c =================================================================== --- linux.orig/arch/i386/kernel/io_apic.c +++ linux/arch/i386/kernel/io_apic.c @@ -56,8 +56,8 @@ atomic_t irq_mis_count; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); int timer_over_8254 __initdata = 1; @@ -261,18 +261,6 @@ static void __unmask_IO_APIC_irq (unsign __modify_IO_APIC_irq(irq, 0, 0x00010000); } -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); -} - static void mask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -1281,9 +1269,10 @@ static void ioapic_register_intr(int irq trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - else + else { set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); + } set_intr_gate(vector, interrupt[irq]); } @@ -1548,7 +1537,7 @@ void __init print_IO_APIC(void) return; } -#if 0 +#if 1 static void print_APIC_bitfield (int base) { @@ -1952,7 +1941,7 @@ int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ - if (jiffies - t1 > 4) + if (jiffies - t1 > 4 && jiffies - t1 < 16) return 1; return 0; @@ -2041,8 +2030,10 @@ static void ack_ioapic_quirk_irq(unsigne if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + /* mask = 1, trigger = 0 */ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + /* mask = 0, trigger = 1 */ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); spin_unlock(&ioapic_lock); } } Index: linux/arch/i386/kernel/irq.c =================================================================== --- linux.orig/arch/i386/kernel/irq.c +++ linux/arch/i386/kernel/irq.c @@ -65,7 +65,7 @@ static union irq_ctx *softirq_ctx[NR_CPU * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +fastcall notrace unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs; /* high bit used in ret_from_ code */ @@ -76,6 +76,10 @@ fastcall unsigned int do_IRQ(struct pt_r u32 *isp; #endif +#ifdef CONFIG_X86_LOCAL_APIC + irq_show_regs_callback(smp_processor_id(), regs); +#endif + if (unlikely((unsigned)irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", __FUNCTION__, irq); @@ -84,6 +88,11 @@ fastcall unsigned int do_IRQ(struct pt_r old_regs = set_irq_regs(regs); irq_enter(); +#ifdef CONFIG_EVENT_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif + trace_special(regs->eip, irq, 0); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { @@ -92,7 +101,7 @@ fastcall unsigned int do_IRQ(struct pt_r __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE - 1)); if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { - printk("do_IRQ: stack overflow: %ld\n", + printk("BUG: do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); dump_stack(); } @@ -293,6 +302,13 @@ skip: per_cpu(irq_stat,j).apic_timer_irqs); seq_putc(p, '\n'); #endif +#ifdef CONFIG_PARAVIRT + seq_printf(p, "HYP: "); + for_each_online_cpu(j) + seq_printf(p, "%10lu ", + per_cpu(irq_stat,j).hypercalls); + seq_putc(p, '\n'); +#endif seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); Index: linux/arch/i386/kernel/kprobes.c =================================================================== --- linux.orig/arch/i386/kernel/kprobes.c +++ linux/arch/i386/kernel/kprobes.c @@ -338,7 +338,7 @@ ss_probe: /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); regs->eip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); + preempt_enable(); return 1; } #endif @@ -347,7 +347,7 @@ ss_probe: return 1; no_kprobe: - preempt_enable_no_resched(); + preempt_enable(); return ret; } @@ -579,7 +579,7 @@ static int __kprobes post_kprobe_handler } reset_current_kprobe(); out: - preempt_enable_no_resched(); + preempt_enable(); /* * if somebody else is singlestepping across a probe point, eflags @@ -613,7 +613,7 @@ static int __kprobes kprobe_fault_handle restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); break; case KPROBE_HIT_ACTIVE: case KPROBE_HIT_SSDONE: @@ -747,7 +747,7 @@ int __kprobes longjmp_break_handler(stru *regs = kcb->jprobe_saved_regs; memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, MIN_STACK_SIZE(stack_addr)); - preempt_enable_no_resched(); + preempt_enable(); return 1; } return 0; Index: linux/arch/i386/kernel/mcount-wrapper.S =================================================================== --- /dev/null +++ linux/arch/i386/kernel/mcount-wrapper.S @@ -0,0 +1,27 @@ +/* + * linux/arch/i386/mcount-wrapper.S + * + * Copyright (C) 2004 Ingo Molnar + */ + +.globl mcount +mcount: + + cmpl $0, mcount_enabled + jz out + + push %ebp + mov %esp, %ebp + pushl %eax + pushl %ecx + pushl %edx + + call __mcount + + popl %edx + popl %ecx + popl %eax + popl %ebp +out: + ret + Index: linux/arch/i386/kernel/microcode.c =================================================================== --- linux.orig/arch/i386/kernel/microcode.c +++ linux/arch/i386/kernel/microcode.c @@ -116,7 +116,7 @@ MODULE_LICENSE("GPL"); #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) /* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); +static DEFINE_RAW_SPINLOCK(microcode_update_lock); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); Index: linux/arch/i386/kernel/mpparse.c =================================================================== --- linux.orig/arch/i386/kernel/mpparse.c +++ linux/arch/i386/kernel/mpparse.c @@ -477,7 +477,7 @@ static int __init smp_read_mpc(struct mp } ++mpc_record; } - clustered_apic_check(); + setup_apic_routing(); if (!num_processors) printk(KERN_ERR "SMP mptable: no processors registered!\n"); return num_processors; Index: linux/arch/i386/kernel/nmi.c =================================================================== --- linux.orig/arch/i386/kernel/nmi.c +++ linux/arch/i386/kernel/nmi.c @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -60,7 +61,7 @@ static cpumask_t backtrace_mask = CPU_MA atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ unsigned int nmi_watchdog = NMI_DEFAULT; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; struct nmi_watchdog_ctlblk { int enabled; @@ -206,7 +207,12 @@ static int endflag __initdata = 0; */ static __init void nmi_cpu_busy(void *data) { + /* + * avoid a warning, on PREEMPT_RT this wont run in hardirq context: + */ +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -263,7 +269,7 @@ static int __init check_nmi_watchdog(voi for_each_possible_cpu(cpu) prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; local_irq_enable(); - mdelay((10*1000)/nmi_hz); // wait 10 ticks + mdelay((100*1000)/nmi_hz); // wait 100 ticks for_each_possible_cpu(cpu) { #ifdef CONFIG_SMP @@ -296,13 +302,13 @@ static int __init check_nmi_watchdog(voi if (nmi_watchdog == NMI_LOCAL_APIC) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - nmi_hz = 1; if (wd->perfctr_msr == MSR_P6_PERFCTR0 || wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) { nmi_hz = adjust_for_32bit_ctr(nmi_hz); } } + nmi_hz = 10000; kfree(prev_nmi_count); return 0; @@ -934,9 +940,49 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -__kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) { + int i; + + if (system_state == SYSTEM_BOOTING) { + printk("nmi_show_all_regs(): system state %d, not doing.\n", + system_state); + return; + } + printk("nmi_show_all_regs(): start on CPU#%d.\n", + raw_smp_processor_id()); + dump_stack(); + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + + smp_send_nmi_allbutself(); + + for_each_online_cpu(i) { + while (nmi_show_regs[i] == 1) + barrier(); + } +} + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + +notrace void irq_show_regs_callback(int cpu, struct pt_regs *regs) +{ + if (!nmi_show_regs[cpu]) + return; + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + printk("NMI show regs on CPU#%d:\n", cpu); + printk("apic_timer_irqs: %d\n", per_cpu(irq_stat, cpu).apic_timer_irqs); + show_regs(regs); + spin_unlock(&nmi_print_lock); +} + +notrace __kprobes int nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) +{ /* * Since current_thread_info()-> is always on the stack, and we * always switch the stack NMI-atomically, it's safe to use @@ -949,6 +995,8 @@ __kprobes int nmi_watchdog_tick(struct p u64 dummy; int rc=0; + __profile_tick(CPU_PROFILING, regs); + /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { @@ -972,6 +1020,9 @@ __kprobes int nmi_watchdog_tick(struct p */ sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0); + irq_show_regs_callback(cpu, regs); + + /* if the apic timer isn't firing, this cpu isn't doing much */ /* if the none of the timers isn't firing, this cpu isn't doing much */ if (!touched && last_irq_sums[cpu] == sum) { /* @@ -979,11 +1030,30 @@ __kprobes int nmi_watchdog_tick(struct p * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; - if (alert_counter[cpu] == 5*nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); + if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) { + int i; + +// bust_spinlocks(1); + spin_lock(&nmi_print_lock); + printk("NMI watchdog detected lockup on CPU#%d (%d/%d)\n", + cpu, alert_counter[cpu], 5*nmi_hz); + show_regs(regs); + spin_unlock(&nmi_print_lock); + + for_each_online_cpu(i) { + if (i == cpu) + continue; + nmi_show_regs[i] = 1; + while (nmi_show_regs[i] == 1) + cpu_relax(); + } + printk("letting NMI watchdog run again ...\n"); + for_each_online_cpu(i) + alert_counter[i] = 0; + +// die_nmi(regs, "NMI Watchdog detected LOCKUP"); + } + } else { last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; @@ -1114,6 +1184,16 @@ void __trigger_all_cpu_backtrace(void) } } +void smp_send_nmi_allbutself(void) +{ +#ifdef CONFIG_SMP + cpumask_t mask = cpu_online_map; + cpu_clear(safe_smp_processor_id(), mask); + if (!cpus_empty(mask)) + send_IPI_mask(mask, NMI_VECTOR); +#endif +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); Index: linux/arch/i386/kernel/paravirt.c =================================================================== --- linux.orig/arch/i386/kernel/paravirt.c +++ linux/arch/i386/kernel/paravirt.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include @@ -34,6 +36,10 @@ #include #include +#include +#include +#include + /* nop stub */ static void native_nop(void) { @@ -93,7 +99,7 @@ static unsigned native_patch(u8 type, u1 return insn_len; } -static unsigned long native_get_debugreg(int regno) +static fastcall unsigned long native_get_debugreg(int regno) { unsigned long val = 0; /* Damn you, gcc! */ @@ -116,7 +122,7 @@ static unsigned long native_get_debugreg return val; } -static void native_set_debugreg(int regno, unsigned long value) +static fastcall void native_set_debugreg(int regno, unsigned long value) { switch (regno) { case 0: @@ -147,55 +153,55 @@ void init_IRQ(void) paravirt_ops.init_IRQ(); } -static void native_clts(void) +static fastcall void native_clts(void) { asm volatile ("clts"); } -static unsigned long native_read_cr0(void) +static fastcall unsigned long native_read_cr0(void) { unsigned long val; asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); return val; } -static void native_write_cr0(unsigned long val) +static fastcall void native_write_cr0(unsigned long val) { asm volatile("movl %0,%%cr0": :"r" (val)); } -static unsigned long native_read_cr2(void) +static fastcall unsigned long native_read_cr2(void) { unsigned long val; asm volatile("movl %%cr2,%0\n\t" :"=r" (val)); return val; } -static void native_write_cr2(unsigned long val) +static fastcall void native_write_cr2(unsigned long val) { asm volatile("movl %0,%%cr2": :"r" (val)); } -static unsigned long native_read_cr3(void) +static fastcall unsigned long native_read_cr3(void) { unsigned long val; asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); return val; } -static void native_write_cr3(unsigned long val) +static fastcall void native_write_cr3(unsigned long val) { asm volatile("movl %0,%%cr3": :"r" (val)); } -static unsigned long native_read_cr4(void) +static fastcall unsigned long native_read_cr4(void) { unsigned long val; asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); return val; } -static unsigned long native_read_cr4_safe(void) +static fastcall unsigned long native_read_cr4_safe(void) { unsigned long val; /* This could fault if %cr4 does not exist */ @@ -208,51 +214,51 @@ static unsigned long native_read_cr4_saf return val; } -static void native_write_cr4(unsigned long val) +static fastcall void native_write_cr4(unsigned long val) { asm volatile("movl %0,%%cr4": :"r" (val)); } -static unsigned long native_save_fl(void) +static fastcall unsigned long native_save_fl(void) { unsigned long f; asm volatile("pushfl ; popl %0":"=g" (f): /* no input */); return f; } -static void native_restore_fl(unsigned long f) +static fastcall void native_restore_fl(unsigned long f) { asm volatile("pushl %0 ; popfl": /* no output */ :"g" (f) :"memory", "cc"); } -static void native_irq_disable(void) +static fastcall void native_irq_disable(void) { asm volatile("cli": : :"memory"); } -static void native_irq_enable(void) +static fastcall void native_irq_enable(void) { asm volatile("sti": : :"memory"); } -static void native_safe_halt(void) +static fastcall void native_safe_halt(void) { asm volatile("sti; hlt": : :"memory"); } -static void native_halt(void) +static fastcall void native_halt(void) { asm volatile("hlt": : :"memory"); } -static void native_wbinvd(void) +static fastcall void native_wbinvd(void) { asm volatile("wbinvd": : :"memory"); } -static unsigned long long native_read_msr(unsigned int msr, int *err) +static fastcall unsigned long long native_read_msr(unsigned int msr, int *err) { unsigned long long val; @@ -271,7 +277,7 @@ static unsigned long long native_read_ms return val; } -static int native_write_msr(unsigned int msr, unsigned long long val) +static fastcall int native_write_msr(unsigned int msr, unsigned long long val) { int err; asm volatile("2: wrmsr ; xorl %0,%0\n" @@ -289,82 +295,82 @@ static int native_write_msr(unsigned int return err; } -static unsigned long long native_read_tsc(void) +static fastcall notrace unsigned long long native_read_tsc(void) { unsigned long long val; asm volatile("rdtsc" : "=A" (val)); return val; } -static unsigned long long native_read_pmc(void) +static fastcall unsigned long long native_read_pmc(void) { unsigned long long val; asm volatile("rdpmc" : "=A" (val)); return val; } -static void native_load_tr_desc(void) +static fastcall void native_load_tr_desc(void) { asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); } -static void native_load_gdt(const struct Xgt_desc_struct *dtr) +static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr) { asm volatile("lgdt %0"::"m" (*dtr)); } -static void native_load_idt(const struct Xgt_desc_struct *dtr) +static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr) { asm volatile("lidt %0"::"m" (*dtr)); } -static void native_store_gdt(struct Xgt_desc_struct *dtr) +static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr) { asm ("sgdt %0":"=m" (*dtr)); } -static void native_store_idt(struct Xgt_desc_struct *dtr) +static fastcall void native_store_idt(struct Xgt_desc_struct *dtr) { asm ("sidt %0":"=m" (*dtr)); } -static unsigned long native_store_tr(void) +static fastcall unsigned long native_store_tr(void) { unsigned long tr; asm ("str %0":"=r" (tr)); return tr; } -static void native_load_tls(struct thread_struct *t, unsigned int cpu) +static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu) { #define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i] C(0); C(1); C(2); #undef C } -static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high) +static fastcall inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high) { u32 *lp = (u32 *)((char *)dt + entry*8); lp[0] = entry_low; lp[1] = entry_high; } -static void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) +static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) +static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) +static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high) { native_write_dt_entry(dt, entrynum, low, high); } -static void native_load_esp0(struct tss_struct *tss, +static fastcall void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread) { tss->esp0 = thread->esp0; @@ -376,12 +382,12 @@ static void native_load_esp0(struct tss_ } } -static void native_io_delay(void) +static fastcall void native_io_delay(void) { asm volatile("outb %al,$0x80"); } -static void native_flush_tlb(void) +static fastcall void native_flush_tlb(void) { __native_flush_tlb(); } @@ -390,49 +396,49 @@ static void native_flush_tlb(void) * Global pages have to be flushed a bit differently. Not a real * performance problem because this does not happen often. */ -static void native_flush_tlb_global(void) +static fastcall void native_flush_tlb_global(void) { __native_flush_tlb_global(); } -static void native_flush_tlb_single(u32 addr) +static fastcall void native_flush_tlb_single(u32 addr) { __native_flush_tlb_single(addr); } #ifndef CONFIG_X86_PAE -static void native_set_pte(pte_t *ptep, pte_t pteval) +static fastcall void native_set_pte(pte_t *ptep, pte_t pteval) { *ptep = pteval; } -static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) +static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval) { *ptep = pteval; } -static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) { *pmdp = pmdval; } #else /* CONFIG_X86_PAE */ -static void native_set_pte(pte_t *ptep, pte_t pte) +static fastcall void native_set_pte(pte_t *ptep, pte_t pte) { ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; } -static void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) +static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte) { ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; } -static void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) +static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { ptep->pte_low = 0; smp_wmb(); @@ -441,29 +447,29 @@ static void native_set_pte_present(struc ptep->pte_low = pte.pte_low; } -static void native_set_pte_atomic(pte_t *ptep, pte_t pteval) +static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval) { set_64bit((unsigned long long *)ptep,pte_val(pteval)); } -static void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) +static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval) { set_64bit((unsigned long long *)pmdp,pmd_val(pmdval)); } -static void native_set_pud(pud_t *pudp, pud_t pudval) +static fastcall void native_set_pud(pud_t *pudp, pud_t pudval) { *pudp = pudval; } -static void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { ptep->pte_low = 0; smp_wmb(); ptep->pte_high = 0; } -static void native_pmd_clear(pmd_t *pmd) +static fastcall void native_pmd_clear(pmd_t *pmd) { u32 *tmp = (u32 *)pmd; *tmp = 0; @@ -586,3 +592,201 @@ struct paravirt_ops paravirt_ops = { * all lowlevel CPU ops used by modules are separately exported. */ EXPORT_SYMBOL_GPL(paravirt_ops); + +/* + * KVM paravirtualization optimizations: + */ +int __read_mostly kvm_paravirt = 1; + +static int __init kvm_paravirt_setup(char *str) +{ + kvm_paravirt = simple_strtoul(str, NULL, 0); + return 1; +} +__setup("kvm_paravirt=", kvm_paravirt_setup); + +/* + * No need for any "IO delay" on KVM: + */ +static void fastcall kvm_io_delay(void) +{ +} + +static DEFINE_PER_CPU(struct kvm_vcpu_para_state, para_state); +static DEFINE_PER_CPU(struct kvm_cr3_cache, cr3_cache); + +/* + * Special, register-to-cr3 instruction based hypercall API + * variant to the KVM host. This utilizes the cr3 filter capability + * of the hardware - if this works out then no VM exit happens, + * if a VM exit happens then KVM will get the virtual address too. + */ +static void fastcall kvm_write_cr3(unsigned long guest_cr3) +{ + struct kvm_cr3_cache *cache = &get_cpu_var(cr3_cache); + int idx, ret; + + /* + * Check the cache (maintained by the host) for a matching + * guest_cr3 => host_cr3 mapping. Use it if found: + */ + for (idx = 0; idx < cache->entry_count; idx++) { + if (cache->entry[idx].guest_cr3 == guest_cr3) { + /* + * Cache-hit: we load the cached host-CR3 value. + * This never causes any VM exit. (if it does then the + * hypervisor could do nothing with this instruction + * and the guest OS would be aborted) + */ + asm volatile("movl %0, %%cr3" + : : "r" (cache->entry[idx].host_cr3)); + goto out; + } + } + + /* + * Cache-miss. Use the cr3 hypercall to load the guest-physical + * address (the host will also update the cr3 cache): + */ + ret = hypercall(1, __NR_hypercall_load_cr3, guest_cr3); + WARN_ON(ret); +out: + put_cpu_var(cr3_cache); +} + +static fastcall void kvm_release_pd(u32 pfn) +{ + long ret; + + ret = hypercall(1, __NR_hypercall_flush_cr3_cache, pfn << PAGE_SHIFT); + WARN_ON(ret); +} + +#ifdef CONFIG_EVENT_TRACE +void trace_hyper(unsigned long v1, unsigned long v2, unsigned long v3) +{ + long ret; + + ret = hypercall(4, __NR_hypercall_trace, CALLER_ADDR0, v1, v2, v3); + WARN_ON(ret); +} +#endif + +/* + * Avoid the VM exit upon cr3 load by using the cached + * ->active_mm->pgd value: + */ +static void fastcall kvm_flush_tlb_user(void) +{ + kvm_write_cr3(__pa(current->active_mm->pgd)); +} + +static void fastcall kvm_flush_tlb_single(u32 addr) +{ + __native_flush_tlb_single(addr); +} +/* + * Disable global pages, do a flush, then enable global pages: + */ +static fastcall void kvm_flush_tlb_kernel(void) +{ + unsigned long orig_cr4 = read_cr4(); + + write_cr4(orig_cr4 & ~X86_CR4_PGE); + kvm_flush_tlb_user(); + write_cr4(orig_cr4); +} + +/* + * This is the vm-syscall address - to be patched by the host to + * VMCALL (Intel) or VMMCALL (AMD), depending on the CPU model: + */ +asm ( + " .globl hypercall_addr \n" + " .align 4 \n" + " hypercall_addr: \n" + " movl $-38, %eax \n" + " ret \n" +); + +extern unsigned char hypercall_addr[6]; + +static void test_hypercall(void) +{ + int ret; + unsigned long dummy = 123456; + + ret = hypercall(1, __NR_hypercall_test, dummy); + pr_debug("hypercall test #1, ret: 0x%x\n", ret); +} + +int kvm_guest_register_para(int cpu) +{ + struct kvm_vcpu_para_state *para_state = &per_cpu(para_state, cpu); + struct kvm_cr3_cache *cr3_cache = &per_cpu(cr3_cache, cpu); + + printk(KERN_DEBUG "kvm guest on VCPU#%d: trying to register para_state %p\n", + cpu, para_state); + /* + * Try to write to a magic MSR (which is invalid on any real CPU), + * and thus signal to KVM that we wish to entering paravirtualized + * mode: + */ + para_state->guest_version = KVM_PARA_API_VERSION; + para_state->host_version = -1; + para_state->size = sizeof(*para_state); + para_state->ret = -1; + para_state->hypercall_gpa = __pa(hypercall_addr); + cr3_cache->entry_count = KVM_CR3_CACHE_SIZE; + para_state->cr3_cache_gpa = __pa(cr3_cache); + + if (wrmsr_safe(MSR_KVM_API_MAGIC, __pa(para_state), 0)) { + printk(KERN_INFO "KVM guest: WRMSR probe failed.\n"); + return 0; + } + + printk(KERN_DEBUG "kvm guest: host returned %d\n", para_state->ret); + printk(KERN_DEBUG "kvm guest: host version: %d\n", para_state->host_version); + printk(KERN_DEBUG "kvm guest: cr3 cache size: %d\n", cr3_cache->entry_count); + printk(KERN_DEBUG "kvm guest: syscall entry: %02x %02x %02x %02x\n", + hypercall_addr[0], hypercall_addr[1], + hypercall_addr[2], hypercall_addr[3]); + if (para_state->ret) { + printk(KERN_ERR "kvm guest: host refused registration.\n"); + return 0; + } + test_hypercall(); + + return 1; +} + +asmlinkage void __init kvm_probe(void) +{ + /* + * Did the user disable it explicitly via kvm_paravirt=0? + */ + if (!kvm_paravirt) + return; + + printk(KERN_DEBUG "KVM paravirtualization probe\n"); + + kvm_paravirt = kvm_guest_register_para(smp_processor_id()); + if (!kvm_paravirt) { + printk(KERN_INFO "Not a KVM para-guest\n"); + return; + } + + printk(KERN_INFO "KVM para-guest: OK\n"); + + paravirt_ops.name = "KVM"; + paravirt_ops.io_delay = kvm_io_delay; + paravirt_ops.flush_tlb_user = kvm_flush_tlb_user; + paravirt_ops.flush_tlb_kernel = kvm_flush_tlb_kernel; + paravirt_ops.flush_tlb_single = kvm_flush_tlb_single; + paravirt_ops.write_cr3 = kvm_write_cr3; + paravirt_ops.release_pd = kvm_release_pd; + + paravirt_ops.paravirt_enabled = 1; +} + +paravirt_probe(kvm_probe); Index: linux/arch/i386/kernel/process.c =================================================================== --- linux.orig/arch/i386/kernel/process.c +++ linux/arch/i386/kernel/process.c @@ -110,7 +110,7 @@ void default_idle(void) smp_mb(); local_irq_disable(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) safe_halt(); /* enables interrupts racelessly */ else local_irq_enable(); @@ -175,7 +175,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { tick_nohz_stop_sched_tick(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) @@ -193,10 +193,12 @@ void cpu_idle(void) __get_cpu_var(irq_stat).idle_timestamp = jiffies; idle(); } + local_irq_disable(); tick_nohz_restart_sched_tick(); - preempt_enable_no_resched(); - schedule(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } @@ -242,10 +244,10 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) __mwait(eax, ecx); } } @@ -363,15 +365,23 @@ void exit_thread(void) if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { struct task_struct *tsk = current; struct thread_struct *t = &tsk->thread; - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + void *io_bitmap_ptr = t->io_bitmap_ptr; + int cpu; + struct tss_struct *tss; - kfree(t->io_bitmap_ptr); + /* + * On PREEMPT_RT we must not call kfree() with + * preemption disabled, so we first zap the pointer: + */ t->io_bitmap_ptr = NULL; + kfree(io_bitmap_ptr); + clear_thread_flag(TIF_IO_BITMAP); /* * Careful, clear this in the TSS too: */ + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); t->io_bitmap_max = 0; tss->io_bitmap_owner = NULL; Index: linux/arch/i386/kernel/quirks.c =================================================================== --- linux.orig/arch/i386/kernel/quirks.c +++ linux/arch/i386/kernel/quirks.c @@ -3,12 +3,10 @@ */ #include #include -#include -#include -#include #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) -static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) + +static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) { u8 config, rev; u32 word; @@ -16,12 +14,14 @@ static void __devinit verify_quirk_intel /* BIOS may enable hardware IRQ balancing for * E7520/E7320/E7525(revision ID 0x9 and below) * based platforms. - * For those platforms, make sure that the genapic is set to 'flat' + * Disable SW irqbalance/affinity on those platforms. */ pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); if (rev > 0x9) return; + printk(KERN_INFO "Intel E7520/7320/7525 detected."); + /* enable access to config space*/ pci_read_config_byte(dev, 0xf4, &config); pci_write_config_byte(dev, 0xf4, config|0x2); @@ -30,44 +30,6 @@ static void __devinit verify_quirk_intel raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); if (!(word & (1 << 13))) { -#ifdef CONFIG_X86_64 - if (genapic != &apic_flat) - panic("APIC mode must be flat on this system\n"); -#elif defined(CONFIG_X86_GENERICARCH) - if (genapic != &apic_default) - panic("APIC mode must be default(flat) on this system. Use apic=default\n"); -#endif - } - - /* put back the original value for config space*/ - if (!(config & 0x2)) - pci_write_config_byte(dev, 0xf4, config); -} - -void __init quirk_intel_irqbalance(void) -{ - u8 config, rev; - u32 word; - - /* BIOS may enable hardware IRQ balancing for - * E7520/E7320/E7525(revision ID 0x9 and below) - * based platforms. - * Disable SW irqbalance/affinity on those platforms. - */ - rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION); - if (rev > 0x9) - return; - - printk(KERN_INFO "Intel E7520/7320/7525 detected."); - - /* enable access to config space */ - config = read_pci_config_byte(0, 0, 0, 0xf4); - write_pci_config_byte(0, 0, 0, 0xf4, config|0x2); - - /* read xTPR register */ - word = read_pci_config_16(0, 0, 0x40, 0x4c); - - if (!(word & (1 << 13))) { printk(KERN_INFO "Disabling irq balancing and affinity\n"); #ifdef CONFIG_IRQBALANCE irqbalance_disable(""); @@ -76,24 +38,13 @@ void __init quirk_intel_irqbalance(void) #ifdef CONFIG_PROC_FS no_irq_affinity = 1; #endif -#ifdef CONFIG_HOTPLUG_CPU - printk(KERN_INFO "Disabling cpu hotplug control\n"); - enable_cpu_hotplug = 0; -#endif -#ifdef CONFIG_X86_64 - /* force the genapic selection to flat mode so that - * interrupts can be redirected to more than one CPU. - */ - genapic_force = &apic_flat; -#endif } - /* put back the original value for config space */ + /* put back the original value for config space*/ if (!(config & 0x2)) - write_pci_config_byte(0, 0, 0, 0xf4, config); + pci_write_config_byte(dev, 0xf4, config); } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance); - +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); #endif Index: linux/arch/i386/kernel/reboot.c =================================================================== --- linux.orig/arch/i386/kernel/reboot.c +++ linux/arch/i386/kernel/reboot.c @@ -318,6 +318,22 @@ void machine_shutdown(void) void machine_emergency_restart(void) { + unsigned long ecx = cpuid_ecx(1); + + /* + * Disable any possibly active VT context (if VT supported): + */ + if (test_bit(5, &ecx)) { /* has VT support */ + asm volatile ( + "1: .byte 0x0f, 0x01, 0xc4 \n" /* vmxoff */ + "2: \n" + ".section __ex_table,\"a\" \n" + " .align 4 \n" + " .long 1b,2b \n" + ".previous \n" + ); + } + if (!reboot_thru_bios) { if (efi_enabled) { efi.reset_system(EFI_RESET_COLD, EFI_SUCCESS, 0, NULL); Index: linux/arch/i386/kernel/signal.c =================================================================== --- linux.orig/arch/i386/kernel/signal.c +++ linux/arch/i386/kernel/signal.c @@ -534,6 +534,13 @@ handle_signal(unsigned long sig, siginfo } } +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so * that register information in the sigcontext is correct. @@ -574,6 +581,13 @@ static void fastcall do_signal(struct pt struct k_sigaction ka; sigset_t *oldset; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux/arch/i386/kernel/smp.c =================================================================== --- linux.orig/arch/i386/kernel/smp.c +++ linux/arch/i386/kernel/smp.c @@ -255,7 +255,7 @@ void send_IPI_mask_sequence(cpumask_t ma static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); +static DEFINE_RAW_SPINLOCK(tlbstate_lock); #define FLUSH_ALL 0xffffffff /* @@ -490,10 +490,20 @@ void smp_send_reschedule(int cpu) } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); @@ -598,13 +608,14 @@ void smp_send_stop(void) } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. Trigger a reschedule pass so that + * RT-overload balancing can pass tasks around. */ -fastcall void smp_reschedule_interrupt(struct pt_regs *regs) +fastcall notrace void smp_reschedule_interrupt(struct pt_regs *regs) { + trace_special(regs->eip, 0, 0); ack_APIC_irq(); + set_tsk_need_resched(current); } fastcall void smp_call_function_interrupt(struct pt_regs *regs) Index: linux/arch/i386/kernel/smpboot.c =================================================================== --- linux.orig/arch/i386/kernel/smpboot.c +++ linux/arch/i386/kernel/smpboot.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include @@ -1318,12 +1317,6 @@ int __cpuinit __cpu_up(unsigned int cpu) cpu_relax(); touch_nmi_watchdog(); } - -#ifdef CONFIG_X86_GENERICARCH - if (num_online_cpus() > 8 && genapic == &apic_default) - panic("Default flat APIC routing can't be used with > 8 cpus\n"); -#endif - return 0; } Index: linux/arch/i386/kernel/time.c =================================================================== --- linux.orig/arch/i386/kernel/time.c +++ linux/arch/i386/kernel/time.c @@ -126,7 +126,7 @@ static int set_rtc_mmss(unsigned long no int timer_ack; -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); Index: linux/arch/i386/kernel/traps.c =================================================================== --- linux.orig/arch/i386/kernel/traps.c +++ linux/arch/i386/kernel/traps.c @@ -95,18 +95,23 @@ asmlinkage void machine_check(void); int kstack_depth_to_print = 24; static unsigned int code_bytes = 64; -ATOMIC_NOTIFIER_HEAD(i386die_chain); +#ifdef CONFIG_STACK_UNWIND +static int call_trace = 1; +#else +#define call_trace (-1) +#endif +RAW_NOTIFIER_HEAD(i386die_chain); int register_die_notifier(struct notifier_block *nb) { vmalloc_sync_all(); - return atomic_notifier_chain_register(&i386die_chain, nb); + return raw_notifier_chain_register(&i386die_chain, nb); } EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ int unregister_die_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&i386die_chain, nb); + return raw_notifier_chain_unregister(&i386die_chain, nb); } EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ @@ -148,6 +153,33 @@ static inline unsigned long print_contex return ebp; } +struct ops_and_data { + struct stacktrace_ops *ops; + void *data; +}; + +static asmlinkage int +dump_trace_unwind(struct unwind_frame_info *info, void *data) +{ + struct ops_and_data *oad = (struct ops_and_data *)data; + int n = 0; + unsigned long sp = UNW_SP(info); + + if (arch_unw_user_mode(info)) + return -1; + while (unwind(info) == 0 && UNW_PC(info)) { + n++; + oad->ops->address(oad->data, UNW_PC(info)); + if (arch_unw_user_mode(info)) + break; + if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1)) + && sp > UNW_SP(info)) + break; + sp = UNW_SP(info); + } + return n; +} + #define MSG(msg) ops->warning(data, msg) void dump_trace(struct task_struct *task, struct pt_regs *regs, @@ -159,6 +191,41 @@ void dump_trace(struct task_struct *task if (!task) task = current; + if (call_trace >= 0) { + int unw_ret = 0; + struct unwind_frame_info info; + struct ops_and_data oad = { .ops = ops, .data = data }; + + if (regs) { + if (unwind_init_frame_info(&info, task, regs) == 0) + unw_ret = dump_trace_unwind(&info, &oad); + } else if (task == current) + unw_ret = unwind_init_running(&info, dump_trace_unwind, + &oad); + else { + if (unwind_init_blocked(&info, task) == 0) + unw_ret = dump_trace_unwind(&info, &oad); + } + if (unw_ret > 0) { + if (call_trace == 1 && !arch_unw_user_mode(&info)) { + ops->warning_symbol(data, + "DWARF2 unwinder stuck at %s", + UNW_PC(&info)); + if (UNW_SP(&info) >= PAGE_OFFSET) { + MSG("Leftover inexact backtrace:"); + stack = (void *)UNW_SP(&info); + if (!stack) + return; + ebp = UNW_FP(&info); + } else + MSG("Full inexact backtrace again:"); + } else if (call_trace >= 1) + return; + else + MSG("Full inexact backtrace again:"); + } else + MSG("Inexact backtrace:"); + } if (!stack) { unsigned long dummy; stack = &dummy; @@ -236,6 +303,7 @@ show_trace_log_lvl(struct task_struct *t { dump_trace(task, regs, stack, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); + print_traces(task); } void show_trace(struct task_struct *task, struct pt_regs *regs, @@ -265,8 +333,17 @@ static void show_stack_log_lvl(struct ta printk("\n%s ", log_lvl); printk("%08lx ", *stack++); } + + if (pause_on_oops_head) + for(;;); + printk("\n%sCall Trace:\n", log_lvl); show_trace_log_lvl(task, regs, esp, log_lvl); + + if (pause_on_oops_tail) + for(;;); + + debug_show_held_locks(task); } void show_stack(struct task_struct *task, unsigned long *esp) @@ -287,6 +364,12 @@ void dump_stack(void) EXPORT_SYMBOL(dump_stack); +#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_EVENT_TRACE) +extern unsigned long worst_stack_left; +#else +# define worst_stack_left -1L +#endif + void show_registers(struct pt_regs *regs) { int i; @@ -315,8 +398,8 @@ void show_registers(struct pt_regs *regs regs->eax, regs->ebx, regs->ecx, regs->edx); printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->esi, regs->edi, regs->ebp, esp); - printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss); + printk(KERN_EMERG "ds: %04x es: %04x fs: %04x gs: %04x ss: %04x preempt:%08x\n", + regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss, preempt_count()); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, current->pid, current_thread_info(), current, current->thread_info); @@ -376,11 +459,11 @@ int is_valid_bugaddr(unsigned long eip) void die(const char * str, struct pt_regs * regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = __SPIN_LOCK_UNLOCKED(die.lock), + .lock = RAW_SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; @@ -488,6 +571,11 @@ static void __kprobes do_trap(int trapnr if (!user_mode(regs)) goto kernel_trap; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + trap_signal: { if (info) force_sig_info(signr, info, tsk); @@ -713,10 +801,11 @@ void __kprobes die_nmi(struct pt_regs *r crash_kexec(regs); } + nmi_exit(); do_exit(SIGSEGV); } -static __kprobes void default_do_nmi(struct pt_regs * regs) +static notrace __kprobes void default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -754,11 +843,12 @@ static __kprobes void default_do_nmi(str reassert_nmi(); } -fastcall __kprobes void do_nmi(struct pt_regs * regs, long error_code) +fastcall notrace __kprobes void do_nmi(struct pt_regs * regs, long error_code) { int cpu; nmi_enter(); + nmi_trace((unsigned long)do_nmi, regs->eip, regs->eflags); cpu = smp_processor_id(); @@ -1195,6 +1285,22 @@ static int __init kstack_setup(char *s) } __setup("kstack=", kstack_setup); +#ifdef CONFIG_STACK_UNWIND +static int __init call_trace_setup(char *s) +{ + if (strcmp(s, "old") == 0) + call_trace = -1; + else if (strcmp(s, "both") == 0) + call_trace = 0; + else if (strcmp(s, "newfallback") == 0) + call_trace = 1; + else if (strcmp(s, "new") == 2) + call_trace = 2; + return 1; +} +__setup("call_trace=", call_trace_setup); +#endif + static int __init code_bytes_setup(char *s) { code_bytes = simple_strtoul(s, NULL, 0); Index: linux/arch/i386/kernel/tsc.c =================================================================== --- linux.orig/arch/i386/kernel/tsc.c +++ linux/arch/i386/kernel/tsc.c @@ -105,7 +105,7 @@ unsigned long long sched_clock(void) /* * Fall back to jiffies if there's no TSC available: */ - if (unlikely(tsc_disable)) + if (tsc_unstable || unlikely(tsc_disable)) /* No locking but a rare wrong value is not a big deal: */ return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ); @@ -259,7 +259,7 @@ core_initcall(cpufreq_tsc); static unsigned long current_tsc_khz = 0; -static cycle_t read_tsc(void) +static notrace cycle_t read_tsc(void) { cycle_t ret; Index: linux/arch/i386/kernel/vm86.c =================================================================== --- linux.orig/arch/i386/kernel/vm86.c +++ linux/arch/i386/kernel/vm86.c @@ -138,6 +138,7 @@ struct pt_regs * fastcall save_v86_state local_irq_enable(); if (!current->thread.vm86_info) { + local_irq_disable(); printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } Index: linux/arch/i386/kernel/vmi.c =================================================================== --- linux.orig/arch/i386/kernel/vmi.c +++ linux/arch/i386/kernel/vmi.c @@ -23,7 +23,6 @@ */ #include -#include #include #include #include @@ -48,7 +47,6 @@ typedef u64 __attribute__((regparm(2))) (((VROMLONGFUNC *)(rom->func)) (arg)) static struct vrom_header *vmi_rom; -static int license_gplok; static int disable_pge; static int disable_pse; static int disable_sep; @@ -629,13 +627,14 @@ static inline int __init check_vmi_rom(s rom->api_version_maj, rom->api_version_min, pci->rom_version_maj, pci->rom_version_min); - license_gplok = license_is_gpl_compatible(license); - if (!license_gplok) { - printk(KERN_WARNING "VMI: ROM license '%s' taints kernel... " - "inlining disabled\n", - license); - add_taint(TAINT_PROPRIETARY_MODULE); - } + /* Don't allow BSD/MIT here for now because we don't want to end up + with any binary only shim layers */ + if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) { + printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n", + license); + return 0; + } + return 1; } Index: linux/arch/i386/lib/delay.c =================================================================== --- linux.orig/arch/i386/lib/delay.c +++ linux/arch/i386/lib/delay.c @@ -23,7 +23,7 @@ #endif /* simple loop based delay: */ -static void delay_loop(unsigned long loops) +static notrace void delay_loop(unsigned long loops) { int d0; @@ -38,7 +38,7 @@ static void delay_loop(unsigned long loo } /* TSC based delay: */ -static void delay_tsc(unsigned long loops) +static notrace void delay_tsc(unsigned long loops) { unsigned long bclock, now; @@ -69,7 +69,7 @@ int read_current_timer(unsigned long *ti return -1; } -void __delay(unsigned long loops) +void notrace __delay(unsigned long loops) { delay_fn(loops); } Index: linux/arch/i386/lib/semaphore.S =================================================================== --- linux.orig/arch/i386/lib/semaphore.S +++ linux/arch/i386/lib/semaphore.S @@ -30,7 +30,7 @@ * value or just clobbered.. */ .section .sched.text -ENTRY(__down_failed) +ENTRY(__compat_down_failed) CFI_STARTPROC FRAME pushl %edx @@ -39,7 +39,7 @@ ENTRY(__down_failed) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __down + call __compat_down popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -49,9 +49,9 @@ ENTRY(__down_failed) ENDFRAME ret CFI_ENDPROC - END(__down_failed) + END(__compat_down_failed) -ENTRY(__down_failed_interruptible) +ENTRY(__compat_down_failed_interruptible) CFI_STARTPROC FRAME pushl %edx @@ -60,7 +60,7 @@ ENTRY(__down_failed_interruptible) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __down_interruptible + call __compat_down_interruptible popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -70,9 +70,9 @@ ENTRY(__down_failed_interruptible) ENDFRAME ret CFI_ENDPROC - END(__down_failed_interruptible) + END(__compat_down_failed_interruptible) -ENTRY(__down_failed_trylock) +ENTRY(__compat_down_failed_trylock) CFI_STARTPROC FRAME pushl %edx @@ -81,7 +81,7 @@ ENTRY(__down_failed_trylock) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __down_trylock + call __compat_down_trylock popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -91,9 +91,9 @@ ENTRY(__down_failed_trylock) ENDFRAME ret CFI_ENDPROC - END(__down_failed_trylock) + END(__compat_down_failed_trylock) -ENTRY(__up_wakeup) +ENTRY(__compat_up_wakeup) CFI_STARTPROC FRAME pushl %edx @@ -102,7 +102,7 @@ ENTRY(__up_wakeup) pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx,0 - call __up + call __compat_up popl %ecx CFI_ADJUST_CFA_OFFSET -4 CFI_RESTORE ecx @@ -112,7 +112,7 @@ ENTRY(__up_wakeup) ENDFRAME ret CFI_ENDPROC - END(__up_wakeup) + END(__compat_up_wakeup) /* * rw spinlock fallbacks Index: linux/arch/i386/mach-default/setup.c =================================================================== --- linux.orig/arch/i386/mach-default/setup.c +++ linux/arch/i386/mach-default/setup.c @@ -35,7 +35,7 @@ void __init pre_intr_init_hook(void) /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, IRQF_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; /** * intr_init_hook - post gate setup interrupt initialisation @@ -81,7 +81,7 @@ void __init trap_init_hook(void) static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED | IRQF_NOBALANCING, + .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "timer" }; Index: linux/arch/i386/mach-visws/setup.c =================================================================== --- linux.orig/arch/i386/mach-visws/setup.c +++ linux/arch/i386/mach-visws/setup.c @@ -116,7 +116,7 @@ void __init pre_setup_arch_hook() static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NODELAY, .name = "timer", }; Index: linux/arch/i386/mach-visws/visws_apic.c =================================================================== --- linux.orig/arch/i386/mach-visws/visws_apic.c +++ linux/arch/i386/mach-visws/visws_apic.c @@ -258,11 +258,13 @@ out_unlock: static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = IRQF_NODELAY, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = IRQF_NODELAY, }; Index: linux/arch/i386/mach-voyager/setup.c =================================================================== --- linux.orig/arch/i386/mach-voyager/setup.c +++ linux/arch/i386/mach-voyager/setup.c @@ -18,7 +18,7 @@ void __init pre_intr_init_hook(void) /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, IRQF_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; void __init intr_init_hook(void) { @@ -40,7 +40,7 @@ void __init trap_init_hook(void) { } -static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL}; +static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED | IRQF_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL}; void __init time_init_hook(void) { Index: linux/arch/i386/mm/fault.c =================================================================== --- linux.orig/arch/i386/mm/fault.c +++ linux/arch/i386/mm/fault.c @@ -294,8 +294,8 @@ static inline int vmalloc_fault(unsigned * bit 3 == 1 means use of reserved bit detected * bit 4 == 1 means fault was an instruction fetch */ -fastcall void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) +fastcall notrace void __kprobes do_page_fault(struct pt_regs *regs, + unsigned long error_code) { struct task_struct *tsk; struct mm_struct *mm; @@ -306,6 +306,7 @@ fastcall void __kprobes do_page_fault(st /* get the address */ address = read_cr2(); + trace_special(regs->eip, error_code, address); tsk = current; @@ -350,7 +351,7 @@ fastcall void __kprobes do_page_fault(st * If we're in an interrupt, have no user context or are running in an * atomic region then we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto bad_area_nosemaphore; /* When running in the kernel we expect faults to occur only to @@ -483,6 +484,9 @@ bad_area_nosemaphore: nr = (address - idt_descr.address) >> 3; if (nr == 6) { + stop_trace(); + user_trace_stop(); + zap_rt_locks(); do_invalid_op(regs, 0); return; } Index: linux/arch/i386/mm/highmem.c =================================================================== --- linux.orig/arch/i386/mm/highmem.c +++ linux/arch/i386/mm/highmem.c @@ -3,9 +3,9 @@ void *kmap(struct page *page) { - might_sleep(); if (!PageHighMem(page)) return page_address(page); + might_sleep(); return kmap_high(page); } @@ -18,6 +18,26 @@ void kunmap(struct page *page) kunmap_high(page); } +void kunmap_virt(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return; + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + kunmap(page); +} + +struct page *kmap_to_page(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return virt_to_page(ptr); + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + return page; +} + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -26,7 +46,7 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; @@ -46,7 +66,7 @@ void *kmap_atomic(struct page *page, enu return (void*) vaddr; } -void kunmap_atomic(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr, enum km_type type) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); @@ -72,7 +92,7 @@ void kunmap_atomic(void *kvaddr, enum km /* This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; @@ -86,7 +106,7 @@ void *kmap_atomic_pfn(unsigned long pfn, return (void*) vaddr; } -struct page *kmap_atomic_to_page(void *ptr) +struct page *__kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; @@ -101,6 +121,7 @@ struct page *kmap_atomic_to_page(void *p EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); +EXPORT_SYMBOL(kunmap_virt); +EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(__kmap_atomic_to_page); Index: linux/arch/i386/mm/init.c =================================================================== --- linux.orig/arch/i386/mm/init.c +++ linux/arch/i386/mm/init.c @@ -45,7 +45,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static int noinline do_test_wp_bit(void); @@ -194,7 +194,7 @@ static inline int page_kills_ppro(unsign return 0; } -int page_is_ram(unsigned long pagenr) +int notrace page_is_ram(unsigned long pagenr) { int i; unsigned long addr, end; Index: linux/arch/i386/mm/pgtable.c =================================================================== --- linux.orig/arch/i386/mm/pgtable.c +++ linux/arch/i386/mm/pgtable.c @@ -215,7 +215,7 @@ void pmd_ctor(void *pmd, struct kmem_cac * recommendations and having no core impact whatsoever. * -- wli */ -DEFINE_SPINLOCK(pgd_lock); +DEFINE_RAW_SPINLOCK(pgd_lock); struct page *pgd_list; static inline void pgd_list_add(pgd_t *pgd) Index: linux/arch/i386/oprofile/Kconfig =================================================================== --- linux.orig/arch/i386/oprofile/Kconfig +++ linux/arch/i386/oprofile/Kconfig @@ -15,3 +15,6 @@ config OPROFILE If unsure, say N. +config PROFILE_NMI + bool + default y Index: linux/arch/i386/pci/Makefile =================================================================== --- linux.orig/arch/i386/pci/Makefile +++ linux/arch/i386/pci/Makefile @@ -4,8 +4,9 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o mmconfig-shared.o obj-$(CONFIG_PCI_DIRECT) += direct.o +obj-$(CONFIG_ACPI) += acpi.o + pci-y := fixup.o -pci-$(CONFIG_ACPI) += acpi.o pci-y += legacy.o irq.o pci-$(CONFIG_X86_VISWS) := visws.o fixup.o Index: linux/arch/i386/pci/common.c =================================================================== --- linux.orig/arch/i386/pci/common.c +++ linux/arch/i386/pci/common.c @@ -52,7 +52,7 @@ int pcibios_scanned; * This interrupt-safe spinlock protects all accesses to PCI * configuration space. */ -DEFINE_SPINLOCK(pci_config_lock); +DEFINE_RAW_SPINLOCK(pci_config_lock); /* * Several buggy motherboards address only 16 devices and mirror Index: linux/arch/i386/pci/direct.c =================================================================== --- linux.orig/arch/i386/pci/direct.c +++ linux/arch/i386/pci/direct.c @@ -220,16 +220,23 @@ static int __init pci_check_type1(void) unsigned int tmp; int works = 0; - local_irq_save(flags); + spin_lock_irqsave(&pci_config_lock, flags); outb(0x01, 0xCFB); tmp = inl(0xCF8); outl(0x80000000, 0xCF8); - if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) { - works = 1; + + if (inl(0xCF8) == 0x80000000) { + spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf1)) + works = 1; + + spin_lock_irqsave(&pci_config_lock, flags); } outl(tmp, 0xCF8); - local_irq_restore(flags); + + spin_unlock_irqrestore(&pci_config_lock, flags); return works; } @@ -239,17 +246,19 @@ static int __init pci_check_type2(void) unsigned long flags; int works = 0; - local_irq_save(flags); + spin_lock_irqsave(&pci_config_lock, flags); outb(0x00, 0xCFB); outb(0x00, 0xCF8); outb(0x00, 0xCFA); - if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 && - pci_sanity_check(&pci_direct_conf2)) { - works = 1; - } - local_irq_restore(flags); + if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) { + spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf2)) + works = 1; + } else + spin_unlock_irqrestore(&pci_config_lock, flags); return works; } Index: linux/arch/i386/pci/pci.h =================================================================== --- linux.orig/arch/i386/pci/pci.h +++ linux/arch/i386/pci/pci.h @@ -78,7 +78,7 @@ struct irq_routing_table { extern unsigned int pcibios_irq_mask; extern int pcibios_scanned; -extern spinlock_t pci_config_lock; +extern raw_spinlock_t pci_config_lock; extern int (*pcibios_enable_irq)(struct pci_dev *dev); extern void (*pcibios_disable_irq)(struct pci_dev *dev); Index: linux/arch/ia64/Kconfig =================================================================== --- linux.orig/arch/ia64/Kconfig +++ linux/arch/ia64/Kconfig @@ -38,6 +38,7 @@ config SWIOTLB config RWSEM_XCHGADD_ALGORITHM bool + depends on !PREEMPT_RT default y config ARCH_HAS_ILOG2_U32 @@ -265,6 +266,69 @@ config SMP If you don't know what to do here, say N. + +config GENERIC_TIME + bool + default y + +config HIGH_RES_TIMERS + bool "High-Resolution Timers" + help + + POSIX timers are available by default. This option enables + high-resolution POSIX timers. With this option the resolution + is at least 1 microsecond. High resolution is not free. If + enabled this option will add a small overhead each time a + timer expires that is not on a 1/HZ tick boundary. If no such + timers are used the overhead is nil. + + This option enables two additional POSIX CLOCKS, + CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR. Note that this + option does not change the resolution of CLOCK_REALTIME or + CLOCK_MONOTONIC which remain at 1/HZ resolution. + +config HIGH_RES_RESOLUTION + int "High-Resolution-Timer resolution (nanoseconds)" + depends on HIGH_RES_TIMERS + default 1000 + help + + This sets the resolution of timers accessed with + CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR. Too + fine a resolution (small a number) will usually not + be observable due to normal system latencies. For an + 800 MHZ processor about 10,000 is the recommended maximum + (smallest number). If you don't need that sort of resolution, + higher numbers may generate less overhead. + +choice + prompt "Clock source" + depends on HIGH_RES_TIMERS + default HIGH_RES_TIMER_ITC + help + This option allows you to choose the hardware source in charge + of generating high precision interruptions on your system. + On IA-64 these are: + + + ITC Interval Time Counter 1/CPU clock + HPET High Precision Event Timer ~ (XXX:have to check the spec) + + The ITC timer is available on all the ia64 computers because + it is integrated directly into the processor. However it may not + give correct results on MP machines with processors running + at different clock rates. In this case you may want to use + the HPET if available on your machine. + + +config HIGH_RES_TIMER_ITC + bool "Interval Time Counter/ITC" + +config HIGH_RES_TIMER_HPET + bool "High Precision Event Timer/HPET" + +endchoice + config NR_CPUS int "Maximum number of CPUs (2-1024)" range 2 1024 @@ -317,17 +381,15 @@ config FORCE_CPEI_RETARGET This option it useful to enable this feature on older BIOS's as well. You can also enable this by using boot command line option force_cpei=1. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. +source "kernel/Kconfig.preempt" - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +config RWSEM_GENERIC_SPINLOCK + bool + depends on PREEMPT_RT + default y + +config PREEMPT + def_bool y if (PREEMPT_RT || PREEMPT_SOFTIRQS || PREEMPT_HARDIRQS || PREEMPT_VOLUNTARY || PREEMPT_DESKTOP) source "mm/Kconfig" Index: linux/arch/ia64/kernel/asm-offsets.c =================================================================== --- linux.orig/arch/ia64/kernel/asm-offsets.c +++ linux/arch/ia64/kernel/asm-offsets.c @@ -255,6 +255,7 @@ void foo(void) offsetof (struct pal_min_state_area_s, pmsa_xip)); BLANK(); +#ifdef CONFIG_TIME_INTERPOLATION /* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */ DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr)); DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source)); @@ -269,4 +270,5 @@ void foo(void) DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64); DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32); DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec)); +#endif } Index: linux/arch/ia64/kernel/entry.S =================================================================== --- linux.orig/arch/ia64/kernel/entry.S +++ linux/arch/ia64/kernel/entry.S @@ -1101,23 +1101,24 @@ skip_rbs_switch: st8 [r2]=r8 st8 [r3]=r10 .work_pending: - tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? + tbit.nz p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? +(p6) br.cond.sptk.few .needresched + tbit.z p6,p0=r31,TIF_NEED_RESCHED_DELAYED // current_thread_info()->need_resched_delayed==0? (p6) br.cond.sptk.few .notify -#ifdef CONFIG_PREEMPT -(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1 + +.needresched: + +(pKStk) br.cond.sptk.many .fromkernel ;; -(pKStk) st4 [r20]=r21 ssm psr.i // enable interrupts -#endif br.call.spnt.many rp=schedule -.ret9: cmp.eq p6,p0=r0,r0 // p6 <- 1 - rsm psr.i // disable interrupts - ;; -#ifdef CONFIG_PREEMPT -(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 +.ret9a: rsm psr.i // disable interrupts ;; -(pKStk) st4 [r20]=r0 // preempt_count() <- 0 -#endif + br.cond.sptk.many .endpreemptdep +.fromkernel: + br.call.spnt.many rp=preempt_schedule_irq +.ret9b: rsm psr.i // disable interrupts +.endpreemptdep: (pLvSys)br.cond.sptk.few .work_pending_syscall_end br.cond.sptk.many .work_processed_kernel // re-check Index: linux/arch/ia64/kernel/fsys.S =================================================================== --- linux.orig/arch/ia64/kernel/fsys.S +++ linux/arch/ia64/kernel/fsys.S @@ -26,6 +26,7 @@ #include "entry.h" +#ifdef CONFIG_TIME_INTERPOLATION /* * See Documentation/ia64/fsys.txt for details on fsyscalls. * @@ -350,6 +351,26 @@ ENTRY(fsys_clock_gettime) br.many .gettime END(fsys_clock_gettime) + +#else // !CONFIG_TIME_INTERPOLATION + +# define fsys_gettimeofday 0 +# define fsys_clock_gettime 0 + +.fail_einval: + mov r8 = EINVAL + mov r10 = -1 + FSYS_RETURN + +.fail_efault: + mov r8 = EFAULT + mov r10 = -1 + FSYS_RETURN + +#endif + + + /* * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize). */ Index: linux/arch/ia64/kernel/iosapic.c =================================================================== --- linux.orig/arch/ia64/kernel/iosapic.c +++ linux/arch/ia64/kernel/iosapic.c @@ -112,7 +112,7 @@ (PAGE_SIZE / sizeof(struct iosapic_rte_info)) #define RTE_PREALLOCATED (1) -static DEFINE_SPINLOCK(iosapic_lock); +static DEFINE_RAW_SPINLOCK(iosapic_lock); /* * These tables map IA-64 vectors to the IOSAPIC pin that generates this @@ -430,6 +430,34 @@ iosapic_startup_level_irq (unsigned int return 0; } +/* + * In the preemptible case mask the IRQ first then handle it and ack it. + */ +#ifdef CONFIG_PREEMPT_HARDIRQS + +static void +iosapic_ack_level_irq (unsigned int irq) +{ + ia64_vector vec = irq_to_vector(irq); + struct iosapic_rte_info *rte; + + move_irq(irq); + mask_irq(irq); + list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list) + iosapic_eoi(rte->addr, vec); +} + +static void +iosapic_end_level_irq (unsigned int irq) +{ + if (!(irq_desc[irq].status & IRQ_INPROGRESS)) + unmask_irq(irq); +} + +#else /* !CONFIG_PREEMPT_HARDIRQS */ + +#define iosapic_ack_level_irq nop + static void iosapic_end_level_irq (unsigned int irq) { @@ -441,10 +469,12 @@ iosapic_end_level_irq (unsigned int irq) iosapic_eoi(rte->addr, vec); } + +#endif + #define iosapic_shutdown_level_irq mask_irq #define iosapic_enable_level_irq unmask_irq #define iosapic_disable_level_irq mask_irq -#define iosapic_ack_level_irq nop struct irq_chip irq_type_iosapic_level = { .name = "IO-SAPIC-level", Index: linux/arch/ia64/kernel/mca.c =================================================================== --- linux.orig/arch/ia64/kernel/mca.c +++ linux/arch/ia64/kernel/mca.c @@ -320,7 +320,7 @@ ia64_mca_spin(const char *func) typedef struct ia64_state_log_s { - spinlock_t isl_lock; + raw_spinlock_t isl_lock; int isl_index; unsigned long isl_count; ia64_err_rec_t *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */ Index: linux/arch/ia64/kernel/perfmon.c =================================================================== --- linux.orig/arch/ia64/kernel/perfmon.c +++ linux/arch/ia64/kernel/perfmon.c @@ -281,7 +281,7 @@ typedef struct { */ typedef struct pfm_context { - spinlock_t ctx_lock; /* context protection */ + raw_spinlock_t ctx_lock; /* context protection */ pfm_context_flags_t ctx_flags; /* bitmask of flags (block reason incl.) */ unsigned int ctx_state; /* state: active/inactive (no bitfield) */ @@ -370,7 +370,7 @@ typedef struct pfm_context { * mostly used to synchronize between system wide and per-process */ typedef struct { - spinlock_t pfs_lock; /* lock the structure */ + raw_spinlock_t pfs_lock; /* lock the structure */ unsigned int pfs_task_sessions; /* number of per task sessions */ unsigned int pfs_sys_sessions; /* number of per system wide sessions */ @@ -511,7 +511,7 @@ static pfm_intr_handler_desc_t *pfm_alt static struct proc_dir_entry *perfmon_dir; static pfm_uuid_t pfm_null_uuid = {0,}; -static spinlock_t pfm_buffer_fmt_lock; +static raw_spinlock_t pfm_buffer_fmt_lock; static LIST_HEAD(pfm_buffer_fmt_list); static pmu_config_t *pmu_conf; Index: linux/arch/ia64/kernel/process.c =================================================================== --- linux.orig/arch/ia64/kernel/process.c +++ linux/arch/ia64/kernel/process.c @@ -95,6 +95,9 @@ show_stack (struct task_struct *task, un void dump_stack (void) { + if (irqs_disabled()) { + printk("Uh oh.. entering dump_stack() with irqs disabled.\n"); + } show_stack(NULL, NULL); } @@ -198,7 +201,7 @@ void default_idle (void) { local_irq_enable(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { if (can_do_pal_halt) safe_halt(); else @@ -280,7 +283,7 @@ cpu_idle (void) current_thread_info()->status |= TS_POLLING; } - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { void (*idle)(void); #ifdef CONFIG_SMP min_xtp(); @@ -302,10 +305,11 @@ cpu_idle (void) normal_xtp(); #endif } - preempt_enable_no_resched(); - schedule(); + __preempt_enable_no_resched(); + __schedule(); + preempt_disable(); - check_pgt_cache(); + if (cpu_is_offline(cpu)) play_dead(); } Index: linux/arch/ia64/kernel/sal.c =================================================================== --- linux.orig/arch/ia64/kernel/sal.c +++ linux/arch/ia64/kernel/sal.c @@ -18,7 +18,7 @@ #include #include - __cacheline_aligned DEFINE_SPINLOCK(sal_lock); + __cacheline_aligned DEFINE_RAW_SPINLOCK(sal_lock); unsigned long sal_platform_features; unsigned short sal_revision; Index: linux/arch/ia64/kernel/salinfo.c =================================================================== --- linux.orig/arch/ia64/kernel/salinfo.c +++ linux/arch/ia64/kernel/salinfo.c @@ -141,7 +141,7 @@ enum salinfo_state { struct salinfo_data { cpumask_t cpu_event; /* which cpus have outstanding events */ - struct semaphore mutex; + struct compat_semaphore mutex; u8 *log_buffer; u64 log_size; u8 *oemdata; /* decoded oem data */ @@ -157,8 +157,8 @@ struct salinfo_data { static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)]; -static DEFINE_SPINLOCK(data_lock); -static DEFINE_SPINLOCK(data_saved_lock); +static DEFINE_RAW_SPINLOCK(data_lock); +static DEFINE_RAW_SPINLOCK(data_saved_lock); /** salinfo_platform_oemdata - optional callback to decode oemdata from an error * record. Index: linux/arch/ia64/kernel/semaphore.c =================================================================== --- linux.orig/arch/ia64/kernel/semaphore.c +++ linux/arch/ia64/kernel/semaphore.c @@ -40,12 +40,12 @@ */ void -__up (struct semaphore *sem) +__up (struct compat_semaphore *sem) { wake_up(&sem->wait); } -void __sched __down (struct semaphore *sem) +void __sched __down (struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -82,7 +82,7 @@ void __sched __down (struct semaphore *s tsk->state = TASK_RUNNING; } -int __sched __down_interruptible (struct semaphore * sem) +int __sched __down_interruptible (struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -142,7 +142,7 @@ int __sched __down_interruptible (struct * count. */ int -__down_trylock (struct semaphore *sem) +__down_trylock (struct compat_semaphore *sem) { unsigned long flags; int sleepers; Index: linux/arch/ia64/kernel/signal.c =================================================================== --- linux.orig/arch/ia64/kernel/signal.c +++ linux/arch/ia64/kernel/signal.c @@ -487,6 +487,14 @@ ia64_do_signal (sigset_t *oldset, struct long errno = scr->pt.r8; # define ERR_CODE(c) (IS_IA32_PROCESS(&scr->pt) ? -(c) : (c)) +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif + /* * In the ia64_leave_kernel code path, we want the common case to go fast, which * is why we may in certain cases get here from kernel mode. Just return without Index: linux/arch/ia64/kernel/smp.c =================================================================== --- linux.orig/arch/ia64/kernel/smp.c +++ linux/arch/ia64/kernel/smp.c @@ -248,6 +248,22 @@ smp_send_reschedule (int cpu) platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != smp_processor_id()) + platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0); + } +} + + void smp_flush_tlb_all (void) { Index: linux/arch/ia64/kernel/smpboot.c =================================================================== --- linux.orig/arch/ia64/kernel/smpboot.c +++ linux/arch/ia64/kernel/smpboot.c @@ -371,6 +371,8 @@ smp_setup_percpu_timer (void) { } +extern void register_itc_clockevent(void); + static void __devinit smp_callin (void) { @@ -430,6 +432,7 @@ smp_callin (void) #ifdef CONFIG_IA32_SUPPORT ia32_gdt_init(); #endif + register_itc_clockevent(); /* * Allow the master to continue. Index: linux/arch/ia64/kernel/time.c =================================================================== --- linux.orig/arch/ia64/kernel/time.c +++ linux/arch/ia64/kernel/time.c @@ -55,6 +55,7 @@ timer_interrupt (int irq, void *dev_id) platform_timer_interrupt(irq, dev_id); +#if 0 new_itm = local_cpu_data->itm_next; if (!time_after(ia64_get_itc(), new_itm)) @@ -62,29 +63,48 @@ timer_interrupt (int irq, void *dev_id) ia64_get_itc(), new_itm); profile_tick(CPU_PROFILING); +#endif + + if (time_after(ia64_get_itc(), local_cpu_data->itm_tick_next)) { - while (1) { - update_process_times(user_mode(get_irq_regs())); + unsigned long new_tick_itm; + new_tick_itm = local_cpu_data->itm_tick_next; - new_itm += local_cpu_data->itm_delta; + profile_tick(CPU_PROFILING, get_irq_regs()); - if (smp_processor_id() == time_keeper_id) { - /* - * Here we are in the timer irq handler. We have irqs locally - * disabled, but we don't know if the timer_bh is running on - * another CPU. We need to avoid to SMP race by acquiring the - * xtime_lock. - */ - write_seqlock(&xtime_lock); - do_timer(1); - local_cpu_data->itm_next = new_itm; - write_sequnlock(&xtime_lock); - } else - local_cpu_data->itm_next = new_itm; + while (1) { + update_process_times(user_mode(get_irq_regs())); + + new_tick_itm += local_cpu_data->itm_tick_delta; + + if (smp_processor_id() == time_keeper_id) { + /* + * Here we are in the timer irq handler. We have irqs locally + * disabled, but we don't know if the timer_bh is running on + * another CPU. We need to avoid to SMP race by acquiring the + * xtime_lock. + */ + write_seqlock(&xtime_lock); + do_timer(get_irq_regs()); + local_cpu_data->itm_tick_next = new_tick_itm; + write_sequnlock(&xtime_lock); + } else + local_cpu_data->itm_tick_next = new_tick_itm; + + if (time_after(new_tick_itm, ia64_get_itc())) + break; + } + } - if (time_after(new_itm, ia64_get_itc())) - break; + if (time_after(ia64_get_itc(), local_cpu_data->itm_timer_next)) { + if (itc_clockevent.event_handler) + itc_clockevent.event_handler(get_irq_regs()); + // FIXME, really, please + new_itm = local_cpu_data->itm_tick_next; + + if (time_after(new_itm, local_cpu_data->itm_timer_next)) + new_itm = local_cpu_data->itm_timer_next; /* * Allow IPIs to interrupt the timer loop. */ @@ -102,8 +122,8 @@ timer_interrupt (int irq, void *dev_id) * too fast (with the potentially devastating effect * of losing monotony of time). */ - while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2)) - new_itm += local_cpu_data->itm_delta; + while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_tick_delta/2)) + new_itm += local_cpu_data->itm_tick_delta; ia64_set_itm(new_itm); /* double check, in case we got hit by a (slow) PMI: */ } while (time_after_eq(ia64_get_itc(), new_itm)); @@ -122,7 +142,7 @@ ia64_cpu_local_tick (void) /* arrange for the cycle counter to generate a timer interrupt: */ ia64_set_itv(IA64_TIMER_VECTOR); - delta = local_cpu_data->itm_delta; + delta = local_cpu_data->itm_tick_delta; /* * Stagger the timer tick for each CPU so they don't occur all at (almost) the * same time: @@ -131,8 +151,8 @@ ia64_cpu_local_tick (void) unsigned long hi = 1UL << ia64_fls(cpu); shift = (2*(cpu - hi) + 1) * delta/hi/2; } - local_cpu_data->itm_next = ia64_get_itc() + delta + shift; - ia64_set_itm(local_cpu_data->itm_next); + local_cpu_data->itm_tick_next = ia64_get_itc() + delta + shift; + ia64_set_itm(local_cpu_data->itm_tick_next); } static int nojitter; @@ -190,7 +210,7 @@ ia64_init_itm (void) itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den; - local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ; + local_cpu_data->itm_tick_delta = (itc_freq + HZ/2) / HZ; printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%u/%u, " "ITC freq=%lu.%03luMHz", smp_processor_id(), platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000, @@ -210,6 +230,7 @@ ia64_init_itm (void) local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<itc_freq; itc_interpolator.drift = itc_drift; @@ -228,6 +249,7 @@ ia64_init_itm (void) #endif register_time_interpolator(&itc_interpolator); } +#endif /* Setup the CPU local timer tick */ ia64_cpu_local_tick(); @@ -235,7 +257,7 @@ ia64_init_itm (void) static struct irqaction timer_irqaction = { .handler = timer_interrupt, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NODELAY, .name = "timer" }; @@ -256,6 +278,8 @@ time_init (void) * tv_nsec field must be normalized (i.e., 0 <= nsec < NSEC_PER_SEC). */ set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); + register_itc_clocksource(); + register_itc_clockevent(); } /* Index: linux/arch/ia64/kernel/traps.c =================================================================== --- linux.orig/arch/ia64/kernel/traps.c +++ linux/arch/ia64/kernel/traps.c @@ -55,11 +55,11 @@ void die (const char *str, struct pt_regs *regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED, .lock_owner = -1, .lock_owner_depth = 0 }; @@ -196,7 +196,7 @@ __kprobes ia64_bad_break (unsigned long * access to fph by the time we get here, as the IVT's "Disabled FP-Register" handler takes * care of clearing psr.dfh. */ -static inline void +void disabled_fph_fault (struct pt_regs *regs) { struct ia64_psr *psr = ia64_psr(regs); @@ -215,7 +215,7 @@ disabled_fph_fault (struct pt_regs *regs = (struct task_struct *)ia64_get_kr(IA64_KR_FPU_OWNER); if (ia64_is_local_fpu_owner(current)) { - preempt_enable_no_resched(); + __preempt_enable_no_resched(); return; } @@ -235,7 +235,7 @@ disabled_fph_fault (struct pt_regs *regs */ psr->mfh = 1; } - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } static inline int Index: linux/arch/ia64/kernel/unwind.c =================================================================== --- linux.orig/arch/ia64/kernel/unwind.c +++ linux/arch/ia64/kernel/unwind.c @@ -81,7 +81,7 @@ typedef unsigned long unw_word; typedef unsigned char unw_hash_index_t; static struct { - spinlock_t lock; /* spinlock for unwind data */ + raw_spinlock_t lock; /* spinlock for unwind data */ /* list of unwind tables (one per load-module) */ struct unw_table *tables; @@ -145,7 +145,7 @@ static struct { # endif } unw = { .tables = &unw.kernel_table, - .lock = SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED, .save_order = { UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR, UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR Index: linux/arch/ia64/kernel/unwind_i.h =================================================================== --- linux.orig/arch/ia64/kernel/unwind_i.h +++ linux/arch/ia64/kernel/unwind_i.h @@ -154,7 +154,7 @@ struct unw_script { unsigned long ip; /* ip this script is for */ unsigned long pr_mask; /* mask of predicates script depends on */ unsigned long pr_val; /* predicate values this script is for */ - rwlock_t lock; + raw_rwlock_t lock; unsigned int flags; /* see UNW_FLAG_* in unwind.h */ unsigned short lru_chain; /* used for least-recently-used chain */ unsigned short coll_chain; /* used for hash collisions */ Index: linux/arch/ia64/mm/init.c =================================================================== --- linux.orig/arch/ia64/mm/init.c +++ linux/arch/ia64/mm/init.c @@ -37,7 +37,7 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist); DEFINE_PER_CPU(long, __pgtable_quicklist_size); @@ -93,15 +93,11 @@ check_pgt_cache(void) if (unlikely(pgtable_quicklist_size <= MIN_PGT_PAGES)) return; - preempt_disable(); while (unlikely((pages_to_free = min_pages_to_free()) > 0)) { while (pages_to_free--) { free_page((unsigned long)pgtable_quicklist_alloc()); } - preempt_enable(); - preempt_disable(); } - preempt_enable(); } void Index: linux/arch/ia64/mm/tlb.c =================================================================== --- linux.orig/arch/ia64/mm/tlb.c +++ linux/arch/ia64/mm/tlb.c @@ -32,7 +32,7 @@ static struct { } purge; struct ia64_ctx ia64_ctx = { - .lock = SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED, .next = 1, .max_ctx = ~0U }; Index: linux/arch/ia64/sn/kernel/io_init.c =================================================================== --- linux.orig/arch/ia64/sn/kernel/io_init.c +++ linux/arch/ia64/sn/kernel/io_init.c @@ -247,10 +247,18 @@ sn_io_slot_fixup(struct pci_dev *dev) addr = ((addr << 4) >> 4) | __IA64_UNCACHED_OFFSET; dev->resource[idx].start = addr; dev->resource[idx].end = addr + size; + + /* + * if it's already in the device structure, remove it before + * inserting + */ + if (dev->resource[idx].parent && dev->resource[idx].parent->child) + release_resource(&dev->resource[idx]); + if (dev->resource[idx].flags & IORESOURCE_IO) - dev->resource[idx].parent = &ioport_resource; + insert_resource(&ioport_resource, &dev->resource[idx]); else - dev->resource[idx].parent = &iomem_resource; + insert_resource(&iomem_resource, &dev->resource[idx]); /* If ROM, mark as shadowed in PROM */ if (idx == PCI_ROM_RESOURCE) dev->resource[idx].flags |= IORESOURCE_ROM_BIOS_COPY; Index: linux/arch/mips/Kconfig =================================================================== --- linux.orig/arch/mips/Kconfig +++ linux/arch/mips/Kconfig @@ -250,7 +250,7 @@ config LASAT select R5000_CPU_SCACHE select SYS_HAS_CPU_R5000 select SYS_SUPPORTS_32BIT_KERNEL - select SYS_SUPPORTS_64BIT_KERNEL if EXPERIMENTAL + select SYS_SUPPORTS_64BIT_KERNEL if BROKEN select SYS_SUPPORTS_LITTLE_ENDIAN select GENERIC_HARDIRQS_NO__DO_IRQ @@ -395,6 +395,7 @@ config MOMENCO_JAGUAR_ATX config MOMENCO_OCELOT bool "Momentum Ocelot board" select DMA_NONCOHERENT + select NO_SPINLOCK select HW_HAS_PCI select IRQ_CPU select IRQ_CPU_RM7K @@ -542,6 +543,8 @@ config QEMU select SYS_SUPPORTS_LITTLE_ENDIAN select ARCH_SPARSEMEM_ENABLE select GENERIC_HARDIRQS_NO__DO_IRQ + select NR_CPUS_DEFAULT_1 + select SYS_SUPPORTS_SMP help Qemu is a software emulator which among other architectures also can simulate a MIPS32 4Kc system. This patch adds support for the @@ -836,6 +839,7 @@ source "arch/mips/philips/pnx8550/common endmenu + config RWSEM_GENERIC_SPINLOCK bool default y @@ -843,6 +847,10 @@ config RWSEM_GENERIC_SPINLOCK config RWSEM_XCHGADD_ALGORITHM bool +config ASM_SEMAPHORES + bool + default y + config ARCH_HAS_ILOG2_U32 bool default n @@ -901,6 +909,9 @@ config DMA_NONCOHERENT config DMA_NEED_PCI_MAP_STATE bool +config NO_SPINLOCK + bool + config EARLY_PRINTK bool "Early printk" if EMBEDDED && DEBUG_KERNEL depends on SYS_HAS_EARLY_PRINTK @@ -1559,6 +1570,7 @@ config MIPS_MT_SMP select CPU_MIPSR2_IRQ_VI select CPU_MIPSR2_SRS select MIPS_MT + select NR_CPUS_DEFAULT_2 select SMP select SYS_SUPPORTS_SMP help @@ -1573,7 +1585,6 @@ config MIPS_MT_SMTC select CPU_MIPSR2_IRQ_VI select CPU_MIPSR2_SRS select MIPS_MT - select NR_CPUS_DEFAULT_2 select NR_CPUS_DEFAULT_8 select SMP select SYS_SUPPORTS_SMP @@ -1805,6 +1816,9 @@ config SMP config SYS_SUPPORTS_SMP bool +config NR_CPUS_DEFAULT_1 + bool + config NR_CPUS_DEFAULT_2 bool @@ -1825,8 +1839,9 @@ config NR_CPUS_DEFAULT_64 config NR_CPUS int "Maximum number of CPUs (2-64)" - range 2 64 + range 1 64 if NR_CPUS_DEFAULT_1 depends on SMP + default "1" if NR_CPUS_DEFAULT_1 default "2" if NR_CPUS_DEFAULT_2 default "4" if NR_CPUS_DEFAULT_4 default "8" if NR_CPUS_DEFAULT_8 @@ -1837,10 +1852,13 @@ config NR_CPUS This allows you to specify the maximum number of CPUs which this kernel will support. The maximum supported value is 32 for 32-bit kernel and 64 for 64-bit kernels; the minimum value which makes - sense is 2. + sense is 1 for Qemu (useful only for kernel debugging purposes) + and 2 for all others. This is purely to save memory - each supported CPU adds - approximately eight kilobytes to the kernel image. + approximately eight kilobytes to the kernel image. For best + performance should round up your number of processors to the next + power of two. # # Timer Interrupt Frequency Configuration @@ -1959,12 +1977,17 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. -endmenu - -config RWSEM_GENERIC_SPINLOCK +config GENERIC_TIME bool default y +source "kernel/time/Kconfig" + +config CPU_SPEED + int "CPU speed used for clocksource/clockevent calculations" + default 600 +endmenu + config LOCKDEP_SUPPORT bool default y Index: linux/arch/mips/ddb5xxx/ddb5477/irq.c =================================================================== --- linux.orig/arch/mips/ddb5xxx/ddb5477/irq.c +++ linux/arch/mips/ddb5xxx/ddb5477/irq.c @@ -194,7 +194,7 @@ static void vrc5477_irq_dispatch(void) asmlinkage void plat_irq_dispatch(void) { - unsigned int pending = read_c0_cause() & read_c0_status(); + unsigned int pending = read_c0_cause() & read_c0_status() & ST0_IM; if (pending & STATUSF_IP7) do_IRQ(CPU_IRQ_BASE + 7); Index: linux/arch/mips/emma2rh/markeins/irq.c =================================================================== --- linux.orig/arch/mips/emma2rh/markeins/irq.c +++ linux/arch/mips/emma2rh/markeins/irq.c @@ -115,7 +115,7 @@ void __init arch_init_irq(void) asmlinkage void plat_irq_dispatch(void) { - unsigned int pending = read_c0_status() & read_c0_cause(); + unsigned int pending = read_c0_status() & read_c0_cause() & ST0_IM; if (pending & STATUSF_IP7) do_IRQ(CPU_IRQ_BASE + 7); Index: linux/arch/mips/gt64120/ev64120/irq.c =================================================================== --- linux.orig/arch/mips/gt64120/ev64120/irq.c +++ linux/arch/mips/gt64120/ev64120/irq.c @@ -48,7 +48,7 @@ asmlinkage void plat_irq_dispatch(void) { - unsigned int pending = read_c0_status() & read_c0_cause(); + unsigned int pending = read_c0_status() & read_c0_cause() & ST0_IM; if (pending & STATUSF_IP4) /* int2 hardware line (timer) */ do_IRQ(4); Index: linux/arch/mips/gt64120/wrppmc/irq.c =================================================================== --- linux.orig/arch/mips/gt64120/wrppmc/irq.c +++ linux/arch/mips/gt64120/wrppmc/irq.c @@ -32,7 +32,7 @@ asmlinkage void plat_irq_dispatch(void) { - unsigned int pending = read_c0_status() & read_c0_cause(); + unsigned int pending = read_c0_status() & read_c0_cause() & ST0_IM; if (pending & STATUSF_IP7) do_IRQ(WRPPMC_MIPS_TIMER_IRQ); /* CPU Compare/Count internal timer */ Index: linux/arch/mips/jazz/irq.c =================================================================== --- linux.orig/arch/mips/jazz/irq.c +++ linux/arch/mips/jazz/irq.c @@ -122,7 +122,7 @@ static void ll_local_dev(void) asmlinkage void plat_irq_dispatch(void) { - unsigned int pending = read_c0_cause() & read_c0_status() & ST0_IM; + unsigned int pending = read_c0_cause() & read_c0_status(); if (pending & IE_IRQ5) write_c0_compare(0); Index: linux/arch/mips/kernel/Makefile =================================================================== --- linux.orig/arch/mips/kernel/Makefile +++ linux/arch/mips/kernel/Makefile @@ -5,7 +5,7 @@ extra-y := head.o init_task.o vmlinux.lds obj-y += cpu-probe.o branch.o entry.o genex.o irq.o process.o \ - ptrace.o reset.o semaphore.o setup.o signal.o syscall.o \ + ptrace.o reset.o setup.o signal.o syscall.o \ time.o topology.o traps.o unaligned.o binfmt_irix-objs := irixelf.o irixinv.o irixioctl.o irixsig.o \ @@ -14,6 +14,8 @@ binfmt_irix-objs := irixelf.o irixinv.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_MODULES) += mips_ksyms.o module.o +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o + obj-$(CONFIG_CPU_R3000) += r2300_fpu.o r2300_switch.o obj-$(CONFIG_CPU_TX39XX) += r2300_fpu.o r2300_switch.o obj-$(CONFIG_CPU_TX49XX) += r4k_fpu.o r4k_switch.o Index: linux/arch/mips/kernel/asm-offsets.c =================================================================== --- linux.orig/arch/mips/kernel/asm-offsets.c +++ linux/arch/mips/kernel/asm-offsets.c @@ -10,9 +10,11 @@ */ #include #include +#include #include #include #include +#include #include #include Index: linux/arch/mips/kernel/entry.S =================================================================== --- linux.orig/arch/mips/kernel/entry.S +++ linux/arch/mips/kernel/entry.S @@ -21,6 +21,8 @@ #endif #ifndef CONFIG_PREEMPT +<<<<<<< delete local_irq_disable + raw_local_irq_disable #define resume_kernel restore_all #else #define __ret_from_irq ret_from_exception @@ -41,7 +43,7 @@ FEXPORT(__ret_from_irq) beqz t0, resume_kernel resume_userspace: - local_irq_disable # make sure we dont miss an + raw_local_irq_disable # make sure we dont miss an # interrupt setting need_resched # between sampling and return LONG_L a2, TI_FLAGS($28) # current->work @@ -51,7 +53,9 @@ resume_userspace: #ifdef CONFIG_PREEMPT resume_kernel: - local_irq_disable + raw_local_irq_disable + lw t0, kernel_preemption + beqz t0, restore_all lw t0, TI_PRE_COUNT($28) bnez t0, restore_all need_resched: @@ -61,7 +65,9 @@ need_resched: LONG_L t0, PT_STATUS(sp) # Interrupts off? andi t0, 1 beqz t0, restore_all + raw_local_irq_disable jal preempt_schedule_irq + sw zero, TI_PRE_COUNT($28) b need_resched #endif @@ -69,7 +75,7 @@ FEXPORT(ret_from_fork) jal schedule_tail # a0 = struct task_struct *prev FEXPORT(syscall_exit) - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work @@ -136,19 +142,21 @@ FEXPORT(restore_partial) # restore part .set at work_pending: - andi t0, a2, _TIF_NEED_RESCHED # a2 is preloaded with TI_FLAGS + # a2 is preloaded with TI_FLAGS + andi t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beqz t0, work_notifysig work_resched: + raw_local_irq_enable t0 jal schedule - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) andi t0, a2, _TIF_WORK_MASK # is there any work to be done # other than syscall tracing? beqz t0, restore_all - andi t0, a2, _TIF_NEED_RESCHED + andi t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bnez t0, work_resched work_notifysig: # deal with pending signals and @@ -164,7 +172,7 @@ syscall_exit_work: li t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT and t0, a2 # a2 is preloaded with TI_FLAGS beqz t0, work_pending # trace bit set? - local_irq_enable # could let do_syscall_trace() + raw_local_irq_enable # could let do_syscall_trace() # call schedule() instead move a0, sp li a1, 1 Index: linux/arch/mips/kernel/genex.S =================================================================== --- linux.orig/arch/mips/kernel/genex.S +++ linux/arch/mips/kernel/genex.S @@ -181,13 +181,13 @@ NESTED(except_vec_vi, 0, sp) * during service by SMTC kernel, we also want to * pass the IM value to be cleared. */ -EXPORT(except_vec_vi_mori) +FEXPORT(except_vec_vi_mori) ori a0, $0, 0 #endif /* CONFIG_MIPS_MT_SMTC */ -EXPORT(except_vec_vi_lui) +FEXPORT(except_vec_vi_lui) lui v0, 0 /* Patched */ j except_vec_vi_handler -EXPORT(except_vec_vi_ori) +FEXPORT(except_vec_vi_ori) ori v0, 0 /* Patched */ .set pop END(except_vec_vi) @@ -220,7 +220,17 @@ NESTED(except_vec_vi_handler, 0, sp) _ehb #endif /* CONFIG_MIPS_MT_SMTC */ CLI +#ifdef CONFIG_TRACE_IRQFLAGS + move s0, v0 +#ifdef CONFIG_MIPS_MT_SMTC + move s1, a0 +#endif TRACE_IRQS_OFF +#ifdef CONFIG_MIPS_MT_SMTC + move a0, s1 +#endif + move v0, s0 +#endif LONG_L s0, TI_REGS($28) LONG_S sp, TI_REGS($28) Index: linux/arch/mips/kernel/i8259.c =================================================================== --- linux.orig/arch/mips/kernel/i8259.c +++ linux/arch/mips/kernel/i8259.c @@ -29,9 +29,9 @@ */ static int i8259A_auto_eoi = -1; -DEFINE_SPINLOCK(i8259A_lock); /* some platforms call this... */ void mask_and_ack_8259A(unsigned int); +DEFINE_RAW_SPINLOCK(i8259A_lock); static struct irq_chip i8259A_chip = { .name = "XT-PIC", Index: linux/arch/mips/kernel/kspd.c =================================================================== --- linux.orig/arch/mips/kernel/kspd.c +++ linux/arch/mips/kernel/kspd.c @@ -191,6 +191,8 @@ void sp_work_handle_request(void) struct mtsp_syscall_generic generic; struct mtsp_syscall_ret ret; struct kspd_notifications *n; + unsigned long written; + mm_segment_t old_fs; struct timeval tv; struct timezone tz; int cmd; @@ -201,7 +203,11 @@ void sp_work_handle_request(void) ret.retval = -1; - if (!rtlx_read(RTLX_CHANNEL_SYSIO, &sc, sizeof(struct mtsp_syscall), 0)) { + old_fs = get_fs(); + set_fs(KERNEL_DS); + + if (!rtlx_read(RTLX_CHANNEL_SYSIO, &sc, sizeof(struct mtsp_syscall))) { + set_fs(old_fs); printk(KERN_ERR "Expected request but nothing to read\n"); return; } @@ -209,7 +215,8 @@ void sp_work_handle_request(void) size = sc.size; if (size) { - if (!rtlx_read(RTLX_CHANNEL_SYSIO, &generic, size, 0)) { + if (!rtlx_read(RTLX_CHANNEL_SYSIO, &generic, size)) { + set_fs(old_fs); printk(KERN_ERR "Expected request but nothing to read\n"); return; } @@ -282,8 +289,11 @@ void sp_work_handle_request(void) if (vpe_getuid(SP_VPE)) sp_setfsuidgid( 0, 0); - if ((rtlx_write(RTLX_CHANNEL_SYSIO, &ret, sizeof(struct mtsp_syscall_ret), 0)) - < sizeof(struct mtsp_syscall_ret)) + old_fs = get_fs(); + set_fs(KERNEL_DS); + written = rtlx_write(RTLX_CHANNEL_SYSIO, &ret, sizeof(ret)); + set_fs(old_fs); + if (written < sizeof(ret)) printk("KSPD: sp_work_handle_request failed to send to SP\n"); } Index: linux/arch/mips/kernel/linux32.c =================================================================== --- linux.orig/arch/mips/kernel/linux32.c +++ linux/arch/mips/kernel/linux32.c @@ -311,6 +311,8 @@ asmlinkage int sys32_sched_rr_get_interv return ret; } +#ifdef CONFIG_SYSVIPC + asmlinkage long sys32_ipc (u32 call, int first, int second, int third, u32 ptr, u32 fifth) { @@ -368,6 +370,16 @@ sys32_ipc (u32 call, int first, int seco return err; } +#else + +asmlinkage long +sys32_ipc (u32 call, int first, int second, int third, u32 ptr, u32 fifth) +{ + return -ENOSYS; +} + +#endif /* CONFIG_SYSVIPC */ + #ifdef CONFIG_MIPS32_N32 asmlinkage long sysn32_semctl(int semid, int semnum, int cmd, u32 arg) { Index: linux/arch/mips/kernel/module.c =================================================================== --- linux.orig/arch/mips/kernel/module.c +++ linux/arch/mips/kernel/module.c @@ -40,7 +40,7 @@ struct mips_hi16 { static struct mips_hi16 *mips_hi16_list; static LIST_HEAD(dbe_list); -static DEFINE_SPINLOCK(dbe_lock); +static DEFINE_RAW_SPINLOCK(dbe_lock); void *module_alloc(unsigned long size) { Index: linux/arch/mips/kernel/process.c =================================================================== --- linux.orig/arch/mips/kernel/process.c +++ linux/arch/mips/kernel/process.c @@ -50,7 +50,7 @@ ATTRIB_NORET void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { #ifdef CONFIG_SMTC_IDLE_HOOK_DEBUG extern void smtc_idle_loop_hook(void); @@ -59,9 +59,11 @@ ATTRIB_NORET void cpu_idle(void) if (cpu_wait) (*cpu_wait)(); } - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } Index: linux/arch/mips/kernel/r2300_switch.S =================================================================== --- linux.orig/arch/mips/kernel/r2300_switch.S +++ linux/arch/mips/kernel/r2300_switch.S @@ -49,8 +49,7 @@ LEAF(resume) #ifndef CONFIG_CPU_HAS_LLSC sw zero, ll_bit #endif - mfc0 t1, CP0_STATUS - sw t1, THREAD_STATUS(a0) + mfc0 t2, CP0_STATUS cpu_save_nonscratch a0 sw ra, THREAD_REG31(a0) @@ -60,8 +59,8 @@ LEAF(resume) lw t3, TASK_THREAD_INFO(a0) lw t0, TI_FLAGS(t3) li t1, _TIF_USEDFPU - and t2, t0, t1 - beqz t2, 1f + and t1, t0 + beqz t1, 1f nor t1, zero, t1 and t0, t0, t1 @@ -74,10 +73,13 @@ LEAF(resume) li t1, ~ST0_CU1 and t0, t0, t1 sw t0, ST_OFF(t3) + /* clear thread_struct CU1 bit */ + and t2, t1 fpu_save_single a0, t0 # clobbers t0 1: + sw t2, THREAD_STATUS(a0) /* * The order of restoring the registers takes care of the race * updating $28, $29 and kernelsp without disabling ints. Index: linux/arch/mips/kernel/r4k_fpu.S =================================================================== --- linux.orig/arch/mips/kernel/r4k_fpu.S +++ linux/arch/mips/kernel/r4k_fpu.S @@ -114,14 +114,6 @@ LEAF(_save_fp_context32) */ LEAF(_restore_fp_context) EX lw t0, SC_FPC_CSR(a0) - - /* Fail if the CSR has exceptions pending */ - srl t1, t0, 5 - and t1, t0 - andi t1, 0x1f << 7 - bnez t1, fault - nop - #ifdef CONFIG_64BIT EX ldc1 $f1, SC_FPREGS+8(a0) EX ldc1 $f3, SC_FPREGS+24(a0) @@ -165,14 +157,6 @@ LEAF(_restore_fp_context) LEAF(_restore_fp_context32) /* Restore an o32 sigcontext. */ EX lw t0, SC32_FPC_CSR(a0) - - /* Fail if the CSR has exceptions pending */ - srl t1, t0, 5 - and t1, t0 - andi t1, 0x1f << 7 - bnez t1, fault - nop - EX ldc1 $f0, SC32_FPREGS+0(a0) EX ldc1 $f2, SC32_FPREGS+16(a0) EX ldc1 $f4, SC32_FPREGS+32(a0) Index: linux/arch/mips/kernel/r4k_switch.S =================================================================== --- linux.orig/arch/mips/kernel/r4k_switch.S +++ linux/arch/mips/kernel/r4k_switch.S @@ -48,8 +48,7 @@ #ifndef CONFIG_CPU_HAS_LLSC sw zero, ll_bit #endif - mfc0 t1, CP0_STATUS - LONG_S t1, THREAD_STATUS(a0) + mfc0 t2, CP0_STATUS cpu_save_nonscratch a0 LONG_S ra, THREAD_REG31(a0) @@ -59,8 +58,8 @@ PTR_L t3, TASK_THREAD_INFO(a0) LONG_L t0, TI_FLAGS(t3) li t1, _TIF_USEDFPU - and t2, t0, t1 - beqz t2, 1f + and t1, t0 + beqz t1, 1f nor t1, zero, t1 and t0, t0, t1 @@ -73,10 +72,13 @@ li t1, ~ST0_CU1 and t0, t0, t1 LONG_S t0, ST_OFF(t3) + /* clear thread_struct CU1 bit */ + and t2, t1 fpu_save_double a0 t0 t1 # c0_status passed in t0 # clobbers t1 1: + LONG_S t2, THREAD_STATUS(a0) /* * The order of restoring the registers takes care of the race Index: linux/arch/mips/kernel/rtlx.c =================================================================== --- linux.orig/arch/mips/kernel/rtlx.c +++ linux/arch/mips/kernel/rtlx.c @@ -54,6 +54,7 @@ static struct chan_waitqueues { wait_queue_head_t rt_queue; wait_queue_head_t lx_queue; atomic_t in_open; + struct mutex mutex; } channel_wqs[RTLX_CHANNELS]; static struct irqaction irq; @@ -146,7 +147,7 @@ static void stopping(int vpe) int rtlx_open(int index, int can_sleep) { - volatile struct rtlx_info **p; + struct rtlx_info **p; struct rtlx_channel *chan; enum rtlx_state state; int ret = 0; @@ -179,13 +180,24 @@ int rtlx_open(int index, int can_sleep) } } + smp_rmb(); if (*p == NULL) { if (can_sleep) { - __wait_event_interruptible(channel_wqs[index].lx_queue, - *p != NULL, - ret); - if (ret) + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait(&channel_wqs[index].lx_queue, &wait, TASK_INTERRUPTIBLE); + smp_rmb(); + if (*p != NULL) + break; + if (!signal_pending(current)) { + schedule(); + continue; + } + ret = -ERESTARTSYS; goto out_fail; + } + finish_wait(&channel_wqs[index].lx_queue, &wait); } else { printk(" *vpe_get_shared is NULL. " "Has an SP program been loaded?\n"); @@ -277,56 +289,52 @@ unsigned int rtlx_write_poll(int index) return write_spacefree(chan->rt_read, chan->rt_write, chan->buffer_size); } -static inline void copy_to(void *dst, void *src, size_t count, int user) -{ - if (user) - copy_to_user(dst, src, count); - else - memcpy(dst, src, count); -} - -static inline void copy_from(void *dst, void *src, size_t count, int user) -{ - if (user) - copy_from_user(dst, src, count); - else - memcpy(dst, src, count); -} - -ssize_t rtlx_read(int index, void *buff, size_t count, int user) +ssize_t rtlx_read(int index, void __user *buff, size_t count, int user) { - size_t fl = 0L; + size_t lx_write, fl = 0L; struct rtlx_channel *lx; + unsigned long failed; if (rtlx == NULL) return -ENOSYS; lx = &rtlx->channel[index]; + mutex_lock(&channel_wqs[index].mutex); + smp_rmb(); + lx_write = lx->lx_write; + /* find out how much in total */ count = min(count, - (size_t)(lx->lx_write + lx->buffer_size - lx->lx_read) + (size_t)(lx_write + lx->buffer_size - lx->lx_read) % lx->buffer_size); /* then how much from the read pointer onwards */ - fl = min( count, (size_t)lx->buffer_size - lx->lx_read); + fl = min(count, (size_t)lx->buffer_size - lx->lx_read); - copy_to(buff, &lx->lx_buffer[lx->lx_read], fl, user); + failed = copy_to_user(buff, lx->lx_buffer + lx->lx_read, fl); + if (failed) + goto out; /* and if there is anything left at the beginning of the buffer */ - if ( count - fl ) - copy_to (buff + fl, lx->lx_buffer, count - fl, user); + if (count - fl) + failed = copy_to_user(buff + fl, lx->lx_buffer, count - fl); + +out: + count -= failed; - /* update the index */ - lx->lx_read += count; - lx->lx_read %= lx->buffer_size; + smp_wmb(); + lx->lx_read = (lx->lx_read + count) % lx->buffer_size; + smp_wmb(); + mutex_unlock(&channel_wqs[index].mutex); return count; } -ssize_t rtlx_write(int index, void *buffer, size_t count, int user) +ssize_t rtlx_write(int index, const void __user *buffer, size_t count, int user) { struct rtlx_channel *rt; + size_t rt_read; size_t fl; if (rtlx == NULL) @@ -334,24 +342,35 @@ ssize_t rtlx_write(int index, void *buff rt = &rtlx->channel[index]; + mutex_lock(&channel_wqs[index].mutex); + smp_rmb(); + rt_read = rt->rt_read; + /* total number of bytes to copy */ count = min(count, - (size_t)write_spacefree(rt->rt_read, rt->rt_write, - rt->buffer_size)); + (size_t)write_spacefree(rt_read, rt->rt_write, rt->buffer_size)); /* first bit from write pointer to the end of the buffer, or count */ fl = min(count, (size_t) rt->buffer_size - rt->rt_write); - copy_from (&rt->rt_buffer[rt->rt_write], buffer, fl, user); + failed = copy_from_user(rt->rt_buffer + rt->rt_write, buffer, fl); + if (failed) + goto out; /* if there's any left copy to the beginning of the buffer */ - if( count - fl ) - copy_from (rt->rt_buffer, buffer + fl, count - fl, user); + if (count - fl) { + failed = copy_from_user(rt->rt_buffer, buffer + fl, count - fl); + } - rt->rt_write += count; - rt->rt_write %= rt->buffer_size; +out: + count -= cailed; - return(count); + smp_wmb(); + rt->rt_write = (rt->rt_write + count) % rt->buffer_size; + smp_wmb(); + mutex_unlock(&channel_wqs[index].mutex); + + return count; } @@ -403,7 +422,7 @@ static ssize_t file_read(struct file *fi return 0; // -EAGAIN makes cat whinge } - return rtlx_read(minor, buffer, count, 1); + return rtlx_read(minor, buffer, count); } static ssize_t file_write(struct file *file, const char __user * buffer, @@ -429,7 +448,7 @@ static ssize_t file_write(struct file *f return ret; } - return rtlx_write(minor, (void *)buffer, count, 1); + return rtlx_write(minor, buffer, count); } static const struct file_operations rtlx_fops = { @@ -468,6 +487,7 @@ static int rtlx_module_init(void) init_waitqueue_head(&channel_wqs[i].rt_queue); init_waitqueue_head(&channel_wqs[i].lx_queue); atomic_set(&channel_wqs[i].in_open, 0); + mutex_init(&channel_wqs[i].mutex); dev = device_create(mt_class, NULL, MKDEV(major, i), "%s%d", module_name, i); Index: linux/arch/mips/kernel/scall32-o32.S =================================================================== --- linux.orig/arch/mips/kernel/scall32-o32.S +++ linux/arch/mips/kernel/scall32-o32.S @@ -73,7 +73,7 @@ stack_done: 1: sw v0, PT_R2(sp) # result o32_syscall_exit: - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return lw a2, TI_FLAGS($28) # current->work Index: linux/arch/mips/kernel/scall64-64.S =================================================================== --- linux.orig/arch/mips/kernel/scall64-64.S +++ linux/arch/mips/kernel/scall64-64.S @@ -72,7 +72,7 @@ NESTED(handle_sys64, PT_SIZE, sp) 1: sd v0, PT_R2(sp) # result n64_syscall_exit: - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work Index: linux/arch/mips/kernel/scall64-n32.S =================================================================== --- linux.orig/arch/mips/kernel/scall64-n32.S +++ linux/arch/mips/kernel/scall64-n32.S @@ -69,7 +69,7 @@ NESTED(handle_sysn32, PT_SIZE, sp) sd v0, PT_R0(sp) # set flag for syscall restarting 1: sd v0, PT_R2(sp) # result - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work Index: linux/arch/mips/kernel/scall64-o32.S =================================================================== --- linux.orig/arch/mips/kernel/scall64-o32.S +++ linux/arch/mips/kernel/scall64-o32.S @@ -98,7 +98,7 @@ NESTED(handle_sys, PT_SIZE, sp) 1: sd v0, PT_R2(sp) # result o32_syscall_exit: - local_irq_disable # make need_resched and + raw_local_irq_disable # make need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) Index: linux/arch/mips/kernel/semaphore.c =================================================================== --- linux.orig/arch/mips/kernel/semaphore.c +++ linux/arch/mips/kernel/semaphore.c @@ -36,7 +36,7 @@ * sem->count and sem->waking atomic. Scalability isn't an issue because * this lock is used on UP only so it's just an empty variable. */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -67,7 +67,7 @@ static inline int __sem_update_count(str : "=&r" (old_count), "=&r" (tmp), "=m" (sem->count) : "r" (incr), "m" (sem->count)); } else { - static DEFINE_SPINLOCK(semaphore_lock); + static DEFINE_RAW_SPINLOCK(semaphore_lock); unsigned long flags; spin_lock_irqsave(&semaphore_lock, flags); @@ -80,7 +80,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -94,7 +94,7 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -EXPORT_SYMBOL(__up); +EXPORT_SYMBOL(__compat_up); /* * Note that when we come in to __down or __down_interruptible, @@ -104,7 +104,7 @@ EXPORT_SYMBOL(__up); * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -133,9 +133,9 @@ void __sched __down(struct semaphore *se wake_up(&sem->wait); } -EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__compat_down); -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -165,4 +165,10 @@ int __sched __down_interruptible(struct return retval; } -EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__compat_down_interruptible); + +int fastcall compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux/arch/mips/kernel/signal-common.h =================================================================== --- linux.orig/arch/mips/kernel/signal-common.h +++ linux/arch/mips/kernel/signal-common.h @@ -31,4 +31,7 @@ extern void __user *get_sigframe(struct */ extern int install_sigtramp(unsigned int __user *tramp, unsigned int syscall); +/* Check and clear pending FPU exceptions in saved CSR */ +extern int fpcsr_pending(unsigned int __user *fpcsr); + #endif /* __SIGNAL_COMMON_H */ Index: linux/arch/mips/kernel/signal.c =================================================================== --- linux.orig/arch/mips/kernel/signal.c +++ linux/arch/mips/kernel/signal.c @@ -82,6 +82,7 @@ int setup_sigcontext(struct pt_regs *reg { int err = 0; int i; + unsigned int used_math; err |= __put_user(regs->cp0_epc, &sc->sc_pc); @@ -104,26 +105,53 @@ int setup_sigcontext(struct pt_regs *reg err |= __put_user(rddsp(DSP_MASK), &sc->sc_dsp); } - err |= __put_user(!!used_math(), &sc->sc_used_math); + used_math = !!used_math(); + err |= __put_user(used_math, &sc->sc_used_math); - if (used_math()) { + if (used_math) { /* * Save FPU state to signal context. Signal handler * will "inherit" current FPU state. */ - preempt_disable(); - - if (!is_fpu_owner()) { - own_fpu(); - restore_fp(current); - } + own_fpu(1); + enable_fp_in_kernel(); err |= save_fp_context(sc); - - preempt_enable(); + disable_fp_in_kernel(); } return err; } +int fpcsr_pending(unsigned int __user *fpcsr) +{ + int err, sig = 0; + unsigned int csr, enabled; + + err = __get_user(csr, fpcsr); + enabled = FPU_CSR_UNI_X | ((csr & FPU_CSR_ALL_E) << 5); + /* + * If the signal handler set some FPU exceptions, clear it and + * send SIGFPE. + */ + if (csr & enabled) { + csr &= ~enabled; + err |= __put_user(csr, fpcsr); + sig = SIGFPE; + } + return err ?: sig; +} + +static int +check_and_restore_fp_context(struct sigcontext __user *sc) +{ + int err, sig; + + err = sig = fpcsr_pending(&sc->sc_fpc_csr); + if (err > 0) + err = 0; + err |= restore_fp_context(sc); + return err ?: sig; +} + int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) { unsigned int used_math; @@ -157,19 +185,18 @@ int restore_sigcontext(struct pt_regs *r err |= __get_user(used_math, &sc->sc_used_math); conditional_used_math(used_math); - preempt_disable(); - - if (used_math()) { + if (used_math) { /* restore fpu context if we have used it before */ - own_fpu(); - err |= restore_fp_context(sc); + own_fpu(0); + enable_fp_in_kernel(); + if (!err) + err = check_and_restore_fp_context(sc); + disable_fp_in_kernel(); } else { /* signal handler may have used FPU. Give it up. */ - lose_fpu(); + lose_fpu(0); } - preempt_enable(); - return err; } @@ -332,6 +359,7 @@ asmlinkage void sys_sigreturn(nabi_no_re { struct sigframe __user *frame; sigset_t blocked; + int sig; frame = (struct sigframe __user *) regs.regs[29]; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) @@ -345,8 +373,11 @@ asmlinkage void sys_sigreturn(nabi_no_re recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext(®s, &frame->sf_sc)) + sig = restore_sigcontext(®s, &frame->sf_sc); + if (sig < 0) goto badframe; + else if (sig) + force_sig(sig, current); /* * Don't let your children do this ... @@ -368,6 +399,7 @@ asmlinkage void sys_rt_sigreturn(nabi_no struct rt_sigframe __user *frame; sigset_t set; stack_t st; + int sig; frame = (struct rt_sigframe __user *) regs.regs[29]; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) @@ -381,8 +413,11 @@ asmlinkage void sys_rt_sigreturn(nabi_no recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext(®s, &frame->rs_uc.uc_mcontext)) + sig = restore_sigcontext(®s, &frame->rs_uc.uc_mcontext); + if (sig < 0) goto badframe; + else if (sig) + force_sig(sig, current); if (__copy_from_user(&st, &frame->rs_uc.uc_stack, sizeof(st))) goto badframe; @@ -561,6 +596,10 @@ static void do_signal(struct pt_regs *re siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which is why we may in certain * cases get here from kernel mode. Just return without doing anything Index: linux/arch/mips/kernel/signal32.c =================================================================== --- linux.orig/arch/mips/kernel/signal32.c +++ linux/arch/mips/kernel/signal32.c @@ -181,6 +181,7 @@ static int setup_sigcontext32(struct pt_ { int err = 0; int i; + u32 used_math; err |= __put_user(regs->cp0_epc, &sc->sc_pc); @@ -200,26 +201,34 @@ static int setup_sigcontext32(struct pt_ err |= __put_user(mflo3(), &sc->sc_lo3); } - err |= __put_user(!!used_math(), &sc->sc_used_math); + used_math = !!used_math(); + err |= __put_user(used_math, &sc->sc_used_math); - if (used_math()) { + if (used_math) { /* * Save FPU state to signal context. Signal handler * will "inherit" current FPU state. */ - preempt_disable(); - - if (!is_fpu_owner()) { - own_fpu(); - restore_fp(current); - } + own_fpu(1); + enable_fp_in_kernel(); err |= save_fp_context32(sc); - - preempt_enable(); + disable_fp_in_kernel(); } return err; } +static int +check_and_restore_fp_context32(struct sigcontext32 __user *sc) +{ + int err, sig; + + err = sig = fpcsr_pending(&sc->sc_fpc_csr); + if (err > 0) + err = 0; + err |= restore_fp_context32(sc); + return err ?: sig; +} + static int restore_sigcontext32(struct pt_regs *regs, struct sigcontext32 __user *sc) { @@ -250,19 +259,18 @@ static int restore_sigcontext32(struct p err |= __get_user(used_math, &sc->sc_used_math); conditional_used_math(used_math); - preempt_disable(); - - if (used_math()) { + if (used_math) { /* restore fpu context if we have used it before */ - own_fpu(); - err |= restore_fp_context32(sc); + own_fpu(0); + enable_fp_in_kernel(); + if (!err) + err = check_and_restore_fp_context32(sc); + disable_fp_in_kernel(); } else { /* signal handler may have used FPU. Give it up. */ - lose_fpu(); + lose_fpu(0); } - preempt_enable(); - return err; } @@ -508,6 +516,7 @@ asmlinkage void sys32_sigreturn(nabi_no_ { struct sigframe32 __user *frame; sigset_t blocked; + int sig; frame = (struct sigframe32 __user *) regs.regs[29]; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) @@ -521,8 +530,11 @@ asmlinkage void sys32_sigreturn(nabi_no_ recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext32(®s, &frame->sf_sc)) + sig = restore_sigcontext32(®s, &frame->sf_sc); + if (sig < 0) goto badframe; + else if (sig) + force_sig(sig, current); /* * Don't let your children do this ... @@ -545,6 +557,7 @@ asmlinkage void sys32_rt_sigreturn(nabi_ sigset_t set; stack_t st; s32 sp; + int sig; frame = (struct rt_sigframe32 __user *) regs.regs[29]; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) @@ -558,8 +571,11 @@ asmlinkage void sys32_rt_sigreturn(nabi_ recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext32(®s, &frame->rs_uc.uc_mcontext)) + sig = restore_sigcontext32(®s, &frame->rs_uc.uc_mcontext); + if (sig < 0) goto badframe; + else if (sig) + force_sig(sig, current); /* The ucontext contains a stack32_t, so we must convert! */ if (__get_user(sp, &frame->rs_uc.uc_stack.ss_sp)) @@ -669,6 +685,10 @@ static int setup_rt_frame_32(struct k_si if (err) goto give_sigsegv; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif /* * Arguments to signal handler: * Index: linux/arch/mips/kernel/signal_n32.c =================================================================== --- linux.orig/arch/mips/kernel/signal_n32.c +++ linux/arch/mips/kernel/signal_n32.c @@ -127,6 +127,7 @@ asmlinkage void sysn32_rt_sigreturn(nabi sigset_t set; stack_t st; s32 sp; + int sig; frame = (struct rt_sigframe_n32 __user *) regs.regs[29]; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) @@ -140,8 +141,11 @@ asmlinkage void sysn32_rt_sigreturn(nabi recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - if (restore_sigcontext(®s, &frame->rs_uc.uc_mcontext)) + sig = restore_sigcontext(®s, &frame->rs_uc.uc_mcontext); + if (sig < 0) goto badframe; + else if (sig) + force_sig(sig, current); /* The ucontext contains a stack32_t, so we must convert! */ if (__get_user(sp, &frame->rs_uc.uc_stack.ss_sp)) Index: linux/arch/mips/kernel/smp.c =================================================================== --- linux.orig/arch/mips/kernel/smp.c +++ linux/arch/mips/kernel/smp.c @@ -98,7 +98,22 @@ asmlinkage void start_secondary(void) cpu_idle(); } -DEFINE_SPINLOCK(smp_call_lock); +DEFINE_RAW_SPINLOCK(smp_call_lock); + +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them. + */ +void smp_send_reschedule_allbutself(void) +{ + int cpu = smp_processor_id(); + int i; + + for (i = 0; i < NR_CPUS; i++) + if (cpu_online(i) && i != cpu) + core_send_ipi(i, SMP_RESCHEDULE_YOURSELF); +} struct call_data_struct *call_data; @@ -286,6 +301,8 @@ int setup_profiling_timer(unsigned int m return 0; } +static DEFINE_RAW_SPINLOCK(tlbstate_lock); + static void flush_tlb_all_ipi(void *info) { local_flush_tlb_all(); @@ -343,6 +360,7 @@ static inline void smp_on_each_tlb(void void flush_tlb_mm(struct mm_struct *mm) { preempt_disable(); + spin_lock(&tlbstate_lock); if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) { smp_on_other_tlbs(flush_tlb_mm_ipi, (void *)mm); @@ -352,6 +370,7 @@ void flush_tlb_mm(struct mm_struct *mm) if (smp_processor_id() != i) cpu_context(i, mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_mm(mm); preempt_enable(); @@ -375,6 +394,8 @@ void flush_tlb_range(struct vm_area_stru struct mm_struct *mm = vma->vm_mm; preempt_disable(); + spin_lock(&tlbstate_lock); + if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) { struct flush_tlb_data fd; @@ -388,6 +409,7 @@ void flush_tlb_range(struct vm_area_stru if (smp_processor_id() != i) cpu_context(i, mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_range(vma, start, end); preempt_enable(); } @@ -418,6 +440,8 @@ static void flush_tlb_page_ipi(void *inf void flush_tlb_page(struct vm_area_struct *vma, unsigned long page) { preempt_disable(); + spin_lock(&tlbstate_lock); + if ((atomic_read(&vma->vm_mm->mm_users) != 1) || (current->mm != vma->vm_mm)) { struct flush_tlb_data fd; @@ -430,6 +454,7 @@ void flush_tlb_page(struct vm_area_struc if (smp_processor_id() != i) cpu_context(i, vma->vm_mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_page(vma, page); preempt_enable(); } Index: linux/arch/mips/kernel/time.c =================================================================== --- linux.orig/arch/mips/kernel/time.c +++ linux/arch/mips/kernel/time.c @@ -10,6 +10,11 @@ * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. + * + * This implementation of High Res Timers uses two timers. One is the system + * timer. The second is used for the high res timers. The high res timers + * require the CPU to have count/compare registers. The mips_set_next_event() + * function schedules the next high res timer interrupt. */ #include #include @@ -23,6 +28,7 @@ #include #include #include +#include #include #include @@ -47,7 +53,27 @@ /* * forward reference */ -DEFINE_SPINLOCK(rtc_lock); +DEFINE_RAW_SPINLOCK(rtc_lock); + +/* any missed timer interrupts */ +int missed_timer_count; + +#ifdef CONFIG_HIGH_RES_TIMERS +static void mips_set_next_event(unsigned long evt); +static void mips_set_mode(int mode, void *priv); + +static struct clock_event lapic_clockevent = { + .name = "mips clockevent interface", + .capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE | + CLOCK_HAS_IRQHANDLER +#ifdef CONFIG_SMP + | CLOCK_CAP_UPDATE +#endif + , + .shift = 32, + .set_next_event = mips_set_next_event, +}; +#endif /* * By default we provide the null RTC ops @@ -56,6 +82,129 @@ static unsigned long null_rtc_get_time(v { return mktime(2000, 1, 1, 0, 0, 0); } +#ifdef CONFIG_SMP +/* + * We have to synchronize the master CPU with all the slave CPUs + */ +static atomic_t cpus_started; +static atomic_t cpus_ready; +static atomic_t cpus_count; +/* + * Master processor inits + */ +static void sync_cpus_init(int v) +{ + atomic_set(&cpus_count, 0); + mb(); + atomic_set(&cpus_started, v); + mb(); + atomic_set(&cpus_ready, v); + mb(); +} + +/* + * Called by the master processor + */ +static void sync_cpus_master(int v) +{ + atomic_set(&cpus_count, 0); + mb(); + atomic_set(&cpus_started, v); + mb(); + /* Wait here till all other CPUs are now ready */ + while (atomic_read(&cpus_count) != (num_online_cpus() -1) ) + mb(); + atomic_set(&cpus_ready, v); + mb(); +} +/* + * Called by the slave processors + */ +static void sync_cpus_slave(int v) +{ + /* Check if the master has been through this */ + while (atomic_read(&cpus_started) != v) + mb(); + atomic_inc(&cpus_count); + mb(); + while (atomic_read(&cpus_ready) != v) + mb(); +} +/* + * Called by the slave CPUs when done syncing the count register + * with the master processor + */ +static void sync_cpus_slave_exit(int v) +{ + while (atomic_read(&cpus_started) != v) + mb(); + atomic_inc(&cpus_count); + mb(); +} + +#define LOOPS 100 +static u32 c0_count[NR_CPUS]; /* Count register per CPU */ +static u32 c[NR_CPUS][LOOPS + 1]; /* Count register per CPU per loop for syncing */ + +/* + * Slave processors execute this via IPI + */ +static void sync_c0_count_slave(void *info) +{ + int cpus = 1, loop, prev_count = 0, cpu = smp_processor_id(); + unsigned long flags; + u32 diff_count; /* CPU count registers are 32-bit */ + local_irq_save(flags); + + for(loop = 0; loop <= LOOPS; loop++) { + /* Sync with the Master processor */ + sync_cpus_slave(cpus++); + c[cpu][loop] = c0_count[cpu] = read_c0_count(); + mb(); + sync_cpus_slave(cpus++); + diff_count = c0_count[0] - c0_count[cpu]; + diff_count += prev_count; + diff_count += read_c0_count(); + write_c0_count(diff_count); + prev_count = (prev_count >> 1) + + ((int)(c0_count[0] - c0_count[cpu]) >> 1); + } + + /* Slave processor is done syncing count register with Master */ + sync_cpus_slave_exit(cpus++); + printk("SMP: Slave processor %d done syncing count \n", cpu); + local_irq_restore(flags); +} + +/* + * Master kicks off the syncing process + */ +void sync_c0_count_master(void) +{ + int cpus = 0, loop, cpu = smp_processor_id(); + unsigned long flags; + + printk("SMP: Starting to sync the c0 count register ... \n"); + sync_cpus_init(cpus++); + + /* Kick off the slave processors to also start the syncing process */ + smp_call_function(sync_c0_count_slave, NULL, 0, 0); + local_irq_save(flags); + + for (loop = 0; loop <= LOOPS; loop++) { + /* Wait for all the CPUs here */ + sync_cpus_master(cpus++); + c[cpu][loop] = c0_count[cpu] = read_c0_count(); + mb(); + /* Do syncing once more */ + sync_cpus_master(cpus++); + } + sync_cpus_master(cpus++); + local_irq_restore(flags); + + printk("SMP: Syncing process completed accross CPUs ... \n"); +} +#endif /* CONFIG_SMP */ static int null_rtc_set_time(unsigned long sec) { @@ -66,19 +215,30 @@ unsigned long (*rtc_mips_get_time)(void) int (*rtc_mips_set_time)(unsigned long) = null_rtc_set_time; int (*rtc_mips_set_mmss)(unsigned long); - /* how many counter cycles in a jiffy */ static unsigned long cycles_per_jiffy __read_mostly; +static unsigned long hrt_cycles_per_jiffy __read_mostly; + + /* expirelo is the count value for next CPU timer interrupt */ static unsigned int expirelo; - /* * Null timer ack for systems not needing one (e.g. i8254). */ static void null_timer_ack(void) { /* nothing */ } +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * Set the next event + */ +static void mips_set_next_event(unsigned long evt) +{ + write_c0_compare(read_c0_count() + evt); +} +#endif + /* * Null high precision timer functions for systems lacking one. */ @@ -95,13 +255,13 @@ static void c0_timer_ack(void) unsigned int count; /* Ack this timer interrupt and set the next one. */ - expirelo += cycles_per_jiffy; + expirelo += hrt_cycles_per_jiffy; write_c0_compare(expirelo); - /* Check to see if we have missed any timer interrupts. */ - while (((count = read_c0_count()) - expirelo) < 0x7fffffff) { - /* missed_timer_count++; */ - expirelo = count + cycles_per_jiffy; + count = read_c0_count(); + if ((count - expirelo) < 0x7fffffff) { + /* missed_timer_count++; */ + expirelo = count + hrt_cycles_per_jiffy; write_c0_compare(expirelo); } } @@ -127,6 +287,29 @@ void (*mips_timer_ack)(void); /* last time when xtime and rtc are sync'ed up */ static long last_rtc_update; +unsigned long read_persistent_clock(void) +{ + unsigned long sec; + sec = rtc_mips_get_time(); + return sec; +} + +void sync_persistent_clock(struct timespec ts) +{ + if (ntp_synced() && + xtime.tv_sec > last_rtc_update + 660 && + (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 && + (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) { + if (rtc_mips_set_mmss(xtime.tv_sec) == 0) { + last_rtc_update = xtime.tv_sec; + } + else { + /* do it again in 60 s */ + last_rtc_update = xtime.tv_sec - 600; + } + } +} + /* * local_timer_interrupt() does profiling and process accounting * on a per-CPU basis. @@ -160,7 +343,7 @@ irqreturn_t timer_interrupt(int irq, voi /* * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. rtc_mips_set_time() has to be + * CMOS clock accordingly every ~11 minutes. rtc_set_time() has to be * called as close as possible to 500 ms before the new second starts. */ if (ntp_synced() && @@ -199,6 +382,15 @@ int (*perf_irq)(void) = null_perf_irq; EXPORT_SYMBOL(null_perf_irq); EXPORT_SYMBOL(perf_irq); +#ifdef CONFIG_HIGH_RES_TIMERS +void event_timer_handler(struct pt_regs *regs) +{ + c0_timer_ack(); + if (lapic_clockevent.event_handler) + lapic_clockevent.event_handler(regs,NULL); +} +#endif + asmlinkage void ll_timer_interrupt(int irq) { int r2 = cpu_has_mips_r2; @@ -212,6 +404,15 @@ asmlinkage void ll_timer_interrupt(int i * performance counter interrupt was pending, so we have to run the * performance counter interrupt handler anyway. */ +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * Run the event handler + */ + if (!r2 || (read_c0_cause() & (1 << 26))) + if (lapic_clockevent.event_handler) + lapic_clockevent.event_handler(regs,NULL); +#endif + if (!r2 || (read_c0_cause() & (1 << 26))) if (perf_irq()) goto out; @@ -244,7 +445,7 @@ asmlinkage void ll_local_timer_interrupt * b) (optional) calibrate and set the mips_hpt_frequency * (only needed if you intended to use cpu counter as timer interrupt * source) - * 2) setup xtime based on rtc_mips_get_time(). + * 2) setup xtime based on rtc_get_time(). * 3) calculate a couple of cached variables for later usage * 4) plat_timer_setup() - * a) (optional) over-write any choices made above by time_init(). @@ -258,7 +459,7 @@ unsigned int mips_hpt_frequency; static struct irqaction timer_irqaction = { .handler = timer_interrupt, - .flags = IRQF_DISABLED, + .flags = IRQF_NODELAY | IRQF_DISABLED, .name = "timer", }; @@ -335,6 +536,9 @@ static void __init init_mips_clocksource void __init time_init(void) { +#ifdef CONFIG_HIGH_RES_TIMERS + u64 temp; +#endif if (board_time_init) board_time_init(); @@ -378,6 +582,12 @@ void __init time_init(void) if (!mips_hpt_frequency) mips_hpt_frequency = calibrate_hpt(); +#ifdef CONFIG_HIGH_RES_TIMERS + hrt_cycles_per_jiffy = ( (CONFIG_CPU_SPEED * 1000000) + HZ / 2) / HZ; +#else + hrt_cycles_per_jiffy = cycles_per_jiffy; +#endif + /* Report the high precision timer rate for a reference. */ printk("Using %u.%03u MHz high precision timer.\n", ((mips_hpt_frequency + 500) / 1000) / 1000, Index: linux/arch/mips/kernel/traps.c =================================================================== --- linux.orig/arch/mips/kernel/traps.c +++ linux/arch/mips/kernel/traps.c @@ -309,7 +309,7 @@ void show_registers(struct pt_regs *regs printk("\n"); } -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); NORET_TYPE void ATTRIB_NORET die(const char * str, struct pt_regs * regs) { @@ -610,16 +610,6 @@ asmlinkage void do_fpe(struct pt_regs *r if (fcr31 & FPU_CSR_UNI_X) { int sig; - preempt_disable(); - -#ifdef CONFIG_PREEMPT - if (!is_fpu_owner()) { - /* We might lose fpu before disabling preempt... */ - own_fpu(); - BUG_ON(!used_math()); - restore_fp(current); - } -#endif /* * Unimplemented operation exception. If we've got the full * software emulator on-board, let's use it... @@ -630,18 +620,12 @@ asmlinkage void do_fpe(struct pt_regs *r * register operands before invoking the emulator, which seems * a bit extreme for what should be an infrequent event. */ - save_fp(current); /* Ensure 'resume' not overwrite saved fp context again. */ - lose_fpu(); - - preempt_enable(); + lose_fpu(1); /* Run the emulator */ sig = fpu_emulator_cop1Handler (regs, ¤t->thread.fpu, 1); - preempt_disable(); - - own_fpu(); /* Using the FPU again. */ /* * We can't allow the emulated instruction to leave any of * the cause bit set in $fcr31. @@ -649,9 +633,7 @@ asmlinkage void do_fpe(struct pt_regs *r current->thread.fpu.fcr31 &= ~FPU_CSR_ALL_X; /* Restore the hardware register state */ - restore_fp(current); - - preempt_enable(); + own_fpu(1); /* Using the FPU again. */ /* If something went wrong, signal */ if (sig) @@ -775,12 +757,11 @@ asmlinkage void do_cpu(struct pt_regs *r { unsigned int cpid; - die_if_kernel("do_cpu invoked from kernel context!", regs); - cpid = (regs->cp0_cause >> CAUSEB_CE) & 3; switch (cpid) { case 0: + die_if_kernel("do_cpu invoked from kernel context!", regs); if (!cpu_has_llsc) if (!simulate_llsc(regs)) return; @@ -791,21 +772,30 @@ asmlinkage void do_cpu(struct pt_regs *r break; case 1: - preempt_disable(); - - own_fpu(); - if (used_math()) { /* Using the FPU again. */ - restore_fp(current); - } else { /* First time FPU user. */ + if (!test_thread_flag(TIF_ALLOW_FP_IN_KERNEL)) + die_if_kernel("do_cpu invoked from kernel context!", + regs); + if (used_math()) /* Using the FPU again. */ + own_fpu(1); + else { /* First time FPU user. */ init_fpu(); set_used_math(); } - if (cpu_has_fpu) { - preempt_enable(); + if (raw_cpu_has_fpu) { + if (test_thread_flag(TIF_ALLOW_FP_IN_KERNEL)) { + local_irq_disable(); + if (cpu_has_fpu) + regs->cp0_status |= ST0_CU1; + /* + * We must return without enabling + * interrupts to ensure keep FPU + * ownership until resume. + */ + return; + } } else { int sig; - preempt_enable(); sig = fpu_emulator_cop1Handler(regs, ¤t->thread.fpu, 0); if (sig) @@ -1259,26 +1249,26 @@ static inline void mips_srs_init(void) /* * This is used by native signal handling */ -asmlinkage int (*save_fp_context)(struct sigcontext *sc); -asmlinkage int (*restore_fp_context)(struct sigcontext *sc); +asmlinkage int (*save_fp_context)(struct sigcontext __user *sc); +asmlinkage int (*restore_fp_context)(struct sigcontext __user *sc); -extern asmlinkage int _save_fp_context(struct sigcontext *sc); -extern asmlinkage int _restore_fp_context(struct sigcontext *sc); +extern asmlinkage int _save_fp_context(struct sigcontext __user *sc); +extern asmlinkage int _restore_fp_context(struct sigcontext __user *sc); -extern asmlinkage int fpu_emulator_save_context(struct sigcontext *sc); -extern asmlinkage int fpu_emulator_restore_context(struct sigcontext *sc); +extern asmlinkage int fpu_emulator_save_context(struct sigcontext __user *sc); +extern asmlinkage int fpu_emulator_restore_context(struct sigcontext __user *sc); #ifdef CONFIG_SMP -static int smp_save_fp_context(struct sigcontext *sc) +static int smp_save_fp_context(struct sigcontext __user *sc) { - return cpu_has_fpu + return raw_cpu_has_fpu ? _save_fp_context(sc) : fpu_emulator_save_context(sc); } -static int smp_restore_fp_context(struct sigcontext *sc) +static int smp_restore_fp_context(struct sigcontext __user *sc) { - return cpu_has_fpu + return raw_cpu_has_fpu ? _restore_fp_context(sc) : fpu_emulator_restore_context(sc); } @@ -1306,14 +1296,14 @@ static inline void signal_init(void) /* * This is used by 32-bit signal stuff on the 64-bit kernel */ -asmlinkage int (*save_fp_context32)(struct sigcontext32 *sc); -asmlinkage int (*restore_fp_context32)(struct sigcontext32 *sc); +asmlinkage int (*save_fp_context32)(struct sigcontext32 __user *sc); +asmlinkage int (*restore_fp_context32)(struct sigcontext32 __user *sc); -extern asmlinkage int _save_fp_context32(struct sigcontext32 *sc); -extern asmlinkage int _restore_fp_context32(struct sigcontext32 *sc); +extern asmlinkage int _save_fp_context32(struct sigcontext32 __user *sc); +extern asmlinkage int _restore_fp_context32(struct sigcontext32 __user *sc); -extern asmlinkage int fpu_emulator_save_context32(struct sigcontext32 *sc); -extern asmlinkage int fpu_emulator_restore_context32(struct sigcontext32 *sc); +extern asmlinkage int fpu_emulator_save_context32(struct sigcontext32 __user *sc); +extern asmlinkage int fpu_emulator_restore_context32(struct sigcontext32 __user *sc); static inline void signal32_init(void) { Index: linux/arch/mips/math-emu/kernel_linkage.c =================================================================== --- linux.orig/arch/mips/math-emu/kernel_linkage.c +++ linux/arch/mips/math-emu/kernel_linkage.c @@ -51,7 +51,7 @@ void fpu_emulator_init_fpu(void) * with appropriate macros from uaccess.h */ -int fpu_emulator_save_context(struct sigcontext *sc) +int fpu_emulator_save_context(struct sigcontext __user *sc) { int i; int err = 0; @@ -65,7 +65,7 @@ int fpu_emulator_save_context(struct sig return err; } -int fpu_emulator_restore_context(struct sigcontext *sc) +int fpu_emulator_restore_context(struct sigcontext __user *sc) { int i; int err = 0; @@ -84,7 +84,7 @@ int fpu_emulator_restore_context(struct * This is the o32 version */ -int fpu_emulator_save_context32(struct sigcontext32 *sc) +int fpu_emulator_save_context32(struct sigcontext32 __user *sc) { int i; int err = 0; @@ -98,7 +98,7 @@ int fpu_emulator_save_context32(struct s return err; } -int fpu_emulator_restore_context32(struct sigcontext32 *sc) +int fpu_emulator_restore_context32(struct sigcontext32 __user *sc) { int i; int err = 0; Index: linux/arch/mips/mips-boards/generic/init.c =================================================================== --- linux.orig/arch/mips/mips-boards/generic/init.c +++ linux/arch/mips/mips-boards/generic/init.c @@ -145,7 +145,7 @@ static void __init console_config(void) char parity = '\0', bits = '\0', flow = '\0'; char *s; - if ((strstr(prom_getcmdline(), "console=ttyS")) == NULL) { + if ((strstr(prom_getcmdline(), "console=")) == NULL) { s = prom_getenv("modetty0"); if (s) { while (*s >= '0' && *s <= '9') Index: linux/arch/mips/mm/fault.c =================================================================== --- linux.orig/arch/mips/mm/fault.c +++ linux/arch/mips/mm/fault.c @@ -69,7 +69,7 @@ asmlinkage void do_page_fault(struct pt_ * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (in_atomic() || !mm) + if (in_atomic() || !mm || current->pagefault_disabled) goto bad_area_nosemaphore; down_read(&mm->mmap_sem); Index: linux/arch/mips/mm/init.c =================================================================== --- linux.orig/arch/mips/mm/init.c +++ linux/arch/mips/mm/init.c @@ -59,7 +59,7 @@ #endif /* CONFIG_MIPS_MT_SMTC */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); /* * We have up to 8 empty zeroed pages so we can map one of the right colour Index: linux/arch/mips/momentum/ocelot_c/irq.c =================================================================== --- linux.orig/arch/mips/momentum/ocelot_c/irq.c +++ linux/arch/mips/momentum/ocelot_c/irq.c @@ -64,7 +64,7 @@ extern void ll_cpci_irq(void); asmlinkage void plat_irq_dispatch(void) { - unsigned int pending = read_c0_cause() & read_c0_status(); + unsigned int pending = read_c0_cause() & read_c0_status() & ST0_IM; if (pending & STATUSF_IP0) do_IRQ(0); Index: linux/arch/mips/philips/pnx8550/common/int.c =================================================================== --- linux.orig/arch/mips/philips/pnx8550/common/int.c +++ linux/arch/mips/philips/pnx8550/common/int.c @@ -83,16 +83,15 @@ static void timer_irqdispatch(int irq) asmlinkage void plat_irq_dispatch(void) { - unsigned int pending = read_c0_status() & read_c0_cause(); + unsigned int pending = read_c0_status() & read_c0_cause() & ST0_IM; if (pending & STATUSF_IP2) hw0_irqdispatch(2); else if (pending & STATUSF_IP7) { if (read_c0_config7() & 0x01c0) timer_irqdispatch(7); - } - - spurious_interrupt(); + } else + spurious_interrupt(); } static inline void modify_cp0_intmask(unsigned clr_mask, unsigned set_mask) Index: linux/arch/mips/qemu/q-smp.c =================================================================== --- linux.orig/arch/mips/qemu/q-smp.c +++ linux/arch/mips/qemu/q-smp.c @@ -46,3 +46,10 @@ void __init prom_prepare_cpus(unsigned i void prom_boot_secondary(int cpu, struct task_struct *idle) { } + +void __init plat_smp_setup(void) +{ +} +void __init plat_prepare_cpus(unsigned int max_cpus) +{ +} Index: linux/arch/mips/sgi-ip22/ip22-int.c =================================================================== --- linux.orig/arch/mips/sgi-ip22/ip22-int.c +++ linux/arch/mips/sgi-ip22/ip22-int.c @@ -237,7 +237,7 @@ extern void indy_8254timer_irq(void); asmlinkage void plat_irq_dispatch(void) { - unsigned int pending = read_c0_cause(); + unsigned int pending = read_c0_status() & read_c0_cause(); /* * First we check for r4k counter/timer IRQ. Index: linux/arch/mips/sgi-ip32/ip32-irq.c =================================================================== --- linux.orig/arch/mips/sgi-ip32/ip32-irq.c +++ linux/arch/mips/sgi-ip32/ip32-irq.c @@ -454,7 +454,7 @@ static void ip32_irq5(void) asmlinkage void plat_irq_dispatch(void) { - unsigned int pending = read_c0_cause(); + unsigned int pending = read_c0_status() & read_c0_cause(); if (likely(pending & IE_IRQ0)) ip32_irq0(); Index: linux/arch/mips/sibyte/cfe/smp.c =================================================================== --- linux.orig/arch/mips/sibyte/cfe/smp.c +++ linux/arch/mips/sibyte/cfe/smp.c @@ -107,4 +107,8 @@ void prom_smp_finish(void) */ void prom_cpus_done(void) { +#ifdef CONFIG_HIGH_RES_TIMERS + extern void sync_c0_count_master(void); + sync_c0_count_master(); +#endif } Index: linux/arch/mips/sibyte/sb1250/irq.c =================================================================== --- linux.orig/arch/mips/sibyte/sb1250/irq.c +++ linux/arch/mips/sibyte/sb1250/irq.c @@ -81,7 +81,7 @@ static struct irq_chip sb1250_irq_type = /* Store the CPU id (not the logical number) */ int sb1250_irq_owner[SB1250_NR_IRQS]; -DEFINE_SPINLOCK(sb1250_imr_lock); +DEFINE_RAW_SPINLOCK(sb1250_imr_lock); void sb1250_mask_irq(int cpu, int irq) { @@ -242,7 +242,7 @@ static irqreturn_t sb1250_dummy_handler static struct irqaction sb1250_dummy_action = { .handler = sb1250_dummy_handler, - .flags = 0, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "sb1250-private", .next = NULL, @@ -352,6 +352,10 @@ void __init arch_init_irq(void) #ifdef CONFIG_KGDB imask |= STATUSF_IP6; #endif + +#ifdef CONFIG_HIGH_RES_TIMERS + imask |= STATUSF_IP7; +#endif /* Enable necessary IPs, disable the rest */ change_c0_status(ST0_IM, imask); @@ -421,7 +425,7 @@ asmlinkage void plat_irq_dispatch(void) * blasting the high 32 bits. */ - pending = read_c0_cause() & read_c0_status(); + pending = read_c0_cause() & read_c0_status() & ST0_IM; #ifdef CONFIG_SIBYTE_SB1250_PROF if (pending & CAUSEF_IP7) /* Cpu performance counter interrupt */ @@ -429,6 +433,10 @@ asmlinkage void plat_irq_dispatch(void) else #endif +#ifdef CONFIG_HIGH_RES_TIMERS + if (pending & CAUSEF_IP7) + event_timer_handler(regs); +#endif if (pending & CAUSEF_IP4) sb1250_timer_interrupt(); Index: linux/arch/mips/sibyte/sb1250/smp.c =================================================================== --- linux.orig/arch/mips/sibyte/sb1250/smp.c +++ linux/arch/mips/sibyte/sb1250/smp.c @@ -59,7 +59,7 @@ void sb1250_smp_finish(void) { extern void sb1250_time_init(void); sb1250_time_init(); - local_irq_enable(); + raw_local_irq_enable(); } /* Index: linux/arch/mips/sibyte/swarm/setup.c =================================================================== --- linux.orig/arch/mips/sibyte/swarm/setup.c +++ linux/arch/mips/sibyte/swarm/setup.c @@ -131,6 +131,12 @@ void __init plat_mem_setup(void) rtc_mips_set_time = m41t81_set_time; } +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * set the mips_hpt_frequency here + */ + mips_hpt_frequency = CONFIG_CPU_SPEED * 1000000; +#endif printk("This kernel optimized for " #ifdef CONFIG_SIMULATION "simulation" Index: linux/arch/mips/sni/pcimt.c =================================================================== --- linux.orig/arch/mips/sni/pcimt.c +++ linux/arch/mips/sni/pcimt.c @@ -333,7 +333,7 @@ static void pcimt_hwint3(void) static void sni_pcimt_hwint(void) { - u32 pending = (read_c0_cause() & read_c0_status()); + u32 pending = read_c0_cause() & read_c0_status(); if (pending & C_IRQ5) do_IRQ (MIPS_CPU_IRQ_BASE + 7); Index: linux/arch/mips/sni/pcit.c =================================================================== --- linux.orig/arch/mips/sni/pcit.c +++ linux/arch/mips/sni/pcit.c @@ -271,7 +271,7 @@ static void pcit_hwint0(void) static void sni_pcit_hwint(void) { - u32 pending = (read_c0_cause() & read_c0_status()); + u32 pending = read_c0_cause() & read_c0_status(); if (pending & C_IRQ1) pcit_hwint1(); @@ -285,7 +285,7 @@ static void sni_pcit_hwint(void) static void sni_pcit_hwint_cplus(void) { - u32 pending = (read_c0_cause() & read_c0_status()); + u32 pending = read_c0_cause() & read_c0_status(); if (pending & C_IRQ0) pcit_hwint0(); Index: linux/arch/mips/tx4927/common/tx4927_irq.c =================================================================== --- linux.orig/arch/mips/tx4927/common/tx4927_irq.c +++ linux/arch/mips/tx4927/common/tx4927_irq.c @@ -416,7 +416,7 @@ static int tx4927_irq_nested(void) asmlinkage void plat_irq_dispatch(void) { - unsigned int pending = read_c0_status() & read_c0_cause(); + unsigned int pending = read_c0_status() & read_c0_cause() & ST0_IM; if (pending & STATUSF_IP7) /* cpu timer */ do_IRQ(TX4927_IRQ_CPU_TIMER); Index: linux/arch/powerpc/Kconfig =================================================================== --- linux.orig/arch/powerpc/Kconfig +++ linux/arch/powerpc/Kconfig @@ -26,18 +26,15 @@ config MMU bool default y -config GENERIC_HARDIRQS +config GENERIC_TIME bool default y -config IRQ_PER_CPU +config GENERIC_HARDIRQS bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM +config IRQ_PER_CPU bool default y @@ -764,6 +761,18 @@ config HIGHMEM source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "fs/Kconfig.binfmt" # We optimistically allocate largepages from the VM, so make the limit Index: linux/arch/powerpc/Kconfig.debug =================================================================== --- linux.orig/arch/powerpc/Kconfig.debug +++ linux/arch/powerpc/Kconfig.debug @@ -2,6 +2,10 @@ menu "Kernel hacking" source "lib/Kconfig.debug" +config TRACE_IRQFLAGS_SUPPORT + bool + default y + config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL Index: linux/arch/powerpc/boot/Makefile =================================================================== --- linux.orig/arch/powerpc/boot/Makefile +++ linux/arch/powerpc/boot/Makefile @@ -33,6 +33,14 @@ endif BOOTCFLAGS += -I$(obj) -I$(srctree)/$(obj) +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +CFLAGS := $(subst ${pg_flag},${space},${CFLAGS}) +endif + zlib := inffast.c inflate.c inftrees.c zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h zliblinuxheader := zlib.h zconf.h zutil.h @@ -51,7 +59,7 @@ obj-wlib := $(addsuffix .o, $(basename $ obj-plat := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-plat)))) quiet_cmd_copy_zlib = COPY $@ - cmd_copy_zlib = sed "s@__attribute_used__@@;s@]\+\).*@\"\1\"@" $< > $@ + cmd_copy_zlib = sed "s@__attribute_used__@@;s@.include.@@;s@.include.@@;s@.*spin.*lock.*@@;s@.*SPINLOCK.*@@;s@]\+\).*@\"\1\"@" $< > $@ quiet_cmd_copy_zlibheader = COPY $@ cmd_copy_zlibheader = sed "s@]\+\).*@\"\1\"@" $< > $@ Index: linux/arch/powerpc/kernel/Makefile =================================================================== --- linux.orig/arch/powerpc/kernel/Makefile +++ linux/arch/powerpc/kernel/Makefile @@ -10,10 +10,11 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif -obj-y := semaphore.o cputable.o ptrace.o syscalls.o \ +obj-y := cputable.o ptrace.o syscalls.o \ irq.o align.o signal_32.o pmc.o vdso.o \ init_task.o process.o systbl.o idle.o obj-y += vdso32/ +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o obj-$(CONFIG_PPC64) += setup_64.o binfmt_elf32.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o cpu_setup_ppc970.o \ Index: linux/arch/powerpc/kernel/entry_32.S =================================================================== --- linux.orig/arch/powerpc/kernel/entry_32.S +++ linux/arch/powerpc/kernel/entry_32.S @@ -638,7 +638,7 @@ user_exc_return: /* r10 contains MSR_KE /* Check current_thread_info()->flags */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED) + andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne do_work restore_user: @@ -856,7 +856,7 @@ load_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -870,7 +870,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne- do_resched andi. r0,r9,_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK beq restore_user @@ -978,3 +978,85 @@ machine_check_in_rtas: /* XXX load up BATs and panic */ #endif /* CONFIG_PPC_RTAS */ + +#ifdef CONFIG_MCOUNT +/* + * mcount() is not the same as _mcount(). The callers of mcount() have a + * normal context. The callers of _mcount() do not have a stack frame and + * have not saved the "caller saves" registers. + */ +_GLOBAL(mcount) + stwu r1,-16(r1) + mflr r3 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + stw r3,20(r1) + cmpwi r5,0 + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,16(r1) + lwz r4,4(r4) + bl __trace +1: + lwz r0,20(r1) + mtlr r0 + addi r1,r1,16 + blr + +/* + * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the + * C compiler to add a call to _mcount() at the start of each function + * preamble, before the stack frame is created. An example of this preamble + * code is: + * + * mflr r0 + * lis r12,-16354 + * stw r0,4(r1) + * addi r0,r12,-19652 + * bl 0xc00034c8 <_mcount> + * mflr r0 + * stwu r1,-16(r1) + */ +_GLOBAL(_mcount) +#define M_STK_SIZE 48 + /* Would not expect to need to save cr, but glibc version of */ + /* _mcount() does, so cautiously saving it here too. */ + stwu r1,-M_STK_SIZE(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 /* will use as first arg to __trace() */ + mfcr r4 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + cmpwi r5,0 + stw r3, 44(r1) /* lr */ + stw r4, 8(r1) /* cr */ + stw r7, 28(r1) + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,M_STK_SIZE+4(r1) + bl __trace +1: + lwz r8, 8(r1) /* cr */ + lwz r9, 44(r1) /* lr */ + lwz r3, 12(r1) + lwz r4, 16(r1) + lwz r5, 20(r1) + mtcrf 0xff,r8 + mtctr r9 + lwz r0, 52(r1) + lwz r6, 24(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1,r1,M_STK_SIZE + mtlr r0 + bctr + +#endif /* CONFIG_MCOUNT */ Index: linux/arch/powerpc/kernel/entry_64.S =================================================================== --- linux.orig/arch/powerpc/kernel/entry_64.S +++ linux/arch/powerpc/kernel/entry_64.S @@ -449,7 +449,8 @@ _GLOBAL(ret_from_except_lite) #ifdef CONFIG_PREEMPT clrrdi r9,r1,THREAD_SHIFT /* current_thread_info() */ - li r0,_TIF_NEED_RESCHED /* bits to check */ + li r0,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) + /* bits to check */ ld r3,_MSR(r1) ld r4,TI_FLAGS(r9) /* Move MSR_PR bit in r3 to _TIF_SIGPENDING position in r0 */ @@ -572,17 +573,22 @@ do_work: rotldi r10,r10,16 mtmsrd r10,1 ld r4,TI_FLAGS(r9) - andi. r0,r4,_TIF_NEED_RESCHED + andi. r0,r4,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne 1b b restore user_work: #endif + /* here we are preempting the current task */ + li r0,1 + stb r0,PACASOFTIRQEN(r13) + stb r0,PACAHARDIRQEN(r13) + /* Enable interrupts */ ori r10,r10,MSR_EE mtmsrd r10,1 - andi. r0,r4,_TIF_NEED_RESCHED + andi. r0,r4,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq 1f bl .schedule b .ret_from_except_lite Index: linux/arch/powerpc/kernel/head_64.S =================================================================== --- linux.orig/arch/powerpc/kernel/head_64.S +++ linux/arch/powerpc/kernel/head_64.S @@ -1385,7 +1385,7 @@ END_FW_FTR_SECTION_IFCLR(FW_FEATURE_ISER * handles any interrupts pending at this point. */ ld r3,SOFTE(r1) - bl .local_irq_restore + bl .raw_local_irq_restore b 11f /* Here we have a page fault that hash_page can't handle. */ Index: linux/arch/powerpc/kernel/idle.c =================================================================== --- linux.orig/arch/powerpc/kernel/idle.c +++ linux/arch/powerpc/kernel/idle.c @@ -56,7 +56,8 @@ void cpu_idle(void) set_thread_flag(TIF_POLLING_NRFLAG); while (1) { - while (!need_resched() && !cpu_should_die()) { + while (!need_resched() && !need_resched_delayed() && + !cpu_should_die()) { ppc64_runlatch_off(); if (ppc_md.power_save) { @@ -89,7 +90,7 @@ void cpu_idle(void) ppc64_runlatch_on(); if (cpu_should_die()) cpu_die(); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); schedule(); preempt_disable(); } Index: linux/arch/powerpc/kernel/irq.c =================================================================== --- linux.orig/arch/powerpc/kernel/irq.c +++ linux/arch/powerpc/kernel/irq.c @@ -93,8 +93,6 @@ extern atomic_t ipi_sent; #endif #ifdef CONFIG_PPC64 -EXPORT_SYMBOL(irq_desc); - int distribute_irqs = 1; static inline unsigned long get_hard_enabled(void) @@ -113,7 +111,7 @@ static inline void set_soft_enabled(unsi : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } -void local_irq_restore(unsigned long en) +void raw_local_irq_restore(unsigned long en) { /* * get_paca()->soft_enabled = en; @@ -394,7 +392,7 @@ EXPORT_SYMBOL(do_softirq); #ifdef CONFIG_PPC_MERGE static LIST_HEAD(irq_hosts); -static spinlock_t irq_big_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_RAW_SPINLOCK(irq_big_lock); static DEFINE_PER_CPU(unsigned int, irq_radix_reader); static unsigned int irq_radix_writer; struct irq_map_entry irq_map[NR_IRQS]; Index: linux/arch/powerpc/kernel/ppc_ksyms.c =================================================================== --- linux.orig/arch/powerpc/kernel/ppc_ksyms.c +++ linux/arch/powerpc/kernel/ppc_ksyms.c @@ -16,7 +16,6 @@ #include #include -#include #include #include #include @@ -51,7 +50,7 @@ #endif #ifdef CONFIG_PPC64 -EXPORT_SYMBOL(local_irq_restore); +EXPORT_SYMBOL(raw_local_irq_restore); #endif #ifdef CONFIG_PPC32 @@ -179,7 +178,6 @@ EXPORT_SYMBOL(screen_info); #ifdef CONFIG_PPC32 EXPORT_SYMBOL(timer_interrupt); -EXPORT_SYMBOL(irq_desc); EXPORT_SYMBOL(tb_ticks_per_jiffy); EXPORT_SYMBOL(console_drivers); EXPORT_SYMBOL(cacheable_memcpy); Index: linux/arch/powerpc/kernel/rtas.c =================================================================== --- linux.orig/arch/powerpc/kernel/rtas.c +++ linux/arch/powerpc/kernel/rtas.c @@ -36,7 +36,7 @@ #include struct rtas_t rtas = { - .lock = SPIN_LOCK_UNLOCKED + .lock = RAW_SPIN_LOCK_UNLOCKED(lock) }; EXPORT_SYMBOL(rtas); Index: linux/arch/powerpc/kernel/semaphore.c =================================================================== --- linux.orig/arch/powerpc/kernel/semaphore.c +++ linux/arch/powerpc/kernel/semaphore.c @@ -31,7 +31,7 @@ * sem->count = tmp; * return old_count; */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -50,7 +50,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -63,7 +63,7 @@ void __up(struct semaphore *sem) __sem_update_count(sem, 1); wake_up(&sem->wait); } -EXPORT_SYMBOL(__up); +EXPORT_SYMBOL(__compat_up); /* * Note that when we come in to __down or __down_interruptible, @@ -73,7 +73,7 @@ EXPORT_SYMBOL(__up); * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -101,9 +101,9 @@ void __sched __down(struct semaphore *se */ wake_up(&sem->wait); } -EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__compat_down); -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore *sem) { int retval = 0; struct task_struct *tsk = current; @@ -132,4 +132,10 @@ int __sched __down_interruptible(struct wake_up(&sem->wait); return retval; } -EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__compat_down_interruptible); + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux/arch/powerpc/kernel/smp.c =================================================================== --- linux.orig/arch/powerpc/kernel/smp.c +++ linux/arch/powerpc/kernel/smp.c @@ -126,6 +126,16 @@ void smp_send_reschedule(int cpu) smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE); +} + #ifdef CONFIG_DEBUGGER void smp_send_debugger_break(int cpu) { @@ -162,7 +172,7 @@ void smp_send_stop(void) * static memory requirements. It also looks cleaner. * Stolen from the i386 version. */ -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock); +static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(call_lock); static struct call_data_struct { void (*func) (void *info); Index: linux/arch/powerpc/kernel/time.c =================================================================== --- linux.orig/arch/powerpc/kernel/time.c +++ linux/arch/powerpc/kernel/time.c @@ -74,6 +74,9 @@ #endif #include +unsigned long cpu_khz; /* Detected as we calibrate the TSC */ +EXPORT_SYMBOL(cpu_khz); + /* keep track of when we need to update the rtc */ time_t last_rtc_update; #ifdef CONFIG_PPC_ISERIES @@ -116,8 +119,6 @@ EXPORT_SYMBOL_GPL(rtc_lock); u64 tb_to_ns_scale; unsigned tb_to_ns_shift; -struct gettimeofday_struct do_gtod; - extern struct timezone sys_tz; static long timezone_offset; @@ -375,162 +376,8 @@ static __inline__ void timer_check_rtc(v } } -/* - * This version of gettimeofday has microsecond resolution. - */ -static inline void __do_gettimeofday(struct timeval *tv) -{ - unsigned long sec, usec; - u64 tb_ticks, xsec; - struct gettimeofday_vars *temp_varp; - u64 temp_tb_to_xs, temp_stamp_xsec; - - /* - * These calculations are faster (gets rid of divides) - * if done in units of 1/2^20 rather than microseconds. - * The conversion to microseconds at the end is done - * without a divide (and in fact, without a multiply) - */ - temp_varp = do_gtod.varp; - - /* Sampling the time base must be done after loading - * do_gtod.varp in order to avoid racing with update_gtod. - */ - data_barrier(temp_varp); - tb_ticks = get_tb() - temp_varp->tb_orig_stamp; - temp_tb_to_xs = temp_varp->tb_to_xs; - temp_stamp_xsec = temp_varp->stamp_xsec; - xsec = temp_stamp_xsec + mulhdu(tb_ticks, temp_tb_to_xs); - sec = xsec / XSEC_PER_SEC; - usec = (unsigned long)xsec & (XSEC_PER_SEC - 1); - usec = SCALE_XSEC(usec, 1000000); - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -void do_gettimeofday(struct timeval *tv) -{ - if (__USE_RTC()) { - /* do this the old way */ - unsigned long flags, seq; - unsigned int sec, nsec, usec; - - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - sec = xtime.tv_sec; - nsec = xtime.tv_nsec + tb_ticks_since(tb_last_jiffy); - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - usec = nsec / 1000; - while (usec >= 1000000) { - usec -= 1000000; - ++sec; - } - tv->tv_sec = sec; - tv->tv_usec = usec; - return; - } - __do_gettimeofday(tv); -} - -EXPORT_SYMBOL(do_gettimeofday); - -/* - * There are two copies of tb_to_xs and stamp_xsec so that no - * lock is needed to access and use these values in - * do_gettimeofday. We alternate the copies and as long as a - * reasonable time elapses between changes, there will never - * be inconsistent values. ntpd has a minimum of one minute - * between updates. - */ -static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec, - u64 new_tb_to_xs) -{ - unsigned temp_idx; - struct gettimeofday_vars *temp_varp; - - temp_idx = (do_gtod.var_idx == 0); - temp_varp = &do_gtod.vars[temp_idx]; - - temp_varp->tb_to_xs = new_tb_to_xs; - temp_varp->tb_orig_stamp = new_tb_stamp; - temp_varp->stamp_xsec = new_stamp_xsec; - smp_mb(); - do_gtod.varp = temp_varp; - do_gtod.var_idx = temp_idx; - - /* - * tb_update_count is used to allow the userspace gettimeofday code - * to assure itself that it sees a consistent view of the tb_to_xs and - * stamp_xsec variables. It reads the tb_update_count, then reads - * tb_to_xs and stamp_xsec and then reads tb_update_count again. If - * the two values of tb_update_count match and are even then the - * tb_to_xs and stamp_xsec values are consistent. If not, then it - * loops back and reads them again until this criteria is met. - * We expect the caller to have done the first increment of - * vdso_data->tb_update_count already. - */ - vdso_data->tb_orig_stamp = new_tb_stamp; - vdso_data->stamp_xsec = new_stamp_xsec; - vdso_data->tb_to_xs = new_tb_to_xs; - vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec; - vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec; - smp_wmb(); - ++(vdso_data->tb_update_count); -} - -/* - * When the timebase - tb_orig_stamp gets too big, we do a manipulation - * between tb_orig_stamp and stamp_xsec. The goal here is to keep the - * difference tb - tb_orig_stamp small enough to always fit inside a - * 32 bits number. This is a requirement of our fast 32 bits userland - * implementation in the vdso. If we "miss" a call to this function - * (interrupt latency, CPU locked in a spinlock, ...) and we end up - * with a too big difference, then the vdso will fallback to calling - * the syscall - */ -static __inline__ void timer_recalc_offset(u64 cur_tb) -{ - unsigned long offset; - u64 new_stamp_xsec; - u64 tlen, t2x; - u64 tb, xsec_old, xsec_new; - struct gettimeofday_vars *varp; - - if (__USE_RTC()) - return; - tlen = current_tick_length(); - offset = cur_tb - do_gtod.varp->tb_orig_stamp; - if (tlen == last_tick_len && offset < 0x80000000u) - return; - if (tlen != last_tick_len) { - t2x = mulhdu(tlen << TICKLEN_SHIFT, ticklen_to_xs); - last_tick_len = tlen; - } else - t2x = do_gtod.varp->tb_to_xs; - new_stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC; - do_div(new_stamp_xsec, 1000000000); - new_stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC; - - ++vdso_data->tb_update_count; - smp_mb(); - - /* - * Make sure time doesn't go backwards for userspace gettimeofday. - */ - tb = get_tb(); - varp = do_gtod.varp; - xsec_old = mulhdu(tb - varp->tb_orig_stamp, varp->tb_to_xs) - + varp->stamp_xsec; - xsec_new = mulhdu(tb - cur_tb, t2x) + new_stamp_xsec; - if (xsec_new < xsec_old) - new_stamp_xsec += xsec_old - xsec_new; - - update_gtod(cur_tb, new_stamp_xsec, t2x); -} - #ifdef CONFIG_SMP -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -578,11 +425,7 @@ static void iSeries_tb_recal(void) tb_ticks_per_sec = new_tb_ticks_per_sec; calc_cputime_factors(); div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres ); - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; tb_to_xs = divres.result_low; - do_gtod.varp->tb_to_xs = tb_to_xs; - vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - vdso_data->tb_to_xs = tb_to_xs; } else { printk( "Titan recalibrate: FAILED (difference > 4 percent)\n" @@ -665,7 +508,7 @@ void timer_interrupt(struct pt_regs * re if (per_cpu(last_jiffy, cpu) >= tb_next_jiffy) { tb_last_jiffy = tb_next_jiffy; do_timer(1); - timer_recalc_offset(tb_last_jiffy); + /*timer_recalc_offset(tb_last_jiffy);*/ timer_check_rtc(); } write_sequnlock(&xtime_lock); @@ -753,77 +596,6 @@ unsigned long long sched_clock(void) return mulhdu(get_tb(), tb_to_ns_scale) << tb_to_ns_shift; } -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, new_sec = tv->tv_sec; - long wtm_nsec, new_nsec = tv->tv_nsec; - unsigned long flags; - u64 new_xsec; - unsigned long tb_delta; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irqsave(&xtime_lock, flags); - - /* - * Updating the RTC is not the job of this code. If the time is - * stepped under NTP, the RTC will be updated after STA_UNSYNC - * is cleared. Tools like clock/hwclock either copy the RTC - * to the system time, in which case there is no point in writing - * to the RTC again, or write to the RTC but then they don't call - * settimeofday to perform this operation. - */ -#ifdef CONFIG_PPC_ISERIES - if (firmware_has_feature(FW_FEATURE_ISERIES) && first_settimeofday) { - iSeries_tb_recal(); - first_settimeofday = 0; - } -#endif - - /* Make userspace gettimeofday spin until we're done. */ - ++vdso_data->tb_update_count; - smp_mb(); - - /* - * Subtract off the number of nanoseconds since the - * beginning of the last tick. - */ - tb_delta = tb_ticks_since(tb_last_jiffy); - tb_delta = mulhdu(tb_delta, do_gtod.varp->tb_to_xs); /* in xsec */ - new_nsec -= SCALE_XSEC(tb_delta, 1000000000); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - new_sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - new_nsec); - - set_normalized_timespec(&xtime, new_sec, new_nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - /* In case of a large backwards jump in time with NTP, we want the - * clock to be updated as soon as the PLL is again in lock. - */ - last_rtc_update = new_sec - 658; - - ntp_clear(); - - new_xsec = xtime.tv_nsec; - if (new_xsec != 0) { - new_xsec *= XSEC_PER_SEC; - do_div(new_xsec, NSEC_PER_SEC); - } - new_xsec += (u64)xtime.tv_sec * XSEC_PER_SEC; - update_gtod(tb_last_jiffy, new_xsec, do_gtod.varp->tb_to_xs); - - vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; - vdso_data->tz_dsttime = sys_tz.tz_dsttime; - - write_sequnlock_irqrestore(&xtime_lock, flags); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - static int __init get_freq(char *name, int cells, unsigned long *val) { struct device_node *cpu; @@ -921,6 +693,7 @@ void __init time_init(void) tb_ticks_per_jiffy = ppc_tb_freq / HZ; tb_ticks_per_sec = ppc_tb_freq; tb_ticks_per_usec = ppc_tb_freq / 1000000; + cpu_khz = ppc_tb_freq / 1000; tb_to_us = mulhwu_scale_factor(ppc_tb_freq, 1000000); calc_cputime_factors(); @@ -989,20 +762,6 @@ void __init time_init(void) xtime.tv_sec = tm; xtime.tv_nsec = 0; - do_gtod.varp = &do_gtod.vars[0]; - do_gtod.var_idx = 0; - do_gtod.varp->tb_orig_stamp = tb_last_jiffy; - __get_cpu_var(last_jiffy) = tb_last_jiffy; - do_gtod.varp->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; - do_gtod.varp->tb_to_xs = tb_to_xs; - do_gtod.tb_to_us = tb_to_us; - - vdso_data->tb_orig_stamp = tb_last_jiffy; - vdso_data->tb_update_count = 0; - vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; - vdso_data->tb_to_xs = tb_to_xs; time_freq = 0; @@ -1015,7 +774,6 @@ void __init time_init(void) set_dec(tb_ticks_per_jiffy); } - #define FEBRUARY 2 #define STARTOFTIME 1970 #define SECDAY 86400L @@ -1160,3 +918,36 @@ void div128_by_32(u64 dividend_high, u64 dr->result_low = ((u64)y << 32) + z; } + +/* PowerPC clocksource code */ + +#include + +static cycle_t timebase_read(void) +{ + return (cycle_t)get_tb(); +} + +struct clocksource clocksource_timebase = { + .name = "timebase", + .rating = 200, + .read = timebase_read, + .mask = (cycle_t)-1, + .mult = 0, + .shift = 22, + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + + +/* XXX - this should be calculated or properly externed! */ +static int __init init_timebase_clocksource(void) +{ + if (__USE_RTC()) + return -ENODEV; + + clocksource_timebase.mult = clocksource_hz2mult(tb_ticks_per_sec, + clocksource_timebase.shift); + return clocksource_register(&clocksource_timebase); +} + +module_init(init_timebase_clocksource); Index: linux/arch/powerpc/kernel/traps.c =================================================================== --- linux.orig/arch/powerpc/kernel/traps.c +++ linux/arch/powerpc/kernel/traps.c @@ -90,7 +90,7 @@ EXPORT_SYMBOL(unregister_die_notifier); * Trap & Exception support */ -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); int die(const char *str, struct pt_regs *regs, long err) { @@ -159,6 +159,11 @@ void _exception(int signr, struct pt_reg return; } +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + memset(&info, 0, sizeof(info)); info.si_signo = signr; info.si_code = code; Index: linux/arch/powerpc/lib/locks.c =================================================================== --- linux.orig/arch/powerpc/lib/locks.c +++ linux/arch/powerpc/lib/locks.c @@ -25,7 +25,7 @@ #include #include -void __spin_yield(raw_spinlock_t *lock) +void __spin_yield(__raw_spinlock_t *lock) { unsigned int lock_value, holder_cpu, yield_count; @@ -78,7 +78,7 @@ void __rw_yield(raw_rwlock_t *rw) } #endif -void __raw_spin_unlock_wait(raw_spinlock_t *lock) +void __raw_spin_unlock_wait(__raw_spinlock_t *lock) { while (lock->slock) { HMT_low(); Index: linux/arch/powerpc/mm/fault.c =================================================================== --- linux.orig/arch/powerpc/mm/fault.c +++ linux/arch/powerpc/mm/fault.c @@ -149,8 +149,8 @@ static void do_dabr(struct pt_regs *regs * The return value is 0 if the fault was handled, or the signal * number if this is a kernel fault that can't be handled here. */ -int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) +int __kprobes notrace do_page_fault(struct pt_regs *regs, + unsigned long address, unsigned long error_code) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; @@ -196,7 +196,7 @@ int __kprobes do_page_fault(struct pt_re } #endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/ - if (in_atomic() || mm == NULL) { + if (in_atomic() || mm == NULL || current->pagefault_disabled) { if (!user_mode(regs)) return SIGSEGV; /* in_atomic() in user mode is really bad, Index: linux/arch/powerpc/mm/hash_native_64.c =================================================================== --- linux.orig/arch/powerpc/mm/hash_native_64.c +++ linux/arch/powerpc/mm/hash_native_64.c @@ -35,7 +35,7 @@ #define HPTE_LOCK_BIT 3 -static DEFINE_SPINLOCK(native_tlbie_lock); +static DEFINE_RAW_SPINLOCK(native_tlbie_lock); static inline void __tlbie(unsigned long va, unsigned int psize) { Index: linux/arch/powerpc/mm/init_32.c =================================================================== --- linux.orig/arch/powerpc/mm/init_32.c +++ linux/arch/powerpc/mm/init_32.c @@ -56,7 +56,7 @@ #endif #define MAX_LOW_MEM CONFIG_LOWMEM_SIZE -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long total_memory; unsigned long total_lowmem; Index: linux/arch/powerpc/mm/tlb_64.c =================================================================== --- linux.orig/arch/powerpc/mm/tlb_64.c +++ linux/arch/powerpc/mm/tlb_64.c @@ -37,7 +37,7 @@ DEFINE_PER_CPU(struct ppc64_tlb_batch, p /* This is declared as we are using the more or less generic * include/asm-powerpc/tlb.h file -- tgall */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); unsigned long pte_freelist_forced_free; @@ -94,8 +94,11 @@ static void pte_free_submit(struct pte_f void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf) { - /* This is safe since tlb_gather_mmu has disabled preemption */ - cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id()); + /* + * This is safe since tlb_gather_mmu has disabled preemption. + * tlb->cpu is set by tlb_gather_mmu as well. + */ + cpumask_t local_cpumask = cpumask_of_cpu(tlb->cpu); struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur); if (atomic_read(&tlb->mm->mm_users) < 2 || Index: linux/arch/powerpc/platforms/cell/smp.c =================================================================== --- linux.orig/arch/powerpc/platforms/cell/smp.c +++ linux/arch/powerpc/platforms/cell/smp.c @@ -133,7 +133,7 @@ static void __devinit smp_iic_setup_cpu( iic_setup_cpu(); } -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned long timebase = 0; static void __devinit cell_give_timebase(void) Index: linux/arch/powerpc/platforms/chrp/smp.c =================================================================== --- linux.orig/arch/powerpc/platforms/chrp/smp.c +++ linux/arch/powerpc/platforms/chrp/smp.c @@ -45,7 +45,7 @@ static void __devinit smp_chrp_setup_cpu mpic_setup_this_cpu(); } -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned int timebase_upper = 0, timebase_lower = 0; void __devinit smp_chrp_give_timebase(void) Index: linux/arch/powerpc/platforms/chrp/time.c =================================================================== --- linux.orig/arch/powerpc/platforms/chrp/time.c +++ linux/arch/powerpc/platforms/chrp/time.c @@ -27,7 +27,7 @@ #include #include -extern spinlock_t rtc_lock; +extern raw_spinlock_t rtc_lock; static int nvram_as1 = NVRAM_AS1; static int nvram_as0 = NVRAM_AS0; Index: linux/arch/powerpc/platforms/iseries/irq.c =================================================================== --- linux.orig/arch/powerpc/platforms/iseries/irq.c +++ linux/arch/powerpc/platforms/iseries/irq.c @@ -279,6 +279,7 @@ static struct irq_chip iseries_pic = { .shutdown = iseries_shutdown_IRQ, .unmask = iseries_enable_IRQ, .mask = iseries_disable_IRQ, + .ack = iseries_end_IRQ, .eoi = iseries_end_IRQ }; Index: linux/arch/powerpc/platforms/iseries/setup.c =================================================================== --- linux.orig/arch/powerpc/platforms/iseries/setup.c +++ linux/arch/powerpc/platforms/iseries/setup.c @@ -564,12 +564,14 @@ static void yield_shared_processor(void) static void iseries_shared_idle(void) { while (1) { - while (!need_resched() && !hvlpevent_is_pending()) { + while (!need_resched() && !need_resched_delayed() + && !hvlpevent_is_pending()) { local_irq_disable(); ppc64_runlatch_off(); /* Recheck with irqs off */ - if (!need_resched() && !hvlpevent_is_pending()) + if (!need_resched() && !need_resched_delayed() + && !hvlpevent_is_pending()) yield_shared_processor(); HMT_medium(); Index: linux/arch/powerpc/platforms/powermac/feature.c =================================================================== --- linux.orig/arch/powerpc/platforms/powermac/feature.c +++ linux/arch/powerpc/platforms/powermac/feature.c @@ -59,7 +59,7 @@ extern struct device_node *k2_skiplist[2 * We use a single global lock to protect accesses. Each driver has * to take care of its own locking */ -DEFINE_SPINLOCK(feature_lock); +DEFINE_RAW_SPINLOCK(feature_lock); #define LOCK(flags) spin_lock_irqsave(&feature_lock, flags); #define UNLOCK(flags) spin_unlock_irqrestore(&feature_lock, flags); Index: linux/arch/powerpc/platforms/powermac/nvram.c =================================================================== --- linux.orig/arch/powerpc/platforms/powermac/nvram.c +++ linux/arch/powerpc/platforms/powermac/nvram.c @@ -80,7 +80,7 @@ static int is_core_99; static int core99_bank = 0; static int nvram_partitions[3]; // XXX Turn that into a sem -static DEFINE_SPINLOCK(nv_lock); +static DEFINE_RAW_SPINLOCK(nv_lock); static int (*core99_write_bank)(int bank, u8* datas); static int (*core99_erase_bank)(int bank); Index: linux/arch/powerpc/platforms/powermac/pic.c =================================================================== --- linux.orig/arch/powerpc/platforms/powermac/pic.c +++ linux/arch/powerpc/platforms/powermac/pic.c @@ -63,7 +63,7 @@ static int max_irqs; static int max_real_irqs; static u32 level_mask[4]; -static DEFINE_SPINLOCK(pmac_pic_lock); +static DEFINE_RAW_SPINLOCK(pmac_pic_lock); #define NR_MASK_WORDS ((NR_IRQS + 31) / 32) static unsigned long ppc_lost_interrupts[NR_MASK_WORDS]; Index: linux/arch/powerpc/platforms/pseries/setup.c =================================================================== --- linux.orig/arch/powerpc/platforms/pseries/setup.c +++ linux/arch/powerpc/platforms/pseries/setup.c @@ -437,7 +437,8 @@ static void pseries_dedicated_idle_sleep set_thread_flag(TIF_POLLING_NRFLAG); while (get_tb() < start_snooze) { - if (need_resched() || cpu_is_offline(cpu)) + if (need_resched() || need_resched_delayed() || + cpu_is_offline(cpu)) goto out; ppc64_runlatch_off(); HMT_low(); @@ -448,7 +449,8 @@ static void pseries_dedicated_idle_sleep clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb(); local_irq_disable(); - if (need_resched() || cpu_is_offline(cpu)) + if (need_resched() || need_resched_delayed() || + cpu_is_offline(cpu)) goto out; } Index: linux/arch/powerpc/platforms/pseries/smp.c =================================================================== --- linux.orig/arch/powerpc/platforms/pseries/smp.c +++ linux/arch/powerpc/platforms/pseries/smp.c @@ -154,7 +154,7 @@ static void __devinit smp_xics_setup_cpu } #endif /* CONFIG_XICS */ -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned long timebase = 0; static void __devinit pSeries_give_timebase(void) Index: linux/arch/powerpc/platforms/pseries/xics.c =================================================================== --- linux.orig/arch/powerpc/platforms/pseries/xics.c +++ linux/arch/powerpc/platforms/pseries/xics.c @@ -456,6 +456,7 @@ static struct irq_chip xics_pic_direct = .startup = xics_startup, .mask = xics_mask_irq, .unmask = xics_unmask_irq, + .ack = xics_eoi_direct, .eoi = xics_eoi_direct, .set_affinity = xics_set_affinity }; @@ -466,6 +467,7 @@ static struct irq_chip xics_pic_lpar = { .startup = xics_startup, .mask = xics_mask_irq, .unmask = xics_unmask_irq, + .ack = xics_eoi_lpar, .eoi = xics_eoi_lpar, .set_affinity = xics_set_affinity }; Index: linux/arch/powerpc/sysdev/mpic.c =================================================================== --- linux.orig/arch/powerpc/sysdev/mpic.c +++ linux/arch/powerpc/sysdev/mpic.c @@ -776,6 +776,7 @@ static int mpic_set_irq_type(unsigned in static struct irq_chip mpic_irq_chip = { .mask = mpic_mask_irq, .unmask = mpic_unmask_irq, + .ack = mpic_end_irq, .eoi = mpic_end_irq, .set_type = mpic_set_irq_type, }; Index: linux/arch/powerpc/xmon/xmon.c =================================================================== --- linux.orig/arch/powerpc/xmon/xmon.c +++ linux/arch/powerpc/xmon/xmon.c @@ -340,6 +340,7 @@ static int xmon_core(struct pt_regs *reg unsigned long timeout; #endif + preempt_disable(); msr = mfmsr(); mtmsr(msr & ~MSR_EE); /* disable interrupts */ @@ -517,6 +518,7 @@ static int xmon_core(struct pt_regs *reg insert_cpu_bpts(); mtmsr(msr); /* restore interrupt enable */ + preempt_enable(); return cmd != 'X' && cmd != EOF; } Index: linux/arch/ppc/8260_io/enet.c =================================================================== --- linux.orig/arch/ppc/8260_io/enet.c +++ linux/arch/ppc/8260_io/enet.c @@ -116,7 +116,7 @@ struct scc_enet_private { scc_t *sccp; struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; }; static int scc_enet_open(struct net_device *dev); Index: linux/arch/ppc/8260_io/fcc_enet.c =================================================================== --- linux.orig/arch/ppc/8260_io/fcc_enet.c +++ linux/arch/ppc/8260_io/fcc_enet.c @@ -376,7 +376,7 @@ struct fcc_enet_private { volatile fcc_enet_t *ep; struct net_device_stats stats; uint tx_free; - spinlock_t lock; + raw_spinlock_t lock; #ifdef CONFIG_USE_MDIO uint phy_id; Index: linux/arch/ppc/8xx_io/commproc.c =================================================================== --- linux.orig/arch/ppc/8xx_io/commproc.c +++ linux/arch/ppc/8xx_io/commproc.c @@ -355,7 +355,7 @@ cpm_setbrg(uint brg, uint rate) /* * dpalloc / dpfree bits. */ -static spinlock_t cpm_dpmem_lock; +static raw_spinlock_t cpm_dpmem_lock; /* * 16 blocks should be enough to satisfy all requests * until the memory subsystem goes up... Index: linux/arch/ppc/8xx_io/enet.c =================================================================== --- linux.orig/arch/ppc/8xx_io/enet.c +++ linux/arch/ppc/8xx_io/enet.c @@ -143,7 +143,7 @@ struct scc_enet_private { unsigned char *rx_vaddr[RX_RING_SIZE]; struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; }; static int scc_enet_open(struct net_device *dev); Index: linux/arch/ppc/8xx_io/fec.c =================================================================== --- linux.orig/arch/ppc/8xx_io/fec.c +++ linux/arch/ppc/8xx_io/fec.c @@ -164,7 +164,7 @@ struct fec_enet_private { struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; #ifdef CONFIG_USE_MDIO uint phy_id; Index: linux/arch/ppc/Kconfig =================================================================== --- linux.orig/arch/ppc/Kconfig +++ linux/arch/ppc/Kconfig @@ -12,13 +12,6 @@ config GENERIC_HARDIRQS bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config ARCH_HAS_ILOG2_U32 bool default y @@ -988,6 +981,18 @@ config ARCH_POPULATES_NODE_MAP source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "mm/Kconfig" source "fs/Kconfig.binfmt" Index: linux/arch/ppc/boot/Makefile =================================================================== --- linux.orig/arch/ppc/boot/Makefile +++ linux/arch/ppc/boot/Makefile @@ -14,6 +14,15 @@ # CFLAGS += -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include + +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +CFLAGS := $(subst ${pg_flag},${space},${CFLAGS}) +endif + HOSTCFLAGS += -Iarch/$(ARCH)/boot/include BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd Index: linux/arch/ppc/kernel/entry.S =================================================================== --- linux.orig/arch/ppc/kernel/entry.S +++ linux/arch/ppc/kernel/entry.S @@ -856,7 +856,7 @@ load_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -870,7 +870,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,18 lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne- do_resched andi. r0,r9,_TIF_SIGPENDING beq restore_user Index: linux/arch/ppc/kernel/semaphore.c =================================================================== --- linux.orig/arch/ppc/kernel/semaphore.c +++ linux/arch/ppc/kernel/semaphore.c @@ -29,7 +29,7 @@ * sem->count = tmp; * return old_count; */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -48,7 +48,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -70,7 +70,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -100,7 +100,7 @@ void __sched __down(struct semaphore *se wake_up(&sem->wait); } -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -129,3 +129,8 @@ int __sched __down_interruptible(struct wake_up(&sem->wait); return retval; } + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} Index: linux/arch/ppc/kernel/smp.c =================================================================== --- linux.orig/arch/ppc/kernel/smp.c +++ linux/arch/ppc/kernel/smp.c @@ -137,6 +137,16 @@ void smp_send_reschedule(int cpu) smp_message_pass(cpu, PPC_MSG_RESCHEDULE); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + smp_message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE, 0, 0); +} + #ifdef CONFIG_XMON void smp_send_xmon_break(int cpu) { @@ -161,7 +171,7 @@ void smp_send_stop(void) * static memory requirements. It also looks cleaner. * Stolen from the i386 version. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); static struct call_data_struct { void (*func) (void *info); Index: linux/arch/ppc/kernel/time.c =================================================================== --- linux.orig/arch/ppc/kernel/time.c +++ linux/arch/ppc/kernel/time.c @@ -66,6 +66,9 @@ #include +unsigned long cpu_khz; /* Detected as we calibrate the TSC */ +EXPORT_SYMBOL(cpu_khz); + unsigned long disarm_decr[NR_CPUS]; extern struct timezone sys_tz; @@ -102,7 +105,7 @@ static inline int tb_delta(unsigned *jif } #ifdef CONFIG_SMP -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); Index: linux/arch/ppc/kernel/traps.c =================================================================== --- linux.orig/arch/ppc/kernel/traps.c +++ linux/arch/ppc/kernel/traps.c @@ -72,7 +72,7 @@ void (*debugger_fault_handler)(struct pt * Trap & Exception support */ -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); int die(const char * str, struct pt_regs * fp, long err) { @@ -107,6 +107,10 @@ void _exception(int signr, struct pt_reg debugger(regs); die("Exception in kernel mode", regs, signr); } +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif info.si_signo = signr; info.si_errno = 0; info.si_code = code; Index: linux/arch/ppc/lib/locks.c =================================================================== --- linux.orig/arch/ppc/lib/locks.c +++ linux/arch/ppc/lib/locks.c @@ -42,7 +42,7 @@ static inline unsigned long __spin_trylo return ret; } -void _raw_spin_lock(spinlock_t *lock) +void __raw_spin_lock(raw_spinlock_t *lock) { int cpu = smp_processor_id(); unsigned int stuck = INIT_STUCK; @@ -62,9 +62,9 @@ void _raw_spin_lock(spinlock_t *lock) lock->owner_pc = (unsigned long)__builtin_return_address(0); lock->owner_cpu = cpu; } -EXPORT_SYMBOL(_raw_spin_lock); +EXPORT_SYMBOL(__raw_spin_lock); -int _raw_spin_trylock(spinlock_t *lock) +int __raw_spin_trylock(raw_spinlock_t *lock) { if (__spin_trylock(&lock->lock)) return 0; @@ -72,9 +72,9 @@ int _raw_spin_trylock(spinlock_t *lock) lock->owner_pc = (unsigned long)__builtin_return_address(0); return 1; } -EXPORT_SYMBOL(_raw_spin_trylock); +EXPORT_SYMBOL(__raw_spin_trylock); -void _raw_spin_unlock(spinlock_t *lp) +void __raw_spin_unlock(raw_spinlock_t *lp) { if ( !lp->lock ) printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n", @@ -88,13 +88,13 @@ void _raw_spin_unlock(spinlock_t *lp) wmb(); lp->lock = 0; } -EXPORT_SYMBOL(_raw_spin_unlock); +EXPORT_SYMBOL(__raw_spin_unlock); /* * For rwlocks, zero is unlocked, -1 is write-locked, * positive is read-locked. */ -static __inline__ int __read_trylock(rwlock_t *rw) +static __inline__ int __read_trylock(raw_rwlock_t *rw) { signed int tmp; @@ -114,13 +114,13 @@ static __inline__ int __read_trylock(rwl return tmp; } -int _raw_read_trylock(rwlock_t *rw) +int __raw_read_trylock(raw_rwlock_t *rw) { return __read_trylock(rw) > 0; } -EXPORT_SYMBOL(_raw_read_trylock); +EXPORT_SYMBOL(__raw_read_trylock); -void _raw_read_lock(rwlock_t *rw) +void __raw_read_lock(rwlock_t *rw) { unsigned int stuck; @@ -135,9 +135,9 @@ void _raw_read_lock(rwlock_t *rw) } } } -EXPORT_SYMBOL(_raw_read_lock); +EXPORT_SYMBOL(__raw_read_lock); -void _raw_read_unlock(rwlock_t *rw) +void __raw_read_unlock(raw_rwlock_t *rw) { if ( rw->lock == 0 ) printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n", @@ -146,9 +146,9 @@ void _raw_read_unlock(rwlock_t *rw) wmb(); atomic_dec((atomic_t *) &(rw)->lock); } -EXPORT_SYMBOL(_raw_read_unlock); +EXPORT_SYMBOL(__raw_read_unlock); -void _raw_write_lock(rwlock_t *rw) +void __raw_write_lock(raw_rwlock_t *rw) { unsigned int stuck; @@ -164,18 +164,18 @@ void _raw_write_lock(rwlock_t *rw) } wmb(); } -EXPORT_SYMBOL(_raw_write_lock); +EXPORT_SYMBOL(__raw_write_lock); -int _raw_write_trylock(rwlock_t *rw) +int __raw_write_trylock(raw_rwlock_t *rw) { if (cmpxchg(&rw->lock, 0, -1) != 0) return 0; wmb(); return 1; } -EXPORT_SYMBOL(_raw_write_trylock); +EXPORT_SYMBOL(__raw_write_trylock); -void _raw_write_unlock(rwlock_t *rw) +void __raw_write_unlock(raw_rwlock_t *rw) { if (rw->lock >= 0) printk("_write_lock(): %s/%d (nip %08lX) lock %d\n", @@ -184,6 +184,6 @@ void _raw_write_unlock(rwlock_t *rw) wmb(); rw->lock = 0; } -EXPORT_SYMBOL(_raw_write_unlock); +EXPORT_SYMBOL(__raw_write_unlock); #endif Index: linux/arch/ppc/mm/fault.c =================================================================== --- linux.orig/arch/ppc/mm/fault.c +++ linux/arch/ppc/mm/fault.c @@ -89,7 +89,7 @@ static int store_updates_sp(struct pt_re * the error_code parameter is ESR for a data fault, 0 for an instruction * fault. */ -int do_page_fault(struct pt_regs *regs, unsigned long address, +int notrace do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long error_code) { struct vm_area_struct * vma; Index: linux/arch/ppc/mm/init.c =================================================================== --- linux.orig/arch/ppc/mm/init.c +++ linux/arch/ppc/mm/init.c @@ -55,7 +55,7 @@ #endif #define MAX_LOW_MEM CONFIG_LOWMEM_SIZE -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long total_memory; unsigned long total_lowmem; Index: linux/arch/ppc/platforms/apus_setup.c =================================================================== --- linux.orig/arch/ppc/platforms/apus_setup.c +++ linux/arch/ppc/platforms/apus_setup.c @@ -275,6 +275,7 @@ void apus_calibrate_decr(void) freq/1000000, freq%1000000); tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; __bus_speed = bus_speed; __speed_test_failed = speed_test_failed; Index: linux/arch/ppc/platforms/ev64260.c =================================================================== --- linux.orig/arch/ppc/platforms/ev64260.c +++ linux/arch/ppc/platforms/ev64260.c @@ -550,6 +550,7 @@ ev64260_calibrate_decr(void) tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; return; } Index: linux/arch/ppc/platforms/hdpu.c =================================================================== --- linux.orig/arch/ppc/platforms/hdpu.c +++ linux/arch/ppc/platforms/hdpu.c @@ -55,7 +55,7 @@ static void parse_bootinfo(unsigned long static void hdpu_set_l1pe(void); static void hdpu_cpustate_set(unsigned char new_state); #ifdef CONFIG_SMP -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned int timebase_upper = 0, timebase_lower = 0; extern int smp_tb_synchronized; Index: linux/arch/ppc/platforms/powerpmc250.c =================================================================== --- linux.orig/arch/ppc/platforms/powerpmc250.c +++ linux/arch/ppc/platforms/powerpmc250.c @@ -163,6 +163,7 @@ powerpmc250_calibrate_decr(void) tb_ticks_per_jiffy = freq / (HZ * divisor); tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static void Index: linux/arch/ppc/platforms/prep_setup.c =================================================================== --- linux.orig/arch/ppc/platforms/prep_setup.c +++ linux/arch/ppc/platforms/prep_setup.c @@ -940,6 +940,7 @@ prep_calibrate_decr(void) (freq/divisor)/1000000, (freq/divisor)%1000000); tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; tb_ticks_per_jiffy = freq / HZ / divisor; } } Index: linux/arch/ppc/platforms/prpmc750.c =================================================================== --- linux.orig/arch/ppc/platforms/prpmc750.c +++ linux/arch/ppc/platforms/prpmc750.c @@ -268,6 +268,7 @@ static void __init prpmc750_calibrate_de tb_ticks_per_jiffy = freq / (HZ * divisor); tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static void prpmc750_restart(char *cmd) Index: linux/arch/ppc/platforms/prpmc800.c =================================================================== --- linux.orig/arch/ppc/platforms/prpmc800.c +++ linux/arch/ppc/platforms/prpmc800.c @@ -327,6 +327,7 @@ static void __init prpmc800_calibrate_de tb_ticks_per_second = 100000000 / 4; tb_ticks_per_jiffy = tb_ticks_per_second / HZ; tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000); + cpu_khz = tb_ticks_per_second / 1000; return; } @@ -367,6 +368,7 @@ static void __init prpmc800_calibrate_de tb_ticks_per_second = (tbl_end - tbl_start) * 2; tb_ticks_per_jiffy = tb_ticks_per_second / HZ; tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000); + cpu_khz = tb_ticks_per_second / 1000; } static void prpmc800_restart(char *cmd) Index: linux/arch/ppc/platforms/sbc82xx.c =================================================================== --- linux.orig/arch/ppc/platforms/sbc82xx.c +++ linux/arch/ppc/platforms/sbc82xx.c @@ -65,7 +65,7 @@ static void sbc82xx_time_init(void) static volatile char *sbc82xx_i8259_map; static char sbc82xx_i8259_mask = 0xff; -static DEFINE_SPINLOCK(sbc82xx_i8259_lock); +static DEFINE_RAW_SPINLOCK(sbc82xx_i8259_lock); static void sbc82xx_i8259_mask_and_ack_irq(unsigned int irq_nr) { Index: linux/arch/ppc/platforms/spruce.c =================================================================== --- linux.orig/arch/ppc/platforms/spruce.c +++ linux/arch/ppc/platforms/spruce.c @@ -147,6 +147,7 @@ spruce_calibrate_decr(void) freq = SPRUCE_BUS_SPEED; tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static int Index: linux/arch/ppc/syslib/cpm2_common.c =================================================================== --- linux.orig/arch/ppc/syslib/cpm2_common.c +++ linux/arch/ppc/syslib/cpm2_common.c @@ -114,7 +114,7 @@ cpm2_fastbrg(uint brg, uint rate, int di /* * dpalloc / dpfree bits. */ -static spinlock_t cpm_dpmem_lock; +static raw_spinlock_t cpm_dpmem_lock; /* 16 blocks should be enough to satisfy all requests * until the memory subsystem goes up... */ static rh_block_t cpm_boot_dpmem_rh_block[16]; Index: linux/arch/ppc/syslib/ibm44x_common.c =================================================================== --- linux.orig/arch/ppc/syslib/ibm44x_common.c +++ linux/arch/ppc/syslib/ibm44x_common.c @@ -63,6 +63,7 @@ void __init ibm44x_calibrate_decr(unsign { tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; /* Set the time base to zero */ mtspr(SPRN_TBWL, 0); Index: linux/arch/ppc/syslib/m8260_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/m8260_setup.c +++ linux/arch/ppc/syslib/m8260_setup.c @@ -79,6 +79,7 @@ m8260_calibrate_decr(void) divisor = 4; tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } /* The 8260 has an internal 1-second timer update register that Index: linux/arch/ppc/syslib/m8xx_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/m8xx_setup.c +++ linux/arch/ppc/syslib/m8xx_setup.c @@ -218,6 +218,7 @@ void __init m8xx_calibrate_decr(void) printk("Decrementer Frequency = %d/%d\n", freq, divisor); tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; /* Perform some more timer/timebase initialization. This used * to be done elsewhere, but other changes caused it to get Index: linux/arch/ppc/syslib/mpc52xx_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/mpc52xx_setup.c +++ linux/arch/ppc/syslib/mpc52xx_setup.c @@ -215,6 +215,7 @@ mpc52xx_calibrate_decr(void) tb_ticks_per_jiffy = xlbfreq / HZ / divisor; tb_to_us = mulhwu_scale_factor(xlbfreq / divisor, 1000000); + cpu_khz = (xlbfreq / divisor) / 1000; } Index: linux/arch/ppc/syslib/ocp.c =================================================================== --- linux.orig/arch/ppc/syslib/ocp.c +++ linux/arch/ppc/syslib/ocp.c @@ -44,11 +44,11 @@ #include #include #include +#include #include #include #include -#include #include //#define DBG(x) printk x Index: linux/arch/ppc/syslib/open_pic.c =================================================================== --- linux.orig/arch/ppc/syslib/open_pic.c +++ linux/arch/ppc/syslib/open_pic.c @@ -526,7 +526,7 @@ void openpic_reset_processor_phys(u_int } #if defined(CONFIG_SMP) || defined(CONFIG_PM) -static DEFINE_SPINLOCK(openpic_setup_lock); +static DEFINE_RAW_SPINLOCK(openpic_setup_lock); #endif #ifdef CONFIG_SMP Index: linux/arch/ppc/syslib/open_pic2.c =================================================================== --- linux.orig/arch/ppc/syslib/open_pic2.c +++ linux/arch/ppc/syslib/open_pic2.c @@ -380,7 +380,7 @@ static void openpic2_set_spurious(u_int vec); } -static DEFINE_SPINLOCK(openpic2_setup_lock); +static DEFINE_RAW_SPINLOCK(openpic2_setup_lock); /* * Initialize a timer interrupt (and disable it) Index: linux/arch/ppc/syslib/ppc4xx_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/ppc4xx_setup.c +++ linux/arch/ppc/syslib/ppc4xx_setup.c @@ -172,6 +172,7 @@ ppc4xx_calibrate_decr(void) freq = bip->bi_tbfreq; tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; /* Set the time base to zero. ** At 200 Mhz, time base will rollover in ~2925 years. Index: linux/arch/ppc/syslib/ppc85xx_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/ppc85xx_setup.c +++ linux/arch/ppc/syslib/ppc85xx_setup.c @@ -57,6 +57,7 @@ mpc85xx_calibrate_decr(void) divisor = 8; tb_ticks_per_jiffy = freq / divisor / HZ; tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; /* Set the time base to zero */ mtspr(SPRN_TBWL, 0); Index: linux/arch/ppc/syslib/todc_time.c =================================================================== --- linux.orig/arch/ppc/syslib/todc_time.c +++ linux/arch/ppc/syslib/todc_time.c @@ -506,6 +506,7 @@ todc_calibrate_decr(void) tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; return; } Index: linux/arch/s390/kernel/compat_wrapper.S =================================================================== --- linux.orig/arch/s390/kernel/compat_wrapper.S +++ linux/arch/s390/kernel/compat_wrapper.S @@ -1665,3 +1665,20 @@ sys_getcpu_wrapper: llgtr %r3,%r3 # unsigned * llgtr %r4,%r4 # struct getcpu_cache * jg sys_getcpu + + .globl compat_sys_epoll_pwait_wrapper +compat_sys_epoll_pwait_wrapper: + lgfr %r2,%r2 # int + llgtr %r3,%r3 # struct compat_epoll_event * + lgfr %r4,%r4 # int + lgfr %r5,%r5 # int + llgtr %r6,%r6 # compat_sigset_t * + llgf %r0,164(%r15) # compat_size_t + stg %r0,160(%r15) + jg compat_sys_epoll_pwait + + .globl compat_sys_utimes_wrapper +compat_sys_utimes_wrapper: + llgtr %r2,%r2 # char * + llgtr %r3,%r3 # struct compat_timeval * + jg compat_sys_utimes Index: linux/arch/s390/kernel/debug.c =================================================================== --- linux.orig/arch/s390/kernel/debug.c +++ linux/arch/s390/kernel/debug.c @@ -268,7 +268,7 @@ debug_info_alloc(char *name, int pages_p rc->level = level; rc->buf_size = buf_size; rc->entry_size = sizeof(debug_entry_t) + buf_size; - strlcpy(rc->name, name, sizeof(rc->name)-1); + strlcpy(rc->name, name, sizeof(rc->name)); memset(rc->views, 0, DEBUG_MAX_VIEWS * sizeof(struct debug_view *)); memset(rc->debugfs_entries, 0 ,DEBUG_MAX_VIEWS * sizeof(struct dentry*)); Index: linux/arch/s390/kernel/early.c =================================================================== --- linux.orig/arch/s390/kernel/early.c +++ linux/arch/s390/kernel/early.c @@ -141,9 +141,9 @@ static noinline __init void detect_machi machine_flags |= 4; } +#ifdef CONFIG_64BIT static noinline __init int memory_fast_detect(void) { - unsigned long val0 = 0; unsigned long val1 = 0xc; int ret = -ENOSYS; @@ -161,9 +161,15 @@ static noinline __init int memory_fast_d if (ret || val0 != val1) return -ENOSYS; - memory_chunk[0].size = val0; + memory_chunk[0].size = val0 + 1; return 0; } +#else +static inline int memory_fast_detect(void) +{ + return -ENOSYS; +} +#endif #define ADDR2G (1UL << 31) Index: linux/arch/s390/kernel/ipl.c =================================================================== --- linux.orig/arch/s390/kernel/ipl.c +++ linux/arch/s390/kernel/ipl.c @@ -839,7 +839,7 @@ static int __init reipl_ccw_init(void) } reipl_block_ccw->hdr.len = IPL_PARM_BLK_CCW_LEN; reipl_block_ccw->hdr.version = IPL_PARM_BLOCK_VERSION; - reipl_block_ccw->hdr.blk0_len = sizeof(reipl_block_ccw->ipl_info.ccw); + reipl_block_ccw->hdr.blk0_len = IPL_PARM_BLK0_CCW_LEN; reipl_block_ccw->hdr.pbt = DIAG308_IPL_TYPE_CCW; /* check if read scp info worked and set loadparm */ if (SCCB_VALID) @@ -880,8 +880,7 @@ static int __init reipl_fcp_init(void) } else { reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN; reipl_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; - reipl_block_fcp->hdr.blk0_len = - sizeof(reipl_block_fcp->ipl_info.fcp); + reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN; reipl_block_fcp->hdr.pbt = DIAG308_IPL_TYPE_FCP; reipl_block_fcp->ipl_info.fcp.opt = DIAG308_IPL_OPT_IPL; } @@ -930,7 +929,7 @@ static int __init dump_ccw_init(void) } dump_block_ccw->hdr.len = IPL_PARM_BLK_CCW_LEN; dump_block_ccw->hdr.version = IPL_PARM_BLOCK_VERSION; - dump_block_ccw->hdr.blk0_len = sizeof(reipl_block_ccw->ipl_info.ccw); + dump_block_ccw->hdr.blk0_len = IPL_PARM_BLK0_CCW_LEN; dump_block_ccw->hdr.pbt = DIAG308_IPL_TYPE_CCW; dump_capabilities |= IPL_TYPE_CCW; return 0; @@ -954,7 +953,7 @@ static int __init dump_fcp_init(void) } dump_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN; dump_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; - dump_block_fcp->hdr.blk0_len = sizeof(dump_block_fcp->ipl_info.fcp); + dump_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN; dump_block_fcp->hdr.pbt = DIAG308_IPL_TYPE_FCP; dump_block_fcp->ipl_info.fcp.opt = DIAG308_IPL_OPT_DUMP; dump_capabilities |= IPL_TYPE_FCP; Index: linux/arch/s390/kernel/syscalls.S =================================================================== --- linux.orig/arch/s390/kernel/syscalls.S +++ linux/arch/s390/kernel/syscalls.S @@ -320,4 +320,5 @@ SYSCALL(sys_tee,sys_tee,sys_tee_wrapper) SYSCALL(sys_vmsplice,sys_vmsplice,compat_sys_vmsplice_wrapper) NI_SYSCALL /* 310 sys_move_pages */ SYSCALL(sys_getcpu,sys_getcpu,sys_getcpu_wrapper) -SYSCALL(sys_epoll_pwait,sys_epoll_pwait,sys_ni_syscall) +SYSCALL(sys_epoll_pwait,sys_epoll_pwait,compat_sys_epoll_pwait_wrapper) +SYSCALL(sys_utimes,sys_utimes,compat_sys_utimes_wrapper) Index: linux/arch/sh/drivers/pci/pci-auto.c =================================================================== --- linux.orig/arch/sh/drivers/pci/pci-auto.c +++ linux/arch/sh/drivers/pci/pci-auto.c @@ -214,6 +214,12 @@ retry: continue; } + if (bar_value < *lower_limit || (bar_value + bar_size) >= *upper_limit) { + DBG(" unavailable -- skipping, value %x size %x\n", + bar_value, bar_size); + continue; + } + #ifdef CONFIG_PCI_AUTO_UPDATE_RESOURCES /* Write it out and update our limit */ early_write_config_dword(hose, top_bus, current_bus, pci_devfn, Index: linux/arch/sh/kernel/cpu/init.c =================================================================== --- linux.orig/arch/sh/kernel/cpu/init.c +++ linux/arch/sh/kernel/cpu/init.c @@ -3,7 +3,7 @@ * * CPU init code * - * Copyright (C) 2002 - 2006 Paul Mundt + * Copyright (C) 2002 - 2007 Paul Mundt * Copyright (C) 2003 Richard Curnow * * This file is subject to the terms and conditions of the GNU General Public @@ -48,8 +48,19 @@ static void __init cache_init(void) { unsigned long ccr, flags; - if (current_cpu_data.type == CPU_SH_NONE) - panic("Unknown CPU"); + /* First setup the rest of the I-cache info */ + current_cpu_data.icache.entry_mask = current_cpu_data.icache.way_incr - + current_cpu_data.icache.linesz; + + current_cpu_data.icache.way_size = current_cpu_data.icache.sets * + current_cpu_data.icache.linesz; + + /* And the D-cache too */ + current_cpu_data.dcache.entry_mask = current_cpu_data.dcache.way_incr - + current_cpu_data.dcache.linesz; + + current_cpu_data.dcache.way_size = current_cpu_data.dcache.sets * + current_cpu_data.dcache.linesz; jump_to_P2(); ccr = ctrl_inl(CCR); @@ -200,6 +211,9 @@ asmlinkage void __init sh_cpu_init(void) /* First, probe the CPU */ detect_cpu_and_cache_system(); + if (current_cpu_data.type == CPU_SH_NONE) + panic("Unknown CPU"); + /* Init the cache */ cache_init(); Index: linux/arch/sh/kernel/cpu/sh2/entry.S =================================================================== --- linux.orig/arch/sh/kernel/cpu/sh2/entry.S +++ linux/arch/sh/kernel/cpu/sh2/entry.S @@ -165,6 +165,7 @@ ENTRY(exception_handler) interrupt_entry: mov r9,r4 + mov r15,r5 mov.l 6f,r9 mov.l 7f,r8 jmp @r8 Index: linux/arch/sh/kernel/cpu/sh3/entry.S =================================================================== --- linux.orig/arch/sh/kernel/cpu/sh3/entry.S +++ linux/arch/sh/kernel/cpu/sh3/entry.S @@ -514,13 +514,16 @@ skip_save: interrupt_exception: mov.l 1f, r9 + mov.l 2f, r4 + mov.l @r4, r4 jmp @r9 - nop + mov r15, r5 rts nop .align 2 1: .long do_IRQ +2: .long INTEVT .align 2 ENTRY(exception_none) Index: linux/arch/sh/kernel/cpu/sh4/probe.c =================================================================== --- linux.orig/arch/sh/kernel/cpu/sh4/probe.c +++ linux/arch/sh/kernel/cpu/sh4/probe.c @@ -195,13 +195,6 @@ int __init detect_cpu_and_cache_system(v } - /* Setup the rest of the I-cache info */ - current_cpu_data.icache.entry_mask = current_cpu_data.icache.way_incr - - current_cpu_data.icache.linesz; - - current_cpu_data.icache.way_size = current_cpu_data.icache.sets * - current_cpu_data.icache.linesz; - /* And the rest of the D-cache */ if (current_cpu_data.dcache.ways > 1) { size = sizes[(cvr >> 16) & 0xf]; @@ -209,12 +202,6 @@ int __init detect_cpu_and_cache_system(v current_cpu_data.dcache.sets = (size >> 6); } - current_cpu_data.dcache.entry_mask = current_cpu_data.dcache.way_incr - - current_cpu_data.dcache.linesz; - - current_cpu_data.dcache.way_size = current_cpu_data.dcache.sets * - current_cpu_data.dcache.linesz; - /* * Setup the L2 cache desc * Index: linux/arch/sh/kernel/irq.c =================================================================== --- linux.orig/arch/sh/kernel/irq.c +++ linux/arch/sh/kernel/irq.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include @@ -82,13 +81,9 @@ static union irq_ctx *hardirq_ctx[NR_CPU static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; #endif -asmlinkage int do_IRQ(unsigned long r4, unsigned long r5, - unsigned long r6, unsigned long r7, - struct pt_regs __regs) +asmlinkage int do_IRQ(unsigned int irq, struct pt_regs *regs) { - struct pt_regs *regs = RELOC_HIDE(&__regs, 0); struct pt_regs *old_regs = set_irq_regs(regs); - int irq; #ifdef CONFIG_4KSTACKS union irq_ctx *curctx, *irqctx; #endif @@ -111,13 +106,7 @@ asmlinkage int do_IRQ(unsigned long r4, } #endif -#ifdef CONFIG_CPU_HAS_INTEVT - irq = evt2irq(ctrl_inl(INTEVT)); -#else - irq = r4; -#endif - - irq = irq_demux(irq); + irq = irq_demux(evt2irq(irq)); #ifdef CONFIG_4KSTACKS curctx = (union irq_ctx *)current_thread_info(); Index: linux/arch/sparc64/Kconfig =================================================================== --- linux.orig/arch/sparc64/Kconfig +++ linux/arch/sparc64/Kconfig @@ -136,18 +136,6 @@ config SMP If you don't know what to do here, say N. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. - config NR_CPUS int "Maximum number of CPUs (2-64)" range 2 64 @@ -399,6 +387,8 @@ config SCHED_SMT when dealing with UltraSPARC cpus at a cost of slightly increased overhead in some places. If unsure say N here. +source "kernel/Kconfig.preempt" + config CMDLINE_BOOL bool "Default bootloader kernel arguments" Index: linux/arch/sparc64/defconfig =================================================================== --- linux.orig/arch/sparc64/defconfig +++ linux/arch/sparc64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.21-rc2 -# Wed Feb 28 09:50:51 2007 +# Linux kernel version: 2.6.21-rc4 +# Sat Mar 17 14:18:44 2007 # CONFIG_SPARC=y CONFIG_SPARC64=y @@ -50,6 +50,7 @@ CONFIG_POSIX_MQUEUE=y # CONFIG_IKCONFIG is not set CONFIG_SYSFS_DEPRECATED=y CONFIG_RELAY=y +# CONFIG_BLK_DEV_INITRD is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y # CONFIG_EMBEDDED is not set @@ -108,7 +109,6 @@ CONFIG_GENERIC_HARDIRQS=y # General machine setup # # CONFIG_SMP is not set -# CONFIG_PREEMPT is not set CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_TABLE=m # CONFIG_CPU_FREQ_DEBUG is not set @@ -165,6 +165,9 @@ CONFIG_BINFMT_ELF32=y CONFIG_BINFMT_ELF=y CONFIG_BINFMT_MISC=m CONFIG_SOLARIS_EMUL=y +# CONFIG_PREEMPT_NONE is not set +CONFIG_PREEMPT_VOLUNTARY=y +# CONFIG_PREEMPT is not set # CONFIG_CMDLINE_BOOL is not set # @@ -340,7 +343,6 @@ CONFIG_BLK_DEV_NBD=m # CONFIG_BLK_DEV_SX8 is not set # CONFIG_BLK_DEV_UB is not set # CONFIG_BLK_DEV_RAM is not set -# CONFIG_BLK_DEV_INITRD is not set CONFIG_CDROM_PKTCDVD=m CONFIG_CDROM_PKTCDVD_BUFFERS=8 CONFIG_CDROM_PKTCDVD_WCACHE=y Index: linux/arch/sparc64/kernel/ktlb.S =================================================================== --- linux.orig/arch/sparc64/kernel/ktlb.S +++ linux/arch/sparc64/kernel/ktlb.S @@ -138,9 +138,15 @@ kvmap_dtlb_4v: brgez,pn %g4, kvmap_dtlb_nonlinear nop +#ifdef CONFIG_DEBUG_PAGEALLOC + /* Index through the base page size TSB even for linear + * mappings when using page allocation debugging. + */ + KERN_TSB_LOOKUP_TL1(%g4, %g6, %g5, %g1, %g2, %g3, kvmap_dtlb_load) +#else /* Correct TAG_TARGET is already in %g6, check 4mb TSB. */ KERN_TSB4M_LOOKUP_TL1(%g6, %g5, %g1, %g2, %g3, kvmap_dtlb_load) - +#endif /* TSB entry address left in %g1, lookup linear PTE. * Must preserve %g1 and %g6 (TAG). */ Index: linux/arch/sparc64/kernel/tsb.S =================================================================== --- linux.orig/arch/sparc64/kernel/tsb.S +++ linux/arch/sparc64/kernel/tsb.S @@ -546,6 +546,7 @@ NGtsb_init: subcc %o1, 0x100, %o1 bne,pt %xcc, 1b add %o0, 0x100, %o0 + membar #Sync retl wr %g2, 0x0, %asi .size NGtsb_init, .-NGtsb_init Index: linux/arch/sparc64/lib/NGbzero.S =================================================================== --- linux.orig/arch/sparc64/lib/NGbzero.S +++ linux/arch/sparc64/lib/NGbzero.S @@ -88,6 +88,7 @@ NGbzero_loop: bne,pt %xcc, NGbzero_loop add %o0, 64, %o0 + membar #Sync wr %o4, 0x0, %asi brz,pn %o1, NGbzero_done NGbzero_medium: Index: linux/arch/sparc64/lib/NGmemcpy.S =================================================================== --- linux.orig/arch/sparc64/lib/NGmemcpy.S +++ linux/arch/sparc64/lib/NGmemcpy.S @@ -247,6 +247,8 @@ FUNC_NAME: /* %o0=dst, %o1=src, %o2=len /* fall through */ 60: + membar #Sync + /* %o2 contains any final bytes still needed to be copied * over. If anything is left, we copy it one byte at a time. */ Index: linux/arch/sparc64/lib/NGpage.S =================================================================== --- linux.orig/arch/sparc64/lib/NGpage.S +++ linux/arch/sparc64/lib/NGpage.S @@ -41,6 +41,7 @@ NGcopy_user_page: /* %o0=dest, %o1=src, subcc %g7, 64, %g7 bne,pt %xcc, 1b add %o0, 32, %o0 + membar #Sync retl nop @@ -63,6 +64,7 @@ NGclear_user_page: /* %o0=dest, %o1=vadd subcc %g7, 64, %g7 bne,pt %xcc, 1b add %o0, 32, %o0 + membar #Sync retl nop Index: linux/arch/sparc64/mm/init.c =================================================================== --- linux.orig/arch/sparc64/mm/init.c +++ linux/arch/sparc64/mm/init.c @@ -59,8 +59,10 @@ unsigned long kern_linear_pte_xor[2] __r */ unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)]; +#ifndef CONFIG_DEBUG_PAGEALLOC /* A special kernel TSB for 4MB and 256MB linear mappings. */ struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES]; +#endif #define MAX_BANKS 32 @@ -1301,7 +1303,12 @@ static void __init tsb_phys_patch(void) } /* Don't mark as init, we give this to the Hypervisor. */ -static struct hv_tsb_descr ktsb_descr[2]; +#ifndef CONFIG_DEBUG_PAGEALLOC +#define NUM_KTSB_DESCR 2 +#else +#define NUM_KTSB_DESCR 1 +#endif +static struct hv_tsb_descr ktsb_descr[NUM_KTSB_DESCR]; extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES]; static void __init sun4v_ktsb_init(void) @@ -1340,6 +1347,7 @@ static void __init sun4v_ktsb_init(void) ktsb_descr[0].tsb_base = ktsb_pa; ktsb_descr[0].resv = 0; +#ifndef CONFIG_DEBUG_PAGEALLOC /* Second KTSB for 4MB/256MB mappings. */ ktsb_pa = (kern_base + ((unsigned long)&swapper_4m_tsb[0] - KERNBASE)); @@ -1352,6 +1360,7 @@ static void __init sun4v_ktsb_init(void) ktsb_descr[1].ctx_idx = 0; ktsb_descr[1].tsb_base = ktsb_pa; ktsb_descr[1].resv = 0; +#endif } void __cpuinit sun4v_ktsb_register(void) @@ -1364,7 +1373,7 @@ void __cpuinit sun4v_ktsb_register(void) pa = kern_base + ((unsigned long)&ktsb_descr[0] - KERNBASE); func = HV_FAST_MMU_TSB_CTX0; - arg0 = 2; + arg0 = NUM_KTSB_DESCR; arg1 = pa; __asm__ __volatile__("ta %6" : "=&r" (func), "=&r" (arg0), "=&r" (arg1) @@ -1393,7 +1402,9 @@ void __init paging_init(void) /* Invalidate both kernel TSBs. */ memset(swapper_tsb, 0x40, sizeof(swapper_tsb)); +#ifndef CONFIG_DEBUG_PAGEALLOC memset(swapper_4m_tsb, 0x40, sizeof(swapper_4m_tsb)); +#endif if (tlb_type == hypervisor) sun4v_pgprot_init(); @@ -1725,8 +1736,13 @@ static void __init sun4u_pgprot_init(voi pg_iobits = (_PAGE_VALID | _PAGE_PRESENT_4U | __DIRTY_BITS_4U | __ACCESS_BITS_4U | _PAGE_E_4U); +#ifdef CONFIG_DEBUG_PAGEALLOC + kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZBITS_4U) ^ + 0xfffff80000000000; +#else kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4U) ^ 0xfffff80000000000; +#endif kern_linear_pte_xor[0] |= (_PAGE_CP_4U | _PAGE_CV_4U | _PAGE_P_4U | _PAGE_W_4U); @@ -1769,13 +1785,23 @@ static void __init sun4v_pgprot_init(voi _PAGE_E = _PAGE_E_4V; _PAGE_CACHE = _PAGE_CACHE_4V; +#ifdef CONFIG_DEBUG_PAGEALLOC + kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZBITS_4V) ^ + 0xfffff80000000000; +#else kern_linear_pte_xor[0] = (_PAGE_VALID | _PAGE_SZ4MB_4V) ^ 0xfffff80000000000; +#endif kern_linear_pte_xor[0] |= (_PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V | _PAGE_W_4V); +#ifdef CONFIG_DEBUG_PAGEALLOC + kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZBITS_4V) ^ + 0xfffff80000000000; +#else kern_linear_pte_xor[1] = (_PAGE_VALID | _PAGE_SZ256MB_4V) ^ 0xfffff80000000000; +#endif kern_linear_pte_xor[1] |= (_PAGE_CP_4V | _PAGE_CV_4V | _PAGE_P_4V | _PAGE_W_4V); Index: linux/arch/x86_64/Kconfig =================================================================== --- linux.orig/arch/x86_64/Kconfig +++ linux/arch/x86_64/Kconfig @@ -24,10 +24,22 @@ config X86 bool default y +config PARAVIRT + bool + default y + config GENERIC_TIME bool default y +config GENERIC_CLOCKEVENTS_BROADCAST + bool + default y + +config GENERIC_CLOCKEVENTS + bool + default y + config GENERIC_TIME_VSYSCALL bool default y @@ -62,13 +74,6 @@ config ISA config SBUS bool -config RWSEM_GENERIC_SPINLOCK - bool - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - config GENERIC_HWEIGHT bool default y @@ -330,6 +335,8 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +source "kernel/time/Kconfig" + source "kernel/Kconfig.preempt" config NUMA @@ -343,6 +350,14 @@ config NUMA If the system is EM64T, you should say N unless your system is EM64T NUMA. +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + depends on !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT + bool + config K8_NUMA bool "Old style AMD Opteron NUMA detection" depends on NUMA && PCI Index: linux/arch/x86_64/Makefile =================================================================== --- linux.orig/arch/x86_64/Makefile +++ linux/arch/x86_64/Makefile @@ -45,7 +45,9 @@ cflags-kernel-$(CONFIG_REORDER) += -ffun # actually it makes the kernel smaller too. cflags-y += -fno-reorder-blocks cflags-y += -Wno-sign-compare +ifneq ($(CONFIG_UNWIND_INFO),y) cflags-y += -fno-asynchronous-unwind-tables +endif ifneq ($(CONFIG_DEBUG_INFO),y) # -fweb shrinks the kernel a bit, but the difference is very small # it also messes up debugging, so don't use it for now. Index: linux/arch/x86_64/defconfig =================================================================== --- linux.orig/arch/x86_64/defconfig +++ linux/arch/x86_64/defconfig @@ -1,11 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.20-git8 -# Tue Feb 13 11:25:16 2007 +# Linux kernel version: 2.6.21-rc3 +# Wed Mar 7 15:29:47 2007 # CONFIG_X86_64=y CONFIG_64BIT=y CONFIG_X86=y +CONFIG_GENERIC_TIME=y +CONFIG_GENERIC_TIME_VSYSCALL=y CONFIG_ZONE_DMA32=y CONFIG_LOCKDEP_SUPPORT=y CONFIG_STACKTRACE_SUPPORT=y @@ -43,6 +45,7 @@ CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y CONFIG_SYSVIPC=y # CONFIG_IPC_NS is not set +CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set @@ -53,6 +56,7 @@ CONFIG_IKCONFIG_PROC=y # CONFIG_CPUSETS is not set CONFIG_SYSFS_DEPRECATED=y # CONFIG_RELAY is not set +CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y @@ -114,11 +118,11 @@ CONFIG_X86_PC=y # CONFIG_X86_VSMP is not set # CONFIG_MK8 is not set # CONFIG_MPSC is not set -CONFIG_MCORE2=y -# CONFIG_GENERIC_CPU is not set -CONFIG_X86_L1_CACHE_BYTES=64 -CONFIG_X86_L1_CACHE_SHIFT=6 -CONFIG_X86_INTERNODE_CACHE_BYTES=64 +# CONFIG_MCORE2 is not set +CONFIG_GENERIC_CPU=y +CONFIG_X86_L1_CACHE_BYTES=128 +CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_INTERNODE_CACHE_BYTES=128 CONFIG_X86_TSC=y CONFIG_X86_GOOD_APIC=y # CONFIG_MICROCODE is not set @@ -207,10 +211,8 @@ CONFIG_ACPI_PROCFS=y CONFIG_ACPI_AC=y CONFIG_ACPI_BATTERY=y CONFIG_ACPI_BUTTON=y -# CONFIG_ACPI_HOTKEY is not set CONFIG_ACPI_FAN=y # CONFIG_ACPI_DOCK is not set -# CONFIG_ACPI_BAY is not set CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_HOTPLUG_CPU=y CONFIG_ACPI_THERMAL=y @@ -319,7 +321,7 @@ CONFIG_IP_PNP_DHCP=y # CONFIG_INET_ESP is not set # CONFIG_INET_IPCOMP is not set # CONFIG_INET_XFRM_TUNNEL is not set -# CONFIG_INET_TUNNEL is not set +CONFIG_INET_TUNNEL=y # CONFIG_INET_XFRM_MODE_TRANSPORT is not set # CONFIG_INET_XFRM_MODE_TUNNEL is not set # CONFIG_INET_XFRM_MODE_BEET is not set @@ -421,7 +423,13 @@ CONFIG_FW_LOADER=y # # Plug and Play support # -# CONFIG_PNP is not set +CONFIG_PNP=y +# CONFIG_PNP_DEBUG is not set + +# +# Protocols +# +CONFIG_PNPACPI=y # # Block devices @@ -441,7 +449,6 @@ CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_COUNT=16 CONFIG_BLK_DEV_RAM_SIZE=4096 CONFIG_BLK_DEV_RAM_BLOCKSIZE=1024 -CONFIG_BLK_DEV_INITRD=y # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set @@ -451,6 +458,7 @@ CONFIG_BLK_DEV_INITRD=y # CONFIG_IBM_ASM is not set # CONFIG_SGI_IOC4 is not set # CONFIG_TIFM_CORE is not set +# CONFIG_SONY_LAPTOP is not set # # ATA/ATAPI/MFM/RLL support @@ -477,6 +485,7 @@ CONFIG_BLK_DEV_IDEACPI=y # CONFIG_IDE_GENERIC=y # CONFIG_BLK_DEV_CMD640 is not set +# CONFIG_BLK_DEV_IDEPNP is not set CONFIG_BLK_DEV_IDEPCI=y # CONFIG_IDEPCI_SHARE_IRQ is not set # CONFIG_BLK_DEV_OFFBOARD is not set @@ -623,6 +632,7 @@ CONFIG_SATA_VIA=y # CONFIG_SATA_VITESSE is not set # CONFIG_SATA_INIC162X is not set CONFIG_SATA_INTEL_COMBINED=y +CONFIG_SATA_ACPI=y # CONFIG_PATA_ALI is not set # CONFIG_PATA_AMD is not set # CONFIG_PATA_ARTOP is not set @@ -726,6 +736,7 @@ CONFIG_NETDEVICES=y # CONFIG_BONDING is not set # CONFIG_EQUALIZER is not set CONFIG_TUN=y +# CONFIG_NET_SB1000 is not set # # ARCnet devices @@ -920,6 +931,7 @@ CONFIG_HW_CONSOLE=y CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_PNP=y CONFIG_SERIAL_8250_NR_UARTS=4 CONFIG_SERIAL_8250_RUNTIME_UARTS=4 # CONFIG_SERIAL_8250_EXTENDED is not set @@ -1001,6 +1013,7 @@ CONFIG_I2C_ISA=m # CONFIG_I2C_NFORCE2 is not set # CONFIG_I2C_OCORES is not set # CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_PASEMI is not set # CONFIG_I2C_PROSAVAGE is not set # CONFIG_I2C_SAVAGE4 is not set # CONFIG_I2C_SIS5595 is not set @@ -1047,6 +1060,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_ADM1021 is not set # CONFIG_SENSORS_ADM1025 is not set # CONFIG_SENSORS_ADM1026 is not set +# CONFIG_SENSORS_ADM1029 is not set # CONFIG_SENSORS_ADM1031 is not set # CONFIG_SENSORS_ADM9240 is not set # CONFIG_SENSORS_K8TEMP is not set @@ -1090,6 +1104,11 @@ CONFIG_SENSORS_SMSC47B397=m # CONFIG_HWMON_DEBUG_CHIP is not set # +# Multifunction device drivers +# +# CONFIG_MFD_SM501 is not set + +# # Multimedia devices # # CONFIG_VIDEO_DEV is not set @@ -1103,7 +1122,7 @@ CONFIG_SENSORS_SMSC47B397=m # # Graphics support # -# CONFIG_FIRMWARE_EDID is not set +# CONFIG_BACKLIGHT_LCD_SUPPORT is not set # CONFIG_FB is not set # @@ -1114,7 +1133,6 @@ CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=256 CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y -# CONFIG_BACKLIGHT_LCD_SUPPORT is not set # # Sound @@ -1130,9 +1148,8 @@ CONFIG_SOUND=y # Open Sound System # CONFIG_SOUND_PRIME=y -CONFIG_OBSOLETE_OSS=y +# CONFIG_OBSOLETE_OSS is not set # CONFIG_SOUND_BT878 is not set -# CONFIG_SOUND_ES1371 is not set CONFIG_SOUND_ICH=y # CONFIG_SOUND_TRIDENT is not set # CONFIG_SOUND_MSNDCLAS is not set @@ -1263,6 +1280,7 @@ CONFIG_USB_MON=y # CONFIG_USB_RIO500 is not set # CONFIG_USB_LEGOTOWER is not set # CONFIG_USB_LCD is not set +# CONFIG_USB_BERRY_CHARGE is not set # CONFIG_USB_LED is not set # CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set @@ -1273,6 +1291,7 @@ CONFIG_USB_MON=y # CONFIG_USB_SISUSBVGA is not set # CONFIG_USB_LD is not set # CONFIG_USB_TRANCEVIBRATOR is not set +# CONFIG_USB_IOWARRIOR is not set # CONFIG_USB_TEST is not set # @@ -1538,6 +1557,7 @@ CONFIG_DEBUG_KERNEL=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_SCHEDSTATS is not set +# CONFIG_TIMER_STATS is not set # CONFIG_DEBUG_SLAB is not set # CONFIG_DEBUG_RT_MUTEXES is not set # CONFIG_RT_MUTEX_TESTER is not set @@ -1553,9 +1573,12 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_LIST is not set # CONFIG_FRAME_POINTER is not set +CONFIG_UNWIND_INFO=y +CONFIG_STACK_UNWIND=y # CONFIG_FORCED_INLINING is not set # CONFIG_RCU_TORTURE_TEST is not set # CONFIG_LKDTM is not set +# CONFIG_FAULT_INJECTION is not set # CONFIG_DEBUG_RODATA is not set # CONFIG_IOMMU_DEBUG is not set CONFIG_DEBUG_STACKOVERFLOW=y Index: linux/arch/x86_64/ia32/ia32entry.S =================================================================== --- linux.orig/arch/x86_64/ia32/ia32entry.S +++ linux/arch/x86_64/ia32/ia32entry.S @@ -120,7 +120,9 @@ sysenter_do_call: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -229,7 +231,9 @@ cstar_do_call: cmpl $IA32_NR_syscalls-1,%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -323,8 +327,10 @@ ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) jmp int_ret_from_sys_call @@ -394,7 +400,7 @@ END(ia32_ptregs_common) .section .rodata,"a" .align 8 -ia32_sys_call_table: +ENTRY(ia32_sys_call_table) .quad sys_restart_syscall .quad sys_exit .quad stub32_fork @@ -560,7 +566,7 @@ ia32_sys_call_table: .quad sys_sched_yield .quad sys_sched_get_priority_max .quad sys_sched_get_priority_min /* 160 */ - .quad sys_sched_rr_get_interval + .quad sys32_sched_rr_get_interval .quad compat_sys_nanosleep .quad sys_mremap .quad sys_setresuid16 @@ -719,4 +725,7 @@ ia32_sys_call_table: .quad compat_sys_move_pages .quad sys_getcpu .quad sys_epoll_pwait +#ifdef CONFIG_EVENT_TRACE + .globl ia32_syscall_end +#endif ia32_syscall_end: Index: linux/arch/x86_64/kernel/Makefile =================================================================== --- linux.orig/arch/x86_64/kernel/Makefile +++ linux/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o trap ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o + pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o paravirt.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o therm_throt.o Index: linux/arch/x86_64/kernel/apic.c =================================================================== --- linux.orig/arch/x86_64/kernel/apic.c +++ linux/arch/x86_64/kernel/apic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -62,6 +63,26 @@ static cpumask_t timer_interrupt_broadca /* Using APIC to generate smp_local_timer_interrupt? */ int using_apic_timer __read_mostly = 0; + +static unsigned int calibration_result; + +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt); +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt); + +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + static void apic_pm_activate(void); void enable_NMI_through_LVT0 (void * dummy) @@ -428,7 +449,6 @@ void __cpuinit setup_local_APIC (void) oldvalue, value); } - nmi_watchdog_default(); setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } @@ -734,12 +754,15 @@ void __init init_apic_mappings(void) #define APIC_DIVISOR 16 -static void __setup_APIC_LVTT(unsigned int clocks) +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot) { unsigned int lvtt_value, tmp_value; int cpu = smp_processor_id(); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) lvtt_value |= APIC_LVT_MASKED; @@ -754,35 +777,23 @@ static void __setup_APIC_LVTT(unsigned i & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | APIC_TDR_DIV_16); - apic_write(APIC_TMICT, clocks/APIC_DIVISOR); + if (!oneshot) + apic_write(APIC_TMICT, clocks/APIC_DIVISOR); } -static void setup_APIC_timer(unsigned int clocks) +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + apic_write(APIC_TMICT, delta); + return 0; +} + +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) { unsigned long flags; local_irq_save(flags); - - /* wait for irq slice */ - if (hpet_address && hpet_use_timer) { - int trigger = hpet_readl(HPET_T0_CMP); - while (hpet_readl(HPET_COUNTER) >= trigger) - /* do nothing */ ; - while (hpet_readl(HPET_COUNTER) < trigger) - /* do nothing */ ; - } else { - int c1, c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - do { - c1 = c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - } while (c2 - c1 < 300); - } - __setup_APIC_LVTT(clocks); /* Turn off PIT interrupt if we use APIC timer as main timer. Only works with the PM timer right now TBD fix it for HPET too. */ @@ -793,9 +804,21 @@ static void setup_APIC_timer(unsigned in stop_timer_interrupt(); apic_runs_main_timer++; } + __setup_APIC_LVTT(calibration_result, mode != CLOCK_EVT_MODE_PERIODIC); local_irq_restore(flags); } + +static void __devinit setup_APIC_timer(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of_cpu(smp_processor_id()); + + clockevents_register_device(levt); +} + /* * In this function we calibrate APIC bus clocks to the external * timer. Unfortunately we cannot use jiffies and the timer irq @@ -815,12 +838,13 @@ static int __init calibrate_APIC_clock(v { int apic, apic_start, tsc, tsc_start; int result; + u64 wallclock_nsecs; /* * Put whatever arbitrary (but long enough) timeout * value into the APIC clock, we just want to get the * counter running for calibration. */ - __setup_APIC_LVTT(1000000000); + __setup_APIC_LVTT(1000000000, 0); apic_start = apic_read(APIC_TMCCT); #ifdef CONFIG_X86_PM_TIMER @@ -828,6 +852,8 @@ static int __init calibrate_APIC_clock(v pmtimer_wait(5000); /* 5ms wait */ apic = apic_read(APIC_TMCCT); result = (apic_start - apic) * 1000L / 5; + printk("using pmtimer for lapic calibration\n"); + wallclock_nsecs = 5000000; } else #endif { @@ -841,6 +867,8 @@ static int __init calibrate_APIC_clock(v result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start); + wallclock_nsecs = ((u64)tsc - (u64)tsc_start) * 1000000 / (u64)cpu_khz; + } printk("result %d\n", result); @@ -848,11 +876,22 @@ static int __init calibrate_APIC_clock(v printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", result / 1000 / 1000, result / 1000 % 1000); + + + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(apic_start - apic, wallclock_nsecs, 32); + + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + printk("lapic max_delta_ns: %ld\n", lapic_clockevent.max_delta_ns); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + return result * APIC_DIVISOR / HZ; } -static unsigned int calibration_result; - void __init setup_boot_APIC_clock (void) { if (disable_apic_timer) { @@ -869,7 +908,7 @@ void __init setup_boot_APIC_clock (void) /* * Now set up the timer for real. */ - setup_APIC_timer(calibration_result); + setup_APIC_timer(); local_irq_enable(); } @@ -877,7 +916,7 @@ void __init setup_boot_APIC_clock (void) void __cpuinit setup_secondary_APIC_clock(void) { local_irq_disable(); /* FIXME: Do we need this? --RR */ - setup_APIC_timer(calibration_result); + setup_APIC_timer(); local_irq_enable(); } @@ -924,6 +963,13 @@ void switch_APIC_timer_to_ipi(void *cpum !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { disable_APIC_timer(); cpu_set(cpu, timer_interrupt_broadcast_ipi_mask); +#ifdef CONFIG_HIGH_RES_TIMERS + printk("Disabling NO_HZ and high resolution timers " + "due to timer broadcasting\n"); + for_each_possible_cpu(cpu) + per_cpu(lapic_events, cpu).features &= + ~CLOCK_EVT_FEAT_ONESHOT; +#endif } } EXPORT_SYMBOL(switch_APIC_timer_to_ipi); @@ -978,7 +1024,7 @@ void setup_APIC_extened_lvt(unsigned cha void smp_local_timer_interrupt(void) { - profile_tick(CPU_PROFILING); +// profile_tick(CPU_PROFILING); #ifdef CONFIG_SMP update_process_times(user_mode(get_irq_regs())); #endif @@ -1008,6 +1054,8 @@ void smp_apic_timer_interrupt(struct pt_ { struct pt_regs *old_regs = set_irq_regs(regs); + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); /* * the NMI deadlock-detector uses this. */ @@ -1025,7 +1073,7 @@ void smp_apic_timer_interrupt(struct pt_ */ exit_idle(); irq_enter(); - smp_local_timer_interrupt(); + evt->event_handler(evt); irq_exit(); set_irq_regs(old_regs); } Index: linux/arch/x86_64/kernel/cpufreq/Kconfig =================================================================== --- linux.orig/arch/x86_64/kernel/cpufreq/Kconfig +++ linux/arch/x86_64/kernel/cpufreq/Kconfig @@ -75,6 +75,7 @@ config X86_ACPI_CPUFREQ_PROC_INTF config X86_P4_CLOCKMOD tristate "Intel Pentium 4 clock modulation" depends on EMBEDDED + select CPU_FREQ_TABLE help This adds the clock modulation driver for Intel Pentium 4 / XEON processors. When enabled it will lower CPU temperature by skipping Index: linux/arch/x86_64/kernel/crash.c =================================================================== --- linux.orig/arch/x86_64/kernel/crash.c +++ linux/arch/x86_64/kernel/crash.c @@ -62,11 +62,6 @@ static int crash_nmi_callback(struct not return 1; } -static void smp_send_nmi_allbutself(void) -{ - send_IPI_allbutself(NMI_VECTOR); -} - /* * This code is a best effort heuristic to get the * other cpus to stop executing. So races with Index: linux/arch/x86_64/kernel/e820.c =================================================================== --- linux.orig/arch/x86_64/kernel/e820.c +++ linux/arch/x86_64/kernel/e820.c @@ -662,7 +662,7 @@ static int __init parse_memmap_opt(char } early_param("memmap", parse_memmap_opt); -void finish_e820_parsing(void) +void __init finish_e820_parsing(void) { if (userdef) { printk(KERN_INFO "user-defined physical RAM map:\n"); Index: linux/arch/x86_64/kernel/early-quirks.c =================================================================== --- linux.orig/arch/x86_64/kernel/early-quirks.c +++ linux/arch/x86_64/kernel/early-quirks.c @@ -16,7 +16,7 @@ #include #include -static void via_bugs(void) +static void __init via_bugs(void) { #ifdef CONFIG_IOMMU if ((end_pfn > MAX_DMA32_PFN || force_iommu) && @@ -36,7 +36,7 @@ static int __init nvidia_hpet_check(stru } #endif -static void nvidia_bugs(void) +static void __init nvidia_bugs(void) { #ifdef CONFIG_ACPI /* @@ -62,7 +62,7 @@ static void nvidia_bugs(void) } -static void ati_bugs(void) +static void __init ati_bugs(void) { if (timer_over_8254 == 1) { timer_over_8254 = 0; @@ -71,28 +71,15 @@ static void ati_bugs(void) } } -static void intel_bugs(void) -{ - u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); - -#ifdef CONFIG_SMP - if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || - device == PCI_DEVICE_ID_INTEL_E7520_MCH || - device == PCI_DEVICE_ID_INTEL_E7525_MCH) - quirk_intel_irqbalance(); -#endif -} - struct chipset { u16 vendor; void (*f)(void); }; -static struct chipset early_qrk[] = { +static struct __initdata chipset early_qrk[] = { { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, { PCI_VENDOR_ID_VIA, via_bugs }, { PCI_VENDOR_ID_ATI, ati_bugs }, - { PCI_VENDOR_ID_INTEL, intel_bugs}, {} }; Index: linux/arch/x86_64/kernel/early_printk.c =================================================================== --- linux.orig/arch/x86_64/kernel/early_printk.c +++ linux/arch/x86_64/kernel/early_printk.c @@ -203,7 +203,7 @@ static int early_console_initialized = 0 void early_printk(const char *fmt, ...) { - char buf[512]; + static char buf[512]; int n; va_list ap; Index: linux/arch/x86_64/kernel/entry.S =================================================================== --- linux.orig/arch/x86_64/kernel/entry.S +++ linux/arch/x86_64/kernel/entry.S @@ -53,6 +53,47 @@ .code64 +#ifdef CONFIG_EVENT_TRACE + +ENTRY(mcount) + cmpl $0, mcount_enabled + jz out + + push %rbp + mov %rsp,%rbp + + push %r11 + push %r10 + push %r9 + push %r8 + push %rdi + push %rsi + push %rdx + push %rcx + push %rax + + mov 0x0(%rbp),%rax + mov 0x8(%rbp),%rdi + mov 0x8(%rax),%rsi + + call __trace + + pop %rax + pop %rcx + pop %rdx + pop %rsi + pop %rdi + pop %r8 + pop %r9 + pop %r10 + pop %r11 + + pop %rbp +out: + ret + +#endif + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif @@ -234,7 +275,9 @@ ENTRY(system_call) cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx + TRACE_SYS_CALL call *sys_call_table(,%rax,8) # XXX: rip relative + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) /* * Syscall return path ending with SYSRET (fast path) @@ -267,8 +310,8 @@ sysret_check: /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: - bt $TIF_NEED_RESCHED,%edx - jnc sysret_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz sysret_signal TRACE_IRQS_ON sti pushq %rdi @@ -291,7 +334,7 @@ sysret_signal: leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ cli @@ -316,7 +359,9 @@ tracesys: cmova %rcx,%rax ja 1f movq %r10,%rcx /* fixup for C */ + TRACE_SYS_CALL call *sys_call_table(,%rax,8) + TRACE_SYS_RET 1: movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ @@ -344,8 +389,8 @@ int_with_check: /* First do a reschedule test. */ /* edx: work, edi: workmask */ int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz int_very_careful TRACE_IRQS_ON sti pushq %rdi @@ -380,7 +425,7 @@ int_signal: movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 call do_notify_resume -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi int_restore_rest: RESTORE_REST cli @@ -584,8 +629,8 @@ bad_iret: /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz retint_signal TRACE_IRQS_ON sti pushq %rdi @@ -611,7 +656,7 @@ retint_signal: RESTORE_REST cli TRACE_IRQS_OFF - movl $_TIF_NEED_RESCHED,%edi + movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi GET_THREAD_INFO(%rcx) jmp retint_check @@ -1158,3 +1203,36 @@ ENTRY(call_softirq) ret CFI_ENDPROC ENDPROC(call_softirq) + +#ifdef CONFIG_STACK_UNWIND +ENTRY(arch_unwind_init_running) + CFI_STARTPROC + movq %r15, R15(%rdi) + movq %r14, R14(%rdi) + xchgq %rsi, %rdx + movq %r13, R13(%rdi) + movq %r12, R12(%rdi) + xorl %eax, %eax + movq %rbp, RBP(%rdi) + movq %rbx, RBX(%rdi) + movq (%rsp), %rcx + movq %rax, R11(%rdi) + movq %rax, R10(%rdi) + movq %rax, R9(%rdi) + movq %rax, R8(%rdi) + movq %rax, RAX(%rdi) + movq %rax, RCX(%rdi) + movq %rax, RDX(%rdi) + movq %rax, RSI(%rdi) + movq %rax, RDI(%rdi) + movq %rax, ORIG_RAX(%rdi) + movq %rcx, RIP(%rdi) + leaq 8(%rsp), %rcx + movq $__KERNEL_CS, CS(%rdi) + movq %rax, EFLAGS(%rdi) + movq %rcx, RSP(%rdi) + movq $__KERNEL_DS, SS(%rdi) + jmpq *%rdx + CFI_ENDPROC +ENDPROC(arch_unwind_init_running) +#endif Index: linux/arch/x86_64/kernel/genapic.c =================================================================== --- linux.orig/arch/x86_64/kernel/genapic.c +++ linux/arch/x86_64/kernel/genapic.c @@ -11,120 +11,41 @@ #include #include #include +#include #include #include #include -#include #include #include -#if defined(CONFIG_ACPI) +#ifdef CONFIG_ACPI #include #endif /* which logical CPU number maps to which CPU (physical APIC ID) */ -u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly + = { [0 ... NR_CPUS-1] = BAD_APICID }; EXPORT_SYMBOL(x86_cpu_to_apicid); -u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -extern struct genapic apic_cluster; -extern struct genapic apic_flat; -extern struct genapic apic_physflat; +u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; -struct genapic *genapic = &apic_flat; -struct genapic *genapic_force; +struct genapic __read_mostly *genapic = &apic_flat; /* - * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. + * Choose the APIC routing mode: */ -void __init clustered_apic_check(void) +void __init setup_apic_routing(void) { - long i; - u8 clusters, max_cluster; - u8 id; - u8 cluster_cnt[NUM_APIC_CLUSTERS]; - int max_apic = 0; - - /* genapic selection can be forced because of certain quirks. - */ - if (genapic_force) { - genapic = genapic_force; - goto print; - } - -#if defined(CONFIG_ACPI) - /* - * Some x86_64 machines use physical APIC mode regardless of how many - * procs/clusters are present (x86_64 ES7000 is an example). - */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID) - if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) { - genapic = &apic_cluster; - goto print; - } -#endif - - memset(cluster_cnt, 0, sizeof(cluster_cnt)); - for (i = 0; i < NR_CPUS; i++) { - id = bios_cpu_apicid[i]; - if (id == BAD_APICID) - continue; - if (id > max_apic) - max_apic = id; - cluster_cnt[APIC_CLUSTERID(id)]++; - } - - /* Don't use clustered mode on AMD platforms. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - genapic = &apic_physflat; -#ifndef CONFIG_HOTPLUG_CPU - /* In the CPU hotplug case we cannot use broadcast mode - because that opens a race when a CPU is removed. - Stay at physflat mode in this case. - It is bad to do this unconditionally though. Once - we have ACPI platform support for CPU hotplug - we should detect hotplug capablity from ACPI tables and - only do this when really needed. -AK */ - if (max_apic <= 8) - genapic = &apic_flat; -#endif - goto print; - } - - clusters = 0; - max_cluster = 0; - - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (cluster_cnt[i] > 0) { - ++clusters; - if (cluster_cnt[i] > max_cluster) - max_cluster = cluster_cnt[i]; - } - } - - /* - * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, - * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical - * else physical mode. - * (We don't use lowest priority delivery + HW APIC IRQ steering, so - * can ignore the clustered logical case and go straight to physical.) - */ - if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) { -#ifdef CONFIG_HOTPLUG_CPU - /* Don't use APIC shortcuts in CPU hotplug to avoid races */ - genapic = &apic_physflat; -#else + if (cpus_weight(cpu_possible_map) <= 8) genapic = &apic_flat; -#endif - } else - genapic = &apic_cluster; + else + genapic = &apic_physflat; -print: printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); } -/* Same for both flat and clustered. */ +/* Same for both flat and physical. */ void send_IPI_self(int vector) { Index: linux/arch/x86_64/kernel/head64.c =================================================================== --- linux.orig/arch/x86_64/kernel/head64.c +++ linux/arch/x86_64/kernel/head64.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -51,7 +52,7 @@ static void __init copy_bootdata(char *r memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); } -void __init x86_64_start_kernel(char * real_mode_data) +void __init notrace x86_64_start_kernel(char * real_mode_data) { int i; Index: linux/arch/x86_64/kernel/hpet.c =================================================================== --- linux.orig/arch/x86_64/kernel/hpet.c +++ linux/arch/x86_64/kernel/hpet.c @@ -18,7 +18,7 @@ /* FSEC = 10^-15 NSEC = 10^-9 */ #define FSEC_PER_NSEC 1000000 -int nohpet __initdata; +int nohpet __initdata = 1; unsigned long hpet_address; unsigned long hpet_period; /* fsecs / HPET clock */ @@ -35,7 +35,7 @@ static __init int late_hpet_init(void) unsigned int ntimer; if (!hpet_address) - return 0; + return 0; memset(&hd, 0, sizeof(hd)); @@ -112,7 +112,7 @@ int hpet_timer_stop_set_go(unsigned long return 0; } -static cycle_t read_hpet(void) +static cycle_t notrace read_hpet(void) { return (cycle_t)hpet_readl(HPET_COUNTER); } @@ -436,7 +436,7 @@ int hpet_rtc_dropped_irq(void) return 1; } -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) { struct rtc_time curr_time; unsigned long rtc_int_flag = 0; Index: linux/arch/x86_64/kernel/i8259.c =================================================================== --- linux.orig/arch/x86_64/kernel/i8259.c +++ linux/arch/x86_64/kernel/i8259.c @@ -97,8 +97,8 @@ static void (*interrupt[NR_VECTORS - FIR */ static int i8259A_auto_eoi; -DEFINE_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); +DEFINE_RAW_SPINLOCK(i8259A_lock); static struct irq_chip i8259A_chip = { .name = "XT-PIC", @@ -396,7 +396,8 @@ device_initcall(i8259A_init_sysfs); * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, IRQF_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; + DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [0 ... IRQ0_VECTOR - 1] = -1, [IRQ0_VECTOR] = 0, Index: linux/arch/x86_64/kernel/io_apic.c =================================================================== --- linux.orig/arch/x86_64/kernel/io_apic.c +++ linux/arch/x86_64/kernel/io_apic.c @@ -91,8 +91,8 @@ int timer_over_8254 __initdata = 1; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +DEFINE_RAW_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -179,6 +179,9 @@ static inline void io_apic_sync(unsigned reg ACTION; \ io_apic_modify(entry->apic, reg); \ FINAL; \ + /* Force POST flush by reading: */ \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + \ if (!entry->next) \ break; \ entry = irq_2_pin + entry->next; \ @@ -323,10 +326,8 @@ static void add_pin_to_irq(unsigned int static void name##_IO_APIC_irq (unsigned int irq) \ __DO_ACTION(R, ACTION, FINAL) -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ +DO_ACTION( __mask, 0, |= 0x00010000, ) /* mask = 1 */ +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) /* mask = 0 */ static void mask_IO_APIC_irq (unsigned int irq) { @@ -778,9 +779,10 @@ static void ioapic_register_intr(int irq if (trigger) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - else + else { set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); + } } static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, @@ -1657,7 +1659,6 @@ static inline void check_timer(void) */ unmask_IO_APIC_irq(0); if (!no_timer_check && timer_irq_works()) { - nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); setup_nmi(); @@ -1683,7 +1684,6 @@ static inline void check_timer(void) setup_ExtINT_IRQ0_pin(apic2, pin2, cfg->vector); if (timer_irq_works()) { apic_printk(APIC_VERBOSE," works.\n"); - nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); } Index: linux/arch/x86_64/kernel/irq.c =================================================================== --- linux.orig/arch/x86_64/kernel/irq.c +++ linux/arch/x86_64/kernel/irq.c @@ -111,10 +111,18 @@ asmlinkage unsigned int do_IRQ(struct pt unsigned vector = ~regs->orig_rax; unsigned irq; + irq_show_regs_callback(smp_processor_id(), regs); + exit_idle(); irq_enter(); irq = __get_cpu_var(vector_irq)[vector]; +#ifdef CONFIG_EVENT_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif + trace_special(regs->rip, irq, 0); + #ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); #endif Index: linux/arch/x86_64/kernel/mpparse.c =================================================================== --- linux.orig/arch/x86_64/kernel/mpparse.c +++ linux/arch/x86_64/kernel/mpparse.c @@ -60,9 +60,9 @@ unsigned long mp_lapic_addr = 0; /* Processor that is doing the boot up */ unsigned int boot_cpu_id = -1U; /* Internal processor count */ -unsigned int num_processors __initdata = 0; +unsigned int num_processors __cpuinitdata = 0; -unsigned disabled_cpus __initdata; +unsigned disabled_cpus __cpuinitdata; /* Bitmask of physically existing CPUs */ physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE; @@ -300,7 +300,7 @@ static int __init smp_read_mpc(struct mp } } } - clustered_apic_check(); + setup_apic_routing(); if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; Index: linux/arch/x86_64/kernel/nmi.c =================================================================== --- linux.orig/arch/x86_64/kernel/nmi.c +++ linux/arch/x86_64/kernel/nmi.c @@ -21,9 +21,11 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -183,7 +185,7 @@ static __cpuinit inline int nmi_known_cp } /* Run after command line and cpu_init init, but before all other checks */ -void nmi_watchdog_default(void) +static inline void nmi_watchdog_default(void) { if (nmi_watchdog != NMI_DEFAULT) return; @@ -199,7 +201,9 @@ static int endflag __initdata = 0; */ static __init void nmi_cpu_busy(void *data) { +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -281,7 +285,7 @@ int __init check_nmi_watchdog (void) if (nmi_watchdog == NMI_LOCAL_APIC) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); - nmi_hz = 1; + nmi_hz = 10000; if (wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR0) nmi_hz = adjust_for_32bit_ctr(nmi_hz); } @@ -814,15 +818,56 @@ void touch_nmi_watchdog (void) touch_softlockup_watchdog(); } +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) +{ + int i; + + if (system_state == SYSTEM_BOOTING) { + printk("nmi_show_all_regs(): system state %d, not doing.\n", + system_state); + return; + } + + smp_send_nmi_allbutself(); + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + + for_each_online_cpu(i) { + while (nmi_show_regs[i] == 1) + barrier(); + } +} + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + +notrace void irq_show_regs_callback(int cpu, struct pt_regs *regs) +{ + if (!nmi_show_regs[cpu]) + return; + + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + printk("NMI show regs on CPU#%d:\n", cpu); + printk("apic_timer_irqs: %d\n", read_pda(apic_timer_irqs)); + show_regs(regs); + spin_unlock(&nmi_print_lock); +} + int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; int cpu = smp_processor_id(); - struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); + struct nmi_watchdog_ctlblk *wd = &per_cpu(nmi_watchdog_ctlblk, cpu); u64 dummy; int rc=0; + irq_show_regs_callback(cpu, regs); + __profile_tick(CPU_PROFILING, regs); + /* check for other users first */ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { @@ -831,6 +876,7 @@ int __kprobes nmi_watchdog_tick(struct p } sum = read_pda(apic_timer_irqs); + if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; @@ -859,9 +905,20 @@ int __kprobes nmi_watchdog_tick(struct p * wait a few IRQs (5 seconds) before doing the oops ... */ local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) + if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { + int i; + + for_each_online_cpu(i) { + if (i == cpu) + continue; + nmi_show_regs[i] = 1; + while (nmi_show_regs[i] == 1) + cpu_relax(); + } + die_nmi("NMI Watchdog detected LOCKUP on CPU %d\n", regs, panic_on_timeout); + } } else { __get_cpu_var(last_irq_sum) = sum; local_set(&__get_cpu_var(alert_counter), 0); @@ -999,6 +1056,13 @@ void __trigger_all_cpu_backtrace(void) } } +void smp_send_nmi_allbutself(void) +{ +#ifdef CONFIG_SMP + send_IPI_allbutself(NMI_VECTOR); +#endif +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); Index: linux/arch/x86_64/kernel/paravirt.c =================================================================== --- /dev/null +++ linux/arch/x86_64/kernel/paravirt.c @@ -0,0 +1,94 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +struct paravirt_ops paravirt_ops; + +static DEFINE_PER_CPU(struct kvm_vcpu_para_state, para_state); +static DEFINE_PER_CPU(struct kvm_cr3_cache, cr3_cache); + +/* + * This is the vm-syscall address - to be patched by the host to + * VMCALL (Intel) or VMMCALL (AMD), depending on the CPU model: + */ +asm ( + " .globl hypercall_addr \n" + " .align 4 \n" + " hypercall_addr: \n" + " movl $-38, %eax \n" + " ret \n" +); + +extern unsigned char hypercall_addr[6]; + +static void test_hypercall(void) +{ + int ret; + unsigned long dummy = 123456; + + ret = hypercall(1, __NR_hypercall_test, dummy); + pr_debug("hypercall test #1, ret: 0x%x\n", ret); +} + +int kvm_guest_register_para(int cpu) +{ + struct kvm_vcpu_para_state *para_state = &per_cpu(para_state, cpu); + struct kvm_cr3_cache *cr3_cache = &per_cpu(cr3_cache, cpu); + + printk(KERN_DEBUG "kvm guest on VCPU#%d: trying to register para_state %p\n", + cpu, para_state); + /* + * Try to write to a magic MSR (which is invalid on any real CPU), + * and thus signal to KVM that we wish to entering paravirtualized + * mode: + */ + para_state->guest_version = KVM_PARA_API_VERSION; + para_state->host_version = -1; + para_state->size = sizeof(*para_state); + para_state->ret = -1; + para_state->hypercall_gpa = __pa(hypercall_addr); + cr3_cache->entry_count = KVM_CR3_CACHE_SIZE; + para_state->cr3_cache_gpa = __pa(cr3_cache); + + if (wrmsr_safe(MSR_KVM_API_MAGIC, __pa(para_state), 0)) { + printk(KERN_INFO "KVM guest: WRMSR probe failed.\n"); + return 0; + } + + printk(KERN_DEBUG "kvm guest: host returned %d\n", para_state->ret); + printk(KERN_DEBUG "kvm guest: host version: %d\n", para_state->host_version); + printk(KERN_DEBUG "kvm guest: cr3 cache size: %d\n", cr3_cache->entry_count); + printk(KERN_DEBUG "kvm guest: syscall entry: %02x %02x %02x %02x\n", + hypercall_addr[0], hypercall_addr[1], + hypercall_addr[2], hypercall_addr[3]); + if (para_state->ret) { + printk(KERN_ERR "kvm guest: host refused registration.\n"); + return 0; + } + test_hypercall(); + + return 1; +} + +asmlinkage void __init kvm_probe(void) +{ + int paravirt; + + printk(KERN_DEBUG "KVM paravirtualization probe\n"); + + paravirt = kvm_guest_register_para(smp_processor_id()); + if (!paravirt) { + printk(KERN_INFO "Not a KVM para-guest\n"); + return; + } + + printk(KERN_INFO "KVM para-guest: OK\n"); + + paravirt_ops.paravirt_enabled = 1; +} Index: linux/arch/x86_64/kernel/pci-gart.c =================================================================== --- linux.orig/arch/x86_64/kernel/pci-gart.c +++ linux/arch/x86_64/kernel/pci-gart.c @@ -675,7 +675,7 @@ void __init gart_iommu_init(void) dma_ops = &gart_dma_ops; } -void gart_parse_options(char *p) +void __init gart_parse_options(char *p) { int arg; Index: linux/arch/x86_64/kernel/process.c =================================================================== --- linux.orig/arch/x86_64/kernel/process.c +++ linux/arch/x86_64/kernel/process.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -115,7 +116,7 @@ static void default_idle(void) */ smp_mb(); local_irq_disable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { /* Enables interrupts one instruction before HLT. x86 special cases this so there is no race. */ safe_halt(); @@ -201,7 +202,8 @@ void cpu_idle (void) current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + tick_nohz_stop_sched_tick(); + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) @@ -227,9 +229,12 @@ void cpu_idle (void) __exit_idle(); } - preempt_enable_no_resched(); - schedule(); + tick_nohz_restart_sched_tick(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } @@ -245,10 +250,10 @@ void cpu_idle (void) */ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) __mwait(eax, ecx); } } @@ -256,10 +261,10 @@ void mwait_idle_with_hints(unsigned long /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) __sti_mwait(0, 0); else local_irq_enable(); @@ -365,7 +370,7 @@ void exit_thread(void) struct thread_struct *t = &me->thread; if (me->thread.io_bitmap_ptr) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss; kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; @@ -373,6 +378,7 @@ void exit_thread(void) /* * Careful, clear this in the TSS too: */ + tss = &per_cpu(init_tss, get_cpu()); memset(tss->io_bitmap, 0xff, t->io_bitmap_max); t->io_bitmap_max = 0; put_cpu(); @@ -382,14 +388,17 @@ void exit_thread(void) void flush_thread(void) { struct task_struct *tsk = current; - struct thread_info *t = current_thread_info(); - if (t->flags & _TIF_ABI_PENDING) { - t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32); - if (t->flags & _TIF_IA32) + if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { + clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); + if (test_tsk_thread_flag(tsk, TIF_IA32)) { + clear_tsk_thread_flag(tsk, TIF_IA32); + } else { + set_tsk_thread_flag(tsk, TIF_IA32); current_thread_info()->status |= TS_COMPAT; + } } - t->flags &= ~_TIF_DEBUG; + clear_tsk_thread_flag(tsk, TIF_DEBUG); tsk->thread.debugreg0 = 0; tsk->thread.debugreg1 = 0; Index: linux/arch/x86_64/kernel/reboot.c =================================================================== --- linux.orig/arch/x86_64/kernel/reboot.c +++ linux/arch/x86_64/kernel/reboot.c @@ -114,8 +114,23 @@ void machine_shutdown(void) void machine_emergency_restart(void) { + unsigned long ecx = cpuid_ecx(1); int i; + /* + * Disable any possibly active VT context (if VT supported): + */ + if (test_bit(5, &ecx)) { /* has VT support */ + asm volatile ( + "1: .byte 0x0f, 0x01, 0xc4 \n" /* vmxoff */ + "2: \n" + ".section __ex_table,\"a\" \n" + " .align 8 \n" + " .quad 1b,2b \n" + ".previous \n" + ); + } + /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; Index: linux/arch/x86_64/kernel/setup64.c =================================================================== --- linux.orig/arch/x86_64/kernel/setup64.c +++ linux/arch/x86_64/kernel/setup64.c @@ -114,7 +114,7 @@ void __init setup_per_cpu_areas(void) } } -void pda_init(int cpu) +void notrace pda_init(int cpu) { struct x8664_pda *pda = cpu_pda(cpu); @@ -188,7 +188,7 @@ unsigned long kernel_eflags; * 'CPU state barrier', nothing should get across. * A lot of state is already set up in PDA init. */ -void __cpuinit cpu_init (void) +void __cpuinit notrace cpu_init (void) { int cpu = stack_smp_processor_id(); struct tss_struct *t = &per_cpu(init_tss, cpu); Index: linux/arch/x86_64/kernel/signal.c =================================================================== --- linux.orig/arch/x86_64/kernel/signal.c +++ linux/arch/x86_64/kernel/signal.c @@ -396,6 +396,13 @@ static void do_signal(struct pt_regs *re int signr; sigset_t *oldset; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux/arch/x86_64/kernel/smp.c =================================================================== --- linux.orig/arch/x86_64/kernel/smp.c +++ linux/arch/x86_64/kernel/smp.c @@ -57,7 +57,7 @@ union smp_flush_state { struct mm_struct *flush_mm; unsigned long flush_va; #define FLUSH_ALL -1ULL - spinlock_t tlbstate_lock; + raw_spinlock_t tlbstate_lock; }; char pad[SMP_CACHE_BYTES]; } ____cacheline_aligned; @@ -296,10 +296,20 @@ void smp_send_reschedule(int cpu) } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); Index: linux/arch/x86_64/kernel/smpboot.c =================================================================== --- linux.orig/arch/x86_64/kernel/smpboot.c +++ linux/arch/x86_64/kernel/smpboot.c @@ -60,7 +60,6 @@ #include #include #include -#include /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -321,7 +320,7 @@ static inline void set_cpu_sibling_map(i /* * Setup code on secondary processor (after comming out of the trampoline) */ -void __cpuinit start_secondary(void) +void __cpuinit notrace start_secondary(void) { /* * Dont put anything before smp_callin(), SMP @@ -336,8 +335,8 @@ void __cpuinit start_secondary(void) barrier(); /* - * Check TSC sync first: - */ + * Check TSC sync first: + */ check_tsc_sync_target(); Dprintk("cpu %d: setting up apic clock\n", smp_processor_id()); @@ -869,7 +868,6 @@ static int __init smp_sanity_check(unsig */ void __init smp_prepare_cpus(unsigned int max_cpus) { - nmi_watchdog_default(); current_cpu_data = boot_cpu_data; current_thread_info()->cpu = 0; /* needed? */ set_cpu_sibling_map(0); @@ -965,13 +963,6 @@ int __cpuinit __cpu_up(unsigned int cpu) while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); - - if (num_online_cpus() > 8 && genapic == &apic_flat) { - printk(KERN_WARNING - "flat APIC routing can't be used with > 8 cpus\n"); - BUG(); - } - err = 0; return err; Index: linux/arch/x86_64/kernel/time.c =================================================================== --- linux.orig/arch/x86_64/kernel/time.c +++ linux/arch/x86_64/kernel/time.c @@ -42,16 +42,19 @@ #include #include #include +#include +#include #include extern void i8254_timer_resume(void); extern int using_apic_timer; +extern struct clock_event_device pit_clockevent; static char *timename = NULL; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); -DEFINE_SPINLOCK(i8253_lock); +DEFINE_RAW_SPINLOCK(i8253_lock); volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; @@ -146,49 +149,7 @@ static void set_rtc_mmss(unsigned long n void main_timer_handler(void) { - static unsigned long rtc_update = 0; -/* - * Here we are in the timer irq handler. We have irqs locally disabled (so we - * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running - * on the other CPU, so we need a lock. We also need to lock the vsyscall - * variables, because both do_timer() and us change them -arca+vojtech - */ - - write_seqlock(&xtime_lock); - -/* - * Do the timer stuff. - */ - - do_timer(1); -#ifndef CONFIG_SMP - update_process_times(user_mode(get_irq_regs())); -#endif - -/* - * In the SMP case we use the local APIC timer interrupt to do the profiling, - * except when we simulate SMP mode on a uniprocessor system, in that case we - * have to call the local interrupt handler. - */ - - if (!using_apic_timer) - smp_local_timer_interrupt(); - -/* - * If we have an externally synchronized Linux clock, then update CMOS clock - * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy - * closest to exactly 500 ms before the next second. If the update fails, we - * don't care, as it'll be updated on the next turn, and the problem (time way - * off) isn't likely to go away much sooner anyway. - */ - - if (ntp_synced() && xtime.tv_sec > rtc_update && - abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) { - set_rtc_mmss(xtime.tv_sec); - rtc_update = xtime.tv_sec + 660; - } - - write_sequnlock(&xtime_lock); + pit_clockevent.event_handler(&pit_clockevent); } static irqreturn_t timer_interrupt(int irq, void *dev_id) @@ -296,6 +257,75 @@ static void __init __pit_init(int val, u spin_unlock_irqrestore(&i8253_lock, flags); } +static void init_pit_timer(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(0x34, PIT_MODE); + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + break; + + case CLOCK_EVT_MODE_ONESHOT: + /* One shot setup */ + outb_p(0x38, PIT_MODE); + udelay(10); + break; + case CLOCK_EVT_MODE_SHUTDOWN: + case CLOCK_EVT_MODE_UNUSED: + outb_p(0x30, PIT_MODE); + outb_p(0, PIT_CH0); /* LSB */ + outb_p(0, PIT_CH0); /* MSB */ + disable_irq(0); + break; + } + spin_unlock_irqrestore(&i8253_lock, flags); +} + +static int pit_next_event(unsigned long delta, struct clock_event_device *evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + outb_p(delta & 0xff , PIT_CH0); /* LSB */ + outb(delta >> 8 , PIT_CH0); /* MSB */ + spin_unlock_irqrestore(&i8253_lock, flags); + + return 0; +} + +struct clock_event_device pit_clockevent = { + .name = "pit", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, + .set_mode = init_pit_timer, + .set_next_event = pit_next_event, + .shift = 32, + .irq = 0, +}; + +void setup_pit_timer(void) +{ + /* + * Start pit with the boot cpu mask and make it global after the + * IO_APIC has been initialized. + */ + pit_clockevent.cpumask = cpumask_of_cpu(0); + + pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32); + pit_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFF, &pit_clockevent); + pit_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &pit_clockevent); + clockevents_register_device(&pit_clockevent); +} + void __init pit_init(void) { __pit_init(LATCH, 0x34); /* binary, mode 2, LSB/MSB, ch 0 */ @@ -320,7 +350,7 @@ void __init stop_timer_interrupt(void) } static struct irqaction irq0 = { - timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL + timer_interrupt, IRQF_DISABLED | IRQF_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL }; void __init time_init(void) @@ -336,6 +366,8 @@ void __init time_init(void) if (hpet_arch_init()) hpet_address = 0; + setup_pit_timer(); + if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ tick_nsec = TICK_NSEC_HPET; @@ -386,30 +418,11 @@ static int timer_suspend(struct sys_devi static int timer_resume(struct sys_device *dev) { - unsigned long flags; - unsigned long sec; - unsigned long ctime = get_cmos_time(); - long sleep_length = (ctime - sleep_start) * HZ; - - if (sleep_length < 0) { - printk(KERN_WARNING "Time skew detected in timer resume!\n"); - /* The time after the resume must not be earlier than the time - * before the suspend or some nasty things will happen - */ - sleep_length = 0; - ctime = sleep_start; - } if (hpet_address) hpet_reenable(); else i8254_timer_resume(); - sec = ctime + clock_cmos_diff; - write_seqlock_irqsave(&xtime_lock,flags); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - jiffies += sleep_length; - write_sequnlock_irqrestore(&xtime_lock,flags); touch_softlockup_watchdog(); return 0; } Index: linux/arch/x86_64/kernel/traps.c =================================================================== --- linux.orig/arch/x86_64/kernel/traps.c +++ linux/arch/x86_64/kernel/traps.c @@ -71,19 +71,19 @@ asmlinkage void alignment_check(void); asmlinkage void machine_check(void); asmlinkage void spurious_interrupt_bug(void); -ATOMIC_NOTIFIER_HEAD(die_chain); +RAW_NOTIFIER_HEAD(die_chain); EXPORT_SYMBOL(die_chain); int register_die_notifier(struct notifier_block *nb) { vmalloc_sync_all(); - return atomic_notifier_chain_register(&die_chain, nb); + return raw_notifier_chain_register(&die_chain, nb); } EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ int unregister_die_notifier(struct notifier_block *nb) { - return atomic_notifier_chain_unregister(&die_chain, nb); + return raw_notifier_chain_unregister(&die_chain, nb); } EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ @@ -110,6 +110,11 @@ static inline void preempt_conditional_c } int kstack_depth_to_print = 12; +#ifdef CONFIG_STACK_UNWIND +static int call_trace = 1; +#else +#define call_trace (-1) +#endif #ifdef CONFIG_KALLSYMS void printk_address(unsigned long address) @@ -212,6 +217,32 @@ static unsigned long *in_exception_stack return NULL; } +struct ops_and_data { + struct stacktrace_ops *ops; + void *data; +}; + +static int dump_trace_unwind(struct unwind_frame_info *info, void *context) +{ + struct ops_and_data *oad = (struct ops_and_data *)context; + int n = 0; + unsigned long sp = UNW_SP(info); + + if (arch_unw_user_mode(info)) + return -1; + while (unwind(info) == 0 && UNW_PC(info)) { + n++; + oad->ops->address(oad->data, UNW_PC(info)); + if (arch_unw_user_mode(info)) + break; + if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1)) + && sp > UNW_SP(info)) + break; + sp = UNW_SP(info); + } + return n; +} + #define MSG(txt) ops->warning(data, txt) /* @@ -231,7 +262,7 @@ void dump_trace(struct task_struct *tsk, unsigned long *stack, struct stacktrace_ops *ops, void *data) { - const unsigned cpu = get_cpu(); + const unsigned cpu = raw_smp_processor_id(); unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; @@ -239,6 +270,40 @@ void dump_trace(struct task_struct *tsk, if (!tsk) tsk = current; + if (call_trace >= 0) { + int unw_ret = 0; + struct unwind_frame_info info; + struct ops_and_data oad = { .ops = ops, .data = data }; + + if (regs) { + if (unwind_init_frame_info(&info, tsk, regs) == 0) + unw_ret = dump_trace_unwind(&info, &oad); + } else if (tsk == current) + unw_ret = unwind_init_running(&info, dump_trace_unwind, + &oad); + else { + if (unwind_init_blocked(&info, tsk) == 0) + unw_ret = dump_trace_unwind(&info, &oad); + } + if (unw_ret > 0) { + if (call_trace == 1 && !arch_unw_user_mode(&info)) { + ops->warning_symbol(data, + "DWARF2 unwinder stuck at %s", + UNW_PC(&info)); + if ((long)UNW_SP(&info) < 0) { + MSG("Leftover inexact backtrace:"); + stack = (unsigned long *)UNW_SP(&info); + if (!stack) + goto out; + } else + MSG("Full inexact backtrace again:"); + } else if (call_trace >= 1) + goto out; + else + MSG("Full inexact backtrace again:"); + } else + MSG("Inexact backtrace:"); + } if (!stack) { unsigned long dummy; stack = &dummy; @@ -322,7 +387,8 @@ void dump_trace(struct task_struct *tsk, tinfo = task_thread_info(tsk); HANDLE_STACK (valid_stack_ptr(tinfo, stack)); #undef HANDLE_STACK - put_cpu(); +out: + ; } EXPORT_SYMBOL(dump_trace); @@ -359,9 +425,15 @@ static struct stacktrace_ops print_trace void show_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long *stack) { + if (pause_on_oops_head) + for(;;); printk("\nCall Trace:\n"); dump_trace(tsk, regs, stack, &print_trace_ops, NULL); printk("\n"); + if (pause_on_oops_tail) + for(;;); + debug_show_held_locks(tsk); + print_traces(tsk); } static void @@ -369,7 +441,7 @@ _show_stack(struct task_struct *tsk, str { unsigned long *stack; int i; - const int cpu = smp_processor_id(); + const int cpu = raw_smp_processor_id(); unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); @@ -477,19 +549,20 @@ void out_of_line_bug(void) EXPORT_SYMBOL(out_of_line_bug); #endif -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); static int die_owner = -1; static unsigned int die_nest_count; unsigned __kprobes long oops_begin(void) { - int cpu = smp_processor_id(); unsigned long flags; + int cpu; oops_enter(); /* racy, but better than risking deadlock. */ local_irq_save(flags); + cpu = smp_processor_id(); if (!spin_trylock(&die_lock)) { if (cpu == die_owner) /* nested oops. should stop eventually */; @@ -1122,3 +1195,21 @@ static int __init kstack_setup(char *s) return 0; } early_param("kstack", kstack_setup); + +#ifdef CONFIG_STACK_UNWIND +static int __init call_trace_setup(char *s) +{ + if (!s) + return -EINVAL; + if (strcmp(s, "old") == 0) + call_trace = -1; + else if (strcmp(s, "both") == 0) + call_trace = 0; + else if (strcmp(s, "newfallback") == 0) + call_trace = 1; + else if (strcmp(s, "new") == 0) + call_trace = 2; + return 0; +} +early_param("call_trace", call_trace_setup); +#endif Index: linux/arch/x86_64/kernel/tsc.c =================================================================== --- linux.orig/arch/x86_64/kernel/tsc.c +++ linux/arch/x86_64/kernel/tsc.c @@ -130,7 +130,11 @@ static int __init cpufreq_tsc(void) return 0; } -core_initcall(cpufreq_tsc); +/* + * init_cpufreq_transition_notifier_list() should execute first, + * which is a core_initcall, so mark this one core_initcall_sync: + */ +core_initcall_sync(cpufreq_tsc); #endif @@ -149,19 +153,27 @@ __cpuinit int unsynchronized_tsc(void) if (apic_is_clustered_box()) return 1; #endif - /* Most intel systems have synchronized TSCs except for - multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + /* Most intel systems have synchronized TSCs except for + multi node systems */ + #ifdef CONFIG_ACPI /* But TSC doesn't tick in C3 so don't use it there */ if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000) return 1; #endif - return 0; + return 0; + + case X86_VENDOR_AMD: + /* ??? C states */ + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; + break; } - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; + /* Assume multi socket systems are not synchronized */ + return num_present_cpus() > 1; } int __init notsc_setup(char *s) @@ -174,13 +186,13 @@ __setup("notsc", notsc_setup); /* clock source code: */ -static cycle_t read_tsc(void) +static notrace cycle_t read_tsc(void) { cycle_t ret = (cycle_t)get_cycles_sync(); return ret; } -static cycle_t __vsyscall_fn vread_tsc(void) +static notrace cycle_t __vsyscall_fn vread_tsc(void) { cycle_t ret = (cycle_t)get_cycles_sync(); return ret; Index: linux/arch/x86_64/kernel/tsc_sync.c =================================================================== --- linux.orig/arch/x86_64/kernel/tsc_sync.c +++ linux/arch/x86_64/kernel/tsc_sync.c @@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count * we want to have the fastest, inlined, non-debug version * of a critical section, to be able to prove TSC time-warps: */ -static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; +static __cpuinitdata __raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; static __cpuinitdata int nr_warps; @@ -97,6 +97,7 @@ static __cpuinit void check_tsc_warp(voi */ void __cpuinit check_tsc_sync_source(int cpu) { + unsigned long flags; int cpus = 2; /* @@ -117,8 +118,11 @@ void __cpuinit check_tsc_sync_source(int /* * Wait for the target to arrive: */ + local_save_flags(flags); + local_irq_enable(); while (atomic_read(&start_count) != cpus-1) cpu_relax(); + local_irq_restore(flags); /* * Trigger the target to continue into the measurement too: */ Index: linux/arch/x86_64/kernel/vmlinux.lds.S =================================================================== --- linux.orig/arch/x86_64/kernel/vmlinux.lds.S +++ linux/arch/x86_64/kernel/vmlinux.lds.S @@ -219,7 +219,9 @@ SECTIONS /* Sections to be discarded */ /DISCARD/ : { *(.exitcall.exit) +#ifndef CONFIG_UNWIND_INFO *(.eh_frame) +#endif } STABS_DEBUG Index: linux/arch/x86_64/kernel/vsyscall.c =================================================================== --- linux.orig/arch/x86_64/kernel/vsyscall.c +++ linux/arch/x86_64/kernel/vsyscall.c @@ -43,11 +43,11 @@ #include #include -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","rcx","memory" +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace struct vsyscall_gtod_data_t { - seqlock_t lock; + raw_seqlock_t lock; int sysctl_enabled; struct timeval wall_time_tv; struct timezone sys_tz; @@ -58,7 +58,7 @@ int __vgetcpu_mode __section_vgetcpu_mod struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data = { - .lock = SEQLOCK_UNLOCKED, + .lock = __RAW_SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), .sysctl_enabled = 1, }; @@ -107,6 +107,22 @@ static __always_inline void do_vgettimeo cycle_t now, base, mask, cycle_delta; unsigned long seq, mult, shift, nsec_delta; cycle_t (*vread)(void); + + if (likely(__vsyscall_gtod_data.sysctl_enabled == 2)) { + struct timeval tmp; + + do { + barrier(); + *tv = __vsyscall_gtod_data.wall_time_tv; + barrier(); + tmp = __vsyscall_gtod_data.wall_time_tv; + + } while (tmp.tv_usec != tv->tv_usec || + tmp.tv_sec != tv->tv_sec); + + return; + } + do { seq = read_seqbegin(&__vsyscall_gtod_data.lock); @@ -151,11 +167,19 @@ int __vsyscall(0) vgettimeofday(struct t * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { + time_t secs; + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - else if (t) - *t = __vsyscall_gtod_data.wall_time_tv.tv_sec; - return __vsyscall_gtod_data.wall_time_tv.tv_sec; + + /* + * Make sure that what we return is the same number we + * write: + */ + secs = __vsyscall_gtod_data.wall_time_tv.tv_sec; + if (t) + *t = secs; + return secs; } /* Fast way to get current CPU and node. @@ -180,7 +204,7 @@ vgetcpu(unsigned *cpu, unsigned *node, s We do this here because otherwise user space would do it on its own in a likely inferior way (no access to jiffies). If you don't like it pass NULL. */ - if (tcache && tcache->blob[0] == (j = __jiffies)) { + if (tcache && tcache->blob[0] == (j = jiffies)) { p = tcache->blob[1]; } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { /* Load per CPU data from RDTSCP */ Index: linux/arch/x86_64/kernel/x8664_ksyms.c =================================================================== --- linux.orig/arch/x86_64/kernel/x8664_ksyms.c +++ linux/arch/x86_64/kernel/x8664_ksyms.c @@ -11,10 +11,12 @@ EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); @@ -59,3 +61,4 @@ EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(init_level4_pgt); EXPORT_SYMBOL(load_gs_index); +EXPORT_SYMBOL(_proxy_pda); Index: linux/arch/x86_64/lib/thunk.S =================================================================== --- linux.orig/arch/x86_64/lib/thunk.S +++ linux/arch/x86_64/lib/thunk.S @@ -40,11 +40,13 @@ thunk rwsem_wake_thunk,rwsem_wake thunk rwsem_downgrade_thunk,rwsem_downgrade_wake #endif - - thunk __down_failed,__down - thunk_retrax __down_failed_interruptible,__down_interruptible - thunk_retrax __down_failed_trylock,__down_trylock - thunk __up_wakeup,__up + +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK + thunk __compat_down_failed,__compat_down + thunk_retrax __compat_down_failed_interruptible,__compat_down_interruptible + thunk_retrax __compat_down_failed_trylock,__compat_down_trylock + thunk __compat_up_wakeup,__compat_up +#endif #ifdef CONFIG_TRACE_IRQFLAGS thunk trace_hardirqs_on_thunk,trace_hardirqs_on Index: linux/arch/x86_64/mm/fault.c =================================================================== --- linux.orig/arch/x86_64/mm/fault.c +++ linux/arch/x86_64/mm/fault.c @@ -381,7 +381,7 @@ asmlinkage void __kprobes do_page_fault( * If we're in an interrupt or have no user * context, we must not take the fault.. */ - if (unlikely(in_atomic() || !mm)) + if (unlikely(in_atomic() || !mm || current->pagefault_disabled)) goto bad_area_nosemaphore; again: Index: linux/arch/x86_64/mm/init.c =================================================================== --- linux.orig/arch/x86_64/mm/init.c +++ linux/arch/x86_64/mm/init.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(dma_ops); static unsigned long dma_reserve __initdata; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the Index: linux/block/ll_rw_blk.c =================================================================== --- linux.orig/block/ll_rw_blk.c +++ linux/block/ll_rw_blk.c @@ -1540,7 +1540,7 @@ static int ll_merge_requests_fn(request_ */ void blk_plug_device(request_queue_t *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); /* * don't plug a stopped queue, it must be paired with blk_start_queue() @@ -1563,7 +1563,7 @@ EXPORT_SYMBOL(blk_plug_device); */ int blk_remove_plug(request_queue_t *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) return 0; @@ -1655,7 +1655,7 @@ static void blk_unplug_timeout(unsigned **/ void blk_start_queue(request_queue_t *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); Index: linux/drivers/acpi/ec.c =================================================================== --- linux.orig/drivers/acpi/ec.c +++ linux/drivers/acpi/ec.c @@ -441,7 +441,16 @@ static u32 acpi_ec_gpe_handler(void *dat struct acpi_ec *ec = (struct acpi_ec *)data; atomic_inc(&ec->event_count); if (acpi_ec_mode == EC_INTR) { +#if 0 wake_up(&ec->wait); +#else + // hack ... + if (waitqueue_active(&ec->wait)) { + struct task_struct *task = list_entry(ec->wait.task_list.next, wait_queue_t, task_list)->private; + if (task) + wake_up_process(task); + } +#endif } value = acpi_ec_read_status(ec); Index: linux/drivers/acpi/hardware/hwregs.c =================================================================== --- linux.orig/drivers/acpi/hardware/hwregs.c +++ linux/drivers/acpi/hardware/hwregs.c @@ -73,7 +73,7 @@ acpi_status acpi_hw_clear_acpi_status(vo ACPI_BITMASK_ALL_FIXED_STATUS, (u16) acpi_gbl_FADT.xpm1a_event_block.address)); - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); status = acpi_hw_register_write(ACPI_MTX_DO_NOT_LOCK, ACPI_REGISTER_PM1_STATUS, @@ -98,7 +98,7 @@ acpi_status acpi_hw_clear_acpi_status(vo status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block); unlock_and_exit: - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); return_ACPI_STATUS(status); } @@ -331,7 +331,7 @@ acpi_status acpi_set_register(u32 regist return_ACPI_STATUS(AE_BAD_PARAMETER); } - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); /* Always do a register read first so we can insert the new bits */ @@ -441,7 +441,7 @@ acpi_status acpi_set_register(u32 regist unlock_and_exit: - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); /* Normalize the value that was read */ @@ -481,7 +481,7 @@ acpi_hw_register_read(u8 use_lock, u32 r ACPI_FUNCTION_TRACE(hw_register_read); if (ACPI_MTX_LOCK == use_lock) { - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); } switch (register_id) { @@ -560,7 +560,7 @@ acpi_hw_register_read(u8 use_lock, u32 r unlock_and_exit: if (ACPI_MTX_LOCK == use_lock) { - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); } if (ACPI_SUCCESS(status)) { @@ -606,7 +606,7 @@ acpi_status acpi_hw_register_write(u8 us ACPI_FUNCTION_TRACE(hw_register_write); if (ACPI_MTX_LOCK == use_lock) { - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + spin_lock_irqsave(acpi_gbl_hardware_lock, lock_flags); } switch (register_id) { @@ -730,7 +730,7 @@ acpi_status acpi_hw_register_write(u8 us unlock_and_exit: if (ACPI_MTX_LOCK == use_lock) { - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + spin_unlock_irqrestore(acpi_gbl_hardware_lock, lock_flags); } return_ACPI_STATUS(status); Index: linux/drivers/acpi/osl.c =================================================================== --- linux.orig/drivers/acpi/osl.c +++ linux/drivers/acpi/osl.c @@ -704,13 +704,13 @@ void acpi_os_delete_lock(acpi_spinlock h acpi_status acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle) { - struct semaphore *sem = NULL; + struct compat_semaphore *sem = NULL; - sem = acpi_os_allocate(sizeof(struct semaphore)); + sem = acpi_os_allocate(sizeof(struct compat_semaphore)); if (!sem) return AE_NO_MEMORY; - memset(sem, 0, sizeof(struct semaphore)); + memset(sem, 0, sizeof(struct compat_semaphore)); sema_init(sem, initial_units); @@ -733,7 +733,7 @@ EXPORT_SYMBOL(acpi_os_create_semaphore); acpi_status acpi_os_delete_semaphore(acpi_handle handle) { - struct semaphore *sem = (struct semaphore *)handle; + struct compat_semaphore *sem = (struct compat_semaphore *)handle; if (!sem) @@ -761,7 +761,7 @@ EXPORT_SYMBOL(acpi_os_delete_semaphore); acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout) { acpi_status status = AE_OK; - struct semaphore *sem = (struct semaphore *)handle; + struct compat_semaphore *sem = (struct compat_semaphore *)handle; int ret = 0; @@ -848,7 +848,7 @@ EXPORT_SYMBOL(acpi_os_wait_semaphore); */ acpi_status acpi_os_signal_semaphore(acpi_handle handle, u32 units) { - struct semaphore *sem = (struct semaphore *)handle; + struct compat_semaphore *sem = (struct compat_semaphore *)handle; if (!sem || (units < 1)) Index: linux/drivers/acpi/processor_idle.c =================================================================== --- linux.orig/drivers/acpi/processor_idle.c +++ linux/drivers/acpi/processor_idle.c @@ -232,7 +232,7 @@ static void acpi_safe_halt(void) * test NEED_RESCHED: */ smp_mb(); - if (!need_resched()) + if (!need_resched() && !need_resched_delayed()) safe_halt(); current_thread_info()->status |= TS_POLLING; } @@ -443,7 +443,7 @@ static void acpi_processor_idle(void) * test NEED_RESCHED: */ smp_mb(); - if (need_resched()) { + if (need_resched() || need_resched_delayed()) { current_thread_info()->status |= TS_POLLING; local_irq_enable(); return; Index: linux/drivers/acpi/sleep/main.c =================================================================== --- linux.orig/drivers/acpi/sleep/main.c +++ linux/drivers/acpi/sleep/main.c @@ -35,6 +35,14 @@ static u32 acpi_suspend_states[] = { static int init_8259A_after_S1; +/* + * simulate entry into the BIOS - this way the system will not + * be turned off for real, and the kernel's resume functionality + * can be debugged while still having some system capabilities + * left. This is especially useful in combination with /sys/power/filter. + */ +int acpi_simulate_suspend_to_ram; + /** * acpi_pm_prepare - Do preliminary suspend work. * @pm_state: suspend state we're entering. @@ -91,7 +99,12 @@ static int acpi_pm_enter(suspend_state_t break; case PM_SUSPEND_MEM: - do_suspend_lowlevel(); + if (unlikely(acpi_simulate_suspend_to_ram)) { + printk(KERN_INFO "ACPI: simulating suspend-to-RAM: " + "not calling BIOS.\n"); + } else { + do_suspend_lowlevel(); + } break; case PM_SUSPEND_DISK: Index: linux/drivers/acpi/utilities/utmutex.c =================================================================== --- linux.orig/drivers/acpi/utilities/utmutex.c +++ linux/drivers/acpi/utilities/utmutex.c @@ -116,7 +116,7 @@ void acpi_ut_mutex_terminate(void) /* Delete the spinlocks */ acpi_os_delete_lock(acpi_gbl_gpe_lock); - acpi_os_delete_lock(acpi_gbl_hardware_lock); +// acpi_os_delete_lock(acpi_gbl_hardware_lock); return_VOID; } Index: linux/drivers/ata/Kconfig =================================================================== --- linux.orig/drivers/ata/Kconfig +++ linux/drivers/ata/Kconfig @@ -564,7 +564,7 @@ config PATA_IXP4XX_CF config PATA_SCC tristate "Toshiba's Cell Reference Set IDE support" - depends on PCI && PPC_IBM_CELL_BLADE + depends on PCI && PPC_CELLEB help This option enables support for the built-in IDE controller on Toshiba Cell Reference Board. Index: linux/drivers/ata/libata-core.c =================================================================== --- linux.orig/drivers/ata/libata-core.c +++ linux/drivers/ata/libata-core.c @@ -826,7 +826,7 @@ static u64 ata_id_n_sectors(const u16 *i /** * ata_id_to_dma_mode - Identify DMA mode from id block * @dev: device to identify - * @mode: mode to assume if we cannot tell + * @unknown: mode to assume if we cannot tell * * Set up the timing values for the device based upon the identify * reported values for the DMA mode. This function is used by drivers Index: linux/drivers/ata/libata-eh.c =================================================================== --- linux.orig/drivers/ata/libata-eh.c +++ linux/drivers/ata/libata-eh.c @@ -1625,8 +1625,14 @@ static int ata_eh_reset(struct ata_port rc = prereset(ap); if (rc) { if (rc == -ENOENT) { - ata_port_printk(ap, KERN_DEBUG, "port disabled. ignoring.\n"); + ata_port_printk(ap, KERN_DEBUG, + "port disabled. ignoring.\n"); ap->eh_context.i.action &= ~ATA_EH_RESET_MASK; + + for (i = 0; i < ATA_MAX_DEVICES; i++) + classes[i] = ATA_DEV_NONE; + + rc = 0; } else ata_port_printk(ap, KERN_ERR, "prereset failed (errno=%d)\n", rc); Index: linux/drivers/ata/pata_ixp4xx_cf.c =================================================================== --- linux.orig/drivers/ata/pata_ixp4xx_cf.c +++ linux/drivers/ata/pata_ixp4xx_cf.c @@ -193,7 +193,7 @@ static __devinit int ixp4xx_pata_probe(s irq = platform_get_irq(pdev, 0); if (irq) - set_irq_type(irq, IRQT_HIGH); + set_irq_type(irq, IRQT_RISING); /* Setup expansion bus chip selects */ *data->cs0_cfg = data->cs0_bits; @@ -232,7 +232,6 @@ static __devexit int ixp4xx_pata_remove( struct ata_host *host = platform_get_drvdata(dev); ata_host_detach(host); - platform_set_drvdata(dev, NULL); return 0; } Index: linux/drivers/ata/pata_winbond.c =================================================================== --- linux.orig/drivers/ata/pata_winbond.c +++ linux/drivers/ata/pata_winbond.c @@ -36,7 +36,7 @@ static int probe_winbond = 1; static int probe_winbond; #endif -static spinlock_t winbond_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(winbond_lock); static void winbond_writecfg(unsigned long port, u8 reg, u8 val) { Index: linux/drivers/ata/sata_inic162x.c =================================================================== --- linux.orig/drivers/ata/sata_inic162x.c +++ linux/drivers/ata/sata_inic162x.c @@ -672,10 +672,6 @@ static int inic_init_one(struct pci_dev if (rc) return rc; - rc = pci_request_regions(pdev, DRV_NAME); - if (rc) - return rc; - rc = pcim_iomap_regions(pdev, 0x3f, DRV_NAME); if (rc) return rc; Index: linux/drivers/ata/sata_sil24.c =================================================================== --- linux.orig/drivers/ata/sata_sil24.c +++ linux/drivers/ata/sata_sil24.c @@ -346,6 +346,7 @@ static const struct pci_device_id sil24_ { PCI_VDEVICE(CMD, 0x3124), BID_SIL3124 }, { PCI_VDEVICE(INTEL, 0x3124), BID_SIL3124 }, { PCI_VDEVICE(CMD, 0x3132), BID_SIL3132 }, + { PCI_VDEVICE(CMD, 0x0242), BID_SIL3132 }, { PCI_VDEVICE(CMD, 0x3131), BID_SIL3131 }, { PCI_VDEVICE(CMD, 0x3531), BID_SIL3131 }, Index: linux/drivers/base/power/suspend.c =================================================================== --- linux.orig/drivers/base/power/suspend.c +++ linux/drivers/base/power/suspend.c @@ -78,7 +78,7 @@ int suspend_device(struct device * dev, suspend_report_result(dev->class->suspend, error); } - if (!error && dev->bus && dev->bus->suspend && !dev->power.power_state.event) { + if (!error && !dev->no_suspend && dev->bus && dev->bus->suspend && !dev->power.power_state.event) { dev_dbg(dev, "%s%s\n", suspend_verb(state.event), ((state.event == PM_EVENT_SUSPEND) Index: linux/drivers/base/power/sysfs.c =================================================================== --- linux.orig/drivers/base/power/sysfs.c +++ linux/drivers/base/power/sysfs.c @@ -141,12 +141,42 @@ wake_store(struct device * dev, struct d static DEVICE_ATTR(wakeup, 0644, wake_show, wake_store); +static ssize_t can_suspend_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", dev->no_suspend ? "no" : "yes"); +} + +static ssize_t can_suspend_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t n) +{ + if (!n) + return -EINVAL; + + switch (buf[0]) { + case 'y': + case 'Y': + case '1': + dev->no_suspend = 0; + break; + case 'n': + case 'N': + case '0': + dev->no_suspend = 1; + break; + } + + return n; +} +static DEVICE_ATTR(can_suspend, 0644, can_suspend_show, can_suspend_store); static struct attribute * power_attrs[] = { #ifdef CONFIG_PM_SYSFS_DEPRECATED &dev_attr_state.attr, #endif &dev_attr_wakeup.attr, + &dev_attr_can_suspend.attr, NULL, }; static struct attribute_group pm_attr_group = { Index: linux/drivers/block/Kconfig =================================================================== --- linux.orig/drivers/block/Kconfig +++ linux/drivers/block/Kconfig @@ -453,6 +453,13 @@ config ATA_OVER_ETH This driver provides Support for ATA over Ethernet block devices like the Coraid EtherDrive (R) Storage Blade. +config KVM_BLOCK + tristate "KVM virtual block device" + depends on KVM + help + This driver provides support for a virtual block device + for the KVM virtualization component. + endmenu endif Index: linux/drivers/block/Makefile =================================================================== --- linux.orig/drivers/block/Makefile +++ linux/drivers/block/Makefile @@ -28,4 +28,4 @@ obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryp obj-$(CONFIG_VIODASD) += viodasd.o obj-$(CONFIG_BLK_DEV_SX8) += sx8.o obj-$(CONFIG_BLK_DEV_UB) += ub.o - +obj-$(CONFIG_KVM_BLOCK) += kvm-block.o Index: linux/drivers/block/floppy.c =================================================================== --- linux.orig/drivers/block/floppy.c +++ linux/drivers/block/floppy.c @@ -4157,6 +4157,28 @@ static void floppy_device_release(struct complete(&device_release); } +static int floppy_suspend(struct platform_device *dev, pm_message_t state) +{ + floppy_release_irq_and_dma(); + + return 0; +} + +static int floppy_resume(struct platform_device *dev) +{ + floppy_grab_irq_and_dma(); + + return 0; +} + +static struct platform_driver floppy_driver = { + .suspend = floppy_suspend, + .resume = floppy_resume, + .driver = { + .name = "floppy", + }, +}; + static struct platform_device floppy_device[N_DRIVE]; static struct kobject *floppy_find(dev_t dev, int *part, void *data) @@ -4205,10 +4227,14 @@ static int __init floppy_init(void) if (err) goto out_put_disk; + err = platform_driver_register(&floppy_driver); + if (err) + goto out_unreg_blkdev; + floppy_queue = blk_init_queue(do_fd_request, &floppy_lock); if (!floppy_queue) { err = -ENOMEM; - goto out_unreg_blkdev; + goto out_unreg_driver; } blk_queue_max_sectors(floppy_queue, 64); @@ -4352,6 +4378,8 @@ out_flush_work: out_unreg_region: blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); blk_cleanup_queue(floppy_queue); +out_unreg_driver: + platform_driver_unregister(&floppy_driver); out_unreg_blkdev: unregister_blkdev(FLOPPY_MAJOR, "fd"); out_put_disk: @@ -4543,6 +4571,7 @@ void cleanup_module(void) init_completion(&device_release); blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); unregister_blkdev(FLOPPY_MAJOR, "fd"); + platform_driver_unregister(&floppy_driver); for (drive = 0; drive < N_DRIVE; drive++) { del_timer_sync(&motor_off_timer[drive]); Index: linux/drivers/block/kvm-block.c =================================================================== --- /dev/null +++ linux/drivers/block/kvm-block.c @@ -0,0 +1,437 @@ +/* + * KVM virtual block device driver for Linux + * + * (C) Copyright 2007 Intel Corporation + * + * This file is licensed to you under the terms of the GNU General Public License + * (GPL) version 2. See the COPYING file in the main directory of the kernel for + * the license text. + * + * Authors: + * Arjan van de Ven + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "kvm_block.h" + +static int kvm_block_major; + +#define DISK_IRQ 9 +#define SLOT_COUNT (PAGE_SIZE/sizeof(uint64_t)) + +struct kvm_block_dev { + struct list_head list; + uint64_t size; + spinlock_t lock; + struct tasklet_struct tasklet; + int nr; + + struct request_queue *queue; + struct gendisk *gd; + + /* + * the following to pages are shared with the hypervisor; the completion page + * is used by the hypervisor to place completed IO's in, while the submit page + * is used by this guest driver to place IOs to be done in. In both cases the + * pages are arrays of uint64_t's that contain the guest physical address of + * a struct kvmio_page. These arrays are indexed by "slot number", which is + * TCQ identifier and unique for each IO. struct kvm_iopage's have to + * be page aligned! + */ + uint64_t *completion_page; + uint64_t *submit_page; +}; + + +/* highest slot used.. to short-change scanning */ +static int highest_slot; + +static LIST_HEAD(kvm_devices); + +static int kvm_get_disk_count(void) +{ + int ret; + ret = hypercall(0, __NR_hypercall_disk_count); + return ret; +} + +static int kvm_get_disk_info(int nr, struct kvm_disk_info *info) +{ + int ret; + ret = hypercall(2, __NR_hypercall_disk_info, nr, __pa((unsigned long) info)); + return ret; +} + + +static int kvm_register_completion_page(struct kvm_block_dev *dev) +{ + int ret; + unsigned long irq = DISK_IRQ; + ret = hypercall(3, __NR_hypercall_disk_register_completion_area, dev->nr, __pa(dev->completion_page), irq); + if (ret) { + printk(KERN_ERR "kvm-block: Hypervisor registration of the completion page failed : %i / %i \n", ret, -ret); + return ret; + } + + ret = hypercall(3, __NR_hypercall_disk_register_submit_area, dev->nr, __pa(dev->submit_page), irq); + if (ret) + printk(KERN_ERR "kvm-block: Hypervisor registration of the submit page failed : %i / %i \n", ret, -ret); + return ret; +} + + +/* + * scan the completion page for completed IO's and process them + * returns the number of completed IO's + */ +static int kvm_scan_completion(struct kvm_block_dev *dev) +{ + int i; + int ret = 0; + struct kvm_iopage *iopage; + unsigned long completed; + unsigned long flags; + + for (i=0; i <= highest_slot; i++) { + if (dev->completion_page[i]) { + struct request *req; + cpu_relax(); + completed = dev->completion_page[i]; /* FIXME: atomic swap */ + dev->completion_page[i] = 0; + iopage = (struct kvm_iopage*) __va(completed); + + req = (struct request *)(unsigned long)iopage->guest_data; + + if (dev != req->rq_disk->private_data) { + printk(KERN_ERR "kvm-block: fatal hypervisor completion mismatch \n"); + BUG(); + } + spin_lock_irqsave(&dev->lock, flags); + if (!end_that_request_first(req, 1, iopage->sector_count)) { + add_disk_randomness(req->rq_disk); + blk_queue_end_tag(dev->queue, req); + end_that_request_last(req, 1); + } + spin_unlock_irqrestore(&dev->lock, flags); + free_page((unsigned long)iopage); + ret++; + } + } + /* the queue may have been stopped; if we completed things it can start again */ + if (ret) { + spin_lock_irqsave(&dev->lock, flags); + blk_start_queue(dev->queue); + spin_unlock_irqrestore(&dev->lock, flags); + } + return ret; +} + +static void kvm_tasklet(unsigned long data) +{ + struct kvm_block_dev *dev = (struct kvm_block_dev*)data; + kvm_scan_completion(dev); +} + + +/* + * synchronous IO submission path -- fallback for low memory situations + */ + +static int kvm_transfer_synchronous(struct kvm_block_dev *dev, unsigned long start_sector, unsigned long nr_sectors, uint64_t buffer, int write, struct bio *bio) +{ + unsigned long ret; + if (write) { + ret = hypercall(4, __NR_hypercall_disk_write, dev->nr, start_sector, nr_sectors, (unsigned long)buffer); + if (ret) + printk(KERN_ERR "kvm-block: hypercall write failed : %li / %li\n", ret, -ret); + } else { + ret = hypercall(4, __NR_hypercall_disk_read, dev->nr, start_sector, nr_sectors, (unsigned long)buffer); + if (ret) + printk(KERN_ERR "kvm-block: hypercall read failed: %li / %li \n", ret, -ret); + } + + return 0; +} + +static int xfer_bio_synchronous(struct kvm_block_dev *dev, struct bio *bio) +{ + int i; + struct bio_vec *vec; + sector_t sector = bio->bi_sector; + + bio_for_each_segment(vec, bio, i) { + uint64_t phys = page_to_phys(bio_iovec_idx((bio), (i))->bv_page) + bio_iovec_idx((bio), (i))->bv_offset; + + kvm_transfer_synchronous(dev, sector, bio_cur_sectors(bio), phys, bio_data_dir(bio)==WRITE, bio); + sector += bio_cur_sectors(bio); + } + return 0; +} + +static int xfer_request_synchronous(struct kvm_block_dev *dev, struct request *req) +{ + struct bio *bio; + int sectors = 0; + + rq_for_each_bio(bio, req) { + xfer_bio_synchronous(dev, bio); + sectors += bio->bi_size/512; + } + return sectors; +} + +/* asynchronous IO submission routines */ + +static int kvm_transfer_vector(struct kvm_block_dev *dev, unsigned long buffer, int slot) +{ + dev->submit_page[slot] = __pa(buffer); + return 0; +} + +/* + * tickle the host to look at the array of submitted pages + */ +static int kvm_trigger_scan(struct kvm_block_dev *dev) +{ + int ret; + ret = hypercall(1, __NR_hypercall_disk_trigger_scan, dev->nr); + if (ret) + printk(KERN_ERR "kvm-block: trigger error: %i / %i ", ret, -ret); + return ret; +} + + +/* + * fill in the vector array of the io description page + */ +static int bio_vector(struct kvm_block_dev *dev, struct bio *bio, struct kvm_iopage *iopage, int *pos, int *sectors) +{ + int i; + struct bio_vec *vec; + + bio_for_each_segment(vec, bio, i) { + uint64_t phys = page_to_phys(bio_iovec_idx((bio), (i))->bv_page) + bio_iovec_idx((bio), (i))->bv_offset; + iopage->vector[*pos].data_gpa = phys; + iopage->vector[*pos].data_len = bio_cur_sectors(bio)*512; + (*pos)++; + (*sectors)+= bio_cur_sectors(bio); + } + return 0; +} + +static int xfer_request_vector(struct kvm_block_dev *dev, struct request *req, char *page) +{ + struct bio *bio = NULL; + struct kvm_iopage *iopage = (struct kvm_iopage *)page; + int pos = 0, sectors = 0; + + rq_for_each_bio(bio, req) { + bio_vector(dev, bio, iopage, &pos, §ors); + } + iopage->sector = req->sector; + iopage->guest_data = (unsigned long)req; + iopage->sector_count = sectors; + iopage->write = (rq_data_dir(req)==WRITE); + iopage->entries = pos; + + kvm_transfer_vector(dev, (unsigned long) iopage, req->tag); + + return 0; +} + +static void kvm_block_request(request_queue_t *queue) +{ + struct request *req; + struct kvm_block_dev *dev = NULL; + int scan = 0; + + while ( (req = elv_next_request(queue)) != NULL) { + dev = req->rq_disk->private_data; + + if (!blk_fs_request(req)) { + end_request(req, 0); + printk("kvm-block: non-fs request received; aborting request\n"); + } else { + char *page = NULL; + + page = (char*)__get_free_page(GFP_ATOMIC); + if (page) { + if (blk_queue_start_tag(dev->queue, req) >= 0) { + if (req->tag > highest_slot) + highest_slot = req->tag; + xfer_request_vector(dev, req, page); + scan++; + } else + blk_stop_queue(dev->queue); + } else { + int sectors; + /* + * fall back to synchronous IO to prevent + * OOM deadlock + */ + sectors = xfer_request_synchronous(dev, req); + if (!end_that_request_first(req, 1, sectors)) { + add_disk_randomness(req->rq_disk); + blkdev_dequeue_request(req); + end_that_request_last(req, 1); + } + } + } + } + /* trigger the host to scan the submit queue */ + if (scan && dev) + kvm_trigger_scan(dev); +} + + +static int kvm_disk_open(struct inode *inode, struct file *filp) +{ + struct kvm_block_dev *dev = inode->i_bdev->bd_disk->private_data; + filp->private_data = dev; + return 0; +} + +static int kvm_disk_release(struct inode *inode, struct file *filp) +{ +/* struct kvm_block_dev *dev = inode->i_bdev->bd_disk->private_data; */ + + + return 0; +} + +static struct block_device_operations kvm_bops = { + .owner = THIS_MODULE, + .open = kvm_disk_open, + .release = kvm_disk_release, +}; + +static irqreturn_t kvm_block_irq_handler(int irq, void *dev_id) +{ + struct kvm_block_dev *dev = dev_id; + + /* do all the work in a tasklet */ + tasklet_schedule(&dev->tasklet); + return IRQ_HANDLED; +} + + + +int kvm_register_disk(int disknr) +{ + struct kvm_block_dev *dev; + int ret; + struct kvm_disk_info *info; + + info = (void*)get_zeroed_page(GFP_KERNEL); + if (!info) + return -ENOMEM; + + ret = kvm_get_disk_info(disknr, info); + if (ret) + printk("kvm-block: failed to get disk info: %i \n", ret); + + dev = kmalloc(sizeof(struct kvm_block_dev), GFP_KERNEL); + if (!dev) { + free_page((unsigned long)info); + return -ENOMEM; + } + + dev->nr = disknr; + dev->completion_page = (void*)get_zeroed_page(GFP_KERNEL); + dev->submit_page = (void*)get_zeroed_page(GFP_KERNEL); + + if (!dev->completion_page) + goto error; + if (!dev->submit_page) + goto error; + + if (kvm_register_completion_page(dev)) + goto error; + + + spin_lock_init(&dev->lock); + dev->queue = blk_init_queue(kvm_block_request, &dev->lock); + blk_queue_max_phys_segments(dev->queue, 126); + blk_queue_max_sectors(dev->queue, 126*8); + blk_queue_init_tags(dev->queue, SLOT_COUNT, NULL); + dev->gd = alloc_disk(64); + dev->gd->queue = dev->queue; + dev->gd->major = kvm_block_major; + dev->gd->fops = &kvm_bops; + dev->gd->private_data = dev; + dev->gd->first_minor = 64*disknr; + dev->gd->minors = 64; + + tasklet_init(&dev->tasklet, kvm_tasklet, (unsigned long)dev); + + ret = request_irq(DISK_IRQ, kvm_block_irq_handler, IRQF_SHARED | IRQF_SAMPLE_RANDOM, "kvm-block", dev); + WARN_ON(ret); + set_irq_chip_and_handler_name(DISK_IRQ, &dummy_irq_chip, handle_simple_irq, "kvm"); + + sprintf(dev->gd->disk_name, "vda"); + + set_capacity(dev->gd, info->sectors); + + list_add(&dev->list, &kvm_devices); + add_disk(dev->gd); + free_page((unsigned long)info); + return 0; + +error: + free_page((unsigned long)info); + free_page((unsigned long)dev->completion_page); + free_page((unsigned long)dev->submit_page); + kfree(dev); + return -ENOMEM; +} + +void kvm_unregister_disk(struct kvm_block_dev *dev) +{ + list_del_init(&dev->list); + del_gendisk(dev->gd); + free_irq(DISK_IRQ, dev); + kfree(dev); +} + + +int init_kvm_driver(void) +{ + int i,max; + kvm_block_major = register_blkdev(0, "kvm-block"); + printk("KVM virtual block driver on major %i \n", kvm_block_major); + if (kvm_block_major < 0) + return -ENODEV; + + max = kvm_get_disk_count(); + for (i=0; i"); +MODULE_DESCRIPTION("KVM coopvirtual block device driver"); Index: linux/drivers/block/kvm_block.h =================================================================== --- /dev/null +++ linux/drivers/block/kvm_block.h @@ -0,0 +1,24 @@ +#ifndef __INCLUDE_GUARD_KVMBLOCK_H +#define __INCLUDE_GUARD_KVMBLOCK_H + +struct kvm_vector { + uint64_t data_gpa; + uint64_t data_len; +}; + +struct kvm_iopage { + uint64_t sector; + uint64_t guest_data; + uint32_t sector_count; + uint32_t write; + uint32_t entries; + uint32_t ioresult; + struct kvm_vector vector[126]; +}; + +struct kvm_disk_info { + uint64_t sectors; + uint64_t reserved[15]; +}; + +#endif Index: linux/drivers/block/paride/pseudo.h =================================================================== --- linux.orig/drivers/block/paride/pseudo.h +++ linux/drivers/block/paride/pseudo.h @@ -43,7 +43,7 @@ static unsigned long ps_timeout; static int ps_tq_active = 0; static int ps_nice = 0; -static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused))); +static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock); static DECLARE_DELAYED_WORK(ps_tq, ps_tq_int); Index: linux/drivers/char/Kconfig =================================================================== --- linux.orig/drivers/char/Kconfig +++ linux/drivers/char/Kconfig @@ -755,6 +755,46 @@ config RTC To compile this driver as a module, choose M here: the module will be called rtc. +config RTC_HISTOGRAM + bool "Real Time Clock Histogram Support" + default n + depends on RTC + ---help--- + If you say Y here then the kernel will track the delivery and + wakeup latency of /dev/rtc using tasks and will report a + histogram to the kernel log when the application closes /dev/rtc. + +config BLOCKER + tristate "Priority Inheritance Debugging (Blocker) Device Support" + depends on X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + pi_test suite uses to test and measure kernel locking primitives. + +config LPPTEST + tristate "Parallel Port Based Latency Measurement Device" + depends on !PARPORT && X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + testlpp utility uses to measure IRQ latencies of a target system + from an independent measurement system. + + NOTE: this code assumes x86 PCs and that the parallel port is + bidirectional and is on IRQ 7. + + to use the device, both the target and the source system needs to + run a kernel with CONFIG_LPPTEST enabled. To measure latencies, + use the scripts/testlpp utility in your kernel source directory, + and run it (as root) on the source system - it will start printing + out the latencies it took to get a response from the target system: + + Latency of response: 12.2 usecs (121265 cycles) + + then generate various workloads on the target system to see how + (worst-case-) latencies are impacted. + config SGI_DS1286 tristate "SGI DS1286 RTC support" depends on SGI_IP22 @@ -1038,5 +1078,23 @@ config TELCLOCK /sys/devices/platform/telco_clock, with a number of files for controlling the behavior of this hardware. +config RMEM + tristate "Access to physical memory via /dev/rmem" + default m + help + The /dev/mem device only allows mmap() memory available to + I/O mapped memory; it does not allow access to "real" + physical memory. The /dev/rmem device is a hack which does + allow access to physical memory. We use this instead of + patching /dev/mem because we don't expect this functionality + to ever be accepted into mainline. + +config ALLOC_RTSJ_MEM + tristate "RTSJ-specific hack to reserve memory" + default m + help + The RTSJ TCK conformance test requires reserving some physical + memory for testing /dev/rmem. + endmenu Index: linux/drivers/char/Makefile =================================================================== --- linux.orig/drivers/char/Makefile +++ linux/drivers/char/Makefile @@ -94,6 +94,10 @@ obj-$(CONFIG_GPIO_VR41XX) += vr41xx_giu. obj-$(CONFIG_TANBAC_TB0219) += tb0219.o obj-$(CONFIG_TELCLOCK) += tlclk.o +obj-$(CONFIG_BLOCKER) += blocker.o +obj-$(CONFIG_LPPTEST) += lpptest.o +obj-$(CONFIG_RMEM) += rmem.o + obj-$(CONFIG_WATCHDOG) += watchdog/ obj-$(CONFIG_MWAVE) += mwave/ obj-$(CONFIG_AGP) += agp/ @@ -104,6 +108,8 @@ obj-$(CONFIG_IPMI_HANDLER) += ipmi/ obj-$(CONFIG_HANGCHECK_TIMER) += hangcheck-timer.o obj-$(CONFIG_TCG_TPM) += tpm/ +obj-$(CONFIG_ALLOC_RTSJ_MEM) += alloc_rtsj_mem.o + # Files generated that shall be removed upon make clean clean-files := consolemap_deftbl.c defkeymap.c Index: linux/drivers/char/alloc_rtsj_mem.c =================================================================== --- /dev/null +++ linux/drivers/char/alloc_rtsj_mem.c @@ -0,0 +1,88 @@ +/* + * alloc_rtsj_mem.c -- Hack to allocate some memory + * + * Copyright (C) 2005 by Theodore Ts'o + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include +#include +#include +#include + +#include + +MODULE_AUTHOR("Theodore Tso"); +MODULE_DESCRIPTION("RTSJ alloc memory"); +MODULE_LICENSE("GPL"); + +static void *mem = 0; +int size = 0, addr = 0; + +module_param(size, int, 0444); +module_param(addr, int, 0444); + +static void __exit shutdown_module(void) +{ + kfree(mem); +} + +#ifndef MODULE +void __init alloc_rtsj_mem_early_setup(void) +{ + if (size > PAGE_SIZE*2) { + mem = alloc_bootmem(size); + if (mem) { + printk(KERN_INFO "alloc_rtsj_mem: got %d bytes " + "using alloc_bootmem\n", size); + } else { + printk(KERN_INFO "alloc_rtsj_mem: failed to " + "get %d bytes from alloc_bootmem\n", size); + } + } +} +#endif + +static int __init startup_module(void) +{ + static char test_string[] = "The BOFH: Servicing users the way the " + "military\n\tservices targets for 15 years.\n"; + + if (!size) + return 0; + + if (!mem) { + mem = kmalloc(size, GFP_KERNEL); + if (mem) { + printk(KERN_INFO "alloc_rtsj_mem: got %d bytes " + "using kmalloc\n", size); + } else { + printk(KERN_ERR "alloc_rtsj_mem: failed to get " + "%d bytes using kmalloc\n", size); + return -ENOMEM; + } + } + memcpy(mem, test_string, min(sizeof(test_string), (size_t) size)); + addr = virt_to_phys(mem); + return 0; +} + +module_init(startup_module); +module_exit(shutdown_module); + Index: linux/drivers/char/blocker.c =================================================================== --- /dev/null +++ linux/drivers/char/blocker.c @@ -0,0 +1,107 @@ +/* + * priority inheritance testing device + */ + +#include +#include + +#define BLOCKER_MINOR 221 + +#define BLOCK_IOCTL 4245 +#define BLOCK_SET_DEPTH 4246 + +#define BLOCKER_MAX_LOCK_DEPTH 10 + +void loop(int loops) +{ + int i; + + for (i = 0; i < loops; i++) + get_cycles(); +} + +static spinlock_t blocker_lock[BLOCKER_MAX_LOCK_DEPTH]; + +static unsigned int lock_depth = 1; + +void do_the_lock_and_loop(unsigned int args) +{ + int i, max; + + if (rt_task(current)) + max = lock_depth; + else if (lock_depth > 1) + max = (current->pid % lock_depth) + 1; + else + max = 1; + + /* Always lock from the top down */ + for (i = max-1; i >= 0; i--) + spin_lock(&blocker_lock[i]); + loop(args); + for (i = 0; i < max; i++) + spin_unlock(&blocker_lock[i]); +} + +static int blocker_open(struct inode *in, struct file *file) +{ + printk(KERN_INFO "blocker_open called\n"); + + return 0; +} + +static long blocker_ioctl(struct file *file, + unsigned int cmd, unsigned long args) +{ + switch(cmd) { + case BLOCK_IOCTL: + do_the_lock_and_loop(args); + return 0; + case BLOCK_SET_DEPTH: + if (args >= BLOCKER_MAX_LOCK_DEPTH) + return -EINVAL; + lock_depth = args; + return 0; + default: + return -EINVAL; + } +} + +static struct file_operations blocker_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .unlocked_ioctl = blocker_ioctl, + .open = blocker_open, +}; + +static struct miscdevice blocker_dev = +{ + BLOCKER_MINOR, + "blocker", + &blocker_fops +}; + +static int __init blocker_init(void) +{ + int i; + + if (misc_register(&blocker_dev)) + return -ENODEV; + + for (i = 0; i < BLOCKER_MAX_LOCK_DEPTH; i++) + spin_lock_init(blocker_lock + i); + + return 0; +} + +void __exit blocker_exit(void) +{ + printk(KERN_INFO "blocker device uninstalled\n"); + misc_deregister(&blocker_dev); +} + +module_init(blocker_init); +module_exit(blocker_exit); + +MODULE_LICENSE("GPL"); + Index: linux/drivers/char/drm/i915_irq.c =================================================================== --- linux.orig/drivers/char/drm/i915_irq.c +++ linux/drivers/char/drm/i915_irq.c @@ -582,7 +582,7 @@ void i915_driver_irq_postinstall(drm_dev { drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private; - dev_priv->swaps_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&dev_priv->swaps_lock); INIT_LIST_HEAD(&dev_priv->vbl_swaps.head); dev_priv->swaps_pending = 0; Index: linux/drivers/char/lcd.c =================================================================== --- linux.orig/drivers/char/lcd.c +++ linux/drivers/char/lcd.c @@ -11,9 +11,6 @@ * March 2001: Ported from 2.0.34 by Liam Davies * */ - -#define RTC_IO_EXTENT 0x10 /*Only really two ports, but... */ - #include #include #include @@ -32,8 +29,6 @@ #include "lcd.h" -static DEFINE_SPINLOCK(lcd_lock); - static int lcd_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); Index: linux/drivers/char/lpptest.c =================================================================== --- /dev/null +++ linux/drivers/char/lpptest.c @@ -0,0 +1,178 @@ +/* + * /dev/lpptest device: test IRQ handling latencies over parallel port + * + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar + * + * licensed under the GPL + * + * You need to have CONFIG_PARPORT disabled for this device, it is a + * completely self-contained device that assumes sole ownership of the + * parallel port. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * API wrappers so that the code can be shared with the -rt tree: + */ +#ifndef local_irq_disable +# define local_irq_disable local_irq_disable +# define local_irq_enable local_irq_enable +#endif + +#ifndef IRQ_NODELAY +# define IRQ_NODELAY 0 +# define IRQF_NODELAY 0 +#endif + +/* + * Driver: + */ +#define LPPTEST_CHAR_MAJOR 245 +#define LPPTEST_DEVICE_NAME "lpptest" + +#define LPPTEST_IRQ 7 + +#define LPPTEST_TEST _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long) +#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long) +#define LPPTEST_ENABLE _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long) + +static char dev_id[] = "lpptest"; + +#define INIT_PORT() outb(0x04, 0x37a) +#define ENABLE_IRQ() outb(0x10, 0x37a) +#define DISABLE_IRQ() outb(0, 0x37a) + +static unsigned char out = 0x5a; + +/** + * Interrupt handler. Flip a bit in the reply. + */ +static int lpptest_irq (int irq, void *dev_id) +{ + out ^= 0xff; + outb(out, 0x378); + + return IRQ_HANDLED; +} + +static cycles_t test_response(void) +{ + cycles_t now, end; + unsigned char in; + int timeout = 0; + + local_irq_disable(); + in = inb(0x379); + inb(0x378); + outb(0x08, 0x378); + now = get_cycles(); + while(1) { + if (inb(0x379) != in) + break; + if (timeout++ > 1000000) { + outb(0x00, 0x378); + local_irq_enable(); + + return 0; + } + } + end = get_cycles(); + outb(0x00, 0x378); + local_irq_enable(); + + return end - now; +} + +static int lpptest_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static int lpptest_close(struct inode *inode, struct file *file) +{ + return 0; +} + +int lpptest_ioctl(struct inode *inode, struct file *file, unsigned int ioctl_num, unsigned long ioctl_param) +{ + int retval = 0; + + switch (ioctl_num) { + + case LPPTEST_DISABLE: + DISABLE_IRQ(); + break; + + case LPPTEST_ENABLE: + ENABLE_IRQ(); + break; + + case LPPTEST_TEST: { + + cycles_t diff = test_response(); + if (copy_to_user((void *)ioctl_param, (void*) &diff, sizeof(diff))) + goto errcpy; + break; + } + default: retval = -EINVAL; + } + + return retval; + + errcpy: + return -EFAULT; +} + +static struct file_operations lpptest_dev_fops = { + .ioctl = lpptest_ioctl, + .open = lpptest_open, + .release = lpptest_close, +}; + +static int __init lpptest_init (void) +{ + if (register_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME, &lpptest_dev_fops)) + { + printk(KERN_NOTICE "Can't allocate major number %d for lpptest.\n", + LPPTEST_CHAR_MAJOR); + return -EAGAIN; + } + + if (request_irq (LPPTEST_IRQ, lpptest_irq, 0, "lpptest", dev_id)) { + printk (KERN_WARNING "lpptest: irq %d in use. Unload parport module!\n", LPPTEST_IRQ); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); + return -EAGAIN; + } + irq_desc[LPPTEST_IRQ].status |= IRQ_NODELAY; + irq_desc[LPPTEST_IRQ].action->flags |= IRQF_NODELAY | IRQF_DISABLED; + + INIT_PORT(); + ENABLE_IRQ(); + + return 0; +} +module_init (lpptest_init); + +static void __exit lpptest_exit (void) +{ + DISABLE_IRQ(); + + free_irq(LPPTEST_IRQ, dev_id); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); +} +module_exit (lpptest_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("lpp test module"); + Index: linux/drivers/char/random.c =================================================================== --- linux.orig/drivers/char/random.c +++ linux/drivers/char/random.c @@ -580,8 +580,11 @@ static void add_timer_randomness(struct preempt_disable(); /* if over the trickle threshold, use only 1 in 4096 samples */ if (input_pool.entropy_count > trickle_thresh && - (__get_cpu_var(trickle_count)++ & 0xfff)) - goto out; + (__get_cpu_var(trickle_count)++ & 0xfff)) { + preempt_enable(); + return; + } + preempt_enable(); sample.jiffies = jiffies; sample.cycles = get_cycles(); @@ -626,9 +629,6 @@ static void add_timer_randomness(struct if(input_pool.entropy_count >= random_read_wakeup_thresh) wake_up_interruptible(&random_read_wait); - -out: - preempt_enable(); } void add_input_randomness(unsigned int type, unsigned int code, Index: linux/drivers/char/rmem.c =================================================================== --- /dev/null +++ linux/drivers/char/rmem.c @@ -0,0 +1,134 @@ +/* + * Rmem - REALLY simple memory mapping demonstration. + * + * Copyright (C) 2005 by Theodore Ts'o + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int rmem_major = 0; +module_param(rmem_major, int, 0444); + +static struct class *rmem_class; + +MODULE_AUTHOR("Theodore Ts'o"); +MODULE_LICENSE("GPL"); + +struct page *rmem_vma_nopage(struct vm_area_struct *vma, + unsigned long address, int *type) +{ + struct page *pageptr; + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + unsigned long physaddr = address - vma->vm_start + offset; + unsigned long pageframe = physaddr >> PAGE_SHIFT; + + if (!pfn_valid(pageframe)) + return NOPAGE_SIGBUS; + pageptr = pfn_to_page(pageframe); + get_page(pageptr); + if (type) + *type = VM_FAULT_MINOR; + return pageptr; +} + +static struct vm_operations_struct rmem_nopage_vm_ops = { + .nopage = rmem_vma_nopage, +}; + +static int rmem_nopage_mmap(struct file *filp, struct vm_area_struct *vma) +{ + unsigned long offset = vma->vm_pgoff << PAGE_SHIFT; + + if (offset >= __pa(high_memory) || (filp->f_flags & O_SYNC)) + vma->vm_flags |= VM_IO; + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &rmem_nopage_vm_ops; +#ifdef TAINT_USER + add_taint(TAINT_USER); +#endif + return 0; +} + +static struct file_operations rmem_nopage_ops = { + .owner = THIS_MODULE, + .mmap = rmem_nopage_mmap, +}; + +static struct cdev rmem_cdev = { + .kobj = {.name = "rmem", }, + .owner = THIS_MODULE, +}; + +static int __init rmem_init(void) +{ + int result; + dev_t dev = MKDEV(rmem_major, 0); + + /* Figure out our device number. */ + if (rmem_major) + result = register_chrdev_region(dev, 1, "rmem"); + else { + result = alloc_chrdev_region(&dev, 0, 1, "rmem"); + rmem_major = MAJOR(dev); + } + if (result < 0) { + printk(KERN_WARNING "rmem: unable to get major %d\n", rmem_major); + return result; + } + if (rmem_major == 0) + rmem_major = result; + + cdev_init(&rmem_cdev, &rmem_nopage_ops); + result = cdev_add(&rmem_cdev, dev, 1); + if (result) { + printk (KERN_NOTICE "Error %d adding /dev/rmem", result); + kobject_put(&rmem_cdev.kobj); + unregister_chrdev_region(dev, 1); + return 1; + } + + rmem_class = class_create(THIS_MODULE, "rmem"); + class_device_create(rmem_class, NULL, dev, NULL, "rmem"); + + return 0; +} + + +static void __exit rmem_cleanup(void) +{ + cdev_del(&rmem_cdev); + unregister_chrdev_region(MKDEV(rmem_major, 0), 1); + class_destroy(rmem_class); +} + + +module_init(rmem_init); +module_exit(rmem_cleanup); Index: linux/drivers/char/rtc.c =================================================================== --- linux.orig/drivers/char/rtc.c +++ linux/drivers/char/rtc.c @@ -82,10 +82,36 @@ #include #include +#ifdef CONFIG_MIPS +# include +#endif + #if defined(__i386__) #include #endif +#ifdef CONFIG_RTC_HISTOGRAM + +static cycles_t last_interrupt_time; + +#include + +#define CPU_MHZ (cpu_khz / 1000) + +#define HISTSIZE 10000 +static int histogram[HISTSIZE]; + +static int rtc_state; + +enum rtc_states { + S_STARTUP, /* First round - let the application start */ + S_IDLE, /* Waiting for an interrupt */ + S_WAITING_FOR_READ, /* Signal delivered. waiting for rtc_read() */ + S_READ_MISSED, /* Signal delivered, read() deadline missed */ +}; + +#endif + #ifdef __sparc__ #include #include @@ -191,7 +217,7 @@ static int rtc_proc_open(struct inode *i static unsigned long rtc_status = 0; /* bitmapped status byte. */ static unsigned long rtc_freq = 0; /* Current periodic IRQ rate */ static unsigned long rtc_irq_data = 0; /* our output to the world */ -static unsigned long rtc_max_user_freq = 64; /* > this, need CAP_SYS_RESOURCE */ +static unsigned long rtc_max_user_freq = 1024; /* > this, need CAP_SYS_RESOURCE */ #ifdef RTC_IRQ /* @@ -225,7 +251,146 @@ static inline unsigned char rtc_is_updat return uip; } +#ifndef RTC_IRQ +# undef CONFIG_RTC_HISTOGRAM +#endif + +static inline void rtc_open_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i; + + last_interrupt_time = 0; + rtc_state = S_STARTUP; + rtc_irq_data = 0; + + for (i = 0; i < HISTSIZE; i++) + histogram[i] = 0; +#endif +} + +static inline void rtc_wake_event(void) +{ +#ifndef CONFIG_RTC_HISTOGRAM + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); +#else + if (!(rtc_status & RTC_IS_OPEN)) + return; + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + break; + /* Waiting for an interrupt */ + case S_IDLE: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + last_interrupt_time = get_cycles(); + rtc_state = S_WAITING_FOR_READ; + break; + + /* Signal has been delivered. waiting for rtc_read() */ + case S_WAITING_FOR_READ: + /* + * Well foo. The usermode application didn't + * schedule and read in time. + */ + last_interrupt_time = get_cycles(); + rtc_state = S_READ_MISSED; + printk("Read missed before next interrupt\n"); + break; + /* Signal has been delivered, read() deadline was missed */ + case S_READ_MISSED: + /* + * Not much we can do here. We're waiting for the usermode + * application to read the rtc + */ + last_interrupt_time = get_cycles(); + break; + } +#endif +} + +static inline void rtc_read_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + cycles_t now = get_cycles(); + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + rtc_state = S_IDLE; + break; + + /* Waiting for an interrupt */ + case S_IDLE: + printk("bug in rtc_read(): called in state S_IDLE!\n"); + break; + case S_WAITING_FOR_READ: /* + * Signal has been delivered. + * waiting for rtc_read() + */ + /* + * Well done + */ + case S_READ_MISSED: /* + * Signal has been delivered, read() + * deadline was missed + */ + /* + * So, you finally got here. + */ + if (!last_interrupt_time) + printk("bug in rtc_read(): last_interrupt_time = 0\n"); + rtc_state = S_IDLE; + { + cycles_t latency = now - last_interrupt_time; + unsigned long delta; /* Microseconds */ + + delta = latency; + delta /= CPU_MHZ; + + if (delta > 1000 * 1000) { + printk("rtc: eek\n"); + } else { + unsigned long slot = delta; + if (slot >= HISTSIZE) + slot = HISTSIZE - 1; + histogram[slot]++; + if (delta > 2000) + printk("wow! That was a " + "%ld millisec bump\n", + delta / 1000); + } + } + rtc_state = S_IDLE; + break; + } +#endif +} + +static inline void rtc_close_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i = 0; + unsigned long total = 0; + + for (i = 0; i < HISTSIZE; i++) + total += histogram[i]; + if (!total) + return; + + printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n", + current->comm, current->pid, total); + for (i = 0; i < HISTSIZE; i++) { + if (histogram[i]) + printk("%d %d\n", i, histogram[i]); + } +#endif +} + #ifdef RTC_IRQ + /* * A very tiny interrupt handler. It runs with IRQF_DISABLED set, * but there is possibility of conflicting with the set_rtc_mmss() @@ -269,9 +434,9 @@ irqreturn_t rtc_interrupt(int irq, void if (rtc_callback) rtc_callback->func(rtc_callback->private_data); spin_unlock(&rtc_task_lock); - wake_up_interruptible(&rtc_wait); - kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + rtc_wake_event(); + wake_up_interruptible(&rtc_wait); return IRQ_HANDLED; } @@ -381,6 +546,8 @@ static ssize_t rtc_read(struct file *fil schedule(); } while (1); + rtc_read_event(); + if (count == sizeof(unsigned int)) retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); else @@ -613,6 +780,11 @@ static int rtc_do_ioctl(unsigned int cmd save_freq_select = CMOS_READ(RTC_FREQ_SELECT); CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + /* + * Make CMOS date writes nonpreemptible even on PREEMPT_RT. + * There's a limit to everything! =B-) + */ + preempt_disable(); #ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR); #endif @@ -622,6 +794,7 @@ static int rtc_do_ioctl(unsigned int cmd CMOS_WRITE(hrs, RTC_HOURS); CMOS_WRITE(min, RTC_MINUTES); CMOS_WRITE(sec, RTC_SECONDS); + preempt_enable(); CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); @@ -720,6 +893,7 @@ static int rtc_open(struct inode *inode, if(rtc_status & RTC_IS_OPEN) goto out_busy; + rtc_open_event(); rtc_status |= RTC_IS_OPEN; rtc_irq_data = 0; @@ -775,6 +949,7 @@ no_irq: rtc_irq_data = 0; rtc_status &= ~RTC_IS_OPEN; spin_unlock_irq (&rtc_lock); + rtc_close_event(); return 0; } @@ -1159,7 +1334,7 @@ static void rtc_dropped_irq(unsigned lon spin_unlock_irq(&rtc_lock); - printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq); +// printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq); /* Now we have new data */ wake_up_interruptible(&rtc_wait); Index: linux/drivers/char/sysrq.c =================================================================== --- linux.orig/drivers/char/sysrq.c +++ linux/drivers/char/sysrq.c @@ -209,6 +209,22 @@ static struct sysrq_key_op sysrq_showreg .enable_mask = SYSRQ_ENABLE_DUMP, }; +#if defined(__i386__) || defined(__x86_64__) + +static void sysrq_handle_showallregs(int key, struct tty_struct *tty) +{ + nmi_show_all_regs(); +} + +static struct sysrq_key_op sysrq_showallregs_op = { + .handler = sysrq_handle_showallregs, + .help_msg = "showalLcpupc", + .action_msg = "Show Regs On All CPUs", +}; +#else +#define sysrq_showallregs_op (*(struct sysrq_key_op *)0) +#endif + static void sysrq_handle_showstate(int key, struct tty_struct *tty) { show_state(); @@ -341,7 +357,7 @@ static struct sysrq_key_op *sysrq_key_ta &sysrq_kill_op, /* i */ NULL, /* j */ &sysrq_SAK_op, /* k */ - NULL, /* l */ + &sysrq_showallregs_op, /* l */ &sysrq_showmem_op, /* m */ &sysrq_unrt_op, /* n */ /* o: This will often be registered as 'Off' at init time */ Index: linux/drivers/char/tty_io.c =================================================================== --- linux.orig/drivers/char/tty_io.c +++ linux/drivers/char/tty_io.c @@ -253,6 +253,7 @@ static int check_tty_count(struct tty_st printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) " "!= #fd's(%d) in %s\n", tty->name, tty->count, count, routine); + dump_stack(); return count; } #endif @@ -1376,6 +1377,8 @@ static void do_tty_hangup(struct work_st read_unlock(&tasklist_lock); tty->flags = 0; + put_pid(tty->session); + put_pid(tty->pgrp); tty->session = NULL; tty->pgrp = NULL; tty->ctrl_status = 0; @@ -3841,6 +3844,9 @@ static struct pid *__proc_set_tty(struct { struct pid *old_pgrp; if (tty) { + /* We should not have a session or pgrp to here but.... */ + put_pid(tty->session); + put_pid(tty->pgrp); tty->session = get_pid(task_session(tsk)); tty->pgrp = get_pid(task_pgrp(tsk)); } Index: linux/drivers/char/vt.c =================================================================== --- linux.orig/drivers/char/vt.c +++ linux/drivers/char/vt.c @@ -724,6 +724,7 @@ int vc_allocate(unsigned int currcons) / return -ENOMEM; memset(vc, 0, sizeof(*vc)); vc_cons[currcons].d = vc; + INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK); visual_init(vc, currcons, 1); if (!*vc->vc_uni_pagedir_loc) con_set_default_unimap(vc); @@ -2185,10 +2186,28 @@ static void console_callback(struct work release_console_sem(); } -void set_console(int nr) +int set_console(int nr) { + struct vc_data *vc = vc_cons[fg_console].d; + + if (!vc_cons_allocated(nr) || vt_dont_switch || + (vc->vt_mode.mode == VT_AUTO && vc->vc_mode == KD_GRAPHICS)) { + + /* + * Console switch will fail in console_callback() or + * change_console() so there is no point scheduling + * the callback + * + * Existing set_console() users don't check the return + * value so this shouldn't break anything + */ + return -EINVAL; + } + want_console = nr; schedule_console_callback(); + + return 0; } struct tty_driver *console_driver; Index: linux/drivers/char/vt_ioctl.c =================================================================== --- linux.orig/drivers/char/vt_ioctl.c +++ linux/drivers/char/vt_ioctl.c @@ -34,7 +34,7 @@ #include #include -static char vt_dont_switch; +char vt_dont_switch; extern struct tty_driver *console_driver; #define VT_IS_IN_USE(i) (console_driver->ttys[i] && console_driver->ttys[i]->count) Index: linux/drivers/char/watchdog/machzwd.c =================================================================== --- linux.orig/drivers/char/watchdog/machzwd.c +++ linux/drivers/char/watchdog/machzwd.c @@ -314,21 +314,21 @@ static int zf_ioctl(struct inode *inode, { void __user *argp = (void __user *)arg; int __user *p = argp; - switch(cmd){ - case WDIOC_GETSUPPORT: - if (copy_to_user(argp, &zf_info, sizeof(zf_info))) - return -EFAULT; - break; - - case WDIOC_GETSTATUS: - return put_user(0, p); - - case WDIOC_KEEPALIVE: - zf_ping(NULL); - break; + switch (cmd) { + case WDIOC_GETSUPPORT: + if (copy_to_user(argp, &zf_info, sizeof(zf_info))) + return -EFAULT; + break; + + case WDIOC_GETSTATUS: + return put_user(0, p); + + case WDIOC_KEEPALIVE: + zf_ping(0); + break; - default: - return -ENOTTY; + default: + return -ENOTTY; } return 0; Index: linux/drivers/clocksource/acpi_pm.c =================================================================== --- linux.orig/drivers/clocksource/acpi_pm.c +++ linux/drivers/clocksource/acpi_pm.c @@ -30,13 +30,13 @@ */ u32 pmtmr_ioport __read_mostly; -static inline u32 read_pmtmr(void) +static notrace inline u32 read_pmtmr(void) { /* mask the output to 24 bits */ return inl(pmtmr_ioport) & ACPI_PM_MASK; } -u32 acpi_pm_read_verified(void) +u32 notrace acpi_pm_read_verified(void) { u32 v1 = 0, v2 = 0, v3 = 0; @@ -56,12 +56,12 @@ u32 acpi_pm_read_verified(void) return v2; } -static cycle_t acpi_pm_read_slow(void) +static notrace cycle_t acpi_pm_read_slow(void) { return (cycle_t)acpi_pm_read_verified(); } -static cycle_t acpi_pm_read(void) +static notrace cycle_t acpi_pm_read(void) { return (cycle_t)read_pmtmr(); } Index: linux/drivers/dma/dmaengine.c =================================================================== --- linux.orig/drivers/dma/dmaengine.c +++ linux/drivers/dma/dmaengine.c @@ -176,6 +176,7 @@ void dma_chan_cleanup(struct kref *kref) chan->client = NULL; kref_put(&chan->device->refcount, dma_async_device_cleanup); } +EXPORT_SYMBOL(dma_chan_cleanup); static void dma_chan_free_rcu(struct rcu_head *rcu) { @@ -261,6 +262,7 @@ struct dma_client *dma_async_client_regi return client; } +EXPORT_SYMBOL(dma_async_client_register); /** * dma_async_client_unregister - unregister a client and free the &dma_client @@ -287,6 +289,7 @@ void dma_async_client_unregister(struct kfree(client); dma_chans_rebalance(); } +EXPORT_SYMBOL(dma_async_client_unregister); /** * dma_async_client_chan_request - request DMA channels @@ -304,6 +307,7 @@ void dma_async_client_chan_request(struc client->chans_desired = number; dma_chans_rebalance(); } +EXPORT_SYMBOL(dma_async_client_chan_request); /** * dma_async_device_register - registers DMA devices found @@ -346,6 +350,7 @@ int dma_async_device_register(struct dma return 0; } +EXPORT_SYMBOL(dma_async_device_register); /** * dma_async_device_cleanup - function called when all references are released @@ -390,23 +395,12 @@ void dma_async_device_unregister(struct kref_put(&device->refcount, dma_async_device_cleanup); wait_for_completion(&device->done); } +EXPORT_SYMBOL(dma_async_device_unregister); static int __init dma_bus_init(void) { mutex_init(&dma_list_mutex); return class_register(&dma_devclass); } - subsys_initcall(dma_bus_init); -EXPORT_SYMBOL(dma_async_client_register); -EXPORT_SYMBOL(dma_async_client_unregister); -EXPORT_SYMBOL(dma_async_client_chan_request); -EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf); -EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg); -EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg); -EXPORT_SYMBOL(dma_async_memcpy_complete); -EXPORT_SYMBOL(dma_async_memcpy_issue_pending); -EXPORT_SYMBOL(dma_async_device_register); -EXPORT_SYMBOL(dma_async_device_unregister); -EXPORT_SYMBOL(dma_chan_cleanup); Index: linux/drivers/hwmon/Kconfig =================================================================== --- linux.orig/drivers/hwmon/Kconfig +++ linux/drivers/hwmon/Kconfig @@ -527,6 +527,7 @@ config SENSORS_W83792D config SENSORS_W83793 tristate "Winbond W83793" depends on HWMON && I2C && EXPERIMENTAL + select HWMON_VID help If you say yes here you get support for the Winbond W83793 hardware monitoring chip. Index: linux/drivers/ide/Kconfig =================================================================== --- linux.orig/drivers/ide/Kconfig +++ linux/drivers/ide/Kconfig @@ -434,24 +434,8 @@ config BLK_DEV_IDEDMA_FORCED Generally say N here. -config IDEDMA_PCI_AUTO - bool "Use PCI DMA by default when available" - ---help--- - Prior to kernel version 2.1.112, Linux used to automatically use - DMA for IDE drives and chipsets which support it. Due to concerns - about a couple of cases where buggy hardware may have caused damage, - the default is now to NOT use DMA automatically. To revert to the - previous behaviour, say Y to this question. - - If you suspect your hardware is at all flakey, say N here. - Do NOT email the IDE kernel people regarding this issue! - - It is normally safe to answer Y to this question unless your - motherboard uses a VIA VP2 chipset, in which case you should say N. - config IDEDMA_ONLYDISK bool "Enable DMA only for disks " - depends on IDEDMA_PCI_AUTO help This is used if you know your ATAPI Devices are going to fail DMA Transfers. @@ -769,6 +753,14 @@ config BLK_DEV_TC86C001 help This driver adds support for Toshiba TC86C001 GOKU-S chip. +config BLK_DEV_CELLEB + tristate "Toshiba's Cell Reference Set IDE support" + depends on PPC_CELLEB + help + This driver provides support for the built-in IDE controller on + Toshiba Cell Reference Board. + If unsure, say Y. + endif config BLK_DEV_IDE_PMAC @@ -800,14 +792,6 @@ config BLK_DEV_IDEDMA_PMAC to transfer data to and from memory. Saying Y is safe and improves performance. -config BLK_DEV_IDE_CELLEB - bool "Toshiba's Cell Reference Set IDE support" - depends on PPC_CELLEB && IDE=y - help - This driver provides support for the built-in IDE controller on - Toshiba Cell Reference Board. - If unsure, say Y. - config BLK_DEV_IDE_SWARM tristate "IDE for Sibyte evaluation boards" depends on SIBYTE_SB1xxx_SOC @@ -851,19 +835,6 @@ config BLK_DEV_IDEDMA_ICS Say Y here if you want to add DMA (Direct Memory Access) support to the ICS IDE driver. -config IDEDMA_ICS_AUTO - bool "Use ICS DMA by default" - depends on BLK_DEV_IDEDMA_ICS - help - Prior to kernel version 2.1.112, Linux used to automatically use - DMA for IDE drives and chipsets which support it. Due to concerns - about a couple of cases where buggy hardware may have caused damage, - the default is now to NOT use DMA automatically. To revert to the - previous behaviour, say Y to this question. - - If you suspect your hardware is at all flakey, say N here. - Do NOT email the IDE kernel people regarding this issue! - config BLK_DEV_IDE_RAPIDE tristate "RapIDE interface support" depends on ARM && ARCH_ACORN @@ -1086,9 +1057,6 @@ config IDEDMA_IVB It is normally safe to answer Y; however, the default is N. -config IDEDMA_AUTO - def_bool IDEDMA_PCI_AUTO || IDEDMA_ICS_AUTO - endif config BLK_DEV_HD_ONLY Index: linux/drivers/ide/Makefile =================================================================== --- linux.orig/drivers/ide/Makefile +++ linux/drivers/ide/Makefile @@ -37,7 +37,6 @@ ide-core-$(CONFIG_BLK_DEV_Q40IDE) += leg # built-in only drivers from ppc/ ide-core-$(CONFIG_BLK_DEV_MPC8xx_IDE) += ppc/mpc8xx.o ide-core-$(CONFIG_BLK_DEV_IDE_PMAC) += ppc/pmac.o -ide-core-$(CONFIG_BLK_DEV_IDE_CELLEB) += ppc/scc_pata.o # built-in only drivers from h8300/ ide-core-$(CONFIG_H8300) += h8300/ide-h8300.o Index: linux/drivers/ide/arm/icside.c =================================================================== --- linux.orig/drivers/ide/arm/icside.c +++ linux/drivers/ide/arm/icside.c @@ -196,11 +196,6 @@ static void icside_maskproc(ide_drive_t } #ifdef CONFIG_BLK_DEV_IDEDMA_ICS - -#ifndef CONFIG_IDEDMA_ICS_AUTO -#warning CONFIG_IDEDMA_ICS_AUTO=n support is obsolete, and will be removed soon. -#endif - /* * SG-DMA support. * @@ -474,12 +469,6 @@ static int icside_dma_lostirq(ide_drive_ static void icside_dma_init(ide_hwif_t *hwif) { - int autodma = 0; - -#ifdef CONFIG_IDEDMA_ICS_AUTO - autodma = 1; -#endif - printk(" %s: SG-DMA", hwif->name); hwif->atapi_dma = 1; @@ -489,7 +478,7 @@ static void icside_dma_init(ide_hwif_t * hwif->dmatable_cpu = NULL; hwif->dmatable_dma = 0; hwif->speedproc = icside_set_speed; - hwif->autodma = autodma; + hwif->autodma = 1; hwif->ide_dma_check = icside_dma_check; hwif->dma_host_off = icside_dma_host_off; Index: linux/drivers/ide/ide-dma.c =================================================================== --- linux.orig/drivers/ide/ide-dma.c +++ linux/drivers/ide/ide-dma.c @@ -767,7 +767,7 @@ int ide_set_dma(ide_drive_t *drive) switch(rc) { case -1: /* DMA needs to be disabled */ hwif->dma_off_quietly(drive); - return 0; + return -1; case 0: /* DMA needs to be enabled */ return hwif->ide_dma_on(drive); case 1: /* DMA setting cannot be changed */ Index: linux/drivers/ide/ide-floppy.c =================================================================== --- linux.orig/drivers/ide/ide-floppy.c +++ linux/drivers/ide/ide-floppy.c @@ -1667,9 +1667,9 @@ static int idefloppy_get_format_progress atapi_status_t status; unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); status.all = HWIF(drive)->INB(IDE_STATUS_REG); - local_irq_restore(flags); + local_irq_restore_nort(flags); progress_indication = !status.b.dsc ? 0 : 0x10000; } Index: linux/drivers/ide/ide-io.c =================================================================== --- linux.orig/drivers/ide/ide-io.c +++ linux/drivers/ide/ide-io.c @@ -1185,7 +1185,7 @@ static void ide_do_request (ide_hwgroup_ ide_get_lock(ide_intr, hwgroup); /* caller must own ide_lock */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); while (!hwgroup->busy) { hwgroup->busy = 1; @@ -1450,7 +1450,7 @@ void ide_timer_expiry (unsigned long dat #endif /* DISABLE_IRQ_NOSYNC */ /* local CPU only, * as if we were handling an interrupt */ - local_irq_disable(); + local_irq_disable_nort(); if (hwgroup->polling) { startstop = handler(drive); } else if (drive_is_ready(drive)) { Index: linux/drivers/ide/ide-iops.c =================================================================== --- linux.orig/drivers/ide/ide-iops.c +++ linux/drivers/ide/ide-iops.c @@ -220,10 +220,10 @@ static void ata_input_data(ide_drive_t * if (io_32bit) { if (io_32bit & 2) { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(drive, IDE_NSECTOR_REG); hwif->INSL(IDE_DATA_REG, buffer, wcount); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else hwif->INSL(IDE_DATA_REG, buffer, wcount); } else { @@ -242,10 +242,10 @@ static void ata_output_data(ide_drive_t if (io_32bit) { if (io_32bit & 2) { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(drive, IDE_NSECTOR_REG); hwif->OUTSL(IDE_DATA_REG, buffer, wcount); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else hwif->OUTSL(IDE_DATA_REG, buffer, wcount); } else { @@ -540,12 +540,12 @@ int ide_wait_stat (ide_startstop_t *star if (!(stat & BUSY_STAT)) break; - local_irq_restore(flags); + local_irq_restore_nort(flags); *startstop = ide_error(drive, "status timeout", stat); return 1; } } - local_irq_restore(flags); + local_irq_restore_nort(flags); } /* * Allow status to settle, then read it again. @@ -709,17 +709,15 @@ int ide_driveid_update (ide_drive_t *dri printk("%s: CHECK for good STATUS\n", drive->name); return 0; } - local_irq_save(flags); - SELECT_MASK(drive, 0); id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC); - if (!id) { - local_irq_restore(flags); + if (!id) return 0; - } + local_irq_save_nort(flags); + SELECT_MASK(drive, 0); ata_input_data(drive, id, SECTOR_WORDS); (void) hwif->INB(IDE_STATUS_REG); /* clear drive IRQ */ - local_irq_enable(); - local_irq_restore(flags); + local_irq_enable_nort(); + local_irq_restore_nort(flags); ide_fix_driveid(id); if (id) { drive->id->dma_ultra = id->dma_ultra; @@ -799,7 +797,7 @@ int ide_config_drive_speed (ide_drive_t if (time_after(jiffies, timeout)) break; } - local_irq_restore(flags); + local_irq_restore_nort(flags); } /* Index: linux/drivers/ide/ide-lib.c =================================================================== --- linux.orig/drivers/ide/ide-lib.c +++ linux/drivers/ide/ide-lib.c @@ -470,15 +470,16 @@ int ide_set_xfer_rate(ide_drive_t *drive static void ide_dump_opcode(ide_drive_t *drive) { + unsigned long flags; struct request *rq; u8 opcode = 0; int found = 0; - spin_lock(&ide_lock); + spin_lock_irqsave(&ide_lock, flags); rq = NULL; if (HWGROUP(drive)) rq = HWGROUP(drive)->rq; - spin_unlock(&ide_lock); + spin_unlock_irqrestore(&ide_lock, flags); if (!rq) return; if (rq->cmd_type == REQ_TYPE_ATA_CMD || @@ -507,10 +508,8 @@ static void ide_dump_opcode(ide_drive_t static u8 ide_dump_ata_status(ide_drive_t *drive, const char *msg, u8 stat) { ide_hwif_t *hwif = HWIF(drive); - unsigned long flags; u8 err = 0; - local_irq_save(flags); printk("%s: %s: status=0x%02x { ", drive->name, msg, stat); if (stat & BUSY_STAT) printk("Busy "); @@ -570,7 +569,7 @@ static u8 ide_dump_ata_status(ide_drive_ printk("\n"); } ide_dump_opcode(drive); - local_irq_restore(flags); + return err; } @@ -585,14 +584,11 @@ static u8 ide_dump_ata_status(ide_drive_ static u8 ide_dump_atapi_status(ide_drive_t *drive, const char *msg, u8 stat) { - unsigned long flags; - atapi_status_t status; atapi_error_t error; status.all = stat; error.all = 0; - local_irq_save(flags); printk("%s: %s: status=0x%02x { ", drive->name, msg, stat); if (status.b.bsy) printk("Busy "); @@ -618,7 +614,7 @@ static u8 ide_dump_atapi_status(ide_driv printk("}\n"); } ide_dump_opcode(drive); - local_irq_restore(flags); + return error.all; } Index: linux/drivers/ide/ide-probe.c =================================================================== --- linux.orig/drivers/ide/ide-probe.c +++ linux/drivers/ide/ide-probe.c @@ -141,7 +141,7 @@ static inline void do_identify (ide_driv hwif->ata_input_data(drive, id, SECTOR_WORDS); drive->id_read = 1; - local_irq_enable(); + local_irq_enable_nort(); ide_fix_driveid(id); #if defined (CONFIG_SCSI_EATA_DMA) || defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA) @@ -323,14 +323,14 @@ static int actual_try_to_identify (ide_d unsigned long flags; /* local CPU only; some systems need this */ - local_irq_save(flags); + local_irq_save_nort(flags); /* drive returned ID */ do_identify(drive, cmd); /* drive responded with ID */ rc = 0; /* clear drive IRQ */ (void) hwif->INB(IDE_STATUS_REG); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else { /* drive refused ID */ rc = 2; @@ -807,7 +807,7 @@ static void probe_hwif(ide_hwif_t *hwif) } while ((stat & BUSY_STAT) && time_after(timeout, jiffies)); } - local_irq_restore(flags); + local_irq_restore_nort(flags); /* * Use cached IRQ number. It might be (and is...) changed by probe * code above Index: linux/drivers/ide/ide-taskfile.c =================================================================== --- linux.orig/drivers/ide/ide-taskfile.c +++ linux/drivers/ide/ide-taskfile.c @@ -274,7 +274,7 @@ static void ide_pio_sector(ide_drive_t * offset %= PAGE_SIZE; #ifdef CONFIG_HIGHMEM - local_irq_save(flags); + local_irq_save_nort(flags); #endif buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset; @@ -294,7 +294,7 @@ static void ide_pio_sector(ide_drive_t * kunmap_atomic(buf, KM_BIO_SRC_IRQ); #ifdef CONFIG_HIGHMEM - local_irq_restore(flags); + local_irq_restore_nort(flags); #endif } @@ -460,7 +460,7 @@ ide_startstop_t pre_task_out_intr (ide_d } if (!drive->unmask) - local_irq_disable(); + local_irq_disable_nort(); ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL); ide_pio_datablock(drive, rq, 1); Index: linux/drivers/ide/ide.c =================================================================== --- linux.orig/drivers/ide/ide.c +++ linux/drivers/ide/ide.c @@ -177,11 +177,7 @@ DECLARE_MUTEX(ide_cfg_sem); static int ide_scan_direction; /* THIS was formerly 2.2.x pci=reverse */ #endif -#ifdef CONFIG_IDEDMA_AUTO int noautodma = 0; -#else -int noautodma = 1; -#endif EXPORT_SYMBOL(noautodma); Index: linux/drivers/ide/mips/au1xxx-ide.c =================================================================== --- linux.orig/drivers/ide/mips/au1xxx-ide.c +++ linux/drivers/ide/mips/au1xxx-ide.c @@ -639,6 +639,7 @@ static int au_ide_probe(struct device *d _auide_hwif *ahwif = &auide_hwif; ide_hwif_t *hwif; struct resource *res; + hw_regs_t *hw; int ret = 0; #if defined(CONFIG_BLK_DEV_IDE_AU1XXX_MDMA2_DBDMA) @@ -681,7 +682,7 @@ static int au_ide_probe(struct device *d /* FIXME: This might possibly break PCMCIA IDE devices */ hwif = &ide_hwifs[pdev->id]; - hw_regs_t *hw = &hwif->hw; + hw = &hwif->hw; hwif->irq = hw->irq = ahwif->irq; hwif->chipset = ide_au1xxx; Index: linux/drivers/ide/pci/Makefile =================================================================== --- linux.orig/drivers/ide/pci/Makefile +++ linux/drivers/ide/pci/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_BLK_DEV_AEC62XX) += aec62x obj-$(CONFIG_BLK_DEV_ALI15X3) += alim15x3.o obj-$(CONFIG_BLK_DEV_AMD74XX) += amd74xx.o obj-$(CONFIG_BLK_DEV_ATIIXP) += atiixp.o +obj-$(CONFIG_BLK_DEV_CELLEB) += scc_pata.o obj-$(CONFIG_BLK_DEV_CMD64X) += cmd64x.o obj-$(CONFIG_BLK_DEV_CS5520) += cs5520.o obj-$(CONFIG_BLK_DEV_CS5530) += cs5530.o Index: linux/drivers/ide/pci/alim15x3.c =================================================================== --- linux.orig/drivers/ide/pci/alim15x3.c +++ linux/drivers/ide/pci/alim15x3.c @@ -325,7 +325,7 @@ static u8 ali15x3_tune_pio (ide_drive_t if (r_clc >= 16) r_clc = 0; } - local_irq_save(flags); + local_irq_save_nort(flags); /* * PIO mode => ATA FIFO on, ATAPI FIFO off @@ -347,7 +347,7 @@ static u8 ali15x3_tune_pio (ide_drive_t pci_write_config_byte(dev, port, s_clc); pci_write_config_byte(dev, port+drive->select.b.unit+2, (a_clc << 4) | r_clc); - local_irq_restore(flags); + local_irq_restore_nort(flags); /* * setup active rec @@ -618,7 +618,7 @@ static unsigned int __devinit init_chips } #endif /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_PROC_FS) */ - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision < 0xC2) { /* @@ -679,7 +679,7 @@ static unsigned int __devinit init_chips out: pci_dev_put(north); pci_dev_put(isa_dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); return 0; } @@ -703,7 +703,7 @@ static unsigned int __devinit ata66_ali1 unsigned long flags; u8 tmpbyte; - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision >= 0xC2) { /* @@ -755,7 +755,7 @@ static unsigned int __devinit ata66_ali1 pci_write_config_byte(dev, 0x53, tmpbyte); - local_irq_restore(flags); + local_irq_restore_nort(flags); return(ata66); } Index: linux/drivers/ide/pci/cmd64x.c =================================================================== --- linux.orig/drivers/ide/pci/cmd64x.c +++ linux/drivers/ide/pci/cmd64x.c @@ -1,6 +1,6 @@ /* $Id: cmd64x.c,v 1.21 2000/01/30 23:23:16 * - * linux/drivers/ide/pci/cmd64x.c Version 1.41 Feb 3, 2007 + * linux/drivers/ide/pci/cmd64x.c Version 1.42 Feb 8, 2007 * * cmd64x.c: Enable interrupts at initialization time on Ultra/PCI machines. * Note, this driver is not used at all on other systems because @@ -189,6 +189,11 @@ static int cmd64x_get_info (char *buffer #endif /* defined(DISPLAY_CMD64X_TIMINGS) && defined(CONFIG_PROC_FS) */ +static u8 quantize_timing(int timing, int quant) +{ + return (timing + quant - 1) / quant; +} + /* * This routine writes the prepared setup/active/recovery counts * for a drive into the cmd646 chipset registers to active them. @@ -268,47 +273,37 @@ static void program_drive_counts (ide_dr */ static u8 cmd64x_tune_pio (ide_drive_t *drive, u8 mode_wanted) { - int setup_time, active_time, recovery_time; - int clock_time, pio_mode, cycle_time; - u8 recovery_count2, cycle_count; - int setup_count, active_count, recovery_count; - int bus_speed = system_bus_clock(); - ide_pio_data_t d; + int setup_time, active_time, cycle_time; + u8 cycle_count, setup_count, active_count, recovery_count; + u8 pio_mode; + int clock_time = 1000 / system_bus_clock(); + ide_pio_data_t pio; - pio_mode = ide_get_best_pio_mode(drive, mode_wanted, 5, &d); - cycle_time = d.cycle_time; + pio_mode = ide_get_best_pio_mode(drive, mode_wanted, 5, &pio); + cycle_time = pio.cycle_time; - /* - * I copied all this complicated stuff from cmd640.c and made a few - * minor changes. For now I am just going to pray that it is correct. - */ setup_time = ide_pio_timings[pio_mode].setup_time; active_time = ide_pio_timings[pio_mode].active_time; - recovery_time = cycle_time - (setup_time + active_time); - clock_time = 1000 / bus_speed; - cycle_count = (cycle_time + clock_time - 1) / clock_time; - - setup_count = (setup_time + clock_time - 1) / clock_time; - - active_count = (active_time + clock_time - 1) / clock_time; - - recovery_count = (recovery_time + clock_time - 1) / clock_time; - recovery_count2 = cycle_count - (setup_count + active_count); - if (recovery_count2 > recovery_count) - recovery_count = recovery_count2; + + setup_count = quantize_timing( setup_time, clock_time); + cycle_count = quantize_timing( cycle_time, clock_time); + active_count = quantize_timing(active_time, clock_time); + + recovery_count = cycle_count - active_count; + /* program_drive_counts() takes care of zero recovery cycles */ if (recovery_count > 16) { active_count += recovery_count - 16; recovery_count = 16; } if (active_count > 16) - active_count = 16; /* maximum allowed by cmd646 */ + active_count = 16; /* maximum allowed by cmd64x */ program_drive_counts (drive, setup_count, active_count, recovery_count); cmdprintk("%s: PIO mode wanted %d, selected %d (%dns)%s, " "clocks=%d/%d/%d\n", drive->name, mode_wanted, pio_mode, cycle_time, - d.overridden ? " (overriding vendor mode)" : "", + pio.overridden ? " (overriding vendor mode)" : "", setup_count, active_count, recovery_count); return pio_mode; Index: linux/drivers/ide/pci/cs5530.c =================================================================== --- linux.orig/drivers/ide/pci/cs5530.c +++ linux/drivers/ide/pci/cs5530.c @@ -232,8 +232,8 @@ static unsigned int __devinit init_chips goto out; } - spin_lock_irqsave(&ide_lock, flags); - /* all CPUs (there should only be one CPU with this chipset) */ + /* Local CPU. ide_lock is acquired in do_ide_setup_pci_device. */ + local_irq_save(flags); /* * Enable BusMaster and MemoryWriteAndInvalidate for the cs5530: @@ -285,7 +285,7 @@ static unsigned int __devinit init_chips pci_write_config_byte(master_0, 0x42, 0x00); pci_write_config_byte(master_0, 0x43, 0xc1); - spin_unlock_irqrestore(&ide_lock, flags); + local_irq_restore(flags); out: pci_dev_put(master_0); Index: linux/drivers/ide/pci/hpt366.c =================================================================== --- linux.orig/drivers/ide/pci/hpt366.c +++ linux/drivers/ide/pci/hpt366.c @@ -1384,7 +1384,7 @@ static void __devinit init_dma_hpt366(id dma_old = hwif->INB(dmabase + 2); - local_irq_save(flags); + local_irq_save_nort(flags); dma_new = dma_old; pci_read_config_byte(dev, hwif->channel ? 0x4b : 0x43, &masterdma); @@ -1395,7 +1395,7 @@ static void __devinit init_dma_hpt366(id if (dma_new != dma_old) hwif->OUTB(dma_new, dmabase + 2); - local_irq_restore(flags); + local_irq_restore_nort(flags); ide_setup_dma(hwif, dmabase, 8); } Index: linux/drivers/ide/pci/jmicron.c =================================================================== --- linux.orig/drivers/ide/pci/jmicron.c +++ linux/drivers/ide/pci/jmicron.c @@ -240,12 +240,31 @@ static int __devinit jmicron_init_one(st return 0; } +/* If libata is configured, jmicron PCI quirk will configure it such + * that the SATA ports are in AHCI function while the PATA ports are + * in a separate IDE function. In such cases, match device class and + * attach only to IDE. If libata isn't configured, keep the old + * behavior for backward compatibility. + */ +#if defined(CONFIG_ATA) || defined(CONFIG_ATA_MODULE) +#define JMB_CLASS PCI_CLASS_STORAGE_IDE << 8 +#define JMB_CLASS_MASK 0xffff00 +#else +#define JMB_CLASS 0 +#define JMB_CLASS_MASK 0 +#endif + static struct pci_device_id jmicron_pci_tbl[] = { - { PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB361, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - { PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB363, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 1}, - { PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB365, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 2}, - { PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB366, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 3}, - { PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB368, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 4}, + { PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB361, + PCI_ANY_ID, PCI_ANY_ID, JMB_CLASS, JMB_CLASS_MASK, 0}, + { PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB363, + PCI_ANY_ID, PCI_ANY_ID, JMB_CLASS, JMB_CLASS_MASK, 1}, + { PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB365, + PCI_ANY_ID, PCI_ANY_ID, JMB_CLASS, JMB_CLASS_MASK, 2}, + { PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB366, + PCI_ANY_ID, PCI_ANY_ID, JMB_CLASS, JMB_CLASS_MASK, 3}, + { PCI_VENDOR_ID_JMICRON, PCI_DEVICE_ID_JMICRON_JMB368, + PCI_ANY_ID, PCI_ANY_ID, JMB_CLASS, JMB_CLASS_MASK, 4}, { 0, }, }; Index: linux/drivers/ide/pci/scc_pata.c =================================================================== --- /dev/null +++ linux/drivers/ide/pci/scc_pata.c @@ -0,0 +1,858 @@ +/* + * Support for IDE interfaces on Celleb platform + * + * (C) Copyright 2006 TOSHIBA CORPORATION + * + * This code is based on drivers/ide/pci/siimage.c: + * Copyright (C) 2001-2002 Andre Hedrick + * Copyright (C) 2003 Red Hat + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define PCI_DEVICE_ID_TOSHIBA_SCC_ATA 0x01b4 + +#define SCC_PATA_NAME "scc IDE" + +#define TDVHSEL_MASTER 0x00000001 +#define TDVHSEL_SLAVE 0x00000004 + +#define MODE_JCUSFEN 0x00000080 + +#define CCKCTRL_ATARESET 0x00040000 +#define CCKCTRL_BUFCNT 0x00020000 +#define CCKCTRL_CRST 0x00010000 +#define CCKCTRL_OCLKEN 0x00000100 +#define CCKCTRL_ATACLKOEN 0x00000002 +#define CCKCTRL_LCLKEN 0x00000001 + +#define QCHCD_IOS_SS 0x00000001 + +#define QCHSD_STPDIAG 0x00020000 + +#define INTMASK_MSK 0xD1000012 +#define INTSTS_SERROR 0x80000000 +#define INTSTS_PRERR 0x40000000 +#define INTSTS_RERR 0x10000000 +#define INTSTS_ICERR 0x01000000 +#define INTSTS_BMSINT 0x00000010 +#define INTSTS_BMHE 0x00000008 +#define INTSTS_IOIRQS 0x00000004 +#define INTSTS_INTRQ 0x00000002 +#define INTSTS_ACTEINT 0x00000001 + +#define ECMODE_VALUE 0x01 + +static struct scc_ports { + unsigned long ctl, dma; + unsigned char hwif_id; /* for removing hwif from system */ +} scc_ports[MAX_HWIFS]; + +/* PIO transfer mode table */ +/* JCHST */ +static unsigned long JCHSTtbl[2][7] = { + {0x0E, 0x05, 0x02, 0x03, 0x02, 0x00, 0x00}, /* 100MHz */ + {0x13, 0x07, 0x04, 0x04, 0x03, 0x00, 0x00} /* 133MHz */ +}; + +/* JCHHT */ +static unsigned long JCHHTtbl[2][7] = { + {0x0E, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00}, /* 100MHz */ + {0x13, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00} /* 133MHz */ +}; + +/* JCHCT */ +static unsigned long JCHCTtbl[2][7] = { + {0x1D, 0x1D, 0x1C, 0x0B, 0x06, 0x00, 0x00}, /* 100MHz */ + {0x27, 0x26, 0x26, 0x0E, 0x09, 0x00, 0x00} /* 133MHz */ +}; + + +/* DMA transfer mode table */ +/* JCHDCTM/JCHDCTS */ +static unsigned long JCHDCTxtbl[2][7] = { + {0x0A, 0x06, 0x04, 0x03, 0x01, 0x00, 0x00}, /* 100MHz */ + {0x0E, 0x09, 0x06, 0x04, 0x02, 0x01, 0x00} /* 133MHz */ +}; + +/* JCSTWTM/JCSTWTS */ +static unsigned long JCSTWTxtbl[2][7] = { + {0x06, 0x04, 0x03, 0x02, 0x02, 0x02, 0x00}, /* 100MHz */ + {0x09, 0x06, 0x04, 0x02, 0x02, 0x02, 0x02} /* 133MHz */ +}; + +/* JCTSS */ +static unsigned long JCTSStbl[2][7] = { + {0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x00}, /* 100MHz */ + {0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05} /* 133MHz */ +}; + +/* JCENVT */ +static unsigned long JCENVTtbl[2][7] = { + {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00}, /* 100MHz */ + {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02} /* 133MHz */ +}; + +/* JCACTSELS/JCACTSELM */ +static unsigned long JCACTSELtbl[2][7] = { + {0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00}, /* 100MHz */ + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01} /* 133MHz */ +}; + + +static u8 scc_ide_inb(unsigned long port) +{ + u32 data = in_be32((void*)port); + return (u8)data; +} + +static u16 scc_ide_inw(unsigned long port) +{ + u32 data = in_be32((void*)port); + return (u16)data; +} + +static void scc_ide_insw(unsigned long port, void *addr, u32 count) +{ + u16 *ptr = (u16 *)addr; + while (count--) { + *ptr++ = le16_to_cpu(in_be32((void*)port)); + } +} + +static void scc_ide_insl(unsigned long port, void *addr, u32 count) +{ + u16 *ptr = (u16 *)addr; + while (count--) { + *ptr++ = le16_to_cpu(in_be32((void*)port)); + *ptr++ = le16_to_cpu(in_be32((void*)port)); + } +} + +static void scc_ide_outb(u8 addr, unsigned long port) +{ + out_be32((void*)port, addr); +} + +static void scc_ide_outw(u16 addr, unsigned long port) +{ + out_be32((void*)port, addr); +} + +static void +scc_ide_outbsync(ide_drive_t * drive, u8 addr, unsigned long port) +{ + ide_hwif_t *hwif = HWIF(drive); + + out_be32((void*)port, addr); + __asm__ __volatile__("eieio":::"memory"); + in_be32((void*)(hwif->dma_base + 0x01c)); + __asm__ __volatile__("eieio":::"memory"); +} + +static void +scc_ide_outsw(unsigned long port, void *addr, u32 count) +{ + u16 *ptr = (u16 *)addr; + while (count--) { + out_be32((void*)port, cpu_to_le16(*ptr++)); + } +} + +static void +scc_ide_outsl(unsigned long port, void *addr, u32 count) +{ + u16 *ptr = (u16 *)addr; + while (count--) { + out_be32((void*)port, cpu_to_le16(*ptr++)); + out_be32((void*)port, cpu_to_le16(*ptr++)); + } +} + +/** + * scc_ratemask - Compute available modes + * @drive: IDE drive + * + * Compute the available speeds for the devices on the interface. + * Enforce UDMA33 as a limit if there is no 80pin cable present. + */ + +static u8 scc_ratemask(ide_drive_t *drive) +{ + u8 mode = 4; + + if (!eighty_ninty_three(drive)) + mode = min(mode, (u8)1); + return mode; +} + +/** + * scc_tuneproc - tune a drive PIO mode + * @drive: drive to tune + * @mode_wanted: the target operating mode + * + * Load the timing settings for this device mode into the + * controller. + */ + +static void scc_tuneproc(ide_drive_t *drive, byte mode_wanted) +{ + ide_hwif_t *hwif = HWIF(drive); + struct scc_ports *ports = ide_get_hwifdata(hwif); + unsigned long ctl_base = ports->ctl; + unsigned long cckctrl_port = ctl_base + 0xff0; + unsigned long piosht_port = ctl_base + 0x000; + unsigned long pioct_port = ctl_base + 0x004; + unsigned long reg; + unsigned char speed = XFER_PIO_0; + int offset; + + mode_wanted = ide_get_best_pio_mode(drive, mode_wanted, 4, NULL); + switch (mode_wanted) { + case 4: + speed = XFER_PIO_4; + break; + case 3: + speed = XFER_PIO_3; + break; + case 2: + speed = XFER_PIO_2; + break; + case 1: + speed = XFER_PIO_1; + break; + case 0: + default: + speed = XFER_PIO_0; + break; + } + + reg = in_be32((void __iomem *)cckctrl_port); + if (reg & CCKCTRL_ATACLKOEN) { + offset = 1; /* 133MHz */ + } else { + offset = 0; /* 100MHz */ + } + reg = JCHSTtbl[offset][mode_wanted] << 16 | JCHHTtbl[offset][mode_wanted]; + out_be32((void __iomem *)piosht_port, reg); + reg = JCHCTtbl[offset][mode_wanted]; + out_be32((void __iomem *)pioct_port, reg); + + ide_config_drive_speed(drive, speed); +} + +/** + * scc_tune_chipset - tune a drive DMA mode + * @drive: Drive to set up + * @xferspeed: speed we want to achieve + * + * Load the timing settings for this device mode into the + * controller. + */ + +static int scc_tune_chipset(ide_drive_t *drive, byte xferspeed) +{ + ide_hwif_t *hwif = HWIF(drive); + u8 speed = ide_rate_filter(scc_ratemask(drive), xferspeed); + struct scc_ports *ports = ide_get_hwifdata(hwif); + unsigned long ctl_base = ports->ctl; + unsigned long cckctrl_port = ctl_base + 0xff0; + unsigned long mdmact_port = ctl_base + 0x008; + unsigned long mcrcst_port = ctl_base + 0x00c; + unsigned long sdmact_port = ctl_base + 0x010; + unsigned long scrcst_port = ctl_base + 0x014; + unsigned long udenvt_port = ctl_base + 0x018; + unsigned long tdvhsel_port = ctl_base + 0x020; + int is_slave = (&hwif->drives[1] == drive); + int offset, idx; + unsigned long reg; + unsigned long jcactsel; + + reg = in_be32((void __iomem *)cckctrl_port); + if (reg & CCKCTRL_ATACLKOEN) { + offset = 1; /* 133MHz */ + } else { + offset = 0; /* 100MHz */ + } + + switch (speed) { + case XFER_UDMA_6: + idx = 6; + break; + case XFER_UDMA_5: + idx = 5; + break; + case XFER_UDMA_4: + idx = 4; + break; + case XFER_UDMA_3: + idx = 3; + break; + case XFER_UDMA_2: + idx = 2; + break; + case XFER_UDMA_1: + idx = 1; + break; + case XFER_UDMA_0: + idx = 0; + break; + default: + return 1; + } + + jcactsel = JCACTSELtbl[offset][idx]; + if (is_slave) { + out_be32((void __iomem *)sdmact_port, JCHDCTxtbl[offset][idx]); + out_be32((void __iomem *)scrcst_port, JCSTWTxtbl[offset][idx]); + jcactsel = jcactsel << 2; + out_be32((void __iomem *)tdvhsel_port, (in_be32((void __iomem *)tdvhsel_port) & ~TDVHSEL_SLAVE) | jcactsel); + } else { + out_be32((void __iomem *)mdmact_port, JCHDCTxtbl[offset][idx]); + out_be32((void __iomem *)mcrcst_port, JCSTWTxtbl[offset][idx]); + out_be32((void __iomem *)tdvhsel_port, (in_be32((void __iomem *)tdvhsel_port) & ~TDVHSEL_MASTER) | jcactsel); + } + reg = JCTSStbl[offset][idx] << 16 | JCENVTtbl[offset][idx]; + out_be32((void __iomem *)udenvt_port, reg); + + return ide_config_drive_speed(drive, speed); +} + +/** + * scc_config_chipset_for_dma - configure for DMA + * @drive: drive to configure + * + * Called by scc_config_drive_for_dma(). + */ + +static int scc_config_chipset_for_dma(ide_drive_t *drive) +{ + u8 speed = ide_dma_speed(drive, scc_ratemask(drive)); + + if (!speed) + return 0; + + if (scc_tune_chipset(drive, speed)) + return 0; + + return ide_dma_enable(drive); +} + +/** + * scc_configure_drive_for_dma - set up for DMA transfers + * @drive: drive we are going to set up + * + * Set up the drive for DMA, tune the controller and drive as + * required. + * If the drive isn't suitable for DMA or we hit other problems + * then we will drop down to PIO and set up PIO appropriately. + * (return 1) + */ + +static int scc_config_drive_for_dma(ide_drive_t *drive) +{ + if (ide_use_dma(drive) && scc_config_chipset_for_dma(drive)) + return 0; + + if (ide_use_fast_pio(drive)) + scc_tuneproc(drive, 4); + + return -1; +} + +/** + * scc_ide_dma_setup - begin a DMA phase + * @drive: target device + * + * Build an IDE DMA PRD (IDE speak for scatter gather table) + * and then set up the DMA transfer registers. + * + * Returns 0 on success. If a PIO fallback is required then 1 + * is returned. + */ + +static int scc_dma_setup(ide_drive_t *drive) +{ + ide_hwif_t *hwif = drive->hwif; + struct request *rq = HWGROUP(drive)->rq; + unsigned int reading; + u8 dma_stat; + + if (rq_data_dir(rq)) + reading = 0; + else + reading = 1 << 3; + + /* fall back to pio! */ + if (!ide_build_dmatable(drive, rq)) { + ide_map_sg(drive, rq); + return 1; + } + + /* PRD table */ + out_be32((void __iomem *)hwif->dma_prdtable, hwif->dmatable_dma); + + /* specify r/w */ + out_be32((void __iomem *)hwif->dma_command, reading); + + /* read dma_status for INTR & ERROR flags */ + dma_stat = in_be32((void __iomem *)hwif->dma_status); + + /* clear INTR & ERROR flags */ + out_be32((void __iomem *)hwif->dma_status, dma_stat|6); + drive->waiting_for_dma = 1; + return 0; +} + + +/** + * scc_ide_dma_end - Stop DMA + * @drive: IDE drive + * + * Check and clear INT Status register. + * Then call __ide_dma_end(). + */ + +static int scc_ide_dma_end(ide_drive_t * drive) +{ + ide_hwif_t *hwif = HWIF(drive); + unsigned long intsts_port = hwif->dma_base + 0x014; + u32 reg; + + while (1) { + reg = in_be32((void __iomem *)intsts_port); + + if (reg & INTSTS_SERROR) { + printk(KERN_WARNING "%s: SERROR\n", SCC_PATA_NAME); + out_be32((void __iomem *)intsts_port, INTSTS_SERROR|INTSTS_BMSINT); + + out_be32((void __iomem *)hwif->dma_command, in_be32((void __iomem *)hwif->dma_command) & ~QCHCD_IOS_SS); + continue; + } + + if (reg & INTSTS_PRERR) { + u32 maea0, maec0; + unsigned long ctl_base = hwif->config_data; + + maea0 = in_be32((void __iomem *)(ctl_base + 0xF50)); + maec0 = in_be32((void __iomem *)(ctl_base + 0xF54)); + + printk(KERN_WARNING "%s: PRERR [addr:%x cmd:%x]\n", SCC_PATA_NAME, maea0, maec0); + + out_be32((void __iomem *)intsts_port, INTSTS_PRERR|INTSTS_BMSINT); + + out_be32((void __iomem *)hwif->dma_command, in_be32((void __iomem *)hwif->dma_command) & ~QCHCD_IOS_SS); + continue; + } + + if (reg & INTSTS_RERR) { + printk(KERN_WARNING "%s: Response Error\n", SCC_PATA_NAME); + out_be32((void __iomem *)intsts_port, INTSTS_RERR|INTSTS_BMSINT); + + out_be32((void __iomem *)hwif->dma_command, in_be32((void __iomem *)hwif->dma_command) & ~QCHCD_IOS_SS); + continue; + } + + if (reg & INTSTS_ICERR) { + out_be32((void __iomem *)hwif->dma_command, in_be32((void __iomem *)hwif->dma_command) & ~QCHCD_IOS_SS); + + printk(KERN_WARNING "%s: Illegal Configuration\n", SCC_PATA_NAME); + out_be32((void __iomem *)intsts_port, INTSTS_ICERR|INTSTS_BMSINT); + continue; + } + + if (reg & INTSTS_BMSINT) { + printk(KERN_WARNING "%s: Internal Bus Error\n", SCC_PATA_NAME); + out_be32((void __iomem *)intsts_port, INTSTS_BMSINT); + + ide_do_reset(drive); + continue; + } + + if (reg & INTSTS_BMHE) { + out_be32((void __iomem *)intsts_port, INTSTS_BMHE); + continue; + } + + if (reg & INTSTS_ACTEINT) { + out_be32((void __iomem *)intsts_port, INTSTS_ACTEINT); + continue; + } + + if (reg & INTSTS_IOIRQS) { + out_be32((void __iomem *)intsts_port, INTSTS_IOIRQS); + continue; + } + break; + } + + return __ide_dma_end(drive); +} + +/* returns 1 if dma irq issued, 0 otherwise */ +static int scc_dma_test_irq(ide_drive_t *drive) +{ + ide_hwif_t *hwif = HWIF(drive); + u8 dma_stat = hwif->INB(hwif->dma_status); + + /* return 1 if INTR asserted */ + if ((dma_stat & 4) == 4) + return 1; + + /* Workaround for PTERADD: emulate DMA_INTR when + * - IDE_STATUS[ERR] = 1 + * - INT_STATUS[INTRQ] = 1 + * - DMA_STATUS[IORACTA] = 1 + */ + if (in_be32((void __iomem *)IDE_ALTSTATUS_REG) & ERR_STAT && + in_be32((void __iomem *)(hwif->dma_base + 0x014)) & INTSTS_INTRQ && + dma_stat & 1) + return 1; + + if (!drive->waiting_for_dma) + printk(KERN_WARNING "%s: (%s) called while not waiting\n", + drive->name, __FUNCTION__); + return 0; +} + +/** + * setup_mmio_scc - map CTRL/BMID region + * @dev: PCI device we are configuring + * @name: device name + * + */ + +static int setup_mmio_scc (struct pci_dev *dev, const char *name) +{ + unsigned long ctl_base = pci_resource_start(dev, 0); + unsigned long dma_base = pci_resource_start(dev, 1); + unsigned long ctl_size = pci_resource_len(dev, 0); + unsigned long dma_size = pci_resource_len(dev, 1); + void *ctl_addr; + void *dma_addr; + int i; + + for (i = 0; i < MAX_HWIFS; i++) { + if (scc_ports[i].ctl == 0) + break; + } + if (i >= MAX_HWIFS) + return -ENOMEM; + + if (!request_mem_region(ctl_base, ctl_size, name)) { + printk(KERN_WARNING "%s: IDE controller MMIO ports not available.\n", SCC_PATA_NAME); + goto fail_0; + } + + if (!request_mem_region(dma_base, dma_size, name)) { + printk(KERN_WARNING "%s: IDE controller MMIO ports not available.\n", SCC_PATA_NAME); + goto fail_1; + } + + if ((ctl_addr = ioremap(ctl_base, ctl_size)) == NULL) + goto fail_2; + + if ((dma_addr = ioremap(dma_base, dma_size)) == NULL) + goto fail_3; + + pci_set_master(dev); + scc_ports[i].ctl = (unsigned long)ctl_addr; + scc_ports[i].dma = (unsigned long)dma_addr; + pci_set_drvdata(dev, (void *) &scc_ports[i]); + + return 1; + + fail_3: + iounmap(ctl_addr); + fail_2: + release_mem_region(dma_base, dma_size); + fail_1: + release_mem_region(ctl_base, ctl_size); + fail_0: + return -ENOMEM; +} + +/** + * init_setup_scc - set up an SCC PATA Controller + * @dev: PCI device + * @d: IDE PCI device + * + * Perform the initial set up for this device. + */ + +static int __devinit init_setup_scc(struct pci_dev *dev, ide_pci_device_t *d) +{ + unsigned long ctl_base; + unsigned long dma_base; + unsigned long cckctrl_port; + unsigned long intmask_port; + unsigned long mode_port; + unsigned long ecmode_port; + unsigned long dma_status_port; + u32 reg = 0; + struct scc_ports *ports; + int rc; + + rc = setup_mmio_scc(dev, d->name); + if (rc < 0) { + return rc; + } + + ports = pci_get_drvdata(dev); + ctl_base = ports->ctl; + dma_base = ports->dma; + cckctrl_port = ctl_base + 0xff0; + intmask_port = dma_base + 0x010; + mode_port = ctl_base + 0x024; + ecmode_port = ctl_base + 0xf00; + dma_status_port = dma_base + 0x004; + + /* controller initialization */ + reg = 0; + out_be32((void*)cckctrl_port, reg); + reg |= CCKCTRL_ATACLKOEN; + out_be32((void*)cckctrl_port, reg); + reg |= CCKCTRL_LCLKEN | CCKCTRL_OCLKEN; + out_be32((void*)cckctrl_port, reg); + reg |= CCKCTRL_CRST; + out_be32((void*)cckctrl_port, reg); + + for (;;) { + reg = in_be32((void*)cckctrl_port); + if (reg & CCKCTRL_CRST) + break; + udelay(5000); + } + + reg |= CCKCTRL_ATARESET; + out_be32((void*)cckctrl_port, reg); + + out_be32((void*)ecmode_port, ECMODE_VALUE); + out_be32((void*)mode_port, MODE_JCUSFEN); + out_be32((void*)intmask_port, INTMASK_MSK); + + return ide_setup_pci_device(dev, d); +} + +/** + * init_mmio_iops_scc - set up the iops for MMIO + * @hwif: interface to set up + * + */ + +static void __devinit init_mmio_iops_scc(ide_hwif_t *hwif) +{ + struct pci_dev *dev = hwif->pci_dev; + struct scc_ports *ports = pci_get_drvdata(dev); + unsigned long dma_base = ports->dma; + + ide_set_hwifdata(hwif, ports); + + hwif->INB = scc_ide_inb; + hwif->INW = scc_ide_inw; + hwif->INSW = scc_ide_insw; + hwif->INSL = scc_ide_insl; + hwif->OUTB = scc_ide_outb; + hwif->OUTBSYNC = scc_ide_outbsync; + hwif->OUTW = scc_ide_outw; + hwif->OUTSW = scc_ide_outsw; + hwif->OUTSL = scc_ide_outsl; + + hwif->io_ports[IDE_DATA_OFFSET] = dma_base + 0x20; + hwif->io_ports[IDE_ERROR_OFFSET] = dma_base + 0x24; + hwif->io_ports[IDE_NSECTOR_OFFSET] = dma_base + 0x28; + hwif->io_ports[IDE_SECTOR_OFFSET] = dma_base + 0x2c; + hwif->io_ports[IDE_LCYL_OFFSET] = dma_base + 0x30; + hwif->io_ports[IDE_HCYL_OFFSET] = dma_base + 0x34; + hwif->io_ports[IDE_SELECT_OFFSET] = dma_base + 0x38; + hwif->io_ports[IDE_STATUS_OFFSET] = dma_base + 0x3c; + hwif->io_ports[IDE_CONTROL_OFFSET] = dma_base + 0x40; + + hwif->irq = hwif->pci_dev->irq; + hwif->dma_base = dma_base; + hwif->config_data = ports->ctl; + hwif->mmio = 1; +} + +/** + * init_iops_scc - set up iops + * @hwif: interface to set up + * + * Do the basic setup for the SCC hardware interface + * and then do the MMIO setup. + */ + +static void __devinit init_iops_scc(ide_hwif_t *hwif) +{ + struct pci_dev *dev = hwif->pci_dev; + hwif->hwif_data = NULL; + if (pci_get_drvdata(dev) == NULL) + return; + init_mmio_iops_scc(hwif); +} + +/** + * init_hwif_scc - set up hwif + * @hwif: interface to set up + * + * We do the basic set up of the interface structure. The SCC + * requires several custom handlers so we override the default + * ide DMA handlers appropriately. + */ + +static void __devinit init_hwif_scc(ide_hwif_t *hwif) +{ + struct scc_ports *ports = ide_get_hwifdata(hwif); + + ports->hwif_id = hwif->index; + + hwif->dma_command = hwif->dma_base; + hwif->dma_status = hwif->dma_base + 0x04; + hwif->dma_prdtable = hwif->dma_base + 0x08; + + /* PTERADD */ + out_be32((void __iomem *)(hwif->dma_base + 0x018), hwif->dmatable_dma); + + hwif->dma_setup = scc_dma_setup; + hwif->ide_dma_end = scc_ide_dma_end; + hwif->speedproc = scc_tune_chipset; + hwif->tuneproc = scc_tuneproc; + hwif->ide_dma_check = scc_config_drive_for_dma; + hwif->ide_dma_test_irq = scc_dma_test_irq; + + hwif->drives[0].autotune = IDE_TUNE_AUTO; + hwif->drives[1].autotune = IDE_TUNE_AUTO; + + if (in_be32((void __iomem *)(hwif->config_data + 0xff0)) & CCKCTRL_ATACLKOEN) { + hwif->ultra_mask = 0x7f; /* 133MHz */ + } else { + hwif->ultra_mask = 0x3f; /* 100MHz */ + } + hwif->mwdma_mask = 0x00; + hwif->swdma_mask = 0x00; + hwif->atapi_dma = 1; + + /* we support 80c cable only. */ + hwif->udma_four = 1; + + hwif->autodma = 0; + if (!noautodma) + hwif->autodma = 1; + hwif->drives[0].autodma = hwif->autodma; + hwif->drives[1].autodma = hwif->autodma; +} + +#define DECLARE_SCC_DEV(name_str) \ + { \ + .name = name_str, \ + .init_setup = init_setup_scc, \ + .init_iops = init_iops_scc, \ + .init_hwif = init_hwif_scc, \ + .channels = 1, \ + .autodma = AUTODMA, \ + .bootable = ON_BOARD, \ + } + +static ide_pci_device_t scc_chipsets[] __devinitdata = { + /* 0 */ DECLARE_SCC_DEV("sccIDE"), +}; + +/** + * scc_init_one - pci layer discovery entry + * @dev: PCI device + * @id: ident table entry + * + * Called by the PCI code when it finds an SCC PATA controller. + * We then use the IDE PCI generic helper to do most of the work. + */ + +static int __devinit scc_init_one(struct pci_dev *dev, const struct pci_device_id *id) +{ + ide_pci_device_t *d = &scc_chipsets[id->driver_data]; + return d->init_setup(dev, d); +} + +/** + * scc_remove - pci layer remove entry + * @dev: PCI device + * + * Called by the PCI code when it removes an SCC PATA controller. + */ + +static void __devexit scc_remove(struct pci_dev *dev) +{ + struct scc_ports *ports = pci_get_drvdata(dev); + ide_hwif_t *hwif = &ide_hwifs[ports->hwif_id]; + unsigned long ctl_base = pci_resource_start(dev, 0); + unsigned long dma_base = pci_resource_start(dev, 1); + unsigned long ctl_size = pci_resource_len(dev, 0); + unsigned long dma_size = pci_resource_len(dev, 1); + + if (hwif->dmatable_cpu) { + pci_free_consistent(hwif->pci_dev, + PRD_ENTRIES * PRD_BYTES, + hwif->dmatable_cpu, + hwif->dmatable_dma); + hwif->dmatable_cpu = NULL; + } + + ide_unregister(hwif->index); + + hwif->chipset = ide_unknown; + iounmap((void*)ports->dma); + iounmap((void*)ports->ctl); + release_mem_region(dma_base, dma_size); + release_mem_region(ctl_base, ctl_size); + memset(ports, 0, sizeof(*ports)); +} + +static struct pci_device_id scc_pci_tbl[] = { + { PCI_VENDOR_ID_TOSHIBA_2, PCI_DEVICE_ID_TOSHIBA_SCC_ATA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + { 0, }, +}; +MODULE_DEVICE_TABLE(pci, scc_pci_tbl); + +static struct pci_driver driver = { + .name = "SCC IDE", + .id_table = scc_pci_tbl, + .probe = scc_init_one, + .remove = scc_remove, +}; + +static int scc_ide_init(void) +{ + return ide_pci_register_driver(&driver); +} + +module_init(scc_ide_init); +/* -- No exit code? +static void scc_ide_exit(void) +{ + ide_pci_unregister_driver(&driver); +} +module_exit(scc_ide_exit); + */ + + +MODULE_DESCRIPTION("PCI driver module for Toshiba SCC IDE"); +MODULE_LICENSE("GPL"); Index: linux/drivers/ide/ppc/scc_pata.c =================================================================== --- linux.orig/drivers/ide/ppc/scc_pata.c +++ /dev/null @@ -1,858 +0,0 @@ -/* - * Support for IDE interfaces on Celleb platform - * - * (C) Copyright 2006 TOSHIBA CORPORATION - * - * This code is based on drivers/ide/pci/siimage.c: - * Copyright (C) 2001-2002 Andre Hedrick - * Copyright (C) 2003 Red Hat - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. - */ - -#include -#include -#include -#include -#include -#include -#include - -#define PCI_DEVICE_ID_TOSHIBA_SCC_ATA 0x01b4 - -#define SCC_PATA_NAME "scc IDE" - -#define TDVHSEL_MASTER 0x00000001 -#define TDVHSEL_SLAVE 0x00000004 - -#define MODE_JCUSFEN 0x00000080 - -#define CCKCTRL_ATARESET 0x00040000 -#define CCKCTRL_BUFCNT 0x00020000 -#define CCKCTRL_CRST 0x00010000 -#define CCKCTRL_OCLKEN 0x00000100 -#define CCKCTRL_ATACLKOEN 0x00000002 -#define CCKCTRL_LCLKEN 0x00000001 - -#define QCHCD_IOS_SS 0x00000001 - -#define QCHSD_STPDIAG 0x00020000 - -#define INTMASK_MSK 0xD1000012 -#define INTSTS_SERROR 0x80000000 -#define INTSTS_PRERR 0x40000000 -#define INTSTS_RERR 0x10000000 -#define INTSTS_ICERR 0x01000000 -#define INTSTS_BMSINT 0x00000010 -#define INTSTS_BMHE 0x00000008 -#define INTSTS_IOIRQS 0x00000004 -#define INTSTS_INTRQ 0x00000002 -#define INTSTS_ACTEINT 0x00000001 - -#define ECMODE_VALUE 0x01 - -static struct scc_ports { - unsigned long ctl, dma; - unsigned char hwif_id; /* for removing hwif from system */ -} scc_ports[MAX_HWIFS]; - -/* PIO transfer mode table */ -/* JCHST */ -static unsigned long JCHSTtbl[2][7] = { - {0x0E, 0x05, 0x02, 0x03, 0x02, 0x00, 0x00}, /* 100MHz */ - {0x13, 0x07, 0x04, 0x04, 0x03, 0x00, 0x00} /* 133MHz */ -}; - -/* JCHHT */ -static unsigned long JCHHTtbl[2][7] = { - {0x0E, 0x02, 0x02, 0x02, 0x02, 0x00, 0x00}, /* 100MHz */ - {0x13, 0x03, 0x03, 0x03, 0x03, 0x00, 0x00} /* 133MHz */ -}; - -/* JCHCT */ -static unsigned long JCHCTtbl[2][7] = { - {0x1D, 0x1D, 0x1C, 0x0B, 0x06, 0x00, 0x00}, /* 100MHz */ - {0x27, 0x26, 0x26, 0x0E, 0x09, 0x00, 0x00} /* 133MHz */ -}; - - -/* DMA transfer mode table */ -/* JCHDCTM/JCHDCTS */ -static unsigned long JCHDCTxtbl[2][7] = { - {0x0A, 0x06, 0x04, 0x03, 0x01, 0x00, 0x00}, /* 100MHz */ - {0x0E, 0x09, 0x06, 0x04, 0x02, 0x01, 0x00} /* 133MHz */ -}; - -/* JCSTWTM/JCSTWTS */ -static unsigned long JCSTWTxtbl[2][7] = { - {0x06, 0x04, 0x03, 0x02, 0x02, 0x02, 0x00}, /* 100MHz */ - {0x09, 0x06, 0x04, 0x02, 0x02, 0x02, 0x02} /* 133MHz */ -}; - -/* JCTSS */ -static unsigned long JCTSStbl[2][7] = { - {0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x00}, /* 100MHz */ - {0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05} /* 133MHz */ -}; - -/* JCENVT */ -static unsigned long JCENVTtbl[2][7] = { - {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00}, /* 100MHz */ - {0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02} /* 133MHz */ -}; - -/* JCACTSELS/JCACTSELM */ -static unsigned long JCACTSELtbl[2][7] = { - {0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00}, /* 100MHz */ - {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01} /* 133MHz */ -}; - - -static u8 scc_ide_inb(unsigned long port) -{ - u32 data = in_be32((void*)port); - return (u8)data; -} - -static u16 scc_ide_inw(unsigned long port) -{ - u32 data = in_be32((void*)port); - return (u16)data; -} - -static void scc_ide_insw(unsigned long port, void *addr, u32 count) -{ - u16 *ptr = (u16 *)addr; - while (count--) { - *ptr++ = le16_to_cpu(in_be32((void*)port)); - } -} - -static void scc_ide_insl(unsigned long port, void *addr, u32 count) -{ - u16 *ptr = (u16 *)addr; - while (count--) { - *ptr++ = le16_to_cpu(in_be32((void*)port)); - *ptr++ = le16_to_cpu(in_be32((void*)port)); - } -} - -static void scc_ide_outb(u8 addr, unsigned long port) -{ - out_be32((void*)port, addr); -} - -static void scc_ide_outw(u16 addr, unsigned long port) -{ - out_be32((void*)port, addr); -} - -static void -scc_ide_outbsync(ide_drive_t * drive, u8 addr, unsigned long port) -{ - ide_hwif_t *hwif = HWIF(drive); - - out_be32((void*)port, addr); - __asm__ __volatile__("eieio":::"memory"); - in_be32((void*)(hwif->dma_base + 0x01c)); - __asm__ __volatile__("eieio":::"memory"); -} - -static void -scc_ide_outsw(unsigned long port, void *addr, u32 count) -{ - u16 *ptr = (u16 *)addr; - while (count--) { - out_be32((void*)port, cpu_to_le16(*ptr++)); - } -} - -static void -scc_ide_outsl(unsigned long port, void *addr, u32 count) -{ - u16 *ptr = (u16 *)addr; - while (count--) { - out_be32((void*)port, cpu_to_le16(*ptr++)); - out_be32((void*)port, cpu_to_le16(*ptr++)); - } -} - -/** - * scc_ratemask - Compute available modes - * @drive: IDE drive - * - * Compute the available speeds for the devices on the interface. - * Enforce UDMA33 as a limit if there is no 80pin cable present. - */ - -static u8 scc_ratemask(ide_drive_t *drive) -{ - u8 mode = 4; - - if (!eighty_ninty_three(drive)) - mode = min(mode, (u8)1); - return mode; -} - -/** - * scc_tuneproc - tune a drive PIO mode - * @drive: drive to tune - * @mode_wanted: the target operating mode - * - * Load the timing settings for this device mode into the - * controller. - */ - -static void scc_tuneproc(ide_drive_t *drive, byte mode_wanted) -{ - ide_hwif_t *hwif = HWIF(drive); - struct scc_ports *ports = ide_get_hwifdata(hwif); - unsigned long ctl_base = ports->ctl; - unsigned long cckctrl_port = ctl_base + 0xff0; - unsigned long piosht_port = ctl_base + 0x000; - unsigned long pioct_port = ctl_base + 0x004; - unsigned long reg; - unsigned char speed = XFER_PIO_0; - int offset; - - mode_wanted = ide_get_best_pio_mode(drive, mode_wanted, 4, NULL); - switch (mode_wanted) { - case 4: - speed = XFER_PIO_4; - break; - case 3: - speed = XFER_PIO_3; - break; - case 2: - speed = XFER_PIO_2; - break; - case 1: - speed = XFER_PIO_1; - break; - case 0: - default: - speed = XFER_PIO_0; - break; - } - - reg = in_be32((void __iomem *)cckctrl_port); - if (reg & CCKCTRL_ATACLKOEN) { - offset = 1; /* 133MHz */ - } else { - offset = 0; /* 100MHz */ - } - reg = JCHSTtbl[offset][mode_wanted] << 16 | JCHHTtbl[offset][mode_wanted]; - out_be32((void __iomem *)piosht_port, reg); - reg = JCHCTtbl[offset][mode_wanted]; - out_be32((void __iomem *)pioct_port, reg); - - ide_config_drive_speed(drive, speed); -} - -/** - * scc_tune_chipset - tune a drive DMA mode - * @drive: Drive to set up - * @xferspeed: speed we want to achieve - * - * Load the timing settings for this device mode into the - * controller. - */ - -static int scc_tune_chipset(ide_drive_t *drive, byte xferspeed) -{ - ide_hwif_t *hwif = HWIF(drive); - u8 speed = ide_rate_filter(scc_ratemask(drive), xferspeed); - struct scc_ports *ports = ide_get_hwifdata(hwif); - unsigned long ctl_base = ports->ctl; - unsigned long cckctrl_port = ctl_base + 0xff0; - unsigned long mdmact_port = ctl_base + 0x008; - unsigned long mcrcst_port = ctl_base + 0x00c; - unsigned long sdmact_port = ctl_base + 0x010; - unsigned long scrcst_port = ctl_base + 0x014; - unsigned long udenvt_port = ctl_base + 0x018; - unsigned long tdvhsel_port = ctl_base + 0x020; - int is_slave = (&hwif->drives[1] == drive); - int offset, idx; - unsigned long reg; - unsigned long jcactsel; - - reg = in_be32((void __iomem *)cckctrl_port); - if (reg & CCKCTRL_ATACLKOEN) { - offset = 1; /* 133MHz */ - } else { - offset = 0; /* 100MHz */ - } - - switch (speed) { - case XFER_UDMA_6: - idx = 6; - break; - case XFER_UDMA_5: - idx = 5; - break; - case XFER_UDMA_4: - idx = 4; - break; - case XFER_UDMA_3: - idx = 3; - break; - case XFER_UDMA_2: - idx = 2; - break; - case XFER_UDMA_1: - idx = 1; - break; - case XFER_UDMA_0: - idx = 0; - break; - default: - return 1; - } - - jcactsel = JCACTSELtbl[offset][idx]; - if (is_slave) { - out_be32((void __iomem *)sdmact_port, JCHDCTxtbl[offset][idx]); - out_be32((void __iomem *)scrcst_port, JCSTWTxtbl[offset][idx]); - jcactsel = jcactsel << 2; - out_be32((void __iomem *)tdvhsel_port, (in_be32((void __iomem *)tdvhsel_port) & ~TDVHSEL_SLAVE) | jcactsel); - } else { - out_be32((void __iomem *)mdmact_port, JCHDCTxtbl[offset][idx]); - out_be32((void __iomem *)mcrcst_port, JCSTWTxtbl[offset][idx]); - out_be32((void __iomem *)tdvhsel_port, (in_be32((void __iomem *)tdvhsel_port) & ~TDVHSEL_MASTER) | jcactsel); - } - reg = JCTSStbl[offset][idx] << 16 | JCENVTtbl[offset][idx]; - out_be32((void __iomem *)udenvt_port, reg); - - return ide_config_drive_speed(drive, speed); -} - -/** - * scc_config_chipset_for_dma - configure for DMA - * @drive: drive to configure - * - * Called by scc_config_drive_for_dma(). - */ - -static int scc_config_chipset_for_dma(ide_drive_t *drive) -{ - u8 speed = ide_dma_speed(drive, scc_ratemask(drive)); - - if (!speed) - return 0; - - if (scc_tune_chipset(drive, speed)) - return 0; - - return ide_dma_enable(drive); -} - -/** - * scc_configure_drive_for_dma - set up for DMA transfers - * @drive: drive we are going to set up - * - * Set up the drive for DMA, tune the controller and drive as - * required. - * If the drive isn't suitable for DMA or we hit other problems - * then we will drop down to PIO and set up PIO appropriately. - * (return 1) - */ - -static int scc_config_drive_for_dma(ide_drive_t *drive) -{ - if (ide_use_dma(drive) && scc_config_chipset_for_dma(drive)) - return 0; - - if (ide_use_fast_pio(drive)) - scc_tuneproc(drive, 4); - - return -1; -} - -/** - * scc_ide_dma_setup - begin a DMA phase - * @drive: target device - * - * Build an IDE DMA PRD (IDE speak for scatter gather table) - * and then set up the DMA transfer registers. - * - * Returns 0 on success. If a PIO fallback is required then 1 - * is returned. - */ - -static int scc_dma_setup(ide_drive_t *drive) -{ - ide_hwif_t *hwif = drive->hwif; - struct request *rq = HWGROUP(drive)->rq; - unsigned int reading; - u8 dma_stat; - - if (rq_data_dir(rq)) - reading = 0; - else - reading = 1 << 3; - - /* fall back to pio! */ - if (!ide_build_dmatable(drive, rq)) { - ide_map_sg(drive, rq); - return 1; - } - - /* PRD table */ - out_be32((void __iomem *)hwif->dma_prdtable, hwif->dmatable_dma); - - /* specify r/w */ - out_be32((void __iomem *)hwif->dma_command, reading); - - /* read dma_status for INTR & ERROR flags */ - dma_stat = in_be32((void __iomem *)hwif->dma_status); - - /* clear INTR & ERROR flags */ - out_be32((void __iomem *)hwif->dma_status, dma_stat|6); - drive->waiting_for_dma = 1; - return 0; -} - - -/** - * scc_ide_dma_end - Stop DMA - * @drive: IDE drive - * - * Check and clear INT Status register. - * Then call __ide_dma_end(). - */ - -static int scc_ide_dma_end(ide_drive_t * drive) -{ - ide_hwif_t *hwif = HWIF(drive); - unsigned long intsts_port = hwif->dma_base + 0x014; - u32 reg; - - while (1) { - reg = in_be32((void __iomem *)intsts_port); - - if (reg & INTSTS_SERROR) { - printk(KERN_WARNING "%s: SERROR\n", SCC_PATA_NAME); - out_be32((void __iomem *)intsts_port, INTSTS_SERROR|INTSTS_BMSINT); - - out_be32((void __iomem *)hwif->dma_command, in_be32((void __iomem *)hwif->dma_command) & ~QCHCD_IOS_SS); - continue; - } - - if (reg & INTSTS_PRERR) { - u32 maea0, maec0; - unsigned long ctl_base = hwif->config_data; - - maea0 = in_be32((void __iomem *)(ctl_base + 0xF50)); - maec0 = in_be32((void __iomem *)(ctl_base + 0xF54)); - - printk(KERN_WARNING "%s: PRERR [addr:%x cmd:%x]\n", SCC_PATA_NAME, maea0, maec0); - - out_be32((void __iomem *)intsts_port, INTSTS_PRERR|INTSTS_BMSINT); - - out_be32((void __iomem *)hwif->dma_command, in_be32((void __iomem *)hwif->dma_command) & ~QCHCD_IOS_SS); - continue; - } - - if (reg & INTSTS_RERR) { - printk(KERN_WARNING "%s: Response Error\n", SCC_PATA_NAME); - out_be32((void __iomem *)intsts_port, INTSTS_RERR|INTSTS_BMSINT); - - out_be32((void __iomem *)hwif->dma_command, in_be32((void __iomem *)hwif->dma_command) & ~QCHCD_IOS_SS); - continue; - } - - if (reg & INTSTS_ICERR) { - out_be32((void __iomem *)hwif->dma_command, in_be32((void __iomem *)hwif->dma_command) & ~QCHCD_IOS_SS); - - printk(KERN_WARNING "%s: Illegal Configuration\n", SCC_PATA_NAME); - out_be32((void __iomem *)intsts_port, INTSTS_ICERR|INTSTS_BMSINT); - continue; - } - - if (reg & INTSTS_BMSINT) { - printk(KERN_WARNING "%s: Internal Bus Error\n", SCC_PATA_NAME); - out_be32((void __iomem *)intsts_port, INTSTS_BMSINT); - - ide_do_reset(drive); - continue; - } - - if (reg & INTSTS_BMHE) { - out_be32((void __iomem *)intsts_port, INTSTS_BMHE); - continue; - } - - if (reg & INTSTS_ACTEINT) { - out_be32((void __iomem *)intsts_port, INTSTS_ACTEINT); - continue; - } - - if (reg & INTSTS_IOIRQS) { - out_be32((void __iomem *)intsts_port, INTSTS_IOIRQS); - continue; - } - break; - } - - return __ide_dma_end(drive); -} - -/* returns 1 if dma irq issued, 0 otherwise */ -static int scc_dma_test_irq(ide_drive_t *drive) -{ - ide_hwif_t *hwif = HWIF(drive); - u8 dma_stat = hwif->INB(hwif->dma_status); - - /* return 1 if INTR asserted */ - if ((dma_stat & 4) == 4) - return 1; - - /* Workaround for PTERADD: emulate DMA_INTR when - * - IDE_STATUS[ERR] = 1 - * - INT_STATUS[INTRQ] = 1 - * - DMA_STATUS[IORACTA] = 1 - */ - if (in_be32((void __iomem *)IDE_ALTSTATUS_REG) & ERR_STAT && - in_be32((void __iomem *)(hwif->dma_base + 0x014)) & INTSTS_INTRQ && - dma_stat & 1) - return 1; - - if (!drive->waiting_for_dma) - printk(KERN_WARNING "%s: (%s) called while not waiting\n", - drive->name, __FUNCTION__); - return 0; -} - -/** - * setup_mmio_scc - map CTRL/BMID region - * @dev: PCI device we are configuring - * @name: device name - * - */ - -static int setup_mmio_scc (struct pci_dev *dev, const char *name) -{ - unsigned long ctl_base = pci_resource_start(dev, 0); - unsigned long dma_base = pci_resource_start(dev, 1); - unsigned long ctl_size = pci_resource_len(dev, 0); - unsigned long dma_size = pci_resource_len(dev, 1); - void *ctl_addr; - void *dma_addr; - int i; - - for (i = 0; i < MAX_HWIFS; i++) { - if (scc_ports[i].ctl == 0) - break; - } - if (i >= MAX_HWIFS) - return -ENOMEM; - - if (!request_mem_region(ctl_base, ctl_size, name)) { - printk(KERN_WARNING "%s: IDE controller MMIO ports not available.\n", SCC_PATA_NAME); - goto fail_0; - } - - if (!request_mem_region(dma_base, dma_size, name)) { - printk(KERN_WARNING "%s: IDE controller MMIO ports not available.\n", SCC_PATA_NAME); - goto fail_1; - } - - if ((ctl_addr = ioremap(ctl_base, ctl_size)) == NULL) - goto fail_2; - - if ((dma_addr = ioremap(dma_base, dma_size)) == NULL) - goto fail_3; - - pci_set_master(dev); - scc_ports[i].ctl = (unsigned long)ctl_addr; - scc_ports[i].dma = (unsigned long)dma_addr; - pci_set_drvdata(dev, (void *) &scc_ports[i]); - - return 1; - - fail_3: - iounmap(ctl_addr); - fail_2: - release_mem_region(dma_base, dma_size); - fail_1: - release_mem_region(ctl_base, ctl_size); - fail_0: - return -ENOMEM; -} - -/** - * init_setup_scc - set up an SCC PATA Controller - * @dev: PCI device - * @d: IDE PCI device - * - * Perform the initial set up for this device. - */ - -static int __devinit init_setup_scc(struct pci_dev *dev, ide_pci_device_t *d) -{ - unsigned long ctl_base; - unsigned long dma_base; - unsigned long cckctrl_port; - unsigned long intmask_port; - unsigned long mode_port; - unsigned long ecmode_port; - unsigned long dma_status_port; - u32 reg = 0; - struct scc_ports *ports; - int rc; - - rc = setup_mmio_scc(dev, d->name); - if (rc < 0) { - return rc; - } - - ports = pci_get_drvdata(dev); - ctl_base = ports->ctl; - dma_base = ports->dma; - cckctrl_port = ctl_base + 0xff0; - intmask_port = dma_base + 0x010; - mode_port = ctl_base + 0x024; - ecmode_port = ctl_base + 0xf00; - dma_status_port = dma_base + 0x004; - - /* controller initialization */ - reg = 0; - out_be32((void*)cckctrl_port, reg); - reg |= CCKCTRL_ATACLKOEN; - out_be32((void*)cckctrl_port, reg); - reg |= CCKCTRL_LCLKEN | CCKCTRL_OCLKEN; - out_be32((void*)cckctrl_port, reg); - reg |= CCKCTRL_CRST; - out_be32((void*)cckctrl_port, reg); - - for (;;) { - reg = in_be32((void*)cckctrl_port); - if (reg & CCKCTRL_CRST) - break; - udelay(5000); - } - - reg |= CCKCTRL_ATARESET; - out_be32((void*)cckctrl_port, reg); - - out_be32((void*)ecmode_port, ECMODE_VALUE); - out_be32((void*)mode_port, MODE_JCUSFEN); - out_be32((void*)intmask_port, INTMASK_MSK); - - return ide_setup_pci_device(dev, d); -} - -/** - * init_mmio_iops_scc - set up the iops for MMIO - * @hwif: interface to set up - * - */ - -static void __devinit init_mmio_iops_scc(ide_hwif_t *hwif) -{ - struct pci_dev *dev = hwif->pci_dev; - struct scc_ports *ports = pci_get_drvdata(dev); - unsigned long dma_base = ports->dma; - - ide_set_hwifdata(hwif, ports); - - hwif->INB = scc_ide_inb; - hwif->INW = scc_ide_inw; - hwif->INSW = scc_ide_insw; - hwif->INSL = scc_ide_insl; - hwif->OUTB = scc_ide_outb; - hwif->OUTBSYNC = scc_ide_outbsync; - hwif->OUTW = scc_ide_outw; - hwif->OUTSW = scc_ide_outsw; - hwif->OUTSL = scc_ide_outsl; - - hwif->io_ports[IDE_DATA_OFFSET] = dma_base + 0x20; - hwif->io_ports[IDE_ERROR_OFFSET] = dma_base + 0x24; - hwif->io_ports[IDE_NSECTOR_OFFSET] = dma_base + 0x28; - hwif->io_ports[IDE_SECTOR_OFFSET] = dma_base + 0x2c; - hwif->io_ports[IDE_LCYL_OFFSET] = dma_base + 0x30; - hwif->io_ports[IDE_HCYL_OFFSET] = dma_base + 0x34; - hwif->io_ports[IDE_SELECT_OFFSET] = dma_base + 0x38; - hwif->io_ports[IDE_STATUS_OFFSET] = dma_base + 0x3c; - hwif->io_ports[IDE_CONTROL_OFFSET] = dma_base + 0x40; - - hwif->irq = hwif->pci_dev->irq; - hwif->dma_base = dma_base; - hwif->config_data = ports->ctl; - hwif->mmio = 1; -} - -/** - * init_iops_scc - set up iops - * @hwif: interface to set up - * - * Do the basic setup for the SCC hardware interface - * and then do the MMIO setup. - */ - -static void __devinit init_iops_scc(ide_hwif_t *hwif) -{ - struct pci_dev *dev = hwif->pci_dev; - hwif->hwif_data = NULL; - if (pci_get_drvdata(dev) == NULL) - return; - init_mmio_iops_scc(hwif); -} - -/** - * init_hwif_scc - set up hwif - * @hwif: interface to set up - * - * We do the basic set up of the interface structure. The SCC - * requires several custom handlers so we override the default - * ide DMA handlers appropriately. - */ - -static void __devinit init_hwif_scc(ide_hwif_t *hwif) -{ - struct scc_ports *ports = ide_get_hwifdata(hwif); - - ports->hwif_id = hwif->index; - - hwif->dma_command = hwif->dma_base; - hwif->dma_status = hwif->dma_base + 0x04; - hwif->dma_prdtable = hwif->dma_base + 0x08; - - /* PTERADD */ - out_be32((void __iomem *)(hwif->dma_base + 0x018), hwif->dmatable_dma); - - hwif->dma_setup = scc_dma_setup; - hwif->ide_dma_end = scc_ide_dma_end; - hwif->speedproc = scc_tune_chipset; - hwif->tuneproc = scc_tuneproc; - hwif->ide_dma_check = scc_config_drive_for_dma; - hwif->ide_dma_test_irq = scc_dma_test_irq; - - hwif->drives[0].autotune = IDE_TUNE_AUTO; - hwif->drives[1].autotune = IDE_TUNE_AUTO; - - if (in_be32((void __iomem *)(hwif->config_data + 0xff0)) & CCKCTRL_ATACLKOEN) { - hwif->ultra_mask = 0x7f; /* 133MHz */ - } else { - hwif->ultra_mask = 0x3f; /* 100MHz */ - } - hwif->mwdma_mask = 0x00; - hwif->swdma_mask = 0x00; - hwif->atapi_dma = 1; - - /* we support 80c cable only. */ - hwif->udma_four = 1; - - hwif->autodma = 0; - if (!noautodma) - hwif->autodma = 1; - hwif->drives[0].autodma = hwif->autodma; - hwif->drives[1].autodma = hwif->autodma; -} - -#define DECLARE_SCC_DEV(name_str) \ - { \ - .name = name_str, \ - .init_setup = init_setup_scc, \ - .init_iops = init_iops_scc, \ - .init_hwif = init_hwif_scc, \ - .channels = 1, \ - .autodma = AUTODMA, \ - .bootable = ON_BOARD, \ - } - -static ide_pci_device_t scc_chipsets[] __devinitdata = { - /* 0 */ DECLARE_SCC_DEV("sccIDE"), -}; - -/** - * scc_init_one - pci layer discovery entry - * @dev: PCI device - * @id: ident table entry - * - * Called by the PCI code when it finds an SCC PATA controller. - * We then use the IDE PCI generic helper to do most of the work. - */ - -static int __devinit scc_init_one(struct pci_dev *dev, const struct pci_device_id *id) -{ - ide_pci_device_t *d = &scc_chipsets[id->driver_data]; - return d->init_setup(dev, d); -} - -/** - * scc_remove - pci layer remove entry - * @dev: PCI device - * - * Called by the PCI code when it removes an SCC PATA controller. - */ - -static void __devexit scc_remove(struct pci_dev *dev) -{ - struct scc_ports *ports = pci_get_drvdata(dev); - ide_hwif_t *hwif = &ide_hwifs[ports->hwif_id]; - unsigned long ctl_base = pci_resource_start(dev, 0); - unsigned long dma_base = pci_resource_start(dev, 1); - unsigned long ctl_size = pci_resource_len(dev, 0); - unsigned long dma_size = pci_resource_len(dev, 1); - - if (hwif->dmatable_cpu) { - pci_free_consistent(hwif->pci_dev, - PRD_ENTRIES * PRD_BYTES, - hwif->dmatable_cpu, - hwif->dmatable_dma); - hwif->dmatable_cpu = NULL; - } - - ide_unregister(hwif->index); - - hwif->chipset = ide_unknown; - iounmap((void*)ports->dma); - iounmap((void*)ports->ctl); - release_mem_region(dma_base, dma_size); - release_mem_region(ctl_base, ctl_size); - memset(ports, 0, sizeof(*ports)); -} - -static struct pci_device_id scc_pci_tbl[] = { - { PCI_VENDOR_ID_TOSHIBA_2, PCI_DEVICE_ID_TOSHIBA_SCC_ATA, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, - { 0, }, -}; -MODULE_DEVICE_TABLE(pci, scc_pci_tbl); - -static struct pci_driver driver = { - .name = "SCC IDE", - .id_table = scc_pci_tbl, - .probe = scc_init_one, - .remove = scc_remove, -}; - -static int scc_ide_init(void) -{ - return ide_pci_register_driver(&driver); -} - -module_init(scc_ide_init); -/* -- No exit code? -static void scc_ide_exit(void) -{ - ide_pci_unregister_driver(&driver); -} -module_exit(scc_ide_exit); - */ - - -MODULE_DESCRIPTION("PCI driver module for Toshiba SCC IDE"); -MODULE_LICENSE("GPL"); Index: linux/drivers/ide/setup-pci.c =================================================================== --- linux.orig/drivers/ide/setup-pci.c +++ linux/drivers/ide/setup-pci.c @@ -505,11 +505,6 @@ static void ide_hwif_setup_dma(struct pc } } } - -#ifndef CONFIG_IDEDMA_PCI_AUTO -#warning CONFIG_IDEDMA_PCI_AUTO=n support is obsolete, and will be removed soon. -#endif - #endif /* CONFIG_BLK_DEV_IDEDMA_PCI*/ /** Index: linux/drivers/ieee1394/ieee1394_transactions.c =================================================================== --- linux.orig/drivers/ieee1394/ieee1394_transactions.c +++ linux/drivers/ieee1394/ieee1394_transactions.c @@ -32,7 +32,7 @@ #ifndef HPSB_DEBUG_TLABELS static #endif -spinlock_t hpsb_tlabel_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(hpsb_tlabel_lock); static DECLARE_WAIT_QUEUE_HEAD(tlabel_wq); Index: linux/drivers/input/ff-memless.c =================================================================== --- linux.orig/drivers/input/ff-memless.c +++ linux/drivers/input/ff-memless.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include Index: linux/drivers/input/gameport/gameport.c =================================================================== --- linux.orig/drivers/input/gameport/gameport.c +++ linux/drivers/input/gameport/gameport.c @@ -21,6 +21,7 @@ #include #include #include +#include #include /* HZ */ #include #include @@ -102,12 +103,12 @@ static int gameport_measure_speed(struct tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); GET_TIME(t1); for (t = 0; t < 50; t++) gameport_read(gameport); GET_TIME(t2); GET_TIME(t3); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t; } @@ -126,11 +127,11 @@ static int gameport_measure_speed(struct tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); rdtscl(t1); for (t = 0; t < 50; t++) gameport_read(gameport); rdtscl(t2); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if (t2 - t1 < tx) tx = t2 - t1; } Index: linux/drivers/input/keyboard/atkbd.c =================================================================== --- linux.orig/drivers/input/keyboard/atkbd.c +++ linux/drivers/input/keyboard/atkbd.c @@ -1382,9 +1382,23 @@ static ssize_t atkbd_show_err_count(stru return sprintf(buf, "%lu\n", atkbd->err_count); } +static int __read_mostly noatkbd; + +static int __init noatkbd_setup(char *str) +{ + noatkbd = 1; + printk(KERN_INFO "debug: not setting up AT keyboard.\n"); + + return 1; +} + +__setup("noatkbd", noatkbd_setup); static int __init atkbd_init(void) { + if (noatkbd) + return 0; + return serio_register_driver(&atkbd_drv); } Index: linux/drivers/input/mouse/psmouse-base.c =================================================================== --- linux.orig/drivers/input/mouse/psmouse-base.c +++ linux/drivers/input/mouse/psmouse-base.c @@ -1545,10 +1545,25 @@ static int psmouse_get_maxproto(char *bu return sprintf(buffer, "%s\n", psmouse_protocol_by_type(type)->name); } +static int __read_mostly nopsmouse; + +static int __init nopsmouse_setup(char *str) +{ + nopsmouse = 1; + printk(KERN_INFO "debug: not setting up psmouse.\n"); + + return 1; +} + +__setup("nopsmouse", nopsmouse_setup); + static int __init psmouse_init(void) { int err; + if (nopsmouse) + return 0; + kpsmoused_wq = create_singlethread_workqueue("kpsmoused"); if (!kpsmoused_wq) { printk(KERN_ERR "psmouse: failed to create kpsmoused workqueue\n"); Index: linux/drivers/isdn/hysdn/boardergo.c =================================================================== --- linux.orig/drivers/isdn/hysdn/boardergo.c +++ linux/drivers/isdn/hysdn/boardergo.c @@ -443,7 +443,7 @@ ergo_inithardware(hysdn_card * card) card->waitpofready = ergo_waitpofready; card->set_errlog_state = ergo_set_errlog_state; INIT_WORK(&card->irq_queue, ergo_irq_bh); - card->hysdn_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&card->hysdn_lock); return (0); } /* ergo_inithardware */ Index: linux/drivers/isdn/hysdn/hysdn_proclog.c =================================================================== --- linux.orig/drivers/isdn/hysdn/hysdn_proclog.c +++ linux/drivers/isdn/hysdn/hysdn_proclog.c @@ -299,7 +299,7 @@ hysdn_log_close(struct inode *ino, struc hysdn_card *card; int retval = 0; unsigned long flags; - spinlock_t hysdn_lock = SPIN_LOCK_UNLOCKED; + DEFINE_SPINLOCK(hysdn_lock); lock_kernel(); if ((filep->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_WRITE) { Index: linux/drivers/isdn/isdnloop/isdnloop.c =================================================================== --- linux.orig/drivers/isdn/isdnloop/isdnloop.c +++ linux/drivers/isdn/isdnloop/isdnloop.c @@ -1461,7 +1461,7 @@ isdnloop_initcard(char *id) skb_queue_head_init(&card->bqueue[i]); } skb_queue_head_init(&card->dqueue); - card->isdnloop_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&card->isdnloop_lock); card->next = cards; cards = card; if (!register_isdn(&card->interface)) { Index: linux/drivers/kvm/Kconfig =================================================================== --- linux.orig/drivers/kvm/Kconfig +++ linux/drivers/kvm/Kconfig @@ -34,4 +34,10 @@ config KVM_AMD Provides support for KVM on AMD processors equipped with the AMD-V (SVM) extensions. +config KVM_NET + tristate "Para virtual network device" + depends on KVM + ---help--- + Provides support for guest-host paravirtualization networking + endmenu Index: linux/drivers/kvm/Makefile =================================================================== --- linux.orig/drivers/kvm/Makefile +++ linux/drivers/kvm/Makefile @@ -1,10 +1,13 @@ # # Makefile for Kernel-based Virtual Machine module # +EXTRA_CFLAGS := -kvm-objs := kvm_main.o mmu.o x86_emulate.o +kvm-objs := kvm_main.o mmu.o x86_emulate.o kvm_net_host.o kvm_block.o obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o kvm-amd-objs = svm.o obj-$(CONFIG_KVM_AMD) += kvm-amd.o +kvm-net-objs = kvm_net.o +obj-$(CONFIG_KVM_NET) += kvm-net.o Index: linux/drivers/kvm/kvm.h =================================================================== --- linux.orig/drivers/kvm/kvm.h +++ linux/drivers/kvm/kvm.h @@ -11,6 +11,7 @@ #include #include #include +#include #include "vmx.h" #include @@ -52,8 +53,8 @@ #define KVM_MAX_VCPUS 1 #define KVM_MEMORY_SLOTS 4 -#define KVM_NUM_MMU_PAGES 256 -#define KVM_MIN_FREE_MMU_PAGES 5 +#define KVM_NUM_MMU_PAGES 1024 +#define KVM_MIN_FREE_MMU_PAGES 10 #define KVM_REFILL_PAGES 25 #define FX_IMAGE_SIZE 512 @@ -166,7 +167,7 @@ struct kvm_mmu { int root_level; int shadow_root_level; - u64 *pae_root; + u64 *pae_root[KVM_CR3_CACHE_SIZE]; }; #define KVM_NR_MEM_OBJS 20 @@ -221,17 +222,19 @@ enum { struct kvm_vcpu { struct kvm *kvm; + struct task_struct* task; union { struct vmcs *vmcs; struct vcpu_svm *svm; }; - struct mutex mutex; + spinlock_t lock; int cpu; int launched; int interrupt_window_open; unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */ #define NR_IRQ_WORDS KVM_IRQ_BITMAP_SIZE(unsigned long) unsigned long irq_pending[NR_IRQ_WORDS]; + spinlock_t irq_lock; unsigned long regs[NR_VCPU_REGS]; /* for rsp: vcpu_load_rsp_rip() */ unsigned long rip; /* needs vcpu_load_rsp_rip() */ @@ -261,6 +264,12 @@ struct kvm_vcpu { gfn_t last_pt_write_gfn; int last_pt_write_count; + unsigned int cr3_cache_idx; + unsigned int cr3_cache_limit; + struct page *cr3_cache_page; + gpa_t cr3_cache_gpa; + gpa_t guest_cr3_gpa[KVM_CR3_CACHE_SIZE]; + struct kvm_guest_debug guest_debug; char fx_buf[FX_BUF_SIZE]; @@ -309,6 +318,7 @@ struct kvm { int busy; unsigned long rmap_overflow; struct list_head vm_list; + struct net_device *host_netdev; struct file *filp; }; @@ -400,6 +410,9 @@ extern struct kvm_arch_ops *kvm_arch_ops int kvm_init_arch(struct kvm_arch_ops *ops, struct module *module); void kvm_exit_arch(void); +void vcpu_load(struct kvm_vcpu *vcpu); +void vcpu_put(struct kvm_vcpu *vcpu); + void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); int kvm_mmu_setup(struct kvm_vcpu *vcpu); @@ -407,7 +420,11 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu) int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot); +int __load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); +void kvm_cr3_cache_clear(struct kvm_vcpu *vcpu); + hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa); +hpa_t __gpa_to_hpa(struct kvm *vcpu, gpa_t gpa); #define HPA_MSB ((sizeof(hpa_t) * 8) - 1) #define HPA_ERR_MASK ((hpa_t)1 << HPA_MSB) static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; } @@ -452,9 +469,9 @@ int emulator_set_dr(struct x86_emulate_c unsigned long value); void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr0); -void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr0); -void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr0); +void set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +void set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +void set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); void lmsw(struct kvm_vcpu *vcpu, unsigned long msw); int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); @@ -471,11 +488,21 @@ int kvm_read_guest(struct kvm_vcpu *vcpu unsigned long size, void *dest); +int kvm_read_guest_gpa(struct kvm_vcpu *vcpu, + gpa_t addr, + unsigned long size, + void *dest); + int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size, void *data); +int kvm_write_guest_gpa(struct kvm_vcpu *vcpu, + gpa_t addr, + unsigned long size, + void *data); + unsigned long segment_base(u16 selector); void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes); @@ -484,14 +511,32 @@ int kvm_mmu_unprotect_page_virt(struct k void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run); +int hypercall_trace(unsigned long caller, unsigned long v1, unsigned long v2, + unsigned long v3); +int hypercall_flush_cr3_cache(struct kvm_vcpu *vcpu, unsigned long old_cr3); +int hypercall_register_eth(struct kvm_vcpu *vcpu, gpa_t eth_state_gpa); +int hypercall_send_eth(struct kvm_vcpu *vcpu); +int hypercall_disk_read(struct kvm_vcpu *vcpu, int nr, unsigned long start_sector, + unsigned long nr_sectors, unsigned long buf_gpa); +int hypercall_disk_write(struct kvm_vcpu *vcpu, int nr, unsigned long start_sector, + unsigned long nr_sectors, unsigned long buf_gpa); + +int hypercall_disk_register_completion_area(struct kvm_vcpu *vcpu, int nr, + unsigned long completion_page_gpa, int irq); +int hypercall_disk_register_submit_area(struct kvm_vcpu *vcpu, int nr, + unsigned long submit_page_gpa, int irq); +int hypercall_disk_trigger_scan(int nr); + +int hypercall_disk_getdiskcount(void); +int hypercall_disk_getinfo(struct kvm_vcpu *vcpu, unsigned long number, unsigned long info_gpa); -static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, - u32 error_code) -{ - if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) - kvm_mmu_free_some_pages(vcpu); - return vcpu->mmu.page_fault(vcpu, gva, error_code); -} + +int kvm_inject_irq(struct kvm_vcpu *kvm_vcpu, struct kvm_interrupt *irq); + +int kvmnet_host_init(struct kvm*); +void kvmnet_host_exit(struct kvm*); + +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn) { @@ -638,4 +683,6 @@ static inline u32 get_rdx_init_val(void) #define TSS_REDIRECTION_SIZE (256 / 8) #define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) +extern int kvm_handle_hypercall(struct kvm_vcpu *vcpu); + #endif Index: linux/drivers/kvm/kvm_block.c =================================================================== --- /dev/null +++ linux/drivers/kvm/kvm_block.c @@ -0,0 +1,382 @@ +/* + * KVM virtual block device driver for Linux + * + * (C) Copyright 2007 Intel Corporation + * + * This file is licensed to you under the terms of the GNU General Public License + * (GPL) version 2. See the COPYING file in the main directory of the kernel for + * the license text. + * + * Authors: + * Arjan van de Ven + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "kvm.h" +#include "kvm_block.h" + +struct kvm_blockhost { + int nr; + int irq; + uint64_t size; + struct file *filp; + uint64_t disk_completion_page_hpa; + uint64_t disk_submit_page_hpa; + struct kvm_vcpu *vcpu_slots; + struct delayed_work slot_work; +}; + + +#define MAX_DISK 16 +#define SLOT_COUNT (PAGE_SIZE/sizeof(uint64_t)) + +/* irq mitigation constant */ +#define WRITE_IRQ_DELAY 20 + + +static struct kvm_blockhost block[MAX_DISK]; + + +static void run_slots(struct work_struct *dummy); + + +static int valid_disk(int nr) +{ + if (nr<0) + return 0; + if (nr>=MAX_DISK) + return 0; + if (!block[nr].filp) + return 0; + return 1; +} + + +int hypercall_disk_register_completion_area(struct kvm_vcpu *vcpu, + int nr, unsigned long completion_page_gpa, int irq) +{ + printk("KVM Block device registered at irq %i\n", irq); + if (nr < 0) + return -EFAULT; + if (nr >= MAX_DISK) + return -EFAULT; + block[nr].disk_completion_page_hpa = gpa_to_hpa(vcpu, completion_page_gpa); + block[nr].irq = irq; + block[nr].vcpu_slots= vcpu; + INIT_DELAYED_WORK(&block[nr].slot_work, run_slots); + block[nr].nr = nr; + return 0; +} + +int hypercall_disk_register_submit_area(struct kvm_vcpu *vcpu, int nr, + unsigned long submit_page_gpa, int irq) +{ + if (nr < 0) + return -EFAULT; + if (nr >= MAX_DISK) + return -EFAULT; + + block[nr].disk_submit_page_hpa = gpa_to_hpa(vcpu, submit_page_gpa); + block[nr].irq = irq; + block[nr].vcpu_slots= vcpu; + + + return 0; +} + + + +int hypercall_disk_read(struct kvm_vcpu *vcpu, int nr, unsigned long start_sector, unsigned long nr_sectors, unsigned long buf_gpa) +{ + unsigned char *buff_hva; + hpa_t buff_hpa; + int ret = 0; + + if (!valid_disk(nr)) + return -EINVAL; + + /* the synchronous interface doesn't allow > 4Kb */ + if (nr_sectors>8) + return -EIO; + + vcpu_put(vcpu); + + buff_hpa = gpa_to_hpa(vcpu, buf_gpa); + ret = is_error_hpa(buff_hpa) ? -EFAULT : 0; + + if (ret<0) + goto out; + + buff_hva = kmap(pfn_to_page(buff_hpa >> PAGE_SHIFT)); + kernel_read(block[nr].filp, start_sector*512, buff_hva + (buff_hpa&4095), nr_sectors*512); + kunmap(pfn_to_page(buff_hpa >> PAGE_SHIFT)); + +out: + vcpu_load(vcpu); + return ret; +} + +/* FIXME: this probably should go to the generic kernel, just like kernel_read is */ +static int kernel_write(struct file *file, unsigned long offset, + char *addr, unsigned long count) +{ + mm_segment_t old_fs; + loff_t pos = offset; + int result; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + result = vfs_write(file, (void __user *)addr, count, &pos); + set_fs(old_fs); + return result; +} + +int hypercall_disk_write(struct kvm_vcpu *vcpu, int nr, unsigned long start_sector, + unsigned long nr_sectors, unsigned long buf_gpa) +{ + unsigned char *buff_hva; + hpa_t buff_hpa; + int ret = 0; + + if (!valid_disk(nr)) + return -EINVAL; + + /* synchronous interface does not allow more than 4Kb at a time */ + if (nr_sectors>8) + return -EIO; + + vcpu_put(vcpu); + + buff_hpa = gpa_to_hpa(vcpu, buf_gpa); + ret = is_error_hpa(buff_hpa) ? -EFAULT : 0; + + if (ret<0) + goto out; + + buff_hva = kmap(pfn_to_page(buff_hpa >> PAGE_SHIFT)); + ret = kernel_write(block[nr].filp, start_sector*512, buff_hva + (buff_hpa&4095), nr_sectors*512); + kunmap(pfn_to_page(buff_hpa >> PAGE_SHIFT)); + +out: + vcpu_load(vcpu); + return ret; +} + + +/* we can use one big vmap if all entries are PAGE_SIZE in size */ +static int can_vmap(struct kvm_iopage *iopage) +{ + int i; + for (i=0; i < iopage->entries; i++) + if (iopage->vector[i].data_len != PAGE_SIZE) + return 0; + return 1; +} + + + +static int do_disk_vector_async(struct kvm_vcpu *vcpu, int nr, unsigned long iopage_gpa, unsigned long slot) +{ + hpa_t iopage_hpa; + hpa_t data_hpa; + struct kvm_iopage *iopage_hva; + unsigned long start_data; + int i; + uint64_t *completion; + int read=0; + + /* first, get the vector mapped into our address space */ + iopage_hpa = gpa_to_hpa(vcpu, iopage_gpa); + if (is_error_hpa(iopage_hpa)) + return 1; + + iopage_hva = kmap(pfn_to_page(iopage_hpa >> PAGE_SHIFT)); + + start_data = iopage_hva->sector * 512; + + /* if we can use vmap that's fastest */ + if (can_vmap(iopage_hva)) { + struct page **page_array; + void *address; + int i; + page_array = kmalloc(sizeof(struct page*) * iopage_hva->entries, GFP_KERNEL); + + if (!page_array) + goto iterate; + + for (i=0; i < iopage_hva->entries; i++) { + hpa_t hpa; + hpa = gpa_to_hpa(vcpu, iopage_hva->vector[i].data_gpa); + page_array[i] = pfn_to_page(hpa >> PAGE_SHIFT); + } + address = vmap(page_array, iopage_hva->entries, VM_MAP, PAGE_KERNEL); + if (!address) { + kfree(page_array); + goto iterate; + } + if (iopage_hva->write) + kernel_write(block[nr].filp, start_data, address, iopage_hva->entries * PAGE_SIZE); + else { + kernel_read(block[nr].filp, start_data, address, iopage_hva->entries * PAGE_SIZE); + read = 1; + } + vunmap(address); + kfree(page_array); + } else { +iterate: + /* now, iterate over the buffer */ + for (i = 0; i < iopage_hva->entries; i++) { + char *data_hva; + + data_hpa = gpa_to_hpa(vcpu, iopage_hva->vector[i].data_gpa); + data_hva = kmap(pfn_to_page(data_hpa >> PAGE_SHIFT)); + if (iopage_hva->write) + kernel_write(block[nr].filp, start_data, data_hva + (data_hpa&4095), iopage_hva->vector[i].data_len); + else + kernel_read(block[nr].filp, start_data, data_hva + (data_hpa&4095), iopage_hva->vector[i].data_len); + start_data += iopage_hva->vector[i].data_len; + kunmap(pfn_to_page(data_hpa >> PAGE_SHIFT)); + } + } + kunmap(pfn_to_page(((unsigned long)iopage_hva) >> PAGE_SHIFT)); + + + completion = (uint64_t*)kmap_atomic(pfn_to_page(block[nr].disk_completion_page_hpa >> PAGE_SHIFT), KM_USER0); + completion[slot] = iopage_gpa; + kunmap_atomic(completion, KM_USER0); + + /* reads should not delay the irq handling */ + return 1 + read*WRITE_IRQ_DELAY; +} + + +static void raise_irq(struct kvm_vcpu *vcpu, int irq) +{ + irq += 32; + vcpu_load(vcpu); + set_bit(irq, vcpu->irq_pending); + set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); + vcpu_put(vcpu); +} + + +static void run_slots(struct work_struct *works) +{ + int i; + struct kvm_blockhost *host; + uint64_t iop; + int work = 0; + int need_irq = 0; + struct kvm_vcpu *vcpu = NULL; + uint64_t *iopage_slots; + + host = container_of(works, struct kvm_blockhost, slot_work.work); + + if (!host->disk_submit_page_hpa) + return; + + vcpu = host->vcpu_slots; + + iopage_slots = kmap(pfn_to_page(host->disk_submit_page_hpa >> PAGE_SHIFT)); + + for (i=0; i< SLOT_COUNT; i++) { + if (iopage_slots[i]!=0) { + iop = xchg(&iopage_slots[i],0); + if (iop) { + need_irq += do_disk_vector_async(vcpu, host->nr, iop, i); + work++; + } + } + if (need_irq>WRITE_IRQ_DELAY) { + need_irq = 0; + raise_irq(vcpu, host->irq); + } + } + + if (need_irq) + raise_irq(vcpu, host->irq); + kunmap(pfn_to_page(host->disk_submit_page_hpa >> PAGE_SHIFT)); +} + + + +int hypercall_disk_trigger_scan(int nr) +{ + + if (!valid_disk(nr)) + return -EINVAL; + + schedule_delayed_work_on(0, &block[nr].slot_work, 0); + return 0; +} + +int hypercall_disk_getdiskcount(void) +{ + return 1; /* hardcoded to 1 disk for now */ +} + +int hypercall_disk_getinfo(struct kvm_vcpu *vcpu, unsigned long number, unsigned long info_gpa) +{ + unsigned char *info_hva; + struct inode *inode; + hpa_t info_hpa; + int ret = 0; + struct kvm_disk_info info; + + + + if (number<0) + return -EINVAL; + if (number >= MAX_DISK) + return -EINVAL; + + + + if (!block[number].filp) { + block[number].filp = filp_open("/tmp/hyperfile",O_RDWR | O_LARGEFILE, 0600); + if (IS_ERR(block[number].filp)) + block[number].filp = NULL; + } + + if (block[number].filp == NULL) { + printk(KERN_ERR "kvm_block: filp open failed: %p -- %li \n", block[number].filp, - (unsigned long)block[number].filp); + return -EPERM; + } + + inode = block[number].filp->f_path.dentry->d_inode; + block[number].size = inode->i_size; + + + + info_hpa = gpa_to_hpa(vcpu, info_gpa); + ret = is_error_hpa(info_hpa) ? -EFAULT : 0; + if (ret) + return ret; + + + + /* must be page aligned */ + if (info_hpa&4095) + return -EINVAL; + + + memset(&info, 0, sizeof(struct kvm_disk_info)); + info.sectors = block[number].size / 512; + + info_hva = kmap_atomic(pfn_to_page(info_hpa >> PAGE_SHIFT), KM_USER0); + memcpy(info_hva, &info, sizeof(struct kvm_disk_info)); + kunmap_atomic(pfn_to_page(info_hpa >> PAGE_SHIFT), KM_USER0); + + + return ret; + +} Index: linux/drivers/kvm/kvm_block.h =================================================================== --- /dev/null +++ linux/drivers/kvm/kvm_block.h @@ -0,0 +1,24 @@ +#ifndef __INCLUDE_GUARD_KVMBLOCK_H +#define __INCLUDE_GUARD_KVMBLOCK_H + +struct kvm_vector { + uint64_t data_gpa; + uint64_t data_len; +}; + +struct kvm_iopage { + uint64_t sector; + uint64_t guest_data; + uint32_t sector_count; + uint32_t write; + uint32_t entries; + uint32_t ioresult; + struct kvm_vector vector[126]; +}; + +struct kvm_disk_info { + uint64_t sectors; + uint64_t reserved[15]; +}; + +#endif Index: linux/drivers/kvm/kvm_main.c =================================================================== --- linux.orig/drivers/kvm/kvm_main.c +++ linux/drivers/kvm/kvm_main.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -133,9 +134,9 @@ static struct file *kvmfs_file(struct in if (!file) return ERR_PTR(-ENFILE); - file->f_path.mnt = mntget(kvmfs_mnt); - file->f_path.dentry = d_alloc_anon(inode); - if (!file->f_path.dentry) + file->f_vfsmnt = mntget(kvmfs_mnt); + file->f_dentry = d_alloc_anon(inode); + if (!file->f_dentry) return ERR_PTR(-ENOMEM); file->f_mapping = inode->i_mapping; @@ -217,6 +218,41 @@ int kvm_read_guest(struct kvm_vcpu *vcpu } EXPORT_SYMBOL_GPL(kvm_read_guest); +int kvm_read_guest_gpa(struct kvm_vcpu *vcpu, + gpa_t addr, + unsigned long size, + void *dest) +{ + unsigned char *host_buf = dest; + unsigned long req_size = size; + + while (size) { + hpa_t paddr; + unsigned now; + unsigned offset; + hva_t guest_buf; + + paddr = gpa_to_hpa(vcpu, addr); + + if (is_error_hpa(paddr)) + break; + + guest_buf = (hva_t)kmap_atomic( + pfn_to_page(paddr >> PAGE_SHIFT), + KM_USER0); + offset = addr & ~PAGE_MASK; + guest_buf |= offset; + now = min(size, PAGE_SIZE - offset); + memcpy(host_buf, (void*)guest_buf, now); + host_buf += now; + addr += now; + size -= now; + kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); + } + return req_size - size; +} +EXPORT_SYMBOL_GPL(kvm_read_guest_gpa); + int kvm_write_guest(struct kvm_vcpu *vcpu, gva_t addr, unsigned long size, void *data) { @@ -252,14 +288,50 @@ int kvm_write_guest(struct kvm_vcpu *vcp } EXPORT_SYMBOL_GPL(kvm_write_guest); +int kvm_write_guest_gpa(struct kvm_vcpu *vcpu, + gpa_t addr, + unsigned long size, + void *data) +{ + unsigned char *host_buf = data; + unsigned long req_size = size; + + while (size) { + hpa_t paddr; + unsigned now; + unsigned offset; + hva_t guest_buf; + + paddr = gpa_to_hpa(vcpu, addr); + if (is_error_hpa(paddr)) { + printk("hm, couldnt look up gpa: %08Lx\n", addr); + break; + } + + guest_buf = (hva_t)kmap_atomic( + pfn_to_page(paddr >> PAGE_SHIFT), KM_USER0); + offset = addr & ~PAGE_MASK; + guest_buf |= offset; + now = min(size, PAGE_SIZE - offset); + memcpy((void*)guest_buf, host_buf, now); + host_buf += now; + addr += now; + size -= now; + kunmap_atomic((void *)(guest_buf & PAGE_MASK), KM_USER0); + } + return req_size - size; +} +EXPORT_SYMBOL_GPL(kvm_write_guest_gpa); + /* * Switches to specified vcpu, until a matching vcpu_put() */ -static void vcpu_load(struct kvm_vcpu *vcpu) +void vcpu_load(struct kvm_vcpu *vcpu) { - mutex_lock(&vcpu->mutex); + spin_lock(&vcpu->lock); kvm_arch_ops->vcpu_load(vcpu); } +EXPORT_SYMBOL_GPL(vcpu_load); /* * Switches to specified vcpu, until a matching vcpu_put(). Will return NULL @@ -269,20 +341,21 @@ static struct kvm_vcpu *vcpu_load_slot(s { struct kvm_vcpu *vcpu = &kvm->vcpus[slot]; - mutex_lock(&vcpu->mutex); + spin_lock(&vcpu->lock); if (!vcpu->vmcs) { - mutex_unlock(&vcpu->mutex); + spin_unlock(&vcpu->lock); return NULL; } kvm_arch_ops->vcpu_load(vcpu); return vcpu; } -static void vcpu_put(struct kvm_vcpu *vcpu) +void vcpu_put(struct kvm_vcpu *vcpu) { kvm_arch_ops->vcpu_put(vcpu); - mutex_unlock(&vcpu->mutex); + spin_unlock(&vcpu->lock); } +EXPORT_SYMBOL_GPL(vcpu_put); static struct kvm *kvm_create_vm(void) { @@ -297,15 +370,18 @@ static struct kvm *kvm_create_vm(void) for (i = 0; i < KVM_MAX_VCPUS; ++i) { struct kvm_vcpu *vcpu = &kvm->vcpus[i]; - mutex_init(&vcpu->mutex); vcpu->cpu = -1; + spin_lock_init(&vcpu->lock); vcpu->kvm = kvm; vcpu->mmu.root_hpa = INVALID_PAGE; INIT_LIST_HEAD(&vcpu->free_pages); spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); spin_unlock(&kvm_lock); + spin_lock_init(&kvm->vcpus[i].irq_lock); } + + kvmnet_host_init(kvm); return kvm; } @@ -373,6 +449,7 @@ static int kvm_dev_release(struct inode static void kvm_destroy_vm(struct kvm *kvm) { spin_lock(&kvm_lock); + kvmnet_host_exit(kvm); list_del(&kvm->vm_list); spin_unlock(&kvm_lock); kvm_free_vcpus(kvm); @@ -396,7 +473,7 @@ static void inject_gp(struct kvm_vcpu *v /* * Load the pae pdptrs. Return true is they are all valid. */ -static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +int __load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) { gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; @@ -406,9 +483,9 @@ static int load_pdptrs(struct kvm_vcpu * int ret; struct kvm_memory_slot *memslot; - spin_lock(&vcpu->kvm->lock); memslot = gfn_to_memslot(vcpu->kvm, pdpt_gfn); /* FIXME: !memslot - emulate? 0xff? */ + WARN_ON(!memslot); pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0); ret = 1; @@ -425,6 +502,19 @@ static int load_pdptrs(struct kvm_vcpu * out: kunmap_atomic(pdpt, KM_USER0); + + return ret; +} + +/* + * Load the pae pdptrs. Return true is they are all valid. + */ +static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) +{ + int ret; + + spin_lock(&vcpu->kvm->lock); + ret = __load_pdptrs(vcpu, cr3); spin_unlock(&vcpu->kvm->lock); return ret; @@ -704,8 +794,13 @@ raced: memset(new.phys_mem, 0, npages * sizeof(struct page *)); for (i = 0; i < npages; ++i) { +#ifdef CONFIG_PREEMPT_RT + new.phys_mem[i] = alloc_page(GFP_USER + | __GFP_ZERO); +#else new.phys_mem[i] = alloc_page(GFP_HIGHUSER | __GFP_ZERO); +#endif if (!new.phys_mem[i]) goto out_free; set_page_private(new.phys_mem[i],0); @@ -1173,11 +1268,87 @@ int emulate_instruction(struct kvm_vcpu } EXPORT_SYMBOL_GPL(emulate_instruction); +int hypercall_test(struct kvm_vcpu *vcpu) +{ + printk(KERN_DEBUG "%s __NR_hypercall_test\n", __FUNCTION__); + return 0x5a5a; +} + +/* + * Load a guest-physical CR3 address: + */ +int hypercall_load_cr3(struct kvm_vcpu *vcpu, unsigned long new_cr3) +{ + set_cr3(vcpu, new_cr3); + + return 0; +} + +/* + * Get the current time to the guest: + */ +int hypercall_get_ktime(struct kvm_vcpu *vcpu, unsigned long time_ptr) +{ + struct timespec now; + int ret; + + ktime_get_ts(&now); + + ret = kvm_write_guest_gpa(vcpu, time_ptr, sizeof(now), &now); + + if (unlikely(ret != sizeof(now))) { + printk("hm: bad time_ptr: %08lx\n", time_ptr); + inject_gp(vcpu); + return 1; + } + return 0; +} + +int hypercall_trace(unsigned long caller, unsigned long v1, unsigned long v2, + unsigned long v3) +{ + __trace_special(caller, v1, v2, v3); + + return 0; +} + +static void * hypercall_table[] = +{ + [__NR_hypercall_load_cr3] = + (void *) hypercall_load_cr3, + [__NR_hypercall_flush_cr3_cache] = + (void *) hypercall_flush_cr3_cache, + [__NR_hypercall_get_ktime] = + (void *) hypercall_get_ktime, + [__NR_hypercall_register_eth] = + (void *) hypercall_register_eth, + [__NR_hypercall_send_eth] = + (void *) hypercall_send_eth, + [__NR_hypercall_test] = + (void *) hypercall_test, + [__NR_hypercall_disk_read] = + (void *) hypercall_disk_read, + [__NR_hypercall_disk_write] = + (void *) hypercall_disk_write, + [__NR_hypercall_disk_register_completion_area] = + (void *) hypercall_disk_register_completion_area, + [__NR_hypercall_disk_register_submit_area] = + (void *) hypercall_disk_register_submit_area, + [__NR_hypercall_disk_trigger_scan] = + (void *) hypercall_disk_trigger_scan, + [__NR_hypercall_disk_info] = + (void *) hypercall_disk_getinfo, + [__NR_hypercall_disk_count] = + (void *) hypercall_disk_getdiskcount, + [__NR_hypercall_trace] = + (void *) hypercall_trace, +}; + int kvm_hypercall(struct kvm_vcpu *vcpu, struct kvm_run *run) { unsigned long nr, a0, a1, a2, a3, a4, a5, ret; - kvm_arch_ops->decache_regs(vcpu); + kvm_arch_ops->cache_regs(vcpu); ret = -KVM_EINVAL; #ifdef CONFIG_X86_64 if (is_long_mode(vcpu)) { @@ -1191,20 +1362,66 @@ int kvm_hypercall(struct kvm_vcpu *vcpu, } else #endif { - nr = vcpu->regs[VCPU_REGS_RBX] & -1u; - a0 = vcpu->regs[VCPU_REGS_RAX] & -1u; + nr = vcpu->regs[VCPU_REGS_RAX] & -1u; + a0 = vcpu->regs[VCPU_REGS_RBX] & -1u; a1 = vcpu->regs[VCPU_REGS_RCX] & -1u; a2 = vcpu->regs[VCPU_REGS_RDX] & -1u; a3 = vcpu->regs[VCPU_REGS_RSI] & -1u; a4 = vcpu->regs[VCPU_REGS_RDI] & -1u; a5 = vcpu->regs[VCPU_REGS_RBP] & -1u; } + if (unlikely(prof_on == KVM_PROFILING && nr < __NR_hypercalls)) + profile_hit(KVM_PROFILING, hypercall_table[nr]); + switch (nr) { + case __NR_hypercall_load_cr3: + ret = hypercall_load_cr3(vcpu, a0); + break; + case __NR_hypercall_flush_cr3_cache: + ret = hypercall_flush_cr3_cache(vcpu, a0); + break; + case __NR_hypercall_get_ktime: + ret = hypercall_get_ktime(vcpu, a0); + break; + case __NR_hypercall_register_eth: + ret = hypercall_register_eth(vcpu, a0); + break; + case __NR_hypercall_send_eth: + ret = hypercall_send_eth(vcpu); + break; + case __NR_hypercall_test: + ret = hypercall_test(vcpu); + break; + case __NR_hypercall_disk_read: + ret = hypercall_disk_read(vcpu, a0, a1, a2, a3); + break; + case __NR_hypercall_disk_write: + ret = hypercall_disk_write(vcpu, a0, a1, a2, a3); + break; + case __NR_hypercall_disk_register_completion_area: + ret = hypercall_disk_register_completion_area(vcpu, a0, a1, a2); + break; + case __NR_hypercall_disk_register_submit_area: + ret = hypercall_disk_register_submit_area(vcpu, a0, a1, a2); + break; + case __NR_hypercall_disk_trigger_scan: + ret = hypercall_disk_trigger_scan(a0); + break; + case __NR_hypercall_disk_info: + ret = hypercall_disk_getinfo(vcpu, a0, a1); + break; + case __NR_hypercall_disk_count: + ret = hypercall_disk_getdiskcount(); + break; + case __NR_hypercall_trace: + ret = hypercall_trace(a0, a1, a2, a3); + break; default: - ; + WARN_ON_ONCE(1); } vcpu->regs[VCPU_REGS_RAX] = ret; - kvm_arch_ops->cache_regs(vcpu); + kvm_arch_ops->decache_regs(vcpu); + return 1; } EXPORT_SYMBOL_GPL(kvm_hypercall); @@ -1280,11 +1497,12 @@ void realmode_set_cr(struct kvm_vcpu *vc */ static int vcpu_register_para(struct kvm_vcpu *vcpu, gpa_t para_state_gpa) { + hpa_t para_state_hpa, hypercall_hpa, cr3_cache_hpa; + struct page *para_state_page, *cr3_cache_page; struct kvm_vcpu_para_state *para_state; - hpa_t para_state_hpa, hypercall_hpa; - struct page *para_state_page; + gpa_t hypercall_gpa, cr3_cache_gpa; + struct kvm_cr3_cache *cr3_cache; unsigned char *hypercall; - gpa_t hypercall_gpa; printk(KERN_DEBUG "kvm: guest trying to enter paravirtual mode\n"); printk(KERN_DEBUG ".... para_state_gpa: %08Lx\n", para_state_gpa); @@ -1325,10 +1543,19 @@ static int vcpu_register_para(struct kvm goto err_kunmap_skip; } - printk(KERN_DEBUG "kvm: para guest successfully registered.\n"); - vcpu->para_state_page = para_state_page; - vcpu->para_state_gpa = para_state_gpa; - vcpu->hypercall_gpa = hypercall_gpa; + if (para_state_gpa != PAGE_ALIGN(para_state_gpa)) { + para_state->ret = -KVM_EINVAL; + goto err_kunmap_skip; + } + + cr3_cache_gpa = para_state->cr3_cache_gpa; + cr3_cache_hpa = gpa_to_hpa(vcpu, cr3_cache_gpa); + printk(KERN_DEBUG ".... cr3_cache_hpa: %08Lx\n", cr3_cache_hpa); + if (is_error_hpa(cr3_cache_hpa)) { + para_state->ret = -KVM_EINVAL; + goto err_kunmap_skip; + } + cr3_cache_page = pfn_to_page(cr3_cache_hpa >> PAGE_SHIFT); mark_page_dirty(vcpu->kvm, hypercall_gpa >> PAGE_SHIFT); hypercall = kmap_atomic(pfn_to_page(hypercall_hpa >> PAGE_SHIFT), @@ -1336,6 +1563,19 @@ static int vcpu_register_para(struct kvm kvm_arch_ops->patch_hypercall(vcpu, hypercall); kunmap_atomic(hypercall, KM_USER1); + cr3_cache = kmap_atomic(pfn_to_page(cr3_cache_hpa >> PAGE_SHIFT), + KM_USER1); + cr3_cache->entry_count = vcpu->cr3_cache_limit; + kunmap_atomic(cr3_cache, KM_USER1); + + vcpu->para_state_page = para_state_page; + vcpu->para_state_gpa = para_state_gpa; + vcpu->hypercall_gpa = hypercall_gpa; + vcpu->cr3_cache_gpa = cr3_cache_gpa; + vcpu->cr3_cache_page = cr3_cache_page; + + printk(KERN_DEBUG "KVM: para guest successfully registered.\n"); + para_state->ret = 0; err_kunmap_skip: kunmap_atomic(para_state, KM_USER0); @@ -1642,8 +1882,10 @@ static int kvm_vcpu_ioctl_get_sregs(stru sregs->efer = vcpu->shadow_efer; sregs->apic_base = vcpu->apic_base; + spin_lock_bh(&vcpu->irq_lock); memcpy(sregs->interrupt_bitmap, vcpu->irq_pending, sizeof sregs->interrupt_bitmap); + spin_unlock_bh(&vcpu->irq_lock); vcpu_put(vcpu); @@ -1707,12 +1949,14 @@ static int kvm_vcpu_ioctl_set_sregs(stru if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); + spin_lock_bh(&vcpu->irq_lock); memcpy(vcpu->irq_pending, sregs->interrupt_bitmap, sizeof vcpu->irq_pending); vcpu->irq_summary = 0; for (i = 0; i < NR_IRQ_WORDS; ++i) if (vcpu->irq_pending[i]) __set_bit(i, &vcpu->irq_summary); + spin_unlock_bh(&vcpu->irq_lock); vcpu_put(vcpu); @@ -1865,14 +2109,30 @@ static int kvm_vcpu_ioctl_interrupt(stru return -EINVAL; vcpu_load(vcpu); + spin_lock_bh(&vcpu->irq_lock); set_bit(irq->irq, vcpu->irq_pending); set_bit(irq->irq / BITS_PER_LONG, &vcpu->irq_summary); + spin_unlock_bh(&vcpu->irq_lock); vcpu_put(vcpu); return 0; } +int kvm_inject_irq(struct kvm_vcpu *kvm_vcpu, struct kvm_interrupt *irq) +{ + if (irq->irq < 0 || irq->irq >= 256) + return -EINVAL; + + spin_lock_bh(&kvm_vcpu->irq_lock); + set_bit(irq->irq, kvm_vcpu->irq_pending); + set_bit(irq->irq / BITS_PER_LONG, &kvm_vcpu->irq_summary); + spin_unlock_bh(&kvm_vcpu->irq_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_inject_irq); + static int kvm_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) { @@ -1928,6 +2188,7 @@ static int create_vcpu_fd(struct kvm_vcp goto out3; fd = r; fd_install(fd, file); + vcpu->task = current; return fd; @@ -1954,10 +2215,10 @@ static int kvm_vm_ioctl_create_vcpu(stru vcpu = &kvm->vcpus[n]; - mutex_lock(&vcpu->mutex); + spin_lock(&vcpu->lock); if (vcpu->vmcs) { - mutex_unlock(&vcpu->mutex); + spin_unlock(&vcpu->lock); return -EEXIST; } @@ -1990,7 +2251,7 @@ static int kvm_vm_ioctl_create_vcpu(stru out_free_vcpus: kvm_free_vcpu(vcpu); - mutex_unlock(&vcpu->mutex); + spin_unlock(&vcpu->lock); out: return r; } @@ -2345,12 +2606,12 @@ static void decache_vcpus_on_cpu(int cpu * If it's not locked, check the last cpu it executed * on. */ - if (mutex_trylock(&vcpu->mutex)) { + if (spin_trylock(&vcpu->lock)) { if (vcpu->cpu == cpu) { kvm_arch_ops->vcpu_decache(vcpu); vcpu->cpu = -1; } - mutex_unlock(&vcpu->mutex); + spin_unlock(&vcpu->lock); } } spin_unlock(&kvm_lock); @@ -2464,7 +2725,7 @@ int kvm_init_arch(struct kvm_arch_ops *o r = kvm_arch_ops->hardware_setup(); if (r < 0) - return r; + goto out; on_each_cpu(kvm_arch_ops->hardware_enable, NULL, 0, 1); r = register_cpu_notifier(&kvm_cpu_notifier); @@ -2500,6 +2761,8 @@ out_free_2: out_free_1: on_each_cpu(kvm_arch_ops->hardware_disable, NULL, 0, 1); kvm_arch_ops->hardware_unsetup(); +out: + kvm_arch_ops = NULL; return r; } Index: linux/drivers/kvm/kvm_net.c =================================================================== --- /dev/null +++ linux/drivers/kvm/kvm_net.c @@ -0,0 +1,496 @@ +/* + * KVM/NET guest para virtualized network driver + * + * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2007, Qumranet, Inc., Dor Laor + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +//#define DEBUG + +#include "kvm.h" +#include "kvm_net.h" +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_AUTHOR ("Ingo Molnar and Dor Laor"); +MODULE_DESCRIPTION ("Implements guest para virtual network driver"); +MODULE_LICENSE("GPL"); +MODULE_VERSION("1"); + +#ifndef CONFIG_PARAVIRT + +/* + * This is the vm-syscall address - to be patched by the host to + * VMCALL (Intel) or VMMCALL (AMD), depending on the CPU model: + */ +asm ( + " .globl hypercall_addr \n" + " .align 4 \n" + " hypercall_addr: \n" + " movl $-38, %eax \n" + " ret \n" +); + +extern unsigned char hypercall_addr[6]; + +static void test_hypercall(void) +{ + int ret; + unsigned long dummy = 123456; + + ret = hypercall(1, __NR_hypercall_test, dummy); + pr_debug("hypercall test #1, ret: 0x%x\n", ret); +} + +#ifndef CONFIG_X86_64 +static DEFINE_PER_CPU(struct kvm_vcpu_para_state, para_state); +#endif + +static int kvm_guest_register_para(int cpu) +{ + struct page *hypercall_addr_page; + struct kvm_vcpu_para_state *para_state; + +#ifdef CONFIG_X86_64 + struct page *pstate_page; + if ((pstate_page = alloc_page(GFP_KERNEL)) == NULL) + return -ENOMEM; + para_state = (struct kvm_vcpu_para_state*)page_address(pstate_page); +#else + para_state = &per_cpu(para_state, cpu); +#endif + /* + * Try to write to a magic MSR (which is invalid on any real CPU), + * and thus signal to KVM that we wish to entering para-virtualized + * mode: + */ + para_state->guest_version = KVM_PARA_API_VERSION; + para_state->host_version = -1; + para_state->size = sizeof(*para_state); + para_state->ret = -1; + + hypercall_addr_page = vmalloc_to_page(hypercall_addr); + para_state->hypercall_gpa = page_to_pfn(hypercall_addr_page) << PAGE_SHIFT | + offset_in_page(hypercall_addr); + printk(KERN_DEBUG "kvm guest: hypercall gpa is 0x%lx\n", (long)para_state->hypercall_gpa); + + if (wrmsr_safe(MSR_KVM_API_MAGIC, __pa(para_state), 0)) { + printk(KERN_INFO "KVM guest: WRMSR probe failed.\n"); + return -1; + } + + printk(KERN_DEBUG "kvm guest: host returned %d\n", para_state->ret); + printk(KERN_DEBUG "kvm guest: host version: %d\n", para_state->host_version); + printk(KERN_DEBUG "kvm guest: syscall entry: %02x %02x %02x %02x\n", + hypercall_addr[0], hypercall_addr[1], + hypercall_addr[2], hypercall_addr[3]); + + if (para_state->ret) { + printk(KERN_ERR "kvm guest: host refused registration.\n"); + return -1; + } + test_hypercall(); + + return 0; +} + +#endif + +/* + * The higher levels take care of making this non-re-entrant (it's + * called with bh's disabled). + */ +static int kvmnet_guest_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct kvm_eth_state *eth_state = dev->priv; + struct kvm_tx_ring_desc *tx_desc; + unsigned int ret; + + trace_hyper((unsigned long)skb, eth_state->tx_head, eth_state->tx_tail); + WARN_ON(eth_state->magic != KVM_ETH_MAGIC); + + /* Note that caller will free the skb on error so there is no leak*/ + if (!netif_running(dev)) { + pr_debug("netdev %s is not running, very weird!\n", + dev->name); + WARN_ON(1); + return -EFAULT; + } + + tx_desc = get_head_tx_desc(eth_state); + tx_desc->size = skb->len; + memcpy(__va(tx_desc->page_gpa), skb->data, skb->len); + commit_head_tx_desc(eth_state, tx_desc); + dev_kfree_skb(skb); + + ret = hypercall(0, __NR_hypercall_send_eth); + WARN_ON(ret); + pr_debug("hypercall ret: %d\n", ret); + + return 0; +} + +static int kvmnet_guest_rx(struct net_device *netdev, int quota) +{ + struct kvm_eth_state *eth_state = netdev->priv; + struct kvm_rx_ring_desc *rx_desc; + unsigned char *buff_gva; + struct sk_buff *skb = NULL; + int work_done = 0; + + pr_debug("kvmnet: got quota=%d\n", quota); + WARN_ON(eth_state->magic != KVM_ETH_MAGIC); + + if (!netif_running(netdev)) { + pr_debug("netdev %s is not running, not sending!\n", + netdev->name); + WARN_ON(1); + return work_done; + } + + pr_debug("kvmnet: rx ring head: %d\n", eth_state->rx_head); + pr_debug("kvmnet: rx ring tail: %d\n", eth_state->rx_tail); + + while (eth_state->rx_tail != eth_state->rx_head && quota--) { + + work_done++; + pr_debug(".. iteration #%03d\n", work_done); + rx_desc = get_tail_rx_desc(eth_state); + pr_debug(".. rx_desc->size: %d\n", rx_desc->size); + pr_debug(".. rx_desc->page_gpa: %08Lx\n", rx_desc->page_gpa); + buff_gva = __va(rx_desc->page_gpa); + pr_debug(".. buff_gva: %p\n", buff_gva); + skb = netdev_alloc_skb(netdev, rx_desc->size); + if (!skb) { + WARN_ON(1); + break; + } + skb_put(skb, rx_desc->size); + memcpy(skb->data, buff_gva, rx_desc->size); + + skb->dev = netdev; + skb->protocol = eth_type_trans(skb, netdev); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + trace_hyper((unsigned long)skb, eth_state->rx_head, eth_state->rx_tail); + print_skb(skb); + netif_rx_ni(skb); + + free_tail_rx_desc(eth_state, rx_desc); + pr_debug(".. eth_state->rx_head: %d\n", eth_state->rx_head); + pr_debug(".. eth_state->rx_tail: %d\n", eth_state->rx_tail); + } + WARN_ON(!quota); + + pr_debug(".. The end. work_done=%d\n", work_done); + + //stats->rx_packets += work_done; + + return work_done; +} + +static irqreturn_t kvmnet_guest_rx_irq_handler(int irq, void *dev_id) +{ + struct net_device *netdev = dev_id; + struct kvm_eth_state *eth_state = netdev->priv; + + trace_hyper(irq, 0, 0); + pr_debug("kvmnet: got irq, netdev: %p, eth_state: %p\n", + netdev, eth_state); + WARN_ON(eth_state->magic != KVM_ETH_MAGIC); + +#if 0 + /* + * Test whether the host issued the irq. Use gen numbers for communication + */ + rmb(); + if (eth_state->irq_gen_host <= eth_state->irq_gen_guest && + eth_state->irq_gen_guest - eth_state->irq_gen_host > 0xfff) /*wrap around */{ + pr_debug("kvmnet:not our irq\n"); + WARN_ON(1); + return IRQ_NONE; + } + eth_state->irq_gen_guest++; +#endif + + if (!netif_running(netdev)) { + pr_debug("netdev %s is not running, not receiving!\n", + netdev->name); + WARN_ON(1); + return IRQ_HANDLED; + } + + pr_debug("kvmnet: rx ring head: %d\n", eth_state->rx_head); + pr_debug("kvmnet: rx ring tail: %d\n", eth_state->rx_tail); + + atomic_set(ð_state->irq_enable, 1); + kvmnet_guest_rx(netdev, KVM_ETH_RX_RING_SIZE); +#if 0 + if (likely(netif_rx_schedule_prep(netdev))) { + atomic_set(ð_state->irq_enable, 0); + __netif_rx_schedule(netdev); + } else + atomic_set(ð_state->irq_enable, 1); +#endif + + return IRQ_HANDLED; +} + +static int kvmnet_guest_poll(struct net_device *netdev, int *budget) +{ + struct kvm_eth_state *eth_state = netdev->priv; + int orig_budget = min(*budget, netdev->quota); + int done = 1; + int work_done; + + pr_debug("got budget=%d, orig_budget=%d\n", *budget, orig_budget); + + work_done = kvmnet_guest_rx(netdev, orig_budget); + pr_debug("work_done=%d\n", work_done); + if (likely(work_done > 0)) { + *budget -= work_done; + netdev->quota -= work_done; + done = (work_done < orig_budget); + } + + if (done) { + pr_debug("deschedule pooling\n"); + // re-enable interrupts after poll + atomic_set(ð_state->irq_enable, 1); + netif_rx_complete(netdev); + } + pr_debug("End of functions\n"); + + return !done; +} + +static const struct ethtool_ops kvmnet_ethtool_ops = { + .get_link = always_on, + .get_tso = ethtool_op_get_tso, + .set_tso = ethtool_op_set_tso, + .get_tx_csum = always_on, + .get_sg = always_on, + .get_rx_csum = always_on, +}; + +/* + * The kvmnet device is special. There is only one instance and + * it is statically allocated. Don't do this for other devices. + */ +struct net_device *kvmnet_dev; + +struct kvm_eth_state *eth_state; + +static int kvmnet_register_guest(struct net_device *netdev) +{ + int ret, i; + gpa_t eth_state_gpa; + struct page *eth_state_page; + + eth_state_page = vmalloc_to_page(eth_state); + WARN_ON(eth_state_page == NULL); + if (!eth_state_page) + return -1; + + netdev->priv = eth_state; + printk(KERN_INFO "registering netdev %p, eth_state: %p\n", + netdev, netdev->priv); + + netdev->dev_addr[0] = 0x00; + netdev->dev_addr[1] = 0x11; + netdev->dev_addr[2] = 0x22; + netdev->dev_addr[3] = 0x33; + netdev->dev_addr[4] = 0x44; + netdev->dev_addr[5] = 0x55; + + eth_state->tx_head = 0; + eth_state->tx_tail = 0; + + for (i = 0; i < KVM_ETH_TX_RING_SIZE; i++) { + unsigned long page_gva; + + page_gva = __get_free_page(GFP_KERNEL | __GFP_ZERO); + + WARN_ON(!page_gva); + WARN_ON((int)(page_gva != (unsigned long)__va(__pa(page_gva)))); + eth_state->tx_ring[i].page_gpa = __pa(page_gva); + } + + eth_state->rx_head = 0; + eth_state->rx_tail = 0; + + eth_state->magic = KVM_ETH_MAGIC; + + //enable irqs + atomic_set(ð_state->irq_enable, 1); + + for (i = 0; i < KVM_ETH_RX_RING_SIZE; i++) { + unsigned long page_gva; + + page_gva = __get_free_page(GFP_KERNEL | __GFP_ZERO); + + WARN_ON(!page_gva); + WARN_ON((int)(page_gva != (unsigned long)__va(__pa(page_gva)))); + eth_state->rx_ring[i].page_gpa = __pa(page_gva); + } + + eth_state_gpa = page_to_pfn(eth_state_page) << PAGE_SHIFT | + offset_in_page(eth_state); + + ret = hypercall(1, __NR_hypercall_register_eth, eth_state_gpa); + WARN_ON(ret); + + netdev->hard_start_xmit = kvmnet_guest_xmit; +// netdev->poll = kvmnet_guest_poll; + strcpy(netdev->name, "kvmnet-guest"); + ret = request_irq(eth_state->irq, kvmnet_guest_rx_irq_handler, + IRQF_SHARED | IRQF_SAMPLE_RANDOM, "kvmnet-guest", + netdev); + WARN_ON(ret); + set_irq_chip_and_handler_name(eth_state->irq, &dummy_irq_chip, + handle_simple_irq, "kvm"); + + return ret; +} + + +#define KVMNET_DRIVER_NAME "paravirt_network_driver" +#define KVMNET_DRIVER_VERSION "1" +#define PCI_VENDOR_ID_KVMNET 0x5002 +#define PCI_DEVICE_ID_KVMNET 0x1234 + +static struct pci_device_id kvmnet_pci_tbl[] = { +// {PCI_VENDOR_ID_KVMNET, PCI_DEVICE_ID_KVMNET, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, + {PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0 }, + {0,} +}; +MODULE_DEVICE_TABLE (pci, kvmnet_pci_tbl); + +static int __devinit kvmnet_init_one(struct pci_dev *pdev, + const struct pci_device_id *ent) +{ + u8 pci_rev; + struct net_device *netdev; + int rs; + + WARN_ON(sizeof(struct kvm_eth_state) > PAGE_SIZE); + + printk(KERN_INFO "%s size of kvm_eth_state is %d\n", __FUNCTION__, + (int)sizeof(struct kvm_eth_state)); + netdev = alloc_etherdev(sizeof(struct kvm_eth_state)); + kvmnet_dev = netdev; + + netdev->ethtool_ops = &kvmnet_ethtool_ops; + netdev->flags &= ~IFF_MULTICAST; + netdev->mtu = 1500; + netdev->weight = 16; + +//#warning fixme + + if (!paravirt_enabled()) { + printk(KERN_ERR "%s failed registering para\n", __FUNCTION__); + return -1; + } + + if (eth_state) { + printk(KERN_ERR "kvmnet-guest already registered\n"); + return -1; + } + + eth_state = vmalloc(sizeof(struct kvm_eth_state)); + WARN_ON(!eth_state); + if (!eth_state) + return -1; + + if (pdev->irq) + eth_state->irq = pdev->irq; + else + eth_state->irq = 10; + + printk(KERN_DEBUG "%s: irq is %d\n", __FUNCTION__, pdev->irq); + + if (kvmnet_register_guest(netdev) < 0) + return -1; + + netdev->tx_queue_len = KVM_ETH_TX_RING_SIZE-1; + + pci_read_config_byte(pdev, PCI_REVISION_ID, &pci_rev); + + if (pdev->vendor == PCI_VENDOR_ID_KVMNET && + pdev->device == PCI_DEVICE_ID_KVMNET) { + printk(KERN_INFO "pci dev %s (id %04x:%04x rev %02x) is a " + "guest paravirt network device\n", + pci_name(pdev), pdev->vendor, pdev->device, pci_rev); + } + + rs = register_netdev(netdev); + netif_wake_queue(netdev); + netif_poll_enable(netdev); + + return 0; +} + +static void __devexit kvmnet_remove_one(struct pci_dev *pdev) +{ + pci_disable_device(pdev); + synchronize_irq(eth_state->irq); + free_irq(eth_state->irq, kvmnet_dev); + vfree(eth_state); + eth_state = NULL; +} + + +static struct pci_driver kvmnet_pci_driver = { + .name = KVMNET_DRIVER_NAME, + .id_table = kvmnet_pci_tbl, + .probe = kvmnet_init_one, + .remove = __devexit_p(kvmnet_remove_one), +}; + + +int __init kvmnet_init(void) +{ + if (paravirt_enabled()) + pci_module_init(&kvmnet_pci_driver); + + pr_debug("Finished registering & wakeup + pool guest netdev\n"); + return 0; +} + +static void __exit kvmnet_exit(void) +{ + netif_poll_disable(kvmnet_dev); + netif_stop_queue(kvmnet_dev); + unregister_netdev(kvmnet_dev); + + pci_unregister_driver(&kvmnet_pci_driver); + free_netdev(kvmnet_dev); +} + +module_init(kvmnet_init); +module_exit(kvmnet_exit); Index: linux/drivers/kvm/kvm_net.h =================================================================== --- /dev/null +++ linux/drivers/kvm/kvm_net.h @@ -0,0 +1,174 @@ +/* + * KVM/NET host network driver + * + * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2007, Qumranet, Inc., Dor Laor + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "kvm.h" + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#undef pr_debug +#ifdef DEBUG +# define pr_debug(fmt,arg...) \ + do { printk("%s:%d " fmt, __FUNCTION__, __LINE__, ##arg); } while (0) +#else +# define pr_debug(fmt,arg...) +#endif + +static inline void print_skb(struct sk_buff *skb) +{ +#ifdef DEBUG + int i; + + printk("skb %p, data: %p, len: %d\n", skb, skb->data, skb->len); + for (i = 0; i < skb->len; i++) { + printk(" %02x", skb->data[i]); + if ((i & 15) == 15) + printk("\n"); + } + if ((i & 15) != 15) + printk("\n"); +#endif +} + +struct pcpu_kvmstats { + unsigned long packets; + unsigned long bytes; +}; + + +/* + * Ring of buffers: + */ +static inline unsigned int tx_idx_next(unsigned int idx) +{ + return (idx + 1) & (KVM_ETH_TX_RING_SIZE - 1); +} + +static inline struct kvm_tx_ring_desc * +get_head_tx_desc(struct kvm_eth_state *eth_state) +{ + WARN_ON(tx_idx_next(eth_state->tx_head) == eth_state->tx_tail); + + return eth_state->tx_ring + eth_state->tx_head; +} + +static inline struct kvm_tx_ring_desc * +get_tail_tx_desc(struct kvm_eth_state *eth_state) +{ + WARN_ON(eth_state->tx_head == eth_state->tx_tail); + + return eth_state->tx_ring + eth_state->tx_tail; +} + +/* + * Make the new TX descriptor visible to the host: + */ +static inline void +commit_head_tx_desc(struct kvm_eth_state *eth_state, + struct kvm_tx_ring_desc *tx_desc) +{ + WARN_ON(tx_desc - eth_state->tx_ring != eth_state->tx_head); + + eth_state->tx_head = tx_idx_next(eth_state->tx_head); +} + +static inline void +free_tail_tx_desc(struct kvm_eth_state *eth_state, + struct kvm_tx_ring_desc *tx_desc) +{ + WARN_ON(tx_desc - eth_state->tx_ring != eth_state->tx_tail); + + eth_state->tx_tail = tx_idx_next(eth_state->tx_tail); +} + +static inline unsigned int rx_idx_next(unsigned int idx) +{ + return (idx + 1) & (KVM_ETH_RX_RING_SIZE - 1); +} + +static inline struct kvm_rx_ring_desc * +get_head_rx_desc(struct kvm_eth_state *eth_state) +{ + WARN_ON(rx_idx_next(eth_state->rx_head) == eth_state->rx_tail); + + return eth_state->rx_ring + eth_state->rx_head; +} + +static inline struct kvm_rx_ring_desc * +get_tail_rx_desc(struct kvm_eth_state *eth_state) +{ + WARN_ON(eth_state->rx_head == eth_state->rx_tail); + + return eth_state->rx_ring + eth_state->rx_tail; +} + +/* + * Make the new RX descriptor visible to the host: + */ +static inline void +commit_head_rx_desc(struct kvm_eth_state *eth_state, + struct kvm_rx_ring_desc *rx_desc) +{ + WARN_ON(rx_desc - eth_state->rx_ring != eth_state->rx_head); + + eth_state->rx_head = rx_idx_next(eth_state->rx_head); +} + +static inline void +free_tail_rx_desc(struct kvm_eth_state *eth_state, + struct kvm_rx_ring_desc *rx_desc) +{ + WARN_ON(rx_desc - eth_state->rx_ring != eth_state->rx_tail); + + eth_state->rx_tail = rx_idx_next(eth_state->rx_tail); +} + +extern struct net_device_stats kvmnet_stats; +DECLARE_PER_CPU(struct pcpu_kvmstats, pcpu_kvmstats); + +static inline struct net_device_stats *get_stats(struct net_device *dev) +{ + struct net_device_stats *stats = dev->priv; + unsigned long packets = 0; + unsigned long bytes = 0; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_kvmstats *kvm_stats; + + kvm_stats = &per_cpu(pcpu_kvmstats, i); + bytes += kvm_stats->bytes; + packets += kvm_stats->packets; + } + stats->rx_packets = packets; + stats->tx_packets = packets; + stats->rx_bytes = bytes; + stats->tx_bytes = bytes; + + return stats; +} + +static u32 always_on(struct net_device *dev) +{ + return 1; +} + Index: linux/drivers/kvm/kvm_net_host.c =================================================================== --- /dev/null +++ linux/drivers/kvm/kvm_net_host.c @@ -0,0 +1,287 @@ +/* + * KVM/NET host network driver + * + * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2007, Qumranet, Inc., Dor Laor + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +//#define DEBUG + +#include "kvm.h" +#include "kvm_net.h" + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct kvm_eth_state *eth_state_hva; + +/* + * The host ring state holds the gpa_to_hpa translation that is done once + */ +static struct { + unsigned long *tx_desc_page_hva[KVM_ETH_TX_RING_SIZE]; + unsigned long *rx_desc_page_hva[KVM_ETH_RX_RING_SIZE]; +} host_ring_state; + +int hypercall_register_eth(struct kvm_vcpu *vcpu, gpa_t eth_state_gpa) +{ + hpa_t eth_state_hpa; + hpa_t page_hpa; + int i, ret; + + pr_debug("hypercall_register_eth(%p): %08Lx\n", vcpu, eth_state_gpa); + + /* + * We only want to work with nice round addresses: + */ + if (PAGE_ALIGN(eth_state_gpa) != eth_state_gpa) { + WARN_ON(1); + return -EINVAL; + } + eth_state_hpa = gpa_to_hpa(vcpu, eth_state_gpa); + if (is_error_hpa(eth_state_hpa)) { + WARN_ON(1); + return -EINVAL; + } + pr_debug("eth_state_hpa: %08Lx\n", eth_state_hpa); + + /* + * TODO: kunmap() this upon exit: + */ + eth_state_hva = kmap(pfn_to_page(eth_state_hpa >> PAGE_SHIFT)); + + pr_debug("eth_state_hva: %p\n", eth_state_hva); + pr_debug("eth_state_hva magic: %08x\n", eth_state_hva->magic); + WARN_ON(eth_state_hva->magic != KVM_ETH_MAGIC); + + spin_lock(&vcpu->kvm->lock); + for (i = 0; i < KVM_ETH_TX_RING_SIZE; i++) { + page_hpa = gpa_to_hpa(vcpu, + eth_state_hva->tx_ring[i].page_gpa); + ret = is_error_hpa(page_hpa) ? -EFAULT : 0; + if (ret) { + WARN_ON(1); + spin_unlock(&vcpu->kvm->lock); + return -EINVAL; + } + /* + * TODO: unmap these upon exit: + */ + host_ring_state.tx_desc_page_hva[i] = + kmap(pfn_to_page(page_hpa >> PAGE_SHIFT)); + } + + for (i = 0; i < KVM_ETH_RX_RING_SIZE; i++) { + page_hpa = gpa_to_hpa(vcpu, + eth_state_hva->rx_ring[i].page_gpa); + ret = is_error_hpa(page_hpa) ? -EFAULT : 0; + if (ret) { + WARN_ON(1); + spin_unlock(&vcpu->kvm->lock); + return -EINVAL; + } + /* + * TODO: unmap these upon exit: + */ + host_ring_state.rx_desc_page_hva[i] = + kmap(pfn_to_page(page_hpa >> PAGE_SHIFT)); + } + spin_unlock(&vcpu->kvm->lock); + + return 0; +} + +int hypercall_send_eth(struct kvm_vcpu *vcpu) +{ + struct kvm_eth_state *eth_state = eth_state_hva; + struct net_device *netdev = vcpu->kvm->host_netdev; + struct kvm_tx_ring_desc *tx_desc; + unsigned long *buff_hva; + struct sk_buff *skb; + unsigned int ret = 0; + int idx; + + pr_debug("eth_state->tx_head: %d\n", eth_state->tx_head); + pr_debug("eth_state->tx_tail: %d\n", eth_state->tx_tail); + + while (eth_state->tx_tail != eth_state->tx_head) { + + tx_desc = get_tail_tx_desc(eth_state); + idx = tx_desc - eth_state->tx_ring; + pr_debug(".. tx_desc->size: %d\n", tx_desc->size); + buff_hva = host_ring_state.tx_desc_page_hva[idx]; + pr_debug(".. buff_hva: %p\n", buff_hva); + + if (!netif_running(netdev)) { + pr_debug("netdev %s is not running, not sending!\n", + netdev->name); + ret = -EFAULT; + break; + } + + skb = netdev_alloc_skb(netdev, tx_desc->size); + if (!skb) { + ret = -ENOMEM; + break; + } + skb_put(skb, tx_desc->size); + memcpy(skb->data, buff_hva, tx_desc->size); + + skb->dev = netdev; + skb->protocol = eth_type_trans(skb, netdev); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + trace_special((unsigned long)skb, eth_state->tx_head, eth_state->tx_tail); + //print_skb(skb); + netif_rx_ni(skb); + + free_tail_tx_desc(eth_state, tx_desc); + pr_debug(".. eth_state->tx_head: %d\n", eth_state->tx_head); + pr_debug(".. eth_state->tx_tail: %d\n", eth_state->tx_tail); + } + + return ret; +} + +/* + * Host side xmit. This prepares the packet into the shared buffer + * and injects an interrupt. + */ +static int kvmnet_host_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct kvm_eth_state *eth_state = eth_state_hva; + struct kvm_rx_ring_desc *rx_desc; + struct kvm *kvm = dev->priv; + struct kvm_vcpu *vcpu = &kvm->vcpus[0]; + struct task_struct *task; + unsigned long *buff_hva; + unsigned int irq; + int ret = 0; + u32 idx; + + trace_special((unsigned long)skb, eth_state->rx_head, eth_state->rx_tail); + + /* Note that caller will free the skb on error so there is no leak*/ + if (!netif_running(dev)) { + pr_debug("netdev %s is not running, very weird!\n", + dev->name); + WARN_ON(1); + return -EFAULT; + } + + idx = eth_state->rx_head; + rx_desc = eth_state->rx_ring + idx; + buff_hva = host_ring_state.rx_desc_page_hva[idx]; + pr_debug(".. buff_hva: %p\n", buff_hva); + + rx_desc->size = skb->len; + memcpy(buff_hva, skb->data, skb->len); + commit_head_rx_desc(eth_state, rx_desc); + +// rmb(); +// if (!atomic_read(ð_state->irq_enable)) +// goto out; + + irq = eth_state->irq; + task = vcpu->task; + + WARN_ON(!irq || irq >= NR_IRQS); + if (irq >= NR_IRQS) + goto out; + /* + * Inject the irq and cause the vcpu to vmexit to shorten irq latency. + */ + eth_state->irq_gen_host++; + pr_debug("test: host-dev xmit, injecting irq %d\n", irq); + set_bit(irq + 32, vcpu->irq_pending); + set_bit((irq + 32) / BITS_PER_LONG, &vcpu->irq_summary); + + /* + * A bit racy - replace with proper halt-wakeup: + */ + signal_wake_up(task, 0); +#if 0 + if (task->state == TASK_INTERRUPTIBLE) + wake_up_process(task); + else if (task->state == TASK_RUNNING) + kick_process(task); +#endif +out: + dev_kfree_skb(skb); + + return ret; +} + +struct net_device_stats kvmnet_stats; +DEFINE_PER_CPU(struct pcpu_kvmstats, pcpu_kvmstats); + +static const struct ethtool_ops kvmnet_ethtool_ops = { + .get_link = always_on, + .get_tso = ethtool_op_get_tso, + .set_tso = ethtool_op_set_tso, + .get_tx_csum = always_on, + .get_sg = always_on, + .get_rx_csum = always_on, +}; + +/* + * The kvmnet device is special. There is only one instance and + * it is statically allocated. Don't do this for other devices. + */ + +int kvmnet_host_init(struct kvm* kvm) +{ + struct net_device *netdev; + + WARN_ON(sizeof(struct kvm_eth_state) > PAGE_SIZE); + + printk(KERN_INFO "%s size of kvm_eth_state is %d\n", __FUNCTION__, + (int)sizeof(struct kvm_eth_state)); + netdev = alloc_etherdev(sizeof(struct kvm_eth_state)); + netdev->priv = kvm; + kvm->host_netdev = netdev; + + netdev->ethtool_ops = &kvmnet_ethtool_ops; + + netdev->flags &= ~IFF_MULTICAST; + netdev->mtu = 1500; + netdev->weight = 16; + netdev->hard_start_xmit = kvmnet_host_xmit; + strcpy(netdev->name, "kvmnet-host"); + + netdev->dev_addr[0] = 0x00; + netdev->dev_addr[1] = 0x11; + netdev->dev_addr[2] = 0x22; + netdev->dev_addr[3] = 0x33; + netdev->dev_addr[4] = 0x44; + netdev->dev_addr[5] = 0x66; + netdev->tx_queue_len = KVM_ETH_RX_RING_SIZE-1; + + return register_netdev(netdev); +}; + +void kvmnet_host_exit(struct kvm* kvm) +{ + unregister_netdev(kvm->host_netdev); + free_netdev(kvm->host_netdev); +} + +EXPORT_SYMBOL_GPL(kvmnet_host_init); +EXPORT_SYMBOL_GPL(kvmnet_host_exit); Index: linux/drivers/kvm/mmu.c =================================================================== --- linux.orig/drivers/kvm/mmu.c +++ linux/drivers/kvm/mmu.c @@ -131,7 +131,7 @@ static int dbg = 1; (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) -#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & PAGE_MASK) +#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) #define PT64_DIR_BASE_ADDR_MASK \ (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) @@ -406,8 +406,8 @@ static void rmap_write_protect(struct kv spte = desc->shadow_ptes[0]; } BUG_ON(!spte); - BUG_ON((*spte & PT64_BASE_ADDR_MASK) != - page_to_pfn(page) << PAGE_SHIFT); + BUG_ON((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT + != page_to_pfn(page)); BUG_ON(!(*spte & PT_PRESENT_MASK)); BUG_ON(!(*spte & PT_WRITABLE_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); @@ -514,7 +514,7 @@ static void mmu_page_remove_parent_pte(s int i; if (!page->multimapped) { - BUG_ON(page->parent_pte != parent_pte); + WARN_ON(page->parent_pte != parent_pte); page->parent_pte = NULL; return; } @@ -541,7 +541,7 @@ static void mmu_page_remove_parent_pte(s } return; } - BUG(); + WARN_ON(1); } static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu, @@ -705,26 +705,33 @@ static void page_header_update_slot(stru __set_bit(slot, &page_head->slot_bitmap); } -hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - hpa_t hpa = gpa_to_hpa(vcpu, gpa); - - return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa; -} - -hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) +hpa_t __gpa_to_hpa(struct kvm *kvm, gpa_t gpa) { struct kvm_memory_slot *slot; struct page *page; ASSERT((gpa & HPA_ERR_MASK) == 0); - slot = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT); + slot = gfn_to_memslot(kvm, gpa >> PAGE_SHIFT); if (!slot) return gpa | HPA_ERR_MASK; page = gfn_to_page(slot, gpa >> PAGE_SHIFT); return ((hpa_t)page_to_pfn(page) << PAGE_SHIFT) | (gpa & (PAGE_SIZE-1)); } +EXPORT_SYMBOL_GPL(__gpa_to_hpa); + +inline hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return __gpa_to_hpa(vcpu->kvm, gpa); +} +EXPORT_SYMBOL_GPL(gpa_to_hpa); + +hpa_t safe_gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + hpa_t hpa = gpa_to_hpa(vcpu, gpa); + + return is_error_hpa(hpa) ? bad_page_address | (gpa & ~PAGE_MASK): hpa; +} hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, gva_t gva) { @@ -734,6 +741,7 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu, return UNMAPPED_GVA; return gpa_to_hpa(vcpu, gpa); } +EXPORT_SYMBOL_GPL(gva_to_hpa); static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { @@ -787,7 +795,7 @@ static int nonpaging_map(struct kvm_vcpu static void mmu_free_roots(struct kvm_vcpu *vcpu) { - int i; + int i, j; struct kvm_mmu_page *page; #ifdef CONFIG_X86_64 @@ -801,21 +809,44 @@ static void mmu_free_roots(struct kvm_vc return; } #endif - for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->mmu.pae_root[i]; + /* + * Skip to the next cr3 filter entry and free it (if it's occupied): + */ + vcpu->cr3_cache_idx++; + if (unlikely(vcpu->cr3_cache_idx >= vcpu->cr3_cache_limit)) + vcpu->cr3_cache_idx = 0; - ASSERT(VALID_PAGE(root)); - root &= PT64_BASE_ADDR_MASK; - page = page_header(root); - --page->root_count; - vcpu->mmu.pae_root[i] = INVALID_PAGE; + j = vcpu->cr3_cache_idx; + /* + * Clear the guest-visible entry: + */ + if (vcpu->cr3_cache_page) { + struct kvm_cr3_cache *cache; + + cache = kmap_atomic(vcpu->cr3_cache_page, KM_USER0); + cache->entry[j].guest_cr3 = 0; + cache->entry[j].host_cr3 = 0; + kunmap_atomic(cache, KM_USER0); + } + ASSERT(vcpu->mmu.pae_root[j]); + if (VALID_PAGE(vcpu->mmu.pae_root[j][0])) { + vcpu->guest_cr3_gpa[j] = INVALID_PAGE; + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->mmu.pae_root[j][i]; + + ASSERT(VALID_PAGE(root)); + root &= PT64_BASE_ADDR_MASK; + page = page_header(root); + --page->root_count; + vcpu->mmu.pae_root[j][i] = INVALID_PAGE; + } } vcpu->mmu.root_hpa = INVALID_PAGE; } static void mmu_alloc_roots(struct kvm_vcpu *vcpu) { - int i; + int i, j; gfn_t root_gfn; struct kvm_mmu_page *page; @@ -834,8 +865,10 @@ static void mmu_alloc_roots(struct kvm_v return; } #endif + + j = vcpu->cr3_cache_idx; for (i = 0; i < 4; ++i) { - hpa_t root = vcpu->mmu.pae_root[i]; + hpa_t root = vcpu->mmu.pae_root[j][i]; ASSERT(!VALID_PAGE(root)); if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL) @@ -847,9 +880,14 @@ static void mmu_alloc_roots(struct kvm_v NULL); root = page->page_hpa; ++page->root_count; - vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK; + vcpu->mmu.pae_root[j][i] = root | PT_PRESENT_MASK; } - vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root); + vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root[j]); + /* + * Store the guest-side address too, we need it if a guest + * exits the VM, to rediscover what cr3 it changed to: + */ + vcpu->guest_cr3_gpa[j] = vcpu->cr3; } static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) @@ -882,7 +920,13 @@ static int nonpaging_page_fault(struct k static void nonpaging_free(struct kvm_vcpu *vcpu) { - mmu_free_roots(vcpu); + int j; + + /* + * This will cycle through all existing roots and free them: + */ + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) + mmu_free_roots(vcpu); } static int nonpaging_init_context(struct kvm_vcpu *vcpu) @@ -901,23 +945,52 @@ static int nonpaging_init_context(struct return 0; } -static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code) { - ++kvm_stat.tlb_flush; - kvm_arch_ops->tlb_flush(vcpu); + if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) { + int j; + + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) + mmu_free_roots(vcpu); + + kvm_mmu_free_some_pages(vcpu); + + mmu_alloc_roots(vcpu); + } + return vcpu->mmu.page_fault(vcpu, gva, error_code); } static void paging_new_cr3(struct kvm_vcpu *vcpu) { pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3); + mmu_free_roots(vcpu); if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) kvm_mmu_free_some_pages(vcpu); mmu_alloc_roots(vcpu); - kvm_mmu_flush_tlb(vcpu); + /* + * Setting the cr3 will flush the TLB: + */ kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa); } +/* + * Pagetable is freed by the guest, flush the cr3 cache: + */ +int hypercall_flush_cr3_cache(struct kvm_vcpu *vcpu, unsigned long old_cr3) +{ + int j; + + kvm_cr3_cache_clear(vcpu); + + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) + mmu_free_roots(vcpu); + + mmu_alloc_roots(vcpu); + + return 0; +} + static void mark_pagetable_nonglobal(void *shadow_pte) { page_header(__pa(shadow_pte))->global = 0; @@ -1093,22 +1166,40 @@ out: return r; } +static void mmu_pre_write_zap_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *page, + u64 *spte) +{ + u64 pte; + struct kvm_mmu_page *child; + + pte = *spte; + if (is_present_pte(pte)) { + if (page->role.level == PT_PAGE_TABLE_LEVEL) + rmap_remove(vcpu, spte); + else { + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(vcpu, child, spte); + } + } + *spte = 0; +} + void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *page; - struct kvm_mmu_page *child; struct hlist_node *node, *n; struct hlist_head *bucket; unsigned index; u64 *spte; - u64 pte; unsigned offset = offset_in_page(gpa); unsigned pte_size; unsigned page_offset; unsigned misaligned; int level; int flooded = 0; + int npte; pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes); if (gfn == vcpu->last_pt_write_gfn) { @@ -1144,22 +1235,26 @@ void kvm_mmu_pre_write(struct kvm_vcpu * } page_offset = offset; level = page->role.level; + npte = 1; if (page->role.glevels == PT32_ROOT_LEVEL) { - page_offset <<= 1; /* 32->64 */ + page_offset <<= 1; /* 32->64 */ + /* + * A 32-bit pde maps 4MB while the shadow pdes map + * only 2MB. So we need to double the offset again + * and zap two pdes instead of one. + */ + if (level == PT32_ROOT_LEVEL) { + page_offset <<= 1; + npte = 2; + } page_offset &= ~PAGE_MASK; } spte = __va(page->page_hpa); spte += page_offset / sizeof(*spte); - pte = *spte; - if (is_present_pte(pte)) { - if (level == PT_PAGE_TABLE_LEVEL) - rmap_remove(vcpu, spte); - else { - child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(vcpu, child, spte); - } + while (npte--) { + mmu_pre_write_zap_pte(vcpu, page, spte); + ++spte; } - *spte = 0; } } @@ -1189,6 +1284,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_free_some_page static void free_mmu_pages(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *page; + int j; while (!list_empty(&vcpu->kvm->active_mmu_pages)) { page = container_of(vcpu->kvm->active_mmu_pages.next, @@ -1202,13 +1298,17 @@ static void free_mmu_pages(struct kvm_vc __free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT)); page->page_hpa = INVALID_PAGE; } - free_page((unsigned long)vcpu->mmu.pae_root); + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) { + ASSERT(vcpu->mmu.pae_root[j]); + free_page((unsigned long)vcpu->mmu.pae_root[j]); + vcpu->mmu.pae_root[j] = NULL; + } } static int alloc_mmu_pages(struct kvm_vcpu *vcpu) { struct page *page; - int i; + int i, j; ASSERT(vcpu); @@ -1225,17 +1325,22 @@ static int alloc_mmu_pages(struct kvm_vc ++vcpu->kvm->n_free_mmu_pages; } - /* - * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. - * Therefore we need to allocate shadow page tables in the first - * 4GB of memory, which happens to fit the DMA32 zone. - */ - page = alloc_page(GFP_KERNEL | __GFP_DMA32); - if (!page) - goto error_1; - vcpu->mmu.pae_root = page_address(page); - for (i = 0; i < 4; ++i) - vcpu->mmu.pae_root[i] = INVALID_PAGE; + for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) { + /* + * When emulating 32-bit mode, cr3 is only 32 bits even on + * x86_64. Therefore we need to allocate shadow page tables + * in the first 4GB of memory, which happens to fit the DMA32 + * zone: + */ + page = alloc_page(GFP_KERNEL | __GFP_DMA32 | __GFP_ZERO); + if (!page) + goto error_1; + + ASSERT(!vcpu->mmu.pae_root[j]); + vcpu->mmu.pae_root[j] = page_address(page); + for (i = 0; i < 4; ++i) + vcpu->mmu.pae_root[j][i] = INVALID_PAGE; + } return 0; @@ -1337,17 +1442,23 @@ static void audit_mappings_page(struct k static void audit_mappings(struct kvm_vcpu *vcpu) { - int i; + unsigned i, j; - if (vcpu->mmu.root_level == 4) + if (vcpu->mmu.root_level == 4) { audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4); - else - for (i = 0; i < 4; ++i) - if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK) + return; + } + + j = vcpu->cr3_cache_idx; +// for (j = 0; j < KVM_CR3_CACHE_SIZE; j++) { + ASSERT(vcpu->mmu.pae_root[j]); + for (i = 0; i < 4; ++i) { + if (vcpu->mmu.pae_root[j][i] & PT_PRESENT_MASK) { audit_mappings_page(vcpu, - vcpu->mmu.pae_root[i], - i << 30, - 2); + vcpu->mmu.pae_root[j][i], i << 30, 2); + } + } +// } } static int count_rmaps(struct kvm_vcpu *vcpu) Index: linux/drivers/kvm/paging_tmpl.h =================================================================== --- linux.orig/drivers/kvm/paging_tmpl.h +++ linux/drivers/kvm/paging_tmpl.h @@ -236,7 +236,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu shadow_addr = vcpu->mmu.root_hpa; level = vcpu->mmu.shadow_root_level; if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3]; + shadow_addr = vcpu->mmu.pae_root[vcpu->cr3_cache_idx][(addr >> 30) & 3]; shadow_addr &= PT64_BASE_ADDR_MASK; --level; } Index: linux/drivers/kvm/svm.c =================================================================== --- linux.orig/drivers/kvm/svm.c +++ linux/drivers/kvm/svm.c @@ -98,20 +98,28 @@ static unsigned get_addr_size(struct kvm static inline u8 pop_irq(struct kvm_vcpu *vcpu) { - int word_index = __ffs(vcpu->irq_summary); - int bit_index = __ffs(vcpu->irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; - + int word_index; + int bit_index; + int irq; + + spin_lock_bh(&vcpu->irq_lock); + word_index = __ffs(vcpu->irq_summary); + bit_index = __ffs(vcpu->irq_pending[word_index]); + irq = word_index * BITS_PER_LONG + bit_index; clear_bit(bit_index, &vcpu->irq_pending[word_index]); if (!vcpu->irq_pending[word_index]) clear_bit(word_index, &vcpu->irq_summary); + spin_unlock_bh(&vcpu->irq_lock); + return irq; } static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) { + spin_lock_bh(&vcpu->irq_lock); set_bit(irq, vcpu->irq_pending); set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); + spin_unlock_bh(&vcpu->irq_lock); } static inline void clgi(void) @@ -602,12 +610,20 @@ static void svm_free_vcpu(struct kvm_vcp static void svm_vcpu_load(struct kvm_vcpu *vcpu) { - get_cpu(); + int cpu = raw_smp_processor_id(); + cpumask_t this_mask = cpumask_of_cpu(cpu); + + /* + * Keep the context preemptible, but do not migrate + * away to another CPU. TODO: make sure this persists. + * Save/restore original mask. + */ + if (unlikely(!cpus_equal(current->cpus_allowed, this_mask))) + set_cpus_allowed(current, cpumask_of_cpu(cpu)); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) { - put_cpu(); } static void svm_vcpu_decache(struct kvm_vcpu *vcpu) @@ -1078,6 +1094,10 @@ static int halt_interception(struct kvm_ static int vmmcall_interception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { + if (vcpu->svm->vmcb->save.cpl != 0) { + inject_ud(vcpu); + return 1; + } vcpu->svm->vmcb->save.rip += 3; return kvm_hypercall(vcpu, kvm_run); } Index: linux/drivers/kvm/vmx.c =================================================================== --- linux.orig/drivers/kvm/vmx.c +++ linux/drivers/kvm/vmx.c @@ -134,7 +134,7 @@ static void vcpu_clear(struct kvm_vcpu * vcpu->launched = 0; } -static unsigned long vmcs_readl(unsigned long field) +static __always_inline unsigned long vmcs_readl(unsigned long field) { unsigned long value; @@ -143,12 +143,12 @@ static unsigned long vmcs_readl(unsigned return value; } -static u16 vmcs_read16(unsigned long field) +static __always_inline u16 vmcs_read16(unsigned long field) { return vmcs_readl(field); } -static u32 vmcs_read32(unsigned long field) +static __always_inline u32 vmcs_read32(unsigned long field) { return vmcs_readl(field); } @@ -207,9 +207,16 @@ static void vmcs_write64(unsigned long f static void vmx_vcpu_load(struct kvm_vcpu *vcpu) { u64 phys_addr = __pa(vcpu->vmcs); - int cpu; + int cpu = raw_smp_processor_id(); + cpumask_t this_mask = cpumask_of_cpu(cpu); - cpu = get_cpu(); + /* + * Keep the context preemptible, but do not migrate + * away to another CPU. TODO: make sure this persists. + * Save/restore original mask. + */ + if (unlikely(!cpus_equal(current->cpus_allowed, this_mask))) + set_cpus_allowed(current, cpumask_of_cpu(cpu)); if (vcpu->cpu != cpu) vcpu_clear(vcpu); @@ -246,7 +253,6 @@ static void vmx_vcpu_load(struct kvm_vcp static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { - put_cpu(); } static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) @@ -371,10 +377,10 @@ static int vmx_get_msr(struct kvm_vcpu * data = vmcs_read32(GUEST_SYSENTER_CS); break; case MSR_IA32_SYSENTER_EIP: - data = vmcs_read32(GUEST_SYSENTER_EIP); + data = vmcs_readl(GUEST_SYSENTER_EIP); break; case MSR_IA32_SYSENTER_ESP: - data = vmcs_read32(GUEST_SYSENTER_ESP); + data = vmcs_readl(GUEST_SYSENTER_ESP); break; default: msr = find_msr_entry(vcpu, msr_index); @@ -412,10 +418,10 @@ static int vmx_set_msr(struct kvm_vcpu * vmcs_write32(GUEST_SYSENTER_CS, data); break; case MSR_IA32_SYSENTER_EIP: - vmcs_write32(GUEST_SYSENTER_EIP, data); + vmcs_writel(GUEST_SYSENTER_EIP, data); break; case MSR_IA32_SYSENTER_ESP: - vmcs_write32(GUEST_SYSENTER_ESP, data); + vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_TIME_STAMP_COUNTER: guest_write_tsc(data); @@ -802,9 +808,56 @@ static void vmx_set_cr0_no_modeswitch(st vcpu->cr0 = cr0; } -static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +static void print_area_in_hex(void *area, int size) +{ + unsigned char *data = area; + int i; + + for (i = 0; i < size; i++, data++) { + if (!(i & 15)) + printk("\n%p:", data); + printk(" %02x", data[i]); + } + printk("\n"); +} + +/* + * Clear the guest side of the cr3 cache: + */ +void kvm_cr3_cache_clear(struct kvm_vcpu *vcpu) +{ + struct kvm_cr3_cache *cache; + + if (!vcpu->cr3_cache_page) + return; + cache = kmap_atomic(vcpu->cr3_cache_page, KM_USER1); + memset(cache->entry, 0, sizeof(cache->entry)); + kunmap_atomic(cache, KM_USER1); +} + +static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3_hpa) { - vmcs_writel(GUEST_CR3, cr3); + struct kvm_cr3_cache *cache; + int idx; + + vmcs_writel(GUEST_CR3, cr3_hpa); + if (!vcpu->cr3_cache_page) + return; + + WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->mmu.root_hpa); + + idx = vcpu->cr3_cache_idx; + cache = kmap_atomic(vcpu->cr3_cache_page, KM_USER0); + + /* NOTE: remove this check, in case of hostile guests: */ + WARN_ON(cache->entry[idx].guest_cr3); + WARN_ON(cache->entry[idx].host_cr3); + + cache->entry[idx].guest_cr3 = vcpu->cr3; + cache->entry[idx].host_cr3 = cr3_hpa; + kunmap_atomic(cache, KM_USER0); + + vmcs_writel(CR3_TARGET_VALUE0 + idx*2, cr3_hpa); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -981,6 +1034,42 @@ static void seg_setup(int seg) } /* + * Set up the cr3 validity hardware cache: + */ +static int vmcs_setup_cr3_cache(struct kvm_vcpu *vcpu) +{ + unsigned int cr3_target_values, i; + u64 msr_val; + + rdmsrl(MSR_IA32_VMX_MISC, msr_val); + + printk(KERN_DEBUG "MSR_IA32_VMX_MISC: %016Lx\n", msr_val); + + /* + * 9 bits of "CR3 target values": + */ + cr3_target_values = (msr_val >> 16) & ((1 << 10) - 1); + printk(KERN_DEBUG " cr3 target values: %d\n", cr3_target_values); + if (cr3_target_values > KVM_CR3_CACHE_SIZE) { + printk(KERN_WARNING "KVM: limiting cr3 cache size from %d to %d\n", + cr3_target_values, KVM_CR3_CACHE_SIZE); + cr3_target_values = KVM_CR3_CACHE_SIZE; + } + + vcpu->cr3_cache_idx = 0; + vcpu->cr3_cache_limit = cr3_target_values; + /* + * Initialize. TODO: set this to guest physical memory. + */ + for (i = 0; i < cr3_target_values; i++) + vmcs_writel(CR3_TARGET_VALUE0 + i*2, -1UL); + + vmcs_write32(CR3_TARGET_COUNT, cr3_target_values); + + return 0; +} + +/* * Sets up the vmcs for emulated real mode. */ static int vmx_vcpu_setup(struct kvm_vcpu *vcpu) @@ -1084,7 +1173,10 @@ static int vmx_vcpu_setup(struct kvm_vcp vmcs_write32(EXCEPTION_BITMAP, 1 << PF_VECTOR); vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); - vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ + + ret = vmcs_setup_cr3_cache(vcpu); + if (ret < 0) + goto out; vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ @@ -1227,13 +1319,18 @@ static void inject_rmode_irq(struct kvm_ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) { - int word_index = __ffs(vcpu->irq_summary); - int bit_index = __ffs(vcpu->irq_pending[word_index]); - int irq = word_index * BITS_PER_LONG + bit_index; - + int word_index; + int bit_index; + int irq; + + spin_lock_bh(&vcpu->irq_lock); + word_index = __ffs(vcpu->irq_summary); + bit_index = __ffs(vcpu->irq_pending[word_index]); + irq = word_index * BITS_PER_LONG + bit_index; clear_bit(bit_index, &vcpu->irq_pending[word_index]); if (!vcpu->irq_pending[word_index]) clear_bit(word_index, &vcpu->irq_summary); + spin_unlock_bh(&vcpu->irq_lock); if (vcpu->rmode.active) { inject_rmode_irq(vcpu, irq); @@ -1322,8 +1419,11 @@ static int handle_exception(struct kvm_v if (is_external_interrupt(vect_info)) { int irq = vect_info & VECTORING_INFO_VECTOR_MASK; + + spin_lock_bh(&vcpu->irq_lock); set_bit(irq, vcpu->irq_pending); set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); + spin_unlock_bh(&vcpu->irq_lock); } if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ @@ -1338,6 +1438,7 @@ static int handle_exception(struct kvm_v cr2 = vmcs_readl(EXIT_QUALIFICATION); spin_lock(&vcpu->kvm->lock); + kvm_cr3_cache_clear(vcpu); r = kvm_mmu_page_fault(vcpu, cr2, error_code); if (r < 0) { spin_unlock(&vcpu->kvm->lock); @@ -1498,6 +1599,15 @@ static int handle_cr(struct kvm_vcpu *vc skip_emulated_instruction(vcpu); return 1; case 3: + /* + * A para-guest must not attempt to set + * %cr3 due to possible aliasing of host-physical + * and guest-physical addresses: + */ + if (unlikely(vcpu->cr3_cache_page)) { + vmx_inject_gp(vcpu, 0); + return 0; + } vcpu_load_rsp_rip(vcpu); set_cr3(vcpu, vcpu->regs[reg]); skip_emulated_instruction(vcpu); @@ -1695,6 +1805,14 @@ static int kvm_handle_exit(struct kvm_ru u32 vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); u32 exit_reason = vmcs_read32(VM_EXIT_REASON); + /* + * Profile KVM exit RIPs - except vmcalls that get profiled + * in a more finegrained way, separately: + */ + if (unlikely(prof_on == KVM_PROFILING && + exit_reason != EXIT_REASON_VMCALL)) + profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); + if ( (vectoring_info & VECTORING_INFO_VALID_MASK) && exit_reason != EXIT_REASON_EXCEPTION_NMI ) printk(KERN_WARNING "%s: unexpected, valid vectoring info and " @@ -1725,6 +1843,44 @@ static int dm_request_for_irq_injection( (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); } +void noinline kvm_cr3_cache_sync(struct kvm_vcpu *vcpu) +{ + void *guest_cr3_hva; + hpa_t guest_cr3_hpa; + u64 *root; + int j; + + if (!vcpu->cr3_cache_page) + return; + + guest_cr3_hpa = vmcs_readl(GUEST_CR3); + + /* + * Are they in sync already? + */ + if (guest_cr3_hpa == vcpu->mmu.root_hpa) + return; + + guest_cr3_hva = __va(guest_cr3_hpa); + + for (j = 0; j < vcpu->cr3_cache_limit; j++) { + root = vcpu->mmu.pae_root[j]; + WARN_ON(!root); + if (root != guest_cr3_hva) + continue; + + vcpu->cr3 = vcpu->guest_cr3_gpa[j]; + vcpu->cr3_cache_idx = j; + vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root[j]); + /* + * Read in the pdptrs: + */ + __load_pdptrs(vcpu, vcpu->cr3); + break; + } + WARN_ON(j == KVM_CR3_CACHE_SIZE); +} + static int vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { u8 fail; @@ -1763,12 +1919,15 @@ again: if (vcpu->guest_debug.enabled) kvm_guest_debug_pre(vcpu); + WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->mmu.root_hpa); + fx_save(vcpu->host_fx_image); fx_restore(vcpu->guest_fx_image); save_msrs(vcpu->host_msrs, vcpu->nmsrs); load_msrs(vcpu->guest_msrs, NR_BAD_MSRS); + preempt_disable(); asm ( /* Store host registers */ "pushf \n\t" @@ -1888,6 +2047,29 @@ again: [cr2]"i"(offsetof(struct kvm_vcpu, cr2)) : "cc", "memory" ); + /* + * Reload segment selectors ASAP. (it's needed for a functional + * kernel: x86 relies on having __KERNEL_PDA in %fs and x86_64 + * relies on having 0 in %gs for the CPU PDA to work.) + */ + if (fs_gs_ldt_reload_needed) { + load_ldt(ldt_sel); + load_fs(fs_sel); + /* + * If we have to reload gs, we must take care to + * preserve our gs base. + */ + local_irq_disable(); + load_gs(gs_sel); +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); +#endif + local_irq_enable(); + + reload_tss(); + } + preempt_enable(); + ++kvm_stat.exits; save_msrs(vcpu->guest_msrs, NR_BAD_MSRS); @@ -1897,6 +2079,14 @@ again: fx_restore(vcpu->host_fx_image); vcpu->interrupt_window_open = (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; + /* + * Figure out whether vcpu->cr3 needs updating because + * the guest made use of the cr3 cache: + */ + kvm_cr3_cache_sync(vcpu); + + WARN_ON(vmcs_readl(GUEST_CR3) != vcpu->mmu.root_hpa); + asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); kvm_run->exit_type = 0; @@ -1905,28 +2095,6 @@ again: kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR); r = 0; } else { - if (fs_gs_ldt_reload_needed) { - load_ldt(ldt_sel); - load_fs(fs_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_disable(); - load_gs(gs_sel); -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); -#endif - local_irq_enable(); - - reload_tss(); - } - /* - * Profile KVM exit RIPs: - */ - if (unlikely(prof_on == KVM_PROFILING)) - profile_hit(KVM_PROFILING, (void *)vmcs_readl(GUEST_RIP)); - vcpu->launched = 1; kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT; r = kvm_handle_exit(kvm_run, vcpu); @@ -1956,6 +2124,7 @@ again: static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { vmcs_writel(GUEST_CR3, vmcs_readl(GUEST_CR3)); + kvm_cr3_cache_clear(vcpu); } static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, @@ -1985,7 +2154,7 @@ static void vmx_inject_page_fault(struct INTR_TYPE_EXCEPTION | INTR_INFO_DELIEVER_CODE_MASK | INTR_INFO_VALID_MASK); - + kvm_cr3_cache_clear(vcpu); } static void vmx_free_vmcs(struct kvm_vcpu *vcpu) Index: linux/drivers/kvm/vmx.h =================================================================== --- linux.orig/drivers/kvm/vmx.h +++ linux/drivers/kvm/vmx.h @@ -293,5 +293,6 @@ enum vmcs_field { #define MSR_IA32_VMX_PROCBASED_CTLS 0x482 #define MSR_IA32_VMX_EXIT_CTLS 0x483 #define MSR_IA32_VMX_ENTRY_CTLS 0x484 +#define MSR_IA32_VMX_MISC 0x485 #endif Index: linux/drivers/macintosh/adb.c =================================================================== --- linux.orig/drivers/macintosh/adb.c +++ linux/drivers/macintosh/adb.c @@ -256,6 +256,8 @@ adb_probe_task(void *x) sigprocmask(SIG_BLOCK, &blocked, NULL); flush_signals(current); + down(&adb_probe_mutex); + printk(KERN_INFO "adb: starting probe task...\n"); do_adb_reset_bus(); printk(KERN_INFO "adb: finished probe task...\n"); @@ -282,7 +284,9 @@ adb_reset_bus(void) return 0; } - down(&adb_probe_mutex); + if (adb_got_sleep) + return 0; + schedule_work(&adb_reset_work); return 0; } @@ -347,23 +351,21 @@ adb_notify_sleep(struct pmu_sleep_notifi switch (when) { case PBOOK_SLEEP_REQUEST: + /* Signal to discontiue probing */ adb_got_sleep = 1; - /* We need to get a lock on the probe thread */ - down(&adb_probe_mutex); /* Stop autopoll */ if (adb_controller->autopoll) adb_controller->autopoll(0); ret = blocking_notifier_call_chain(&adb_client_list, ADB_MSG_POWERDOWN, NULL); if (ret & NOTIFY_STOP_MASK) { - up(&adb_probe_mutex); + adb_got_sleep = 0; return PBOOK_SLEEP_REFUSE; } break; case PBOOK_SLEEP_REJECT: if (adb_got_sleep) { adb_got_sleep = 0; - up(&adb_probe_mutex); adb_reset_bus(); } break; @@ -372,7 +374,6 @@ adb_notify_sleep(struct pmu_sleep_notifi break; case PBOOK_WAKE: adb_got_sleep = 0; - up(&adb_probe_mutex); adb_reset_bus(); break; } Index: linux/drivers/md/linear.c =================================================================== --- linux.orig/drivers/md/linear.c +++ linux/drivers/md/linear.c @@ -188,7 +188,7 @@ static linear_conf_t *linear_conf(mddev_ for (i=0; i < cnt-1 ; i++) { sector_t sz = 0; int j; - for (j=i; idisks[j].size; if (sz >= min_spacing && sz < conf->hash_spacing) conf->hash_spacing = sz; Index: linux/drivers/media/dvb/dvb-core/dvb_frontend.c =================================================================== --- linux.orig/drivers/media/dvb/dvb-core/dvb_frontend.c +++ linux/drivers/media/dvb/dvb-core/dvb_frontend.c @@ -98,7 +98,7 @@ struct dvb_frontend_private { struct dvb_device *dvbdev; struct dvb_frontend_parameters parameters; struct dvb_fe_events events; - struct semaphore sem; + struct compat_semaphore sem; struct list_head list_head; wait_queue_head_t wait_queue; struct task_struct *thread; Index: linux/drivers/media/dvb/dvb-core/dvb_frontend.h =================================================================== --- linux.orig/drivers/media/dvb/dvb-core/dvb_frontend.h +++ linux/drivers/media/dvb/dvb-core/dvb_frontend.h @@ -142,7 +142,7 @@ struct dvb_fe_events { int eventr; int overflow; wait_queue_head_t wait_queue; - struct semaphore sem; + struct compat_semaphore sem; }; struct dvb_frontend { Index: linux/drivers/mtd/devices/block2mtd.c =================================================================== --- linux.orig/drivers/mtd/devices/block2mtd.c +++ linux/drivers/mtd/devices/block2mtd.c @@ -40,6 +40,7 @@ struct block2mtd_dev { static LIST_HEAD(blkmtd_device_list); +#if 0 #define PAGE_READAHEAD 64 static void cache_readahead(struct address_space *mapping, int index) { @@ -84,12 +85,12 @@ static void cache_readahead(struct addre if (ret) read_cache_pages(mapping, &page_pool, filler, NULL); } - +#endif static struct page* page_readahead(struct address_space *mapping, int index) { filler_t *filler = (filler_t*)mapping->a_ops->readpage; - cache_readahead(mapping, index); +// cache_readahead(mapping, index); return read_cache_page(mapping, index, filler, NULL); } Index: linux/drivers/net/3c527.c =================================================================== --- linux.orig/drivers/net/3c527.c +++ linux/drivers/net/3c527.c @@ -182,7 +182,7 @@ struct mc32_local u16 rx_ring_tail; /* index to rx de-queue end */ - struct semaphore cmd_mutex; /* Serialises issuing of execute commands */ + struct compat_semaphore cmd_mutex; /* Serialises issuing of execute commands */ struct completion execution_cmd; /* Card has completed an execute command */ struct completion xceiver_cmd; /* Card has completed a tx or rx command */ }; Index: linux/drivers/net/3c59x.c =================================================================== --- linux.orig/drivers/net/3c59x.c +++ linux/drivers/net/3c59x.c @@ -792,9 +792,9 @@ static void poll_vortex(struct net_devic { struct vortex_private *vp = netdev_priv(dev); unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #endif @@ -1728,6 +1728,7 @@ vortex_timer(unsigned long data) int next_tick = 60*HZ; int ok = 0; int media_status, old_window; + unsigned long flags; if (vortex_debug > 2) { printk(KERN_DEBUG "%s: Media selection timer tick happened, %s.\n", @@ -1735,7 +1736,7 @@ vortex_timer(unsigned long data) printk(KERN_DEBUG "dev->watchdog_timeo=%d\n", dev->watchdog_timeo); } - disable_irq_lockdep(dev->irq); + spin_lock_irqsave(&vp->lock, flags); old_window = ioread16(ioaddr + EL3_CMD) >> 13; EL3WINDOW(4); media_status = ioread16(ioaddr + Wn4_Media); @@ -1758,9 +1759,7 @@ vortex_timer(unsigned long data) case XCVR_MII: case XCVR_NWAY: { ok = 1; - spin_lock_bh(&vp->lock); vortex_check_media(dev, 0); - spin_unlock_bh(&vp->lock); } break; default: /* Other media types handled by Tx timeouts. */ @@ -1816,7 +1815,7 @@ leave_media_alone: dev->name, media_tbl[dev->if_port].name); EL3WINDOW(old_window); - enable_irq_lockdep(dev->irq); + spin_unlock_irqrestore(&vp->lock, flags); mod_timer(&vp->timer, RUN_AT(next_tick)); if (vp->deferred) iowrite16(FakeIntr, ioaddr + EL3_CMD); @@ -1849,13 +1848,17 @@ static void vortex_tx_timeout(struct net /* * Block interrupts because vortex_interrupt does a bare spin_lock() */ +#ifndef CONFIG_PREEMPT_RT unsigned long flags; local_irq_save(flags); +#endif if (vp->full_bus_master_tx) boomerang_interrupt(dev->irq, dev); else vortex_interrupt(dev->irq, dev); +#ifndef CONFIG_PREEMPT_RT local_irq_restore(flags); +#endif } } Index: linux/drivers/net/8139too.c =================================================================== --- linux.orig/drivers/net/8139too.c +++ linux/drivers/net/8139too.c @@ -2132,10 +2132,10 @@ static int rtl8139_poll(struct net_devic * Order is important since data can get interrupted * again when we think we are done. */ - local_irq_save(flags); + local_irq_save_nort(flags); RTL_W16_F(IntrMask, rtl8139_intr_mask); __netif_rx_complete(dev); - local_irq_restore(flags); + local_irq_restore_nort(flags); } spin_unlock(&tp->rx_lock); @@ -2215,7 +2215,11 @@ static irqreturn_t rtl8139_interrupt (in */ static void rtl8139_poll_controller(struct net_device *dev) { - disable_irq(dev->irq); + /* + * use _nosync() variant - might be used by netconsole + * from atomic contexts: + */ + disable_irq_nosync(dev->irq); rtl8139_interrupt(dev->irq, dev); enable_irq(dev->irq); } Index: linux/drivers/net/e1000/e1000_main.c =================================================================== --- linux.orig/drivers/net/e1000/e1000_main.c +++ linux/drivers/net/e1000/e1000_main.c @@ -3363,10 +3363,8 @@ e1000_xmit_frame(struct sk_buff *skb, st (adapter->hw.mac_type == e1000_82573)) e1000_transfer_dhcp_info(adapter, skb); - local_irq_save(flags); - if (!spin_trylock(&tx_ring->tx_lock)) { + if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) { /* Collision - tell upper layer to requeue */ - local_irq_restore(flags); return NETDEV_TX_LOCKED; } Index: linux/drivers/net/hamradio/6pack.c =================================================================== --- linux.orig/drivers/net/hamradio/6pack.c +++ linux/drivers/net/hamradio/6pack.c @@ -123,7 +123,7 @@ struct sixpack { struct timer_list tx_t; struct timer_list resync_t; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; spinlock_t lock; }; Index: linux/drivers/net/hamradio/mkiss.c =================================================================== --- linux.orig/drivers/net/hamradio/mkiss.c +++ linux/drivers/net/hamradio/mkiss.c @@ -84,7 +84,7 @@ struct mkiss { #define CRC_MODE_SMACK_TEST 4 atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; }; /*---------------------------------------------------------------------------*/ Index: linux/drivers/net/ibm_emac/ibm_emac_core.c =================================================================== --- linux.orig/drivers/net/ibm_emac/ibm_emac_core.c +++ linux/drivers/net/ibm_emac/ibm_emac_core.c @@ -1060,6 +1060,8 @@ static inline int emac_xmit_finish(struc ++dev->stats.tx_packets; dev->stats.tx_bytes += len; + spin_unlock(&dev->tx_lock); + return 0; } @@ -1073,6 +1075,7 @@ static int emac_start_xmit(struct sk_buf u16 ctrl = EMAC_TX_CTRL_GFCS | EMAC_TX_CTRL_GP | MAL_TX_CTRL_READY | MAL_TX_CTRL_LAST | emac_tx_csum(dev, skb); + spin_lock(&dev->tx_lock); slot = dev->tx_slot++; if (dev->tx_slot == NUM_TX_BUFF) { dev->tx_slot = 0; @@ -1135,6 +1138,8 @@ static int emac_start_xmit_sg(struct sk_ if (likely(!nr_frags && len <= MAL_MAX_TX_SIZE)) return emac_start_xmit(skb, ndev); + spin_lock(&dev->tx_lock); + len -= skb->data_len; /* Note, this is only an *estimation*, we can still run out of empty @@ -1203,6 +1208,7 @@ static int emac_start_xmit_sg(struct sk_ stop_queue: netif_stop_queue(ndev); DBG2("%d: stopped TX queue" NL, dev->def->index); + spin_unlock(&dev->tx_lock); return 1; } #else @@ -1242,6 +1248,7 @@ static void emac_poll_tx(void *param) DBG2("%d: poll_tx, %d %d" NL, dev->def->index, dev->tx_cnt, dev->ack_slot); + spin_lock(&dev->tx_lock); if (dev->tx_cnt) { u16 ctrl; int slot = dev->ack_slot, n = 0; @@ -1251,6 +1258,7 @@ static void emac_poll_tx(void *param) struct sk_buff *skb = dev->tx_skb[slot]; ++n; + spin_unlock(&dev->tx_lock); if (skb) { dev_kfree_skb(skb); dev->tx_skb[slot] = NULL; @@ -1260,6 +1268,7 @@ static void emac_poll_tx(void *param) if (unlikely(EMAC_IS_BAD_TX(ctrl))) emac_parse_tx_error(dev, ctrl); + spin_lock(&dev->tx_lock); if (--dev->tx_cnt) goto again; } @@ -1272,6 +1281,7 @@ static void emac_poll_tx(void *param) DBG2("%d: tx %d pkts" NL, dev->def->index, n); } } + spin_unlock(&dev->tx_lock); } static inline void emac_recycle_rx_skb(struct ocp_enet_private *dev, int slot, @@ -1965,6 +1975,7 @@ static int __init emac_probe(struct ocp_ dev->ldev = &ocpdev->dev; dev->def = ocpdev->def; SET_MODULE_OWNER(ndev); + spin_lock_init(&dev->tx_lock); /* Find MAL device we are connected to */ maldev = Index: linux/drivers/net/ibm_emac/ibm_emac_core.h =================================================================== --- linux.orig/drivers/net/ibm_emac/ibm_emac_core.h +++ linux/drivers/net/ibm_emac/ibm_emac_core.h @@ -193,6 +193,8 @@ struct ocp_enet_private { struct ibm_emac_error_stats estats; struct net_device_stats nstats; + spinlock_t tx_lock; + struct device* ldev; }; Index: linux/drivers/net/loopback.c =================================================================== --- linux.orig/drivers/net/loopback.c +++ linux/drivers/net/loopback.c @@ -153,14 +153,14 @@ static int loopback_xmit(struct sk_buff #endif dev->last_rx = jiffies; - /* it's OK to use __get_cpu_var() because BHs are off */ - lb_stats = &__get_cpu_var(pcpu_lstats); + lb_stats = &per_cpu(pcpu_lstats, get_cpu()); lb_stats->bytes += skb->len; lb_stats->packets++; + put_cpu(); - netif_rx(skb); + netif_rx_ni(skb); - return 0; + return(0); } static struct net_device_stats loopback_stats; Index: linux/drivers/net/netconsole.c =================================================================== --- linux.orig/drivers/net/netconsole.c +++ linux/drivers/net/netconsole.c @@ -68,21 +68,16 @@ static int configured = 0; static void write_msg(struct console *con, const char *msg, unsigned int len) { int frag, left; - unsigned long flags; if (!np.dev) return; - local_irq_save(flags); - - for(left = len; left; ) { + for (left = len; left; ) { frag = min(left, MAX_PRINT_CHUNK); netpoll_send_udp(&np, msg, frag); msg += frag; left -= frag; } - - local_irq_restore(flags); } static struct console netconsole = { Index: linux/drivers/net/plip.c =================================================================== --- linux.orig/drivers/net/plip.c +++ linux/drivers/net/plip.c @@ -228,7 +228,10 @@ struct net_local { struct hh_cache *hh); spinlock_t lock; atomic_t kill_timer; - struct semaphore killed_timer_sem; + /* + * PREEMPT_RT: this isnt a mutex, it should be struct completion. + */ + struct compat_semaphore killed_timer_sem; }; static inline void enable_parport_interrupts (struct net_device *dev) Index: linux/drivers/net/ppp_async.c =================================================================== --- linux.orig/drivers/net/ppp_async.c +++ linux/drivers/net/ppp_async.c @@ -67,7 +67,7 @@ struct asyncppp { struct tasklet_struct tsk; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; struct ppp_channel chan; /* interface to generic ppp layer */ unsigned char obuf[OBUFSIZE]; }; Index: linux/drivers/net/ppp_synctty.c =================================================================== --- linux.orig/drivers/net/ppp_synctty.c +++ linux/drivers/net/ppp_synctty.c @@ -70,7 +70,7 @@ struct syncppp { struct tasklet_struct tsk; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; struct ppp_channel chan; /* interface to generic ppp layer */ }; Index: linux/drivers/net/sungem.c =================================================================== --- linux.orig/drivers/net/sungem.c +++ linux/drivers/net/sungem.c @@ -1039,10 +1039,8 @@ static int gem_start_xmit(struct sk_buff (csum_stuff_off << 21)); } - local_irq_save(flags); - if (!spin_trylock(&gp->tx_lock)) { + if (!spin_trylock_irqsave(&gp->tx_lock, flags)) { /* Tell upper layer to requeue */ - local_irq_restore(flags); return NETDEV_TX_LOCKED; } /* We raced with gem_do_stop() */ Index: linux/drivers/net/tulip/tulip_core.c =================================================================== --- linux.orig/drivers/net/tulip/tulip_core.c +++ linux/drivers/net/tulip/tulip_core.c @@ -1807,6 +1807,7 @@ static void __devexit tulip_remove_one ( pci_iounmap(pdev, tp->base_addr); free_netdev (dev); pci_release_regions (pdev); + pci_disable_device (pdev); pci_set_drvdata (pdev, NULL); /* pci_power_off (pdev, -1); */ Index: linux/drivers/oprofile/oprofilefs.c =================================================================== --- linux.orig/drivers/oprofile/oprofilefs.c +++ linux/drivers/oprofile/oprofilefs.c @@ -21,7 +21,7 @@ #define OPROFILEFS_MAGIC 0x6f70726f -DEFINE_SPINLOCK(oprofilefs_lock); +DEFINE_RAW_SPINLOCK(oprofilefs_lock); static struct inode * oprofilefs_get_inode(struct super_block * sb, int mode) { Index: linux/drivers/pci/access.c =================================================================== --- linux.orig/drivers/pci/access.c +++ linux/drivers/pci/access.c @@ -11,7 +11,7 @@ * configuration space. */ -static DEFINE_SPINLOCK(pci_lock); +static DEFINE_RAW_SPINLOCK(pci_lock); /* * Wrappers for all PCI configuration access functions. They just check Index: linux/drivers/pci/hotplug/cpci_hotplug_core.c =================================================================== --- linux.orig/drivers/pci/hotplug/cpci_hotplug_core.c +++ linux/drivers/pci/hotplug/cpci_hotplug_core.c @@ -59,8 +59,8 @@ static int slots; static atomic_t extracting; int cpci_debug; static struct cpci_hp_controller *controller; -static struct semaphore event_semaphore; /* mutex for process loop (up if something to process) */ -static struct semaphore thread_exit; /* guard ensure thread has exited before calling it quits */ +static struct compat_semaphore event_semaphore; /* mutex for process loop (up if something to process) */ +static struct compat_semaphore thread_exit; /* guard ensure thread has exited before calling it quits */ static int thread_finished = 1; static int enable_slot(struct hotplug_slot *slot); Index: linux/drivers/pci/hotplug/cpqphp_ctrl.c =================================================================== --- linux.orig/drivers/pci/hotplug/cpqphp_ctrl.c +++ linux/drivers/pci/hotplug/cpqphp_ctrl.c @@ -45,8 +45,8 @@ static int configure_new_function(struct u8 behind_bridge, struct resource_lists *resources); static void interrupt_event_handler(struct controller *ctrl); -static struct semaphore event_semaphore; /* mutex for process loop (up if something to process) */ -static struct semaphore event_exit; /* guard ensure thread has exited before calling it quits */ +static struct compat_semaphore event_semaphore; /* mutex for process loop (up if something to process) */ +static struct compat_semaphore event_exit; /* guard ensure thread has exited before calling it quits */ static int event_finished; static unsigned long pushbutton_pending; /* = 0 */ Index: linux/drivers/pci/hotplug/ibmphp_hpc.c =================================================================== --- linux.orig/drivers/pci/hotplug/ibmphp_hpc.c +++ linux/drivers/pci/hotplug/ibmphp_hpc.c @@ -106,7 +106,7 @@ static int tid_poll; static struct mutex sem_hpcaccess; // lock access to HPC static struct semaphore semOperations; // lock all operations and // access to data structures -static struct semaphore sem_exit; // make sure polling thread goes away +static struct compat_semaphore sem_exit; // make sure polling thread goes away //---------------------------------------------------------------------------- // local function prototypes //---------------------------------------------------------------------------- Index: linux/drivers/pci/hotplug/pciehp_ctrl.c =================================================================== --- linux.orig/drivers/pci/hotplug/pciehp_ctrl.c +++ linux/drivers/pci/hotplug/pciehp_ctrl.c @@ -37,8 +37,8 @@ static void interrupt_event_handler(struct controller *ctrl); -static struct semaphore event_semaphore; /* mutex for process loop (up if something to process) */ -static struct semaphore event_exit; /* guard ensure thread has exited before calling it quits */ +static struct compat_semaphore event_semaphore; /* mutex for process loop (up if something to process) */ +static struct compat_semaphore event_exit; /* guard ensure thread has exited before calling it quits */ static int event_finished; static unsigned long pushbutton_pending; /* = 0 */ static unsigned long surprise_rm_pending; /* = 0 */ Index: linux/drivers/pci/pcie/aer/aerdrv.c =================================================================== --- linux.orig/drivers/pci/pcie/aer/aerdrv.c +++ linux/drivers/pci/pcie/aer/aerdrv.c @@ -157,7 +157,7 @@ static struct aer_rpc* aer_alloc_rpc(str * Initialize Root lock access, e_lock, to Root Error Status Reg, * Root Error ID Reg, and Root error producer/consumer index. */ - rpc->e_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&rpc->e_lock); rpc->rpd = dev; INIT_WORK(&rpc->dpc_handler, aer_isr); Index: linux/drivers/pnp/manager.c =================================================================== --- linux.orig/drivers/pnp/manager.c +++ linux/drivers/pnp/manager.c @@ -451,7 +451,7 @@ int pnp_auto_config_dev(struct pnp_dev * return -EINVAL; if(!pnp_can_configure(dev)) { - pnp_info("Device %s does not support resource configuration.", dev->dev.bus_id); + pnp_dbg("Device %s does not support resource configuration.", dev->dev.bus_id); return -ENODEV; } @@ -482,7 +482,7 @@ int pnp_auto_config_dev(struct pnp_dev * int pnp_start_dev(struct pnp_dev *dev) { if (!pnp_can_write(dev)) { - pnp_info("Device %s does not support activation.", dev->dev.bus_id); + pnp_dbg("Device %s does not support activation.", dev->dev.bus_id); return -EINVAL; } @@ -506,7 +506,7 @@ int pnp_start_dev(struct pnp_dev *dev) int pnp_stop_dev(struct pnp_dev *dev) { if (!pnp_can_disable(dev)) { - pnp_info("Device %s does not support disabling.", dev->dev.bus_id); + pnp_dbg("Device %s does not support disabling.", dev->dev.bus_id); return -EINVAL; } if (dev->protocol->disable(dev)<0) { Index: linux/drivers/pnp/pnpbios/core.c =================================================================== --- linux.orig/drivers/pnp/pnpbios/core.c +++ linux/drivers/pnp/pnpbios/core.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include Index: linux/drivers/s390/cio/qdio.c =================================================================== --- linux.orig/drivers/s390/cio/qdio.c +++ linux/drivers/s390/cio/qdio.c @@ -210,9 +210,11 @@ again: goto again; } if (rc < 0) { - QDIO_DBF_TEXT3(1,trace,"sqberr"); - sprintf(dbf_text,"%2x,%2x,%d,%d",tmp_cnt,*cnt,ccq,q_no); - QDIO_DBF_TEXT3(1,trace,dbf_text); + QDIO_DBF_TEXT3(1,trace,"sqberr"); + sprintf(dbf_text,"%2x,%2x",tmp_cnt,*cnt); + QDIO_DBF_TEXT3(1,trace,dbf_text); + sprintf(dbf_text,"%d,%d",ccq,q_no); + QDIO_DBF_TEXT3(1,trace,dbf_text); q->handler(q->cdev,QDIO_STATUS_ACTIVATE_CHECK_CONDITION| QDIO_STATUS_LOOK_FOR_ERROR, 0, 0, 0, -1, -1, q->int_parm); @@ -1250,7 +1252,6 @@ qdio_is_inbound_q_done(struct qdio_q *q) if (!no_used) { QDIO_DBF_TEXT4(0,trace,"inqisdnA"); QDIO_DBF_HEX4(0,trace,&q,sizeof(void*)); - QDIO_DBF_TEXT4(0,trace,dbf_text); return 1; } if (irq->is_qebsm) { @@ -3371,10 +3372,15 @@ qdio_do_qdio_fill_input(struct qdio_q *q unsigned int count, struct qdio_buffer *buffers) { struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr; + int tmp = 0; + qidx &= (QDIO_MAX_BUFFERS_PER_Q - 1); if (irq->is_qebsm) { - while (count) - set_slsb(q, &qidx, SLSB_CU_INPUT_EMPTY, &count); + while (count) { + tmp = set_slsb(q, &qidx, SLSB_CU_INPUT_EMPTY, &count); + if (!tmp) + return; + } return; } for (;;) { @@ -3390,11 +3396,15 @@ qdio_do_qdio_fill_output(struct qdio_q * unsigned int count, struct qdio_buffer *buffers) { struct qdio_irq *irq = (struct qdio_irq *) q->irq_ptr; + int tmp = 0; qidx &= (QDIO_MAX_BUFFERS_PER_Q - 1); if (irq->is_qebsm) { - while (count) - set_slsb(q, &qidx, SLSB_CU_OUTPUT_PRIMED, &count); + while (count) { + tmp = set_slsb(q, &qidx, SLSB_CU_OUTPUT_PRIMED, &count); + if (!tmp) + return; + } return; } Index: linux/drivers/s390/crypto/ap_bus.c =================================================================== --- linux.orig/drivers/s390/crypto/ap_bus.c +++ linux/drivers/s390/crypto/ap_bus.c @@ -65,6 +65,8 @@ module_param_named(poll_thread, ap_threa MODULE_PARM_DESC(poll_thread, "Turn on/off poll thread, default is 1 (on)."); static struct device *ap_root_device = NULL; +static DEFINE_SPINLOCK(ap_device_lock); +static LIST_HEAD(ap_device_list); /** * Workqueue & timer for bus rescan. @@ -457,6 +459,9 @@ static int ap_device_probe(struct device int rc; ap_dev->drv = ap_drv; + spin_lock_bh(&ap_device_lock); + list_add(&ap_dev->list, &ap_device_list); + spin_unlock_bh(&ap_device_lock); rc = ap_drv->probe ? ap_drv->probe(ap_dev) : -ENODEV; return rc; } @@ -497,6 +502,9 @@ static int ap_device_remove(struct devic ap_flush_queue(ap_dev); if (ap_drv->remove) ap_drv->remove(ap_dev); + spin_lock_bh(&ap_device_lock); + list_del_init(&ap_dev->list); + spin_unlock_bh(&ap_device_lock); return 0; } @@ -772,6 +780,7 @@ static void ap_scan_bus(struct work_stru spin_lock_init(&ap_dev->lock); INIT_LIST_HEAD(&ap_dev->pendingq); INIT_LIST_HEAD(&ap_dev->requestq); + INIT_LIST_HEAD(&ap_dev->list); if (device_type == 0) ap_probe_device_type(ap_dev); else @@ -1033,14 +1042,13 @@ static void ap_poll_timeout(unsigned lon * polling until bit 2^0 of the control flags is not set. If bit 2^1 * of the control flags has been set arm the poll timer. */ -static int __ap_poll_all(struct device *dev, void *data) +static int __ap_poll_all(struct ap_device *ap_dev, unsigned long *flags) { - struct ap_device *ap_dev = to_ap_dev(dev); int rc; spin_lock(&ap_dev->lock); if (!ap_dev->unregistered) { - rc = ap_poll_queue(to_ap_dev(dev), (unsigned long *) data); + rc = ap_poll_queue(ap_dev, flags); if (rc) ap_dev->unregistered = 1; } else @@ -1054,10 +1062,15 @@ static int __ap_poll_all(struct device * static void ap_poll_all(unsigned long dummy) { unsigned long flags; + struct ap_device *ap_dev; do { flags = 0; - bus_for_each_dev(&ap_bus_type, NULL, &flags, __ap_poll_all); + spin_lock(&ap_device_lock); + list_for_each_entry(ap_dev, &ap_device_list, list) { + __ap_poll_all(ap_dev, &flags); + } + spin_unlock(&ap_device_lock); } while (flags & 1); if (flags & 2) ap_schedule_poll_timer(); @@ -1075,6 +1088,7 @@ static int ap_poll_thread(void *data) DECLARE_WAITQUEUE(wait, current); unsigned long flags; int requests; + struct ap_device *ap_dev; set_user_nice(current, 19); while (1) { @@ -1092,10 +1106,12 @@ static int ap_poll_thread(void *data) set_current_state(TASK_RUNNING); remove_wait_queue(&ap_poll_wait, &wait); - local_bh_disable(); flags = 0; - bus_for_each_dev(&ap_bus_type, NULL, &flags, __ap_poll_all); - local_bh_enable(); + spin_lock_bh(&ap_device_lock); + list_for_each_entry(ap_dev, &ap_device_list, list) { + __ap_poll_all(ap_dev, &flags); + } + spin_unlock_bh(&ap_device_lock); } set_current_state(TASK_RUNNING); remove_wait_queue(&ap_poll_wait, &wait); Index: linux/drivers/s390/crypto/ap_bus.h =================================================================== --- linux.orig/drivers/s390/crypto/ap_bus.h +++ linux/drivers/s390/crypto/ap_bus.h @@ -106,6 +106,7 @@ struct ap_device { struct device device; struct ap_driver *drv; /* Pointer to AP device driver. */ spinlock_t lock; /* Per device lock. */ + struct list_head list; /* private list of all AP devices. */ ap_qid_t qid; /* AP queue id. */ int queue_depth; /* AP queue depth.*/ Index: linux/drivers/s390/crypto/zcrypt_api.c =================================================================== --- linux.orig/drivers/s390/crypto/zcrypt_api.c +++ linux/drivers/s390/crypto/zcrypt_api.c @@ -298,14 +298,14 @@ static long zcrypt_rsa_modexpo(struct ic get_device(&zdev->ap_dev->device); zdev->request_count++; __zcrypt_decrease_preference(zdev); - spin_unlock_bh(&zcrypt_device_lock); if (try_module_get(zdev->ap_dev->drv->driver.owner)) { + spin_unlock_bh(&zcrypt_device_lock); rc = zdev->ops->rsa_modexpo(zdev, mex); + spin_lock_bh(&zcrypt_device_lock); module_put(zdev->ap_dev->drv->driver.owner); } else rc = -EAGAIN; - spin_lock_bh(&zcrypt_device_lock); zdev->request_count--; __zcrypt_increase_preference(zdev); put_device(&zdev->ap_dev->device); @@ -373,14 +373,14 @@ static long zcrypt_rsa_crt(struct ica_rs get_device(&zdev->ap_dev->device); zdev->request_count++; __zcrypt_decrease_preference(zdev); - spin_unlock_bh(&zcrypt_device_lock); if (try_module_get(zdev->ap_dev->drv->driver.owner)) { + spin_unlock_bh(&zcrypt_device_lock); rc = zdev->ops->rsa_modexpo_crt(zdev, crt); + spin_lock_bh(&zcrypt_device_lock); module_put(zdev->ap_dev->drv->driver.owner); } else rc = -EAGAIN; - spin_lock_bh(&zcrypt_device_lock); zdev->request_count--; __zcrypt_increase_preference(zdev); put_device(&zdev->ap_dev->device); @@ -408,14 +408,14 @@ static long zcrypt_send_cprb(struct ica_ get_device(&zdev->ap_dev->device); zdev->request_count++; __zcrypt_decrease_preference(zdev); - spin_unlock_bh(&zcrypt_device_lock); if (try_module_get(zdev->ap_dev->drv->driver.owner)) { + spin_unlock_bh(&zcrypt_device_lock); rc = zdev->ops->send_cprb(zdev, xcRB); + spin_lock_bh(&zcrypt_device_lock); module_put(zdev->ap_dev->drv->driver.owner); } else rc = -EAGAIN; - spin_lock_bh(&zcrypt_device_lock); zdev->request_count--; __zcrypt_increase_preference(zdev); put_device(&zdev->ap_dev->device); Index: linux/drivers/scsi/aacraid/aacraid.h =================================================================== --- linux.orig/drivers/scsi/aacraid/aacraid.h +++ linux/drivers/scsi/aacraid/aacraid.h @@ -750,7 +750,7 @@ struct aac_fib_context { u32 unique; // unique value representing this context ulong jiffies; // used for cleanup - dmb changed to ulong struct list_head next; // used to link context's into a linked list - struct semaphore wait_sem; // this is used to wait for the next fib to arrive. + struct compat_semaphore wait_sem; // this is used to wait for the next fib to arrive. int wait; // Set to true when thread is in WaitForSingleObject unsigned long count; // total number of FIBs on FibList struct list_head fib_list; // this holds fibs and their attachd hw_fibs @@ -820,7 +820,7 @@ struct fib { * This is the event the sendfib routine will wait on if the * caller did not pass one and this is synch io. */ - struct semaphore event_wait; + struct compat_semaphore event_wait; spinlock_t event_lock; u32 done; /* gets set to 1 when fib is complete */ Index: linux/drivers/scsi/qla2xxx/qla_def.h =================================================================== --- linux.orig/drivers/scsi/qla2xxx/qla_def.h +++ linux/drivers/scsi/qla2xxx/qla_def.h @@ -2341,7 +2341,7 @@ typedef struct scsi_qla_host { #define MBX_UPDATE_FLASH_ACTIVE 3 struct semaphore mbx_cmd_sem; /* Serialialize mbx access */ - struct semaphore mbx_intr_sem; /* Used for completion notification */ + struct compat_semaphore mbx_intr_sem; /* Used for completion notification */ uint32_t mbx_flags; #define MBX_IN_PROGRESS BIT_0 Index: linux/drivers/serial/8250.c =================================================================== --- linux.orig/drivers/serial/8250.c +++ linux/drivers/serial/8250.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1392,7 +1393,7 @@ static irqreturn_t serial8250_interrupt( l = l->next; - if (l == i->head && pass_counter++ > PASS_LIMIT) { + if (!paravirt_enabled() && l == i->head && pass_counter++ > PASS_LIMIT) { /* If we hit this, we're dead. */ printk(KERN_ERR "serial8250: too much work for " "irq%d\n", irq); @@ -2357,14 +2358,10 @@ serial8250_console_write(struct console touch_nmi_watchdog(); - local_irq_save(flags); - if (up->port.sysrq) { - /* serial8250_handle_port() already took the lock */ - locked = 0; - } else if (oops_in_progress) { - locked = spin_trylock(&up->port.lock); - } else - spin_lock(&up->port.lock); + if (up->port.sysrq || oops_in_progress) + locked = spin_trylock_irqsave(&up->port.lock, flags); + else + spin_lock_irqsave(&up->port.lock, flags); /* * First save the IER then disable the interrupts @@ -2386,8 +2383,7 @@ serial8250_console_write(struct console serial_out(up, UART_IER, ier); if (locked) - spin_unlock(&up->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&up->port.lock, flags); } static int __init serial8250_console_setup(struct console *co, char *options) Index: linux/drivers/serial/sh-sci.c =================================================================== --- linux.orig/drivers/serial/sh-sci.c +++ linux/drivers/serial/sh-sci.c @@ -17,6 +17,9 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. */ +#if defined(CONFIG_SERIAL_SH_SCI_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) +#define SUPPORT_SYSRQ +#endif #undef DEBUG @@ -49,11 +52,6 @@ #endif #include - -#if defined(CONFIG_SERIAL_SH_SCI_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ) -#define SUPPORT_SYSRQ -#endif - #include "sh-sci.h" struct sci_port { @@ -645,6 +643,9 @@ static inline int sci_handle_breaks(stru struct tty_struct *tty = port->info->tty; struct sci_port *s = &sci_ports[port->line]; + if (uart_handle_break(port)) + return 0; + if (!s->break_flag && status & SCxSR_BRK(port)) { #if defined(CONFIG_CPU_SH3) /* Debounce break */ Index: linux/drivers/spi/at25.c =================================================================== --- linux.orig/drivers/spi/at25.c +++ linux/drivers/spi/at25.c @@ -291,7 +291,7 @@ static int at25_probe(struct spi_device */ sr = spi_w8r8(spi, AT25_RDSR); if (sr < 0 || sr & AT25_SR_nRDY) { - dev_dbg(&at25->spi->dev, "rdsr --> %d (%02x)\n", sr, sr); + dev_dbg(&spi->dev, "rdsr --> %d (%02x)\n", sr, sr); err = -ENXIO; goto fail; } Index: linux/drivers/spi/atmel_spi.c =================================================================== --- linux.orig/drivers/spi/atmel_spi.c +++ linux/drivers/spi/atmel_spi.c @@ -425,7 +425,7 @@ static int atmel_spi_setup(struct spi_de if (ret) return ret; spi->controller_state = (void *)npcs_pin; - gpio_direction_output(npcs_pin); + gpio_direction_output(npcs_pin, !(spi->mode & SPI_CS_HIGH)); } dev_dbg(&spi->dev, Index: linux/drivers/spi/spi_bitbang.c =================================================================== --- linux.orig/drivers/spi/spi_bitbang.c +++ linux/drivers/spi/spi_bitbang.c @@ -302,10 +302,6 @@ static void bitbang_work(struct work_str setup_transfer = NULL; list_for_each_entry (t, &m->transfers, transfer_list) { - if (bitbang->shutdown) { - status = -ESHUTDOWN; - break; - } /* override or restore speed and wordsize */ if (t->speed_hz || t->bits_per_word) { @@ -410,8 +406,6 @@ int spi_bitbang_transfer(struct spi_devi m->status = -EINPROGRESS; bitbang = spi_master_get_devdata(spi->master); - if (bitbang->shutdown) - return -ESHUTDOWN; spin_lock_irqsave(&bitbang->lock, flags); if (!spi->max_speed_hz) @@ -507,28 +501,12 @@ EXPORT_SYMBOL_GPL(spi_bitbang_start); */ int spi_bitbang_stop(struct spi_bitbang *bitbang) { - unsigned limit = 500; - - spin_lock_irq(&bitbang->lock); - bitbang->shutdown = 0; - while (!list_empty(&bitbang->queue) && limit--) { - spin_unlock_irq(&bitbang->lock); + spi_unregister_master(bitbang->master); - dev_dbg(bitbang->master->cdev.dev, "wait for queue\n"); - msleep(10); - - spin_lock_irq(&bitbang->lock); - } - spin_unlock_irq(&bitbang->lock); - if (!list_empty(&bitbang->queue)) { - dev_err(bitbang->master->cdev.dev, "queue didn't empty\n"); - return -EBUSY; - } + WARN_ON(!list_empty(&bitbang->queue)); destroy_workqueue(bitbang->workqueue); - spi_unregister_master(bitbang->master); - return 0; } EXPORT_SYMBOL_GPL(spi_bitbang_stop); Index: linux/drivers/spi/spi_s3c24xx.c =================================================================== --- linux.orig/drivers/spi/spi_s3c24xx.c +++ linux/drivers/spi/spi_s3c24xx.c @@ -41,7 +41,7 @@ struct s3c24xx_spi { int len; int count; - int (*set_cs)(struct s3c2410_spi_info *spi, + void (*set_cs)(struct s3c2410_spi_info *spi, int cs, int pol); /* data buffers */ Index: linux/drivers/usb/class/usblp.c =================================================================== --- linux.orig/drivers/usb/class/usblp.c +++ linux/drivers/usb/class/usblp.c @@ -202,6 +202,7 @@ struct quirk_printer_struct { #define USBLP_QUIRK_BIDIR 0x1 /* reports bidir but requires unidirectional mode (no INs/reads) */ #define USBLP_QUIRK_USB_INIT 0x2 /* needs vendor USB init string */ +#define USBLP_QUIRK_BAD_CLASS 0x4 /* descriptor uses vendor-specific Class or SubClass */ static const struct quirk_printer_struct quirk_printers[] = { { 0x03f0, 0x0004, USBLP_QUIRK_BIDIR }, /* HP DeskJet 895C */ @@ -218,6 +219,7 @@ static const struct quirk_printer_struct { 0x0409, 0xf0be, USBLP_QUIRK_BIDIR }, /* NEC Picty920 (HP OEM) */ { 0x0409, 0xf1be, USBLP_QUIRK_BIDIR }, /* NEC Picty800 (HP OEM) */ { 0x0482, 0x0010, USBLP_QUIRK_BIDIR }, /* Kyocera Mita FS 820, by zut */ + { 0x04b8, 0x0202, USBLP_QUIRK_BAD_CLASS }, /* Seiko Epson Receipt Printer M129C */ { 0, 0 } }; @@ -1048,7 +1050,8 @@ static int usblp_select_alts(struct usbl ifd = &if_alt->altsetting[i]; if (ifd->desc.bInterfaceClass != 7 || ifd->desc.bInterfaceSubClass != 1) - continue; + if (!(usblp->quirks & USBLP_QUIRK_BAD_CLASS)) + continue; if (ifd->desc.bInterfaceProtocol < USBLP_FIRST_PROTOCOL || ifd->desc.bInterfaceProtocol > USBLP_LAST_PROTOCOL) @@ -1232,6 +1235,7 @@ static struct usb_device_id usblp_ids [] { USB_INTERFACE_INFO(7, 1, 1) }, { USB_INTERFACE_INFO(7, 1, 2) }, { USB_INTERFACE_INFO(7, 1, 3) }, + { USB_DEVICE(0x04b8, 0x0202) }, /* Seiko Epson Receipt Printer M129C */ { } /* Terminating entry */ }; Index: linux/drivers/usb/core/devio.c =================================================================== --- linux.orig/drivers/usb/core/devio.c +++ linux/drivers/usb/core/devio.c @@ -309,10 +309,11 @@ static void async_completed(struct urb * struct async *as = urb->context; struct dev_state *ps = as->ps; struct siginfo sinfo; + unsigned long flags; - spin_lock(&ps->lock); - list_move_tail(&as->asynclist, &ps->async_completed); - spin_unlock(&ps->lock); + spin_lock_irqsave(&ps->lock, flags); + list_move_tail(&as->asynclist, &ps->async_completed); + spin_unlock_irqrestore(&ps->lock, flags); if (as->signr) { sinfo.si_signo = as->signr; sinfo.si_errno = as->urb->status; Index: linux/drivers/usb/core/hcd.c =================================================================== --- linux.orig/drivers/usb/core/hcd.c +++ linux/drivers/usb/core/hcd.c @@ -517,13 +517,11 @@ error: } /* any errors get returned through the urb completion */ - local_irq_save (flags); - spin_lock (&urb->lock); + spin_lock_irqsave(&urb->lock, flags); if (urb->status == -EINPROGRESS) urb->status = status; - spin_unlock (&urb->lock); + spin_unlock_irqrestore(&urb->lock, flags); usb_hcd_giveback_urb (hcd, urb); - local_irq_restore (flags); return 0; } @@ -551,8 +549,7 @@ void usb_hcd_poll_rh_status(struct usb_h if (length > 0) { /* try to complete the status urb */ - local_irq_save (flags); - spin_lock(&hcd_root_hub_lock); + spin_lock_irqsave(&hcd_root_hub_lock, flags); urb = hcd->status_urb; if (urb) { spin_lock(&urb->lock); @@ -568,14 +565,13 @@ void usb_hcd_poll_rh_status(struct usb_h spin_unlock(&urb->lock); } else length = 0; - spin_unlock(&hcd_root_hub_lock); + spin_unlock_irqrestore(&hcd_root_hub_lock, flags); /* local irqs are always blocked in completions */ if (length > 0) usb_hcd_giveback_urb (hcd, urb); else hcd->poll_pending = 1; - local_irq_restore (flags); } /* The USB 2.0 spec says 256 ms. This is close enough and won't @@ -647,17 +643,15 @@ static int usb_rh_urb_dequeue (struct us } else { /* Status URB */ if (!hcd->uses_new_polling) del_timer (&hcd->rh_timer); - local_irq_save (flags); - spin_lock (&hcd_root_hub_lock); + spin_lock_irqsave(&hcd_root_hub_lock, flags); if (urb == hcd->status_urb) { hcd->status_urb = NULL; urb->hcpriv = NULL; } else urb = NULL; /* wasn't fully queued */ - spin_unlock (&hcd_root_hub_lock); + spin_unlock_irqrestore(&hcd_root_hub_lock, flags); if (urb) usb_hcd_giveback_urb (hcd, urb); - local_irq_restore (flags); } return 0; @@ -1176,11 +1170,9 @@ void usb_hcd_endpoint_disable (struct us WARN_ON (!HC_IS_RUNNING (hcd->state) && hcd->state != HC_STATE_HALT && udev->state != USB_STATE_NOTATTACHED); - local_irq_disable (); - /* ep is already gone from udev->ep_{in,out}[]; no more submits */ rescan: - spin_lock (&hcd_data_lock); + spin_lock_irq(&hcd_data_lock); list_for_each_entry (urb, &ep->urb_list, urb_list) { int tmp; @@ -1188,13 +1180,13 @@ rescan: if (urb->status != -EINPROGRESS) continue; usb_get_urb (urb); - spin_unlock (&hcd_data_lock); + spin_unlock_irq(&hcd_data_lock); - spin_lock (&urb->lock); + spin_lock_irq(&urb->lock); tmp = urb->status; if (tmp == -EINPROGRESS) urb->status = -ESHUTDOWN; - spin_unlock (&urb->lock); + spin_unlock_irq(&urb->lock); /* kick hcd unless it's already returning this */ if (tmp == -EINPROGRESS) { @@ -1217,8 +1209,7 @@ rescan: /* list contents may have changed */ goto rescan; } - spin_unlock (&hcd_data_lock); - local_irq_enable (); + spin_unlock_irq(&hcd_data_lock); /* synchronize with the hardware, so old configuration state * clears out immediately (and will be freed). Index: linux/drivers/usb/core/message.c =================================================================== --- linux.orig/drivers/usb/core/message.c +++ linux/drivers/usb/core/message.c @@ -255,8 +255,9 @@ static void sg_clean (struct usb_sg_requ static void sg_complete (struct urb *urb) { struct usb_sg_request *io = urb->context; + unsigned long flags; - spin_lock (&io->lock); + spin_lock_irqsave (&io->lock, flags); /* In 2.5 we require hcds' endpoint queues not to progress after fault * reports, until the completion callback (this!) returns. That lets @@ -290,7 +291,7 @@ static void sg_complete (struct urb *urb * unlink pending urbs so they won't rx/tx bad data. * careful: unlink can sometimes be synchronous... */ - spin_unlock (&io->lock); + spin_unlock_irqrestore (&io->lock, flags); for (i = 0, found = 0; i < io->entries; i++) { if (!io->urbs [i] || !io->urbs [i]->dev) continue; @@ -305,7 +306,7 @@ static void sg_complete (struct urb *urb } else if (urb == io->urbs [i]) found = 1; } - spin_lock (&io->lock); + spin_lock_irqsave (&io->lock, flags); } urb->dev = NULL; @@ -315,7 +316,7 @@ static void sg_complete (struct urb *urb if (!io->count) complete (&io->complete); - spin_unlock (&io->lock); + spin_unlock_irqrestore (&io->lock, flags); } @@ -577,7 +578,7 @@ void usb_sg_cancel (struct usb_sg_reques dev_warn (&io->dev->dev, "%s, unlink --> %d\n", __FUNCTION__, retval); } - spin_lock (&io->lock); + spin_lock_irqsave (&io->lock, flags); } spin_unlock_irqrestore (&io->lock, flags); } Index: linux/drivers/usb/misc/berry_charge.c =================================================================== --- linux.orig/drivers/usb/misc/berry_charge.c +++ linux/drivers/usb/misc/berry_charge.c @@ -69,7 +69,7 @@ static int magic_charge(struct usb_devic return retval; } - dbg(&udev->dev, "Sending first magic command\n"); + dbg(&udev->dev, "Sending second magic command\n"); retval = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), 0xa2, 0x40, 0, 1, dummy_buffer, 0, 100); if (retval != 0) { Index: linux/drivers/usb/net/dm9601.c =================================================================== --- linux.orig/drivers/usb/net/dm9601.c +++ linux/drivers/usb/net/dm9601.c @@ -578,6 +578,14 @@ static const struct usb_device_id produc USB_DEVICE(0x0a46, 0x9601), /* Davicom USB-100 */ .driver_info = (unsigned long)&dm9601_info, }, + { + USB_DEVICE(0x0a46, 0x6688), /* ZT6688 USB NIC */ + .driver_info = (unsigned long)&dm9601_info, + }, + { + USB_DEVICE(0x0a46, 0x0268), /* ShanTou ST268 USB NIC */ + .driver_info = (unsigned long)&dm9601_info, + }, {}, // END }; Index: linux/drivers/usb/net/usbnet.c =================================================================== --- linux.orig/drivers/usb/net/usbnet.c +++ linux/drivers/usb/net/usbnet.c @@ -898,6 +898,8 @@ static void tx_complete (struct urb *urb urb->dev = NULL; entry->state = tx_done; + spin_lock_rt(&dev->txq.lock); + spin_unlock_rt(&dev->txq.lock); defer_bh(dev, skb, &dev->txq); } Index: linux/drivers/usb/serial/airprime.c =================================================================== --- linux.orig/drivers/usb/serial/airprime.c +++ linux/drivers/usb/serial/airprime.c @@ -18,10 +18,6 @@ static struct usb_device_id id_table [] = { { USB_DEVICE(0x0c88, 0x17da) }, /* Kyocera Wireless KPC650/Passport */ - { USB_DEVICE(0x1410, 0x1110) }, /* Novatel Wireless Merlin CDMA */ - { USB_DEVICE(0x1410, 0x1130) }, /* Novatel Wireless S720 CDMA/EV-DO */ - { USB_DEVICE(0x1410, 0x2110) }, /* Novatel Wireless U720 CDMA/EV-DO */ - { USB_DEVICE(0x1410, 0x1430) }, /* Novatel Merlin XU870 HSDPA/3G */ { USB_DEVICE(0x1410, 0x1100) }, /* ExpressCard34 Qualcomm 3G CDMA */ { USB_DEVICE(0x413c, 0x8115) }, /* Dell Wireless HSDPA 5500 */ { }, Index: linux/drivers/usb/serial/mos7720.c =================================================================== --- linux.orig/drivers/usb/serial/mos7720.c +++ linux/drivers/usb/serial/mos7720.c @@ -1628,6 +1628,7 @@ static struct usb_serial_driver moschip7 .chars_in_buffer = mos7720_chars_in_buffer, .break_ctl = mos7720_break, .read_bulk_callback = mos7720_bulk_in_callback, + .read_int_callback = mos7720_interrupt_callback, }; static int __init moschip7720_init(void) Index: linux/drivers/usb/serial/option.c =================================================================== --- linux.orig/drivers/usb/serial/option.c +++ linux/drivers/usb/serial/option.c @@ -109,7 +109,6 @@ static int option_send_setup(struct usb #define HUAWEI_PRODUCT_E220 0x1003 #define NOVATELWIRELESS_VENDOR_ID 0x1410 -#define NOVATELWIRELESS_PRODUCT_U740 0x1400 #define ANYDATA_VENDOR_ID 0x16d5 #define ANYDATA_PRODUCT_ID 0x6501 @@ -152,7 +151,19 @@ static struct usb_device_id option_ids[] { USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_ETNA_KOI_NETWORK) }, { USB_DEVICE(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E600) }, { USB_DEVICE(HUAWEI_VENDOR_ID, HUAWEI_PRODUCT_E220) }, - { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID,NOVATELWIRELESS_PRODUCT_U740) }, + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, 0x1100) }, /* Novatel Merlin XS620/S640 */ + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, 0x1110) }, /* Novatel Merlin S620 */ + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, 0x1120) }, /* Novatel Merlin EX720 */ + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, 0x1130) }, /* Novatel Merlin S720 */ + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, 0x1400) }, /* Novatel U730 */ + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, 0x1410) }, /* Novatel U740 */ + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, 0x1420) }, /* Novatel EU870 */ + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, 0x1430) }, /* Novatel Merlin XU870 HSDPA/3G */ + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, 0x1430) }, /* Novatel XU870 */ + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, 0x2100) }, /* Novatel EV620 CDMA/EV-DO */ + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, 0x2110) }, /* Novatel Merlin ES620 / Merlin ES720 / Ovation U720 */ + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, 0x2130) }, /* Novatel Merlin ES620 SM Bus */ + { USB_DEVICE(NOVATELWIRELESS_VENDOR_ID, 0x2410) }, /* Novatel EU740 */ { USB_DEVICE(ANYDATA_VENDOR_ID, ANYDATA_PRODUCT_ID) }, { } /* Terminating entry */ }; Index: linux/drivers/usb/serial/usb-serial.c =================================================================== --- linux.orig/drivers/usb/serial/usb-serial.c +++ linux/drivers/usb/serial/usb-serial.c @@ -138,6 +138,11 @@ static void destroy_serial(struct kref * dbg("%s - %s", __FUNCTION__, serial->type->description); + serial->type->shutdown(serial); + + /* return the minor range that this device had */ + return_serial(serial); + for (i = 0; i < serial->num_ports; ++i) serial->port[i]->open_count = 0; @@ -148,12 +153,6 @@ static void destroy_serial(struct kref * serial->port[i] = NULL; } - if (serial->type->shutdown) - serial->type->shutdown(serial); - - /* return the minor range that this device had */ - return_serial(serial); - /* If this is a "fake" port, we have to clean it up here, as it will * not get cleaned up in port_release() as it was never registered with * the driver core */ Index: linux/drivers/usb/storage/unusual_devs.h =================================================================== --- linux.orig/drivers/usb/storage/unusual_devs.h +++ linux/drivers/usb/storage/unusual_devs.h @@ -1411,6 +1411,16 @@ UNUSUAL_DEV( 0x22b8, 0x3010, 0x0001, 0x US_SC_DEVICE, US_PR_DEVICE, NULL, US_FL_FIX_CAPACITY | US_FL_IGNORE_RESIDUE ), +/* + * Patch by Pete Zaitcev + * Report by Mark Patton. Red Hat bz#208928. + */ +UNUSUAL_DEV( 0x22b8, 0x4810, 0x0001, 0x0001, + "Motorola", + "RAZR V3i", + US_SC_DEVICE, US_PR_DEVICE, NULL, + US_FL_FIX_CAPACITY), + /* Reported by Radovan Garabik */ UNUSUAL_DEV( 0x2735, 0x100b, 0x0000, 0x9999, "MPIO", Index: linux/drivers/usb/storage/usb.h =================================================================== --- linux.orig/drivers/usb/storage/usb.h +++ linux/drivers/usb/storage/usb.h @@ -147,7 +147,7 @@ struct us_data { dma_addr_t iobuf_dma; /* mutual exclusion and synchronization structures */ - struct semaphore sema; /* to sleep thread on */ + struct compat_semaphore sema; /* to sleep thread on */ struct completion notify; /* thread begin/end */ wait_queue_head_t delay_wait; /* wait during scan, reset */ Index: linux/drivers/video/Kconfig =================================================================== --- linux.orig/drivers/video/Kconfig +++ linux/drivers/video/Kconfig @@ -1320,7 +1320,7 @@ config FB_AU1100 config FB_AU1200 bool "Au1200 LCD Driver" - depends on FB && MIPS && SOC_AU1200 + depends on (FB = y) && MIPS && SOC_AU1200 select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT @@ -1470,7 +1470,7 @@ config FB_G364 config FB_68328 bool "Motorola 68328 native frame buffer support" - depends on FB && (M68328 || M68EZ328 || M68VZ328) + depends on (FB = y) && (M68328 || M68EZ328 || M68VZ328) select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT @@ -1616,7 +1616,7 @@ config FB_IBM_GXT4500 config FB_PS3 bool "PS3 GPU framebuffer driver" - depends on FB && PS3_PS3AV + depends on (FB = y) && PS3_PS3AV select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT Index: linux/drivers/video/console/fbcon.c =================================================================== --- linux.orig/drivers/video/console/fbcon.c +++ linux/drivers/video/console/fbcon.c @@ -1246,7 +1246,6 @@ static void fbcon_clear(struct vc_data * { struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; struct fbcon_ops *ops = info->fbcon_par; - struct display *p = &fb_display[vc->vc_num]; u_int y_break; @@ -1275,10 +1274,11 @@ static void fbcon_putcs(struct vc_data * struct display *p = &fb_display[vc->vc_num]; struct fbcon_ops *ops = info->fbcon_par; - if (!fbcon_is_inactive(vc, info)) + if (!fbcon_is_inactive(vc, info)) { ops->putcs(vc, info, s, count, real_y(p, ypos), xpos, get_color(vc, info, scr_readw(s), 1), get_color(vc, info, scr_readw(s), 0)); + } } static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos) @@ -3091,6 +3091,7 @@ static const struct consw fb_con = { .con_screen_pos = fbcon_screen_pos, .con_getxy = fbcon_getxy, .con_resize = fbcon_resize, + .con_preemptible = 1, }; static struct notifier_block fbcon_event_notifier = { Index: linux/drivers/video/console/vgacon.c =================================================================== --- linux.orig/drivers/video/console/vgacon.c +++ linux/drivers/video/console/vgacon.c @@ -51,7 +51,7 @@ #include