Index: linux/Documentation/kernel-parameters.txt =================================================================== --- linux.orig/Documentation/kernel-parameters.txt +++ linux/Documentation/kernel-parameters.txt @@ -1637,6 +1637,12 @@ running once the system is up. time Show timing data prefixed to each printk message line + timeout_granularity= + [KNL] + Timeout granularity: process timer wheel timers every + timeout_granularity jiffies. Defaults to 1 (process + timers HZ times per second - most finegrained). + clocksource= [GENERIC_TIME] Override the default clocksource Override the default clocksource and use the clocksource with the name specified. Index: linux/Makefile =================================================================== --- linux.orig/Makefile +++ linux/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 18 -EXTRAVERSION = +EXTRAVERSION = -rt4 NAME=Avast! A bilge rat! # *DOCUMENTATION* @@ -485,10 +485,14 @@ endif include $(srctree)/arch/$(ARCH)/Makefile -ifdef CONFIG_FRAME_POINTER -CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) +ifdef CONFIG_MCOUNT +CFLAGS += -pg -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) else -CFLAGS += -fomit-frame-pointer + ifdef CONFIG_FRAME_POINTER + CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) + else + CFLAGS += -fomit-frame-pointer + endif endif ifdef CONFIG_UNWIND_INFO Index: linux/arch/arm/Kconfig =================================================================== --- linux.orig/arch/arm/Kconfig +++ linux/arch/arm/Kconfig @@ -17,6 +17,10 @@ config ARM Europe. There is an ARM Linux project with a web page at . +config GENERIC_TIME + bool + default y + config MMU bool default y @@ -51,6 +55,18 @@ config GENERIC_HARDIRQS bool default y +config STACKTRACE_SUPPORT + bool + default y + +config LOCKDEP_SUPPORT + bool + default y + +config TRACE_IRQFLAGS_SUPPORT + bool + default y + config HARDIRQS_SW_RESEND bool default y @@ -344,6 +360,15 @@ source "arch/arm/mach-at91rm9200/Kconfig source "arch/arm/mach-netx/Kconfig" +config IS_TICK_BASED + bool + depends on GENERIC_TIME + default y + help + This is used on platforms that have not added a clocksource to + support GENERIC_TIME. Platforms which have a clocksource + should set this to 'n' in their mach-*/Kconfig. + # Definitions to make life easier config ARCH_ACORN bool @@ -419,6 +444,8 @@ endmenu menu "Kernel Features" +source "kernel/time/Kconfig" + config SMP bool "Symmetric Multi-Processing (EXPERIMENTAL)" depends on EXPERIMENTAL && REALVIEW_MPCORE @@ -463,38 +490,7 @@ config LOCAL_TIMERS accounting to be spread across the timer interval, preventing a "thundering herd" at every timer tick. -config PREEMPT - bool "Preemptible Kernel (EXPERIMENTAL)" - depends on EXPERIMENTAL - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. - -config NO_IDLE_HZ - bool "Dynamic tick timer" - help - Select this option if you want to disable continuous timer ticks - and have them programmed to occur as required. This option saves - power as the system can remain in idle state for longer. - - By default dynamic tick is disabled during the boot, and can be - manually enabled with: - - echo 1 > /sys/devices/system/timer/timer0/dyn_tick - - Alternatively, if you want dynamic tick automatically enabled - during boot, pass "dyntick=enable" via the kernel command string. - - Please note that dynamic tick may affect the accuracy of - timekeeping on some platforms depending on the implementation. - Currently at least OMAP, PXA2xx and SA11x0 platforms are known - to have accurate timekeeping with dynamic tick. +source kernel/Kconfig.preempt config HZ int Index: linux/arch/arm/boot/compressed/head.S =================================================================== --- linux.orig/arch/arm/boot/compressed/head.S +++ linux/arch/arm/boot/compressed/head.S @@ -231,7 +231,8 @@ not_relocated: mov r0, #0 */ cmp r4, r2 bhs wont_overwrite - add r0, r4, #4096*1024 @ 4MB largest kernel size + sub r3, r3, r5 @ compressed kernel size + add r0, r4, r3, lsl #2 @ allow for 4x expansion cmp r0, r5 bls wont_overwrite @@ -822,6 +823,19 @@ memdump: mov r12, r0 mov pc, r10 #endif +#ifdef CONFIG_MCOUNT +/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this + * trampoline + */ + .text + .align 0 + .type mcount %function + .global mcount +mcount: + mov pc, lr @ just return +#endif + + reloc_end: .align Index: linux/arch/arm/common/time-acorn.c =================================================================== --- linux.orig/arch/arm/common/time-acorn.c +++ linux/arch/arm/common/time-acorn.c @@ -77,7 +77,7 @@ ioc_timer_interrupt(int irq, void *dev_i static struct irqaction ioc_timer_irq = { .name = "timer", - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NODELAY, .handler = ioc_timer_interrupt }; Index: linux/arch/arm/kernel/dma.c =================================================================== --- linux.orig/arch/arm/kernel/dma.c +++ linux/arch/arm/kernel/dma.c @@ -20,7 +20,7 @@ #include -DEFINE_SPINLOCK(dma_spin_lock); +DEFINE_RAW_SPINLOCK(dma_spin_lock); EXPORT_SYMBOL(dma_spin_lock); static dma_t dma_chan[MAX_DMA_CHANNELS]; Index: linux/arch/arm/kernel/entry-armv.S =================================================================== --- linux.orig/arch/arm/kernel/entry-armv.S +++ linux/arch/arm/kernel/entry-armv.S @@ -191,6 +191,9 @@ __dabt_svc: __irq_svc: svc_entry +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off +#endif #ifdef CONFIG_PREEMPT get_thread_info tsk ldr r8, [tsk, #TI_PREEMPT] @ get preempt count @@ -201,7 +204,7 @@ __irq_svc: irq_handler #ifdef CONFIG_PREEMPT ldr r0, [tsk, #TI_FLAGS] @ get flags - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED blne svc_preempt preempt_return: ldr r0, [tsk, #TI_PREEMPT] @ read preempt value @@ -211,6 +214,10 @@ preempt_return: #endif ldr r0, [sp, #S_PSR] @ irqs are already disabled msr spsr_cxsf, r0 +#ifdef CONFIG_TRACE_IRQFLAGS + tst r0, #PSR_I_BIT + bleq trace_hardirqs_on +#endif ldmia sp, {r0 - pc}^ @ load r0 - pc, cpsr .ltorg @@ -228,7 +235,7 @@ svc_preempt: str r7, [tsk, #TI_PREEMPT] @ expects preempt_count == 0 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED beq preempt_return @ go again b 1b #endif @@ -398,6 +405,9 @@ __dabt_usr: __irq_usr: usr_entry +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off +#endif get_thread_info tsk #ifdef CONFIG_PREEMPT ldr r8, [tsk, #TI_PREEMPT] @ get preempt count @@ -412,6 +422,9 @@ __irq_usr: teq r0, r7 strne r0, [r0, -r0] #endif +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_on +#endif mov why, #0 b ret_to_user Index: linux/arch/arm/kernel/entry-common.S =================================================================== --- linux.orig/arch/arm/kernel/entry-common.S +++ linux/arch/arm/kernel/entry-common.S @@ -3,6 +3,8 @@ * * Copyright (C) 2000 Russell King * + * LATENCY_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -40,7 +42,7 @@ ret_fast_syscall: fast_work_pending: str r0, [sp, #S_R0+S_OFF]! @ returned r0 work_pending: - tst r1, #_TIF_NEED_RESCHED + tst r1, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED bne work_resched tst r1, #_TIF_NOTIFY_RESUME | _TIF_SIGPENDING beq no_work_pending @@ -50,7 +52,8 @@ work_pending: b ret_slow_syscall @ Check work again work_resched: - bl schedule + bl __schedule + /* * "slow" syscall return path. "why" tells us if this was a real syscall. */ @@ -387,6 +390,112 @@ ENTRY(sys_oabi_call_table) #include "calls.S" #undef ABI #undef OBSOLETE +#endif + +#ifdef CONFIG_FRAME_POINTER + +#ifdef CONFIG_MCOUNT +/* + * At the point where we are in mcount() we maintain the + * frame of the prologue code and keep the call to mcount() + * out of the stack frame list: + + saved pc <---\ caller of instrumented routine + saved lr | + ip/prev_sp | + fp -----^ | + : | + | + -> saved pc | instrumented routine + | saved lr | + | ip/prev_sp | + | fp ---------/ + | : + | + | mcount + | saved pc + | saved lr + | ip/prev sp + -- fp + r3 + r2 + r1 + sp-> r0 + : + */ + + .text + .align 0 + .type mcount %function + .global mcount + +/* gcc -pg generated FUNCTION_PROLOGUE references mcount() + * and has already created the stack frame invocation for + * the routine we have been called to instrument. We create + * a complete frame nevertheless, as we want to use the same + * call to mcount() from c code. + */ +mcount: + + ldr ip, =mcount_enabled @ leave early, if disabled + ldr ip, [ip] + cmp ip, #0 + moveq pc, lr + + mov ip, sp + stmdb sp!, {r0 - r3, fp, ip, lr, pc} @ create stack frame + + ldr r1, [fp, #-4] @ get lr (the return address + @ of the caller of the + @ instrumented function) + mov r0, lr @ get lr - (the return address + @ of the instrumented function) + + sub fp, ip, #4 @ point fp at this frame + + bl __trace +1: + ldmdb fp, {r0 - r3, fp, sp, pc} @ pop entry frame and return + +#endif + +/* ARM replacement for unsupported gcc __builtin_return_address(n) + * where 0 < n. n == 0 is supported here as well. + * + * Walk up the stack frame until the desired frame is found or a NULL + * fp is encountered, return NULL in the latter case. + * + * Note: it is possible under code optimization for the stack invocation + * of an ancestor function (level N) to be removed before calling a + * descendant function (level N+1). No easy means is available to deduce + * this scenario with the result being [for example] caller_addr(0) when + * called from level N+1 returning level N-1 rather than the expected + * level N. This optimization issue appears isolated to the case of + * a call to a level N+1 routine made at the tail end of a level N + * routine -- the level N frame is deleted and a simple branch is made + * to the level N+1 routine. + */ + + .text + .align 0 + .type arm_return_addr %function + .global arm_return_addr + +arm_return_addr: + mov ip, r0 + mov r0, fp +3: + cmp r0, #0 + beq 1f @ frame list hit end, bail + cmp ip, #0 + beq 2f @ reached desired frame + ldr r0, [r0, #-12] @ else continue, get next fp + sub ip, ip, #1 + b 3b +2: + ldr r0, [r0, #-4] @ get target return address +1: + mov pc, lr #endif Index: linux/arch/arm/kernel/fiq.c =================================================================== --- linux.orig/arch/arm/kernel/fiq.c +++ linux/arch/arm/kernel/fiq.c @@ -89,7 +89,7 @@ void set_fiq_handler(void *start, unsign * disable irqs for the duration. Note - these functions are almost * entirely coded in assembly. */ -void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( @@ -107,7 +107,7 @@ void __attribute__((naked)) set_fiq_regs : "r" (®s->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE)); } -void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( Index: linux/arch/arm/kernel/head.S =================================================================== --- linux.orig/arch/arm/kernel/head.S +++ linux/arch/arm/kernel/head.S @@ -234,18 +234,19 @@ __create_page_tables: /* * Now setup the pagetables for our kernel direct - * mapped region. We round TEXTADDR down to the - * nearest megabyte boundary. It is assumed that - * the kernel fits within 4 contigous 1MB sections. + * mapped region. */ add r0, r4, #(TEXTADDR & 0xff000000) >> 18 @ start of kernel str r3, [r0, #(TEXTADDR & 0x00f00000) >> 18]! - add r3, r3, #1 << 20 - str r3, [r0, #4]! @ KERNEL + 1MB - add r3, r3, #1 << 20 - str r3, [r0, #4]! @ KERNEL + 2MB - add r3, r3, #1 << 20 - str r3, [r0, #4] @ KERNEL + 3MB + + ldr r6, =(_end - PAGE_OFFSET) + sub r6, r6, #1 @ r6 = number of sections + mov r6, r6, lsr #20 @ needed for kernel minus 1 + +1: add r3, r3, #1 << 20 + str r3, [r0, #4]! + subs r6, r6, #1 + bgt 1b /* * Then map first 1MB of ram in case it contains our boot params. Index: linux/arch/arm/kernel/irq.c =================================================================== --- linux.orig/arch/arm/kernel/irq.c +++ linux/arch/arm/kernel/irq.c @@ -101,7 +101,7 @@ unlock: /* Handle bad interrupts */ static struct irq_desc bad_irq_desc = { .handle_irq = handle_bad_irq, - .lock = SPIN_LOCK_UNLOCKED + .lock = RAW_SPIN_LOCK_UNLOCKED(bad_irq_desc.lock) }; /* @@ -109,10 +109,12 @@ static struct irq_desc bad_irq_desc = { * come via this function. Instead, they should provide their * own 'handler' */ -asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs) +asmlinkage notrace void asm_do_IRQ(unsigned int irq, struct pt_regs *regs) { struct irqdesc *desc = irq_desc + irq; + trace_special(instruction_pointer(regs), irq, 0); + /* * Some hardware gives randomly wrong interrupts. Rather * than crashing, do something sensible. Index: linux/arch/arm/kernel/process.c =================================================================== --- linux.orig/arch/arm/kernel/process.c +++ linux/arch/arm/kernel/process.c @@ -123,7 +123,7 @@ static void default_idle(void) cpu_relax(); else { local_irq_disable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { timer_dyn_reprogram(); arch_idle(); } @@ -154,12 +154,20 @@ void cpu_idle(void) if (!idle) idle = default_idle; leds_event(led_idle_start); - while (!need_resched()) - idle(); + + if (!need_resched() && !need_resched_delayed() && + !hrtimer_stop_sched_tick()) { + while (!need_resched() && !need_resched_delayed()) + idle(); + } + hrtimer_restart_sched_tick(); + leds_event(led_idle_end); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } Index: linux/arch/arm/kernel/semaphore.c =================================================================== --- linux.orig/arch/arm/kernel/semaphore.c +++ linux/arch/arm/kernel/semaphore.c @@ -49,14 +49,16 @@ * we cannot lose wakeup events. */ -void __up(struct semaphore *sem) +fastcall void __attribute_used__ __compat_up(struct compat_semaphore *sem) { wake_up(&sem->wait); } +EXPORT_SYMBOL(__compat_up); + static DEFINE_SPINLOCK(semaphore_lock); -void __sched __down(struct semaphore * sem) +fastcall void __attribute_used__ __sched __compat_down(struct compat_semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +91,9 @@ void __sched __down(struct semaphore * s wake_up(&sem->wait); } -int __sched __down_interruptible(struct semaphore * sem) +EXPORT_SYMBOL(__compat_down); + +fastcall int __attribute_used__ __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -140,6 +144,8 @@ int __sched __down_interruptible(struct return retval; } +EXPORT_SYMBOL(__compat_down_interruptible); + /* * Trylock failed - make sure we correct for * having decremented the count. @@ -148,7 +154,7 @@ int __sched __down_interruptible(struct * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ -int __down_trylock(struct semaphore * sem) +fastcall int __attribute_used__ __compat_down_trylock(struct compat_semaphore * sem) { int sleepers; unsigned long flags; @@ -168,6 +174,15 @@ int __down_trylock(struct semaphore * se return 1; } +EXPORT_SYMBOL(__compat_down_trylock); + +fastcall int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + +EXPORT_SYMBOL(compat_sem_is_locked); + /* * The semaphore operations have a special calling sequence that * allow us to do a simpler in-line version of them. These routines @@ -185,7 +200,7 @@ asm(" .section .sched.text,\"ax\",%progb __down_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down \n\ + bl __compat_down \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ .align 5 \n\ @@ -193,7 +208,7 @@ __down_failed: \n\ __down_interruptible_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down_interruptible \n\ + bl __compat_down_interruptible \n\ mov ip, r0 \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ @@ -202,7 +217,7 @@ __down_interruptible_failed: \n\ __down_trylock_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down_trylock \n\ + bl __compat_down_trylock \n\ mov ip, r0 \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ @@ -211,7 +226,7 @@ __down_trylock_failed: \n\ __up_wakeup: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __up \n\ + bl __compat_up \n\ ldmfd sp!, {r0 - r4, pc} \n\ "); Index: linux/arch/arm/kernel/signal.c =================================================================== --- linux.orig/arch/arm/kernel/signal.c +++ linux/arch/arm/kernel/signal.c @@ -630,6 +630,14 @@ static int do_signal(sigset_t *oldset, s siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif + /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux/arch/arm/kernel/smp.c =================================================================== --- linux.orig/arch/arm/kernel/smp.c +++ linux/arch/arm/kernel/smp.c @@ -515,7 +515,7 @@ static void ipi_call_function(unsigned i cpu_clear(cpu, data->unfinished); } -static DEFINE_SPINLOCK(stop_lock); +static DEFINE_RAW_SPINLOCK(stop_lock); /* * ipi_cpu_stop - handle IPI from smp_send_stop() Index: linux/arch/arm/kernel/time.c =================================================================== --- linux.orig/arch/arm/kernel/time.c +++ linux/arch/arm/kernel/time.c @@ -69,10 +69,12 @@ EXPORT_SYMBOL(profile_pc); */ int (*set_rtc)(void); +#ifdef CONFIG_IS_TICK_BASED static unsigned long dummy_gettimeoffset(void) { return 0; } +#endif /* * Scheduler clock - returns current time in nanosec units. @@ -84,34 +86,10 @@ unsigned long long __attribute__((weak)) return (unsigned long long)jiffies * (1000000000 / HZ); } -static unsigned long next_rtc_update; - -/* - * If we have an externally synchronized linux clock, then update - * CMOS clock accordingly every ~11 minutes. set_rtc() has to be - * called as close as possible to 500 ms before the new second - * starts. - */ -static inline void do_set_rtc(void) +void sync_persistent_clock(struct timespec ts) { - if (!ntp_synced() || set_rtc == NULL) - return; - - if (next_rtc_update && - time_before((unsigned long)xtime.tv_sec, next_rtc_update)) - return; - - if (xtime.tv_nsec < 500000000 - ((unsigned) tick_nsec >> 1) && - xtime.tv_nsec >= 500000000 + ((unsigned) tick_nsec >> 1)) - return; - - if (set_rtc()) - /* - * rtc update failed. Try again in 60s - */ - next_rtc_update = xtime.tv_sec + 60; - else - next_rtc_update = xtime.tv_sec + 660; + if (set_rtc) + set_rtc(); } #ifdef CONFIG_LEDS @@ -230,68 +208,6 @@ static inline void do_leds(void) #define do_leds() #endif -void do_gettimeofday(struct timeval *tv) -{ - unsigned long flags; - unsigned long seq; - unsigned long usec, sec, lost; - - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - usec = system_timer->offset(); - - lost = jiffies - wall_jiffies; - if (lost) - usec += lost * USECS_PER_JIFFY; - - sec = xtime.tv_sec; - usec += xtime.tv_nsec / 1000; - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - - /* usec may have gone up a lot: be safe */ - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * done, and then undo it! - */ - nsec -= system_timer->offset() * NSEC_PER_USEC; - nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - /** * save_time_delta - Save the offset between system time and RTC time * @delta: pointer to timespec to store delta @@ -332,7 +248,6 @@ void timer_tick(struct pt_regs *regs) { profile_tick(CPU_PROFILING, regs); do_leds(); - do_set_rtc(); do_timer(regs); #ifndef CONFIG_SMP update_process_times(user_mode(regs)); @@ -500,8 +415,10 @@ device_initcall(timer_init_sysfs); void __init time_init(void) { +#ifdef CONFIG_IS_TICK_BASED if (system_timer->offset == NULL) system_timer->offset = dummy_gettimeoffset; +#endif system_timer->init(); #ifdef CONFIG_NO_IDLE_HZ Index: linux/arch/arm/kernel/traps.c =================================================================== --- linux.orig/arch/arm/kernel/traps.c +++ linux/arch/arm/kernel/traps.c @@ -176,6 +176,7 @@ void dump_stack(void) { #ifdef CONFIG_DEBUG_ERRORS __backtrace(); + print_traces(current); #endif } @@ -191,7 +192,7 @@ void show_stack(struct task_struct *tsk, if (tsk != current) fp = thread_saved_fp(tsk); else - asm("mov%? %0, fp" : "=r" (fp)); + asm("mov %0, fp" : "=r" (fp) : : "cc"); c_backtrace(fp, 0x10); barrier(); @@ -216,7 +217,7 @@ static void __die(const char *str, int e } } -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. @@ -252,7 +253,7 @@ void notify_die(const char *str, struct } static LIST_HEAD(undef_hook); -static DEFINE_SPINLOCK(undef_lock); +static DEFINE_RAW_SPINLOCK(undef_lock); void register_undef_hook(struct undef_hook *hook) { Index: linux/arch/arm/lib/Makefile =================================================================== --- linux.orig/arch/arm/lib/Makefile +++ linux/arch/arm/lib/Makefile @@ -41,6 +41,7 @@ lib-$(CONFIG_ARCH_RPC) += ecard.o io-ac lib-$(CONFIG_ARCH_CLPS7500) += io-acorn.o lib-$(CONFIG_ARCH_L7200) += io-acorn.o lib-$(CONFIG_ARCH_SHARK) += io-shark.o +lib-$(CONFIG_STACKTRACE) += stacktrace.o $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S $(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S Index: linux/arch/arm/lib/stacktrace.c =================================================================== --- /dev/null +++ linux/arch/arm/lib/stacktrace.c @@ -0,0 +1,77 @@ +#include +#include + +struct stackframe { + unsigned long fp; + unsigned long sp; + unsigned long lr; + unsigned long pc; +}; + +int walk_stackframe(unsigned long fp, unsigned long low, unsigned long high, + int (*fn)(struct stackframe *, void *), void *data) +{ + struct stackframe *frame; + + do { + /* + * Check current frame pointer is within bounds + */ + if ((fp - 12) < low || fp + 4 >= high) + break; + + frame = (struct stackframe *)(fp - 12); + + if (fn(frame, data)) + break; + + /* + * Update the low bound - the next frame must always + * be at a higher address than the current frame. + */ + low = fp + 4; + fp = frame->fp; + } while (fp); + + return 0; +} + +struct stack_trace_data { + struct stack_trace *trace; + unsigned int skip; +}; + +static int save_trace(struct stackframe *frame, void *d) +{ + struct stack_trace_data *data = d; + struct stack_trace *trace = data->trace; + + if (data->skip) { + data->skip--; + return 0; + } + + trace->entries[trace->nr_entries++] = frame->lr; + + return trace->nr_entries >= trace->max_entries; +} + +void save_stack_trace(struct stack_trace *trace, struct task_struct *task, + int all_contexts, unsigned int skip) +{ + struct stack_trace_data data; + unsigned long fp, base; + + data.trace = trace; + data.skip = skip; + + if (task) { + base = (unsigned long)task_stack_page(task); + fp = 0; + } else { + base = (unsigned long)task_stack_page(current); + asm("mov %0, fp" : "=r" (fp)); + } + + walk_stackframe(fp, base, base + THREAD_SIZE, save_trace, &data); +} Index: linux/arch/arm/mach-footbridge/netwinder-hw.c =================================================================== --- linux.orig/arch/arm/mach-footbridge/netwinder-hw.c +++ linux/arch/arm/mach-footbridge/netwinder-hw.c @@ -67,7 +67,7 @@ static inline void wb977_ww(int reg, int /* * This is a lock for accessing ports GP1_IO_BASE and GP2_IO_BASE */ -DEFINE_SPINLOCK(gpio_lock); +DEFINE_RAW_SPINLOCK(gpio_lock); static unsigned int current_gpio_op; static unsigned int current_gpio_io; Index: linux/arch/arm/mach-footbridge/netwinder-leds.c =================================================================== --- linux.orig/arch/arm/mach-footbridge/netwinder-leds.c +++ linux/arch/arm/mach-footbridge/netwinder-leds.c @@ -32,7 +32,7 @@ static char led_state; static char hw_led_state; static DEFINE_SPINLOCK(leds_lock); -extern spinlock_t gpio_lock; +extern raw_spinlock_t gpio_lock; static void netwinder_leds_event(led_event_t evt) { Index: linux/arch/arm/mach-integrator/core.c =================================================================== --- linux.orig/arch/arm/mach-integrator/core.c +++ linux/arch/arm/mach-integrator/core.c @@ -164,7 +164,7 @@ static struct amba_pl010_data integrator #define CM_CTRL IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET -static DEFINE_SPINLOCK(cm_lock); +static DEFINE_RAW_SPINLOCK(cm_lock); /** * cm_control - update the CM_CTRL register. Index: linux/arch/arm/mach-integrator/pci_v3.c =================================================================== --- linux.orig/arch/arm/mach-integrator/pci_v3.c +++ linux/arch/arm/mach-integrator/pci_v3.c @@ -162,7 +162,7 @@ * 7:2 register number * */ -static DEFINE_SPINLOCK(v3_lock); +static DEFINE_RAW_SPINLOCK(v3_lock); #define PCI_BUS_NONMEM_START 0x00000000 #define PCI_BUS_NONMEM_SIZE SZ_256M Index: linux/arch/arm/mach-integrator/platsmp.c =================================================================== --- linux.orig/arch/arm/mach-integrator/platsmp.c +++ linux/arch/arm/mach-integrator/platsmp.c @@ -31,7 +31,7 @@ extern void integrator_secondary_startup volatile int __cpuinitdata pen_release = -1; unsigned long __cpuinitdata phys_pen_release = 0; -static DEFINE_SPINLOCK(boot_lock); +static DEFINE_RAW_SPINLOCK(boot_lock); void __cpuinit platform_secondary_init(unsigned int cpu) { Index: linux/arch/arm/mach-ixp4xx/Kconfig =================================================================== --- linux.orig/arch/arm/mach-ixp4xx/Kconfig +++ linux/arch/arm/mach-ixp4xx/Kconfig @@ -1,5 +1,9 @@ if ARCH_IXP4XX +config IS_TICK_BASED + bool + default n + config ARCH_SUPPORTS_BIG_ENDIAN bool default y Index: linux/arch/arm/mach-ixp4xx/common-pci.c =================================================================== --- linux.orig/arch/arm/mach-ixp4xx/common-pci.c +++ linux/arch/arm/mach-ixp4xx/common-pci.c @@ -53,7 +53,7 @@ unsigned long ixp4xx_pci_reg_base = 0; * these transactions are atomic or we will end up * with corrupt data on the bus or in a driver. */ -static DEFINE_SPINLOCK(ixp4xx_pci_lock); +static DEFINE_RAW_SPINLOCK(ixp4xx_pci_lock); /* * Read from PCI config space Index: linux/arch/arm/mach-ixp4xx/common.c =================================================================== --- linux.orig/arch/arm/mach-ixp4xx/common.c +++ linux/arch/arm/mach-ixp4xx/common.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include #include @@ -38,6 +40,11 @@ #include #include +#ifdef CONFIG_HIGH_RES_TIMERS +static int __init ixp4xx_clockevent_init(void); +static struct clock_event clockevent_ixp4xx; +#endif + /************************************************************************* * IXP4xx chipset I/O mapping *************************************************************************/ @@ -253,25 +260,17 @@ void __init ixp4xx_init_irq(void) static unsigned volatile last_jiffy_time; -#define CLOCK_TICKS_PER_USEC ((CLOCK_TICK_RATE + USEC_PER_SEC/2) / USEC_PER_SEC) - -/* IRQs are disabled before entering here from do_gettimeofday() */ -static unsigned long ixp4xx_gettimeoffset(void) -{ - u32 elapsed; - - elapsed = *IXP4XX_OSTS - last_jiffy_time; - - return elapsed / CLOCK_TICKS_PER_USEC; -} - static irqreturn_t ixp4xx_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { - write_seqlock(&xtime_lock); - /* Clear Pending Interrupt by writing '1' to it */ *IXP4XX_OSST = IXP4XX_OSST_TIMER_1_PEND; +#ifdef CONFIG_HIGH_RES_TIMERS + if (clockevent_ixp4xx.event_handler) + clockevent_ixp4xx.event_handler(regs); +#else + write_seqlock(&xtime_lock); + /* * Catch up with the real idea of time */ @@ -281,6 +280,7 @@ static irqreturn_t ixp4xx_timer_interrup } write_sequnlock(&xtime_lock); +#endif return IRQ_HANDLED; } @@ -299,17 +299,18 @@ static void __init ixp4xx_timer_init(voi /* Setup the Timer counter value */ *IXP4XX_OSRT1 = (LATCH & ~IXP4XX_OST_RELOAD_MASK) | IXP4XX_OST_ENABLE; - /* Reset time-stamp counter */ - *IXP4XX_OSTS = 0; last_jiffy_time = 0; /* Connect the interrupt handler and enable the interrupt */ setup_irq(IRQ_IXP4XX_TIMER1, &ixp4xx_timer_irq); + +#ifdef CONFIG_HIGH_RES_TIMERS + ixp4xx_clockevent_init(); +#endif } struct sys_timer ixp4xx_timer = { .init = ixp4xx_timer_init, - .offset = ixp4xx_gettimeoffset, }; static struct resource ixp46x_i2c_resources[] = { @@ -365,3 +366,70 @@ void __init ixp4xx_sys_init(void) ixp4xx_exp_bus_size >> 20); } +cycle_t ixp4xx_get_cycles(void) +{ + return *IXP4XX_OSTS; +} + +static struct clocksource clocksource_ixp4xx = { + .name = "OSTS", + .rating = 200, + .read = ixp4xx_get_cycles, + .mask = 0xFFFFFFFF, + .shift = 20, + .is_continuous = 1, +}; + +static int __init ixp4xx_clocksource_init(void) +{ + /* Reset time-stamp counter */ + *IXP4XX_OSTS = 0; + + clocksource_ixp4xx.mult = + clocksource_khz2mult(66660, clocksource_ixp4xx.shift); + clocksource_register(&clocksource_ixp4xx); + + return 0; +} +device_initcall(ixp4xx_clocksource_init); + +#ifdef CONFIG_HIGH_RES_TIMERS +static u32 clockevent_mode = 0; + +static void ixp4xx_set_next_event(unsigned long evt, + struct clock_event *unused) +{ + u32 oneshot = (clockevent_mode == CLOCK_EVT_ONESHOT) ? + IXP4XX_OST_ONE_SHOT : 0; + + *IXP4XX_OSRT1 = (evt & ~IXP4XX_OST_RELOAD_MASK) | IXP4XX_OST_ENABLE | + oneshot; +} + +static void ixp4xx_set_mode(int mode, struct clock_event *evt) +{ + clockevent_mode = mode; +} + +static struct clock_event clockevent_ixp4xx = { + .name = "ixp4xx timer1", + .capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_TICK | + CLOCK_CAP_UPDATE | CLOCK_CAP_PROFILE, + .shift = 32, + .set_mode = ixp4xx_set_mode, + .set_next_event = ixp4xx_set_next_event, +}; + +static int __init ixp4xx_clockevent_init(void) +{ + clockevent_ixp4xx.mult = div_sc(FREQ, NSEC_PER_SEC, + clockevent_ixp4xx.shift); + clockevent_ixp4xx.max_delta_ns = + clockevent_delta2ns(0xfffffffe, &clockevent_ixp4xx); + clockevent_ixp4xx.min_delta_ns = + clockevent_delta2ns(0xf, &clockevent_ixp4xx); + register_local_clockevent(&clockevent_ixp4xx); + + return 0; +} +#endif Index: linux/arch/arm/mach-omap1/pm.c =================================================================== --- linux.orig/arch/arm/mach-omap1/pm.c +++ linux/arch/arm/mach-omap1/pm.c @@ -120,7 +120,7 @@ void omap_pm_idle(void) local_irq_disable(); local_fiq_disable(); - if (need_resched()) { + if (need_resched() || need_resched_delayed()) { local_fiq_enable(); local_irq_enable(); return; Index: linux/arch/arm/mach-omap2/pm.c =================================================================== --- linux.orig/arch/arm/mach-omap2/pm.c +++ linux/arch/arm/mach-omap2/pm.c @@ -53,7 +53,7 @@ void omap2_pm_idle(void) { local_irq_disable(); local_fiq_disable(); - if (need_resched()) { + if (need_resched() || need_resched_delayed()) { local_fiq_enable(); local_irq_enable(); return; Index: linux/arch/arm/mach-sa1100/badge4.c =================================================================== --- linux.orig/arch/arm/mach-sa1100/badge4.c +++ linux/arch/arm/mach-sa1100/badge4.c @@ -240,15 +240,22 @@ void badge4_set_5V(unsigned subsystem, i /* detect on->off and off->on transitions */ if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { /* was off, now on */ - printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__); GPSR = BADGE4_GPIO_PCMEN5V; } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { /* was on, now off */ - printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__); GPCR = BADGE4_GPIO_PCMEN5V; } local_irq_restore(flags); + + /* detect on->off and off->on transitions */ + if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { + /* was off, now on */ + printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__); + } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { + /* was on, now off */ + printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__); + } } EXPORT_SYMBOL(badge4_set_5V); Index: linux/arch/arm/mach-shark/leds.c =================================================================== --- linux.orig/arch/arm/mach-shark/leds.c +++ linux/arch/arm/mach-shark/leds.c @@ -32,7 +32,7 @@ static char led_state; static short hw_led_state; static short saved_state; -static DEFINE_SPINLOCK(leds_lock); +static DEFINE_RAW_SPINLOCK(leds_lock); short sequoia_read(int addr) { outw(addr,0x24); Index: linux/arch/arm/mach-versatile/Kconfig =================================================================== --- linux.orig/arch/arm/mach-versatile/Kconfig +++ linux/arch/arm/mach-versatile/Kconfig @@ -1,6 +1,10 @@ menu "Versatile platform type" depends on ARCH_VERSATILE +config IS_TICK_BASED + bool + default n + config ARCH_VERSATILE_PB bool "Support Versatile/PB platform" default y Index: linux/arch/arm/mach-versatile/core.c =================================================================== --- linux.orig/arch/arm/mach-versatile/core.c +++ linux/arch/arm/mach-versatile/core.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include #include @@ -808,59 +810,50 @@ void __init versatile_init(void) #define TICKS2USECS(x) ((x) / TICKS_PER_uSEC) #endif -/* - * Returns number of ms since last clock interrupt. Note that interrupts - * will have been disabled by do_gettimeoffset() - */ -static unsigned long versatile_gettimeoffset(void) +#ifdef CONFIG_HIGH_RES_TIMERS +static void timer_set_mode(int mode, struct clock_event *clk) { - unsigned long ticks1, ticks2, status; - - /* - * Get the current number of ticks. Note that there is a race - * condition between us reading the timer and checking for - * an interrupt. We get around this by ensuring that the - * counter has not reloaded between our two reads. - */ - ticks2 = readl(TIMER0_VA_BASE + TIMER_VALUE) & 0xffff; - do { - ticks1 = ticks2; - status = __raw_readl(VA_IC_BASE + VIC_RAW_STATUS); - ticks2 = readl(TIMER0_VA_BASE + TIMER_VALUE) & 0xffff; - } while (ticks2 > ticks1); - - /* - * Number of ticks since last interrupt. - */ - ticks1 = TIMER_RELOAD - ticks2; - - /* - * Interrupt pending? If so, we've reloaded once already. - * - * FIXME: Need to check this is effectively timer 0 that expires - */ - if (status & IRQMASK_TIMERINT0_1) - ticks1 += TIMER_RELOAD; + if (mode == CLOCK_EVT_PERIODIC) { + writel(TIMER_CTRL_PERIODIC | TIMER_CTRL_32BIT | TIMER_CTRL_IE | + TIMER_CTRL_ENABLE, TIMER0_VA_BASE + TIMER_CTRL); + } else { + writel(TIMER_CTRL_ONESHOT | TIMER_CTRL_32BIT | TIMER_CTRL_IE | + TIMER_CTRL_ENABLE, TIMER0_VA_BASE + TIMER_CTRL); + } +} - /* - * Convert the ticks to usecs - */ - return TICKS2USECS(ticks1); +static void timer_set_next_event(unsigned long evt, struct clock_event *unused) +{ + BUG_ON(!evt); + writel(evt, TIMER0_VA_BASE + TIMER_LOAD); } +static struct clock_event timer0_clock = { + .name = "timer0", + .shift = 32, + .capabilities = CLOCK_CAP_TICK | CLOCK_CAP_UPDATE | + CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE, + .set_mode = timer_set_mode, + .set_next_event = timer_set_next_event, +}; +#endif + /* * IRQ handler for the timer */ static irqreturn_t versatile_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { - write_seqlock(&xtime_lock); - // ...clear the interrupt writel(1, TIMER0_VA_BASE + TIMER_INTCLR); +#ifdef CONFIG_HIGH_RES_TIMERS + if (timer0_clock.event_handler) + timer0_clock.event_handler(regs); +#else + write_seqlock(&xtime_lock); timer_tick(regs); - write_sequnlock(&xtime_lock); +#endif return IRQ_HANDLED; } @@ -893,11 +886,20 @@ static void __init versatile_timer_init( /* * Initialise to a known state (all timers off) */ - writel(0, TIMER0_VA_BASE + TIMER_CTRL); + writel(0, TIMER0_VA_BASE + TIMER_CTRL); writel(0, TIMER1_VA_BASE + TIMER_CTRL); writel(0, TIMER2_VA_BASE + TIMER_CTRL); writel(0, TIMER3_VA_BASE + TIMER_CTRL); +#ifdef CONFIG_HIGH_RES_TIMERS + timer0_clock.mult = div_sc(1000000, NSEC_PER_SEC, timer0_clock.shift); + timer0_clock.max_delta_ns = + clockevent_delta2ns(0xffffffff, &timer0_clock); + timer0_clock.min_delta_ns = + clockevent_delta2ns(0xf, &timer0_clock); + register_global_clockevent(&timer0_clock); +#endif + writel(TIMER_RELOAD, TIMER0_VA_BASE + TIMER_LOAD); writel(TIMER_RELOAD, TIMER0_VA_BASE + TIMER_VALUE); writel(TIMER_DIVISOR | TIMER_CTRL_ENABLE | TIMER_CTRL_PERIODIC | @@ -911,5 +913,36 @@ static void __init versatile_timer_init( struct sys_timer versatile_timer = { .init = versatile_timer_init, - .offset = versatile_gettimeoffset, }; + +cycle_t versatile_get_cycles(void) +{ + return ~readl(TIMER3_VA_BASE + TIMER_VALUE); +} + +static struct clocksource clocksource_versatile = { + .name = "timer3", + .rating = 200, + .read = versatile_get_cycles, + .mask = 0xFFFFFFFF, + .shift = 20, + .is_continuous = 1, +}; + +static int __init versatile_clocksource_init(void) +{ + writel(0, TIMER3_VA_BASE + TIMER_CTRL); + writel(0xffffffff, TIMER3_VA_BASE + TIMER_LOAD); + writel(0xffffffff, TIMER3_VA_BASE + TIMER_VALUE); + writel(TIMER_CTRL_32BIT | TIMER_CTRL_ENABLE | TIMER_CTRL_PERIODIC, + TIMER3_VA_BASE + TIMER_CTRL); + + clocksource_versatile.mult = + clocksource_khz2mult(1000, clocksource_versatile.shift); + clocksource_register(&clocksource_versatile); + + return 0; +} + +device_initcall(versatile_clocksource_init); + Index: linux/arch/arm/mm/consistent.c =================================================================== --- linux.orig/arch/arm/mm/consistent.c +++ linux/arch/arm/mm/consistent.c @@ -40,7 +40,7 @@ * These are the page tables (2MB each) covering uncached, DMA consistent allocations */ static pte_t *consistent_pte[NUM_CONSISTENT_PTES]; -static DEFINE_SPINLOCK(consistent_lock); +static DEFINE_RAW_SPINLOCK(consistent_lock); /* * VM region handling support. Index: linux/arch/arm/mm/copypage-v4mc.c =================================================================== --- linux.orig/arch/arm/mm/copypage-v4mc.c +++ linux/arch/arm/mm/copypage-v4mc.c @@ -29,7 +29,7 @@ #define TOP_PTE(x) pte_offset_kernel(top_pmd, x) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * ARMv4 mini-dcache optimised copy_user_page @@ -43,7 +43,7 @@ static DEFINE_SPINLOCK(minicache_lock); * instruction. If your processor does not supply this, you have to write your * own copy_user_page that does the right thing. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { asm volatile( @@ -82,7 +82,7 @@ void v4_mc_copy_user_page(void *kto, con /* * ARMv4 optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) v4_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux/arch/arm/mm/copypage-v6.c =================================================================== --- linux.orig/arch/arm/mm/copypage-v6.c +++ linux/arch/arm/mm/copypage-v6.c @@ -26,7 +26,7 @@ #define TOP_PTE(x) pte_offset_kernel(top_pmd, x) -static DEFINE_SPINLOCK(v6_lock); +static DEFINE_RAW_SPINLOCK(v6_lock); /* * Copy the user page. No aliasing to deal with so we can just Index: linux/arch/arm/mm/copypage-xscale.c =================================================================== --- linux.orig/arch/arm/mm/copypage-xscale.c +++ linux/arch/arm/mm/copypage-xscale.c @@ -31,7 +31,7 @@ #define TOP_PTE(x) pte_offset_kernel(top_pmd, x) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * XScale mini-dcache optimised copy_user_page @@ -41,7 +41,7 @@ static DEFINE_SPINLOCK(minicache_lock); * Dcache aliasing issue. The writes will be forwarded to the write buffer, * and merged as appropriate. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { /* @@ -104,7 +104,7 @@ void xscale_mc_copy_user_page(void *kto, /* * XScale optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux/arch/arm/mm/fault.c =================================================================== --- linux.orig/arch/arm/mm/fault.c +++ linux/arch/arm/mm/fault.c @@ -215,7 +215,7 @@ out: return fault; } -static int +static notrace int do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk; @@ -315,7 +315,7 @@ no_context: * interrupt or a critical region, and should only copy the information * from the master page table, nothing more. */ -static int +static notrace int do_translation_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { @@ -361,7 +361,7 @@ bad_area: * Some section permission faults need to be handled gracefully. * They can happen due to a __{get,put}_user during an oops. */ -static int +static notrace int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk = current; @@ -372,7 +372,7 @@ do_sect_fault(unsigned long addr, unsign /* * This abort handler always returns "fault". */ -static int +static notrace int do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { return 1; @@ -427,7 +427,7 @@ static struct fsr_info { { do_bad, SIGBUS, 0, "unknown 31" } }; -void __init +void __init notrace hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *), int sig, const char *name) { @@ -441,7 +441,7 @@ hook_fault_code(int nr, int (*fn)(unsign /* * Dispatch a data abort to the relevant handler. */ -asmlinkage void +asmlinkage notrace void do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6); @@ -460,7 +460,7 @@ do_DataAbort(unsigned long addr, unsigne notify_die("", regs, &info, fsr, 0); } -asmlinkage void +asmlinkage notrace void do_PrefetchAbort(unsigned long addr, struct pt_regs *regs) { do_translation_fault(addr, 0, regs); Index: linux/arch/arm/mm/init.c =================================================================== --- linux.orig/arch/arm/mm/init.c +++ linux/arch/arm/mm/init.c @@ -25,7 +25,7 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern void _stext, _text, _etext, __data_start, _end, __init_begin, __init_end; Index: linux/arch/arm/plat-omap/clock.c =================================================================== --- linux.orig/arch/arm/plat-omap/clock.c +++ linux/arch/arm/plat-omap/clock.c @@ -29,7 +29,7 @@ static LIST_HEAD(clocks); static DEFINE_MUTEX(clocks_mutex); -static DEFINE_SPINLOCK(clockfw_lock); +static DEFINE_RAW_SPINLOCK(clockfw_lock); static struct clk_functions *arch_clock; Index: linux/arch/arm/plat-omap/dma.c =================================================================== --- linux.orig/arch/arm/plat-omap/dma.c +++ linux/arch/arm/plat-omap/dma.c @@ -949,7 +949,7 @@ static struct irqaction omap24xx_dma_irq /*----------------------------------------------------------------------------*/ static struct lcd_dma_info { - spinlock_t lock; + raw_spinlock_t lock; int reserved; void (* callback)(u16 status, void *data); void *cb_data; Index: linux/arch/arm/plat-omap/gpio.c =================================================================== --- linux.orig/arch/arm/plat-omap/gpio.c +++ linux/arch/arm/plat-omap/gpio.c @@ -120,7 +120,7 @@ struct gpio_bank { u32 reserved_map; u32 suspend_wakeup; u32 saved_wakeup; - spinlock_t lock; + raw_spinlock_t lock; }; #define METHOD_MPUIO 0 Index: linux/arch/arm/plat-omap/mux.c =================================================================== --- linux.orig/arch/arm/plat-omap/mux.c +++ linux/arch/arm/plat-omap/mux.c @@ -56,7 +56,7 @@ int __init omap_mux_register(struct pin_ */ int __init_or_module omap_cfg_reg(const unsigned long index) { - static DEFINE_SPINLOCK(mux_spin_lock); + static DEFINE_RAW_SPINLOCK(mux_spin_lock); unsigned long flags; struct pin_config *cfg; Index: linux/arch/arm/plat-omap/pm.c =================================================================== --- linux.orig/arch/arm/plat-omap/pm.c +++ linux/arch/arm/plat-omap/pm.c @@ -84,7 +84,7 @@ void omap_pm_idle(void) local_irq_disable(); local_fiq_disable(); - if (need_resched()) { + if (need_resched() || need_resched_delayed()) { local_fiq_enable(); local_irq_enable(); return; Index: linux/arch/h8300/Kconfig =================================================================== --- linux.orig/arch/h8300/Kconfig +++ linux/arch/h8300/Kconfig @@ -41,6 +41,10 @@ config GENERIC_CALIBRATE_DELAY bool default y +config GENERIC_TIME + bool + default y + config TIME_LOW_RES bool default y Index: linux/arch/h8300/kernel/time.c =================================================================== --- linux.orig/arch/h8300/kernel/time.c +++ linux/arch/h8300/kernel/time.c @@ -68,58 +68,6 @@ void time_init(void) platform_timer_setup(timer_interrupt); } -/* - * This version of gettimeofday has near microsecond resolution. - */ -void do_gettimeofday(struct timeval *tv) -{ - unsigned long flags; - unsigned long usec, sec; - - read_lock_irqsave(&xtime_lock, flags); - usec = 0; - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - read_unlock_irqrestore(&xtime_lock, flags); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_lock_irq(&xtime_lock); - /* This is revolting. We need to set the xtime.tv_usec - * correctly. However, the value in this location is - * is value at the last tick. - * Discover what correction gettimeofday - * would have done, and then undo it! - */ - while (tv->tv_nsec < 0) { - tv->tv_nsec += NSEC_PER_SEC; - tv->tv_sec--; - } - - xtime.tv_sec = tv->tv_sec; - xtime.tv_nsec = tv->tv_nsec; - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - unsigned long long sched_clock(void) { return (unsigned long long)jiffies * (1000000000 / HZ); Index: linux/arch/i386/Kconfig =================================================================== --- linux.orig/arch/i386/Kconfig +++ linux/arch/i386/Kconfig @@ -65,6 +65,8 @@ source "init/Kconfig" menu "Processor type and features" +source "kernel/time/Kconfig" + config SMP bool "Symmetric multi-processing support" ---help--- @@ -261,6 +263,19 @@ config SCHED_MC source "kernel/Kconfig.preempt" +config RWSEM_GENERIC_SPINLOCK + bool + depends on M386 || PREEMPT_RT + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + default y if !RWSEM_GENERIC_SPINLOCK + config X86_UP_APIC bool "Local APIC support on uniprocessors" depends on !SMP && !(X86_VISWS || X86_VOYAGER) @@ -708,6 +723,7 @@ config BOOT_IOREMAP config REGPARM bool "Use register arguments" + depends on !MCOUNT default y help Compile the kernel with -mregparm=3. This instructs gcc to use @@ -791,6 +807,10 @@ config HOTPLUG_CPU enable suspend on SMP systems. CPUs can be controlled through /sys/devices/system/cpu. +config GENERIC_TIME_VSYSCALL + depends on EXPERIMENTAL + bool "VSYSCALL gettimeofday() interface" + config COMPAT_VDSO bool "Compat VDSO support" default y Index: linux/arch/i386/Kconfig.cpu =================================================================== --- linux.orig/arch/i386/Kconfig.cpu +++ linux/arch/i386/Kconfig.cpu @@ -235,11 +235,6 @@ config RWSEM_GENERIC_SPINLOCK depends on M386 default y -config RWSEM_XCHGADD_ALGORITHM - bool - depends on !M386 - default y - config GENERIC_CALIBRATE_DELAY bool default y Index: linux/arch/i386/Kconfig.debug =================================================================== --- linux.orig/arch/i386/Kconfig.debug +++ linux/arch/i386/Kconfig.debug @@ -22,6 +22,7 @@ config EARLY_PRINTK config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL + default y help This option will cause messages to be printed if free stack space drops below a certain limit. @@ -29,6 +30,7 @@ config DEBUG_STACKOVERFLOW config DEBUG_STACK_USAGE bool "Stack utilization instrumentation" depends on DEBUG_KERNEL + default y help Enables the display of the minimum amount of free stack which each task has ever had available in the sysrq-T and sysrq-P debug output. @@ -49,6 +51,7 @@ config DEBUG_PAGEALLOC config DEBUG_RODATA bool "Write protect kernel read-only data structures" depends on DEBUG_KERNEL + default y help Mark the kernel read-only data as write-protected in the pagetables, in order to catch accidental (and incorrect) writes to such const @@ -59,6 +62,7 @@ config DEBUG_RODATA config 4KSTACKS bool "Use 4Kb for kernel stacks instead of 8Kb" depends on DEBUG_KERNEL + default y help If you say Y here the kernel will use a 4Kb stacksize for the kernel stack attached to each process/thread. This facilitates Index: linux/arch/i386/boot/compressed/misc.c =================================================================== --- linux.orig/arch/i386/boot/compressed/misc.c +++ linux/arch/i386/boot/compressed/misc.c @@ -15,6 +15,12 @@ #include #include +#ifdef CONFIG_MCOUNT +void notrace mcount(void) +{ +} +#endif + /* * gzip declarations */ @@ -107,7 +113,7 @@ static long free_mem_end_ptr; #define INPLACE_MOVE_ROUTINE 0x1000 #define LOW_BUFFER_START 0x2000 #define LOW_BUFFER_MAX 0x90000 -#define HEAP_SIZE 0x3000 +#define HEAP_SIZE 0x4000 static unsigned int low_buffer_end, low_buffer_size; static int high_loaded =0; static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; Index: linux/arch/i386/kernel/Makefile =================================================================== --- linux.orig/arch/i386/kernel/Makefile +++ linux/arch/i386/kernel/Makefile @@ -4,7 +4,7 @@ extra-y := head.o init_task.o vmlinux.lds -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ +obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o bootflag.o \ quirks.o i8237.o topology.o alternative.o i8253.o tsc.o @@ -12,6 +12,7 @@ obj-y := process.o semaphore.o signal.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ +obj-$(CONFIG_GENERIC_TIME_VSYSCALL) += vsyscall-gtod.o obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca.o obj-$(CONFIG_X86_MSR) += msr.o @@ -20,6 +21,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-$(CONFIG_MCOUNT) += mcount-wrapper.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o @@ -30,6 +32,7 @@ obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o obj-y += sysenter.o vsyscall.o obj-$(CONFIG_ACPI_SRAT) += srat.o obj-$(CONFIG_HPET_TIMER) += time_hpet.o Index: linux/arch/i386/kernel/acpi/boot.c =================================================================== --- linux.orig/arch/i386/kernel/acpi/boot.c +++ linux/arch/i386/kernel/acpi/boot.c @@ -53,8 +53,6 @@ static inline int acpi_madt_oem_check(ch #include #endif /* CONFIG_X86_LOCAL_APIC */ -static inline int gsi_irq_sharing(int gsi) { return gsi; } - #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ @@ -459,12 +457,7 @@ void __init acpi_pic_sci_set_trigger(uns int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) { -#ifdef CONFIG_X86_IO_APIC - if (use_pci_vector() && !platform_legacy_irq(gsi)) - *irq = IO_APIC_VECTOR(gsi); - else -#endif - *irq = gsi_irq_sharing(gsi); + *irq = gsi; return 0; } @@ -575,6 +568,7 @@ static int __init acpi_parse_sbf(unsigne } #ifdef CONFIG_HPET_TIMER +#include static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) { @@ -595,21 +589,13 @@ static int __init acpi_parse_hpet(unsign return -1; } #ifdef CONFIG_X86_64 - vxtime.hpet_address = hpet_tbl->addr.addrl | + hpet_address = hpet_tbl->addr.addrl | ((long)hpet_tbl->addr.addrh << 32); - +#else + hpet_address = hpet_tbl->addr.addrl; +#endif printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, vxtime.hpet_address); -#else /* X86 */ - { - extern unsigned long hpet_address; - - hpet_address = hpet_tbl->addr.addrl; - printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, hpet_address); - } -#endif /* X86 */ - + hpet_tbl->id, hpet_address); return 0; } #else Index: linux/arch/i386/kernel/apic.c =================================================================== --- linux.orig/arch/i386/kernel/apic.c +++ linux/arch/i386/kernel/apic.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -59,6 +60,23 @@ int enable_local_apic __initdata = 0; /* */ int apic_verbosity; +static unsigned int calibration_result; + +static void lapic_next_event(unsigned long delta, struct clock_event *evt); +static void lapic_timer_setup(int mode, struct clock_event *evt); + +static struct clock_event lapic_clockevent = { + .name = "lapic", + .capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE +#ifdef CONFIG_SMP + | CLOCK_CAP_UPDATE +#endif + , + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, +}; +static DEFINE_PER_CPU(struct clock_event, lapic_events); static void apic_pm_activate(void); @@ -909,6 +927,11 @@ fake_ioapic_page: */ /* + * FIXME: Move this to i8253.h. There is no need to keep the access to + * the PIT scattered all around the place -tglx + */ + +/* * The timer chip is already set up at HZ interrupts per second here, * but we do not accept timer interrupts yet. We only allow the BP * to calibrate. @@ -966,13 +989,15 @@ void (*wait_timer_tick)(void) __devinitd #define APIC_DIVISOR 16 -static void __setup_APIC_LVTT(unsigned int clocks) +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot) { unsigned int lvtt_value, tmp_value, ver; int cpu = smp_processor_id(); ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; if (!APIC_INTEGRATED(ver)) lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); @@ -989,23 +1014,31 @@ static void __setup_APIC_LVTT(unsigned i & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | APIC_TDR_DIV_16); - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); + if (!oneshot) + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); +} + +static void lapic_next_event(unsigned long delta, struct clock_event *evt) +{ + apic_write_around(APIC_TMICT, delta); } -static void __devinit setup_APIC_timer(unsigned int clocks) +static void lapic_timer_setup(int mode, struct clock_event *evt) { unsigned long flags; local_irq_save(flags); + __setup_APIC_LVTT(calibration_result, mode != CLOCK_EVT_PERIODIC); + local_irq_restore(flags); +} - /* - * Wait for IRQ0's slice: - */ - wait_timer_tick(); +static void __devinit setup_APIC_timer(void) +{ + struct clock_event *levt = &__get_cpu_var(lapic_events); - __setup_APIC_LVTT(clocks); + memcpy(levt, &lapic_clockevent, sizeof(*levt)); - local_irq_restore(flags); + register_local_clockevent(levt); } /* @@ -1014,6 +1047,8 @@ static void __devinit setup_APIC_timer(u * to calibrate, since some later bootup code depends on getting * the first irq? Ugh. * + * TODO: Fix this rather than saying "Ugh" -tglx + * * We want to do the calibration only once since we * want to have local timer irqs syncron. CPUs connected * by the same APIC bus have the very same bus frequency. @@ -1036,7 +1071,7 @@ static int __init calibrate_APIC_clock(v * value into the APIC clock, we just want to get the * counter running for calibration. */ - __setup_APIC_LVTT(1000000000); + __setup_APIC_LVTT(1000000000, 0); /* * The timer chip counts down to zero. Let's wait @@ -1073,6 +1108,14 @@ static int __init calibrate_APIC_clock(v result = (tt1-tt2)*APIC_DIVISOR/LOOPS; + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(tt1-tt2, TICK_NSEC * LOOPS, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + printk("lapic max_delta_ns: %ld\n", lapic_clockevent.max_delta_ns); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + if (cpu_has_tsc) apic_printk(APIC_VERBOSE, "..... CPU clock speed is " "%ld.%04ld MHz.\n", @@ -1087,8 +1130,6 @@ static int __init calibrate_APIC_clock(v return result; } -static unsigned int calibration_result; - void __init setup_boot_APIC_clock(void) { unsigned long flags; @@ -1101,14 +1142,14 @@ void __init setup_boot_APIC_clock(void) /* * Now set up the timer for real. */ - setup_APIC_timer(calibration_result); + setup_APIC_timer(); local_irq_restore(flags); } void __devinit setup_secondary_APIC_clock(void) { - setup_APIC_timer(calibration_result); + setup_APIC_timer(); } void disable_APIC_timer(void) @@ -1154,6 +1195,13 @@ void switch_APIC_timer_to_ipi(void *cpum !cpu_isset(cpu, timer_bcast_ipi)) { disable_APIC_timer(); cpu_set(cpu, timer_bcast_ipi); +#ifdef CONFIG_HIGH_RES_TIMERS + printk("Disabling NO_HZ and high resolution timers " + "due to timer broadcasting\n"); + for_each_possible_cpu(cpu) + per_cpu(lapic_events, cpu).capabilities &= + ~CLOCK_CAP_NEXTEVT; +#endif } } EXPORT_SYMBOL(switch_APIC_timer_to_ipi); @@ -1190,6 +1238,8 @@ inline void smp_local_timer_interrupt(st update_process_times(user_mode_vm(regs)); #endif + trace_special(regs->eip, 0, 0); + /* * We take the 'long' return path, and there every subsystem * grabs the apropriate locks (kernel lock/ irq lock). @@ -1211,15 +1261,18 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) +fastcall notrace void smp_apic_timer_interrupt(struct pt_regs *regs) { int cpu = smp_processor_id(); + struct clock_event *evt = &per_cpu(lapic_events, cpu); /* * the NMI deadlock-detector uses this. */ per_cpu(irq_stat, cpu).apic_timer_irqs++; + trace_special(regs->eip, 0, 0); + /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. @@ -1231,7 +1284,15 @@ fastcall void smp_apic_timer_interrupt(s * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(regs); + /* + * If the task is currently running in user mode, don't + * detect soft lockups. If CONFIG_DETECT_SOFTLOCKUP is not + * configured, this should be optimized out. + */ + if (user_mode(regs)) + touch_softlockup_watchdog(); + + evt->event_handler(regs); irq_exit(); } @@ -1240,6 +1301,8 @@ static void up_apic_timer_interrupt_call { int cpu = smp_processor_id(); + trace_special(regs->eip, 1, 0); + /* * the NMI deadlock-detector uses this. */ @@ -1323,6 +1386,7 @@ fastcall void smp_error_interrupt(struct */ printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); + dump_stack(); irq_exit(); } Index: linux/arch/i386/kernel/apm.c =================================================================== --- linux.orig/arch/i386/kernel/apm.c +++ linux/arch/i386/kernel/apm.c @@ -233,7 +233,6 @@ #include "io_ports.h" -extern unsigned long get_cmos_time(void); extern void machine_real_restart(unsigned char *, int); #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) @@ -1152,26 +1151,6 @@ out: spin_unlock(&user_list_lock); } -static void set_time(void) -{ - if (got_clock_diff) { /* Must know time zone in order to set clock */ - xtime.tv_sec = get_cmos_time() + clock_cmos_diff; - xtime.tv_nsec = 0; - } -} - -static void get_time_diff(void) -{ -#ifndef CONFIG_APM_RTC_IS_GMT - /* - * Estimate time zone so that set_time can update the clock - */ - clock_cmos_diff = -get_cmos_time(); - clock_cmos_diff += get_seconds(); - got_clock_diff = 1; -#endif -} - static void reinit_timer(void) { #ifdef INIT_TIMER_AFTER_SUSPEND @@ -1211,19 +1190,6 @@ static int suspend(int vetoable) local_irq_disable(); device_power_down(PMSG_SUSPEND); - /* serialize with the timer interrupt */ - write_seqlock(&xtime_lock); - - /* protect against access to timer chip registers */ - spin_lock(&i8253_lock); - - get_time_diff(); - /* - * Irq spinlock must be dropped around set_system_power_state. - * We'll undo any timer changes due to interrupts below. - */ - spin_unlock(&i8253_lock); - write_sequnlock(&xtime_lock); local_irq_enable(); save_processor_state(); @@ -1232,13 +1198,7 @@ static int suspend(int vetoable) restore_processor_state(); local_irq_disable(); - write_seqlock(&xtime_lock); - spin_lock(&i8253_lock); reinit_timer(); - set_time(); - - spin_unlock(&i8253_lock); - write_sequnlock(&xtime_lock); if (err == APM_NO_ERROR) err = APM_SUCCESS; @@ -1267,11 +1227,6 @@ static void standby(void) local_irq_disable(); device_power_down(PMSG_SUSPEND); - /* serialize with the timer interrupt */ - write_seqlock(&xtime_lock); - /* If needed, notify drivers here */ - get_time_diff(); - write_sequnlock(&xtime_lock); local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); @@ -1365,9 +1320,6 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - write_seqlock_irq(&xtime_lock); - set_time(); - write_sequnlock_irq(&xtime_lock); device_resume(); pm_send_all(PM_RESUME, (void *)0); queue_event(event, NULL); @@ -1383,9 +1335,6 @@ static void check_events(void) break; case APM_UPDATE_TIME: - write_seqlock_irq(&xtime_lock); - set_time(); - write_sequnlock_irq(&xtime_lock); break; case APM_CRITICAL_SUSPEND: Index: linux/arch/i386/kernel/cpu/mtrr/generic.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/mtrr/generic.c +++ linux/arch/i386/kernel/cpu/mtrr/generic.c @@ -234,7 +234,7 @@ static unsigned long set_mtrr_state(u32 static unsigned long cr4 = 0; static u32 deftype_lo, deftype_hi; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts - they Index: linux/arch/i386/kernel/cpu/mtrr/main.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/mtrr/main.c +++ linux/arch/i386/kernel/cpu/mtrr/main.c @@ -135,8 +135,6 @@ struct set_mtrr_data { mtrr_type smp_type; }; -#ifdef CONFIG_SMP - static void ipi_handler(void *info) /* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. [RETURNS] Nothing. @@ -166,8 +164,6 @@ static void ipi_handler(void *info) local_irq_restore(flags); } -#endif - /** * set_mtrr - update mtrrs on all processors * @reg: mtrr in question Index: linux/arch/i386/kernel/cpu/transmeta.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/transmeta.c +++ linux/arch/i386/kernel/cpu/transmeta.c @@ -9,7 +9,8 @@ static void __init init_transmeta(struct { unsigned int cap_mask, uk, max, dummy; unsigned int cms_rev1, cms_rev2; - unsigned int cpu_rev, cpu_freq, cpu_flags, new_cpu_rev; + unsigned int cpu_rev, cpu_freq = 0 /* shut up gcc warning */, + cpu_flags, new_cpu_rev; char cpu_info[65]; get_model_name(c); /* Same as AMD/Cyrix */ Index: linux/arch/i386/kernel/entry.S =================================================================== --- linux.orig/arch/i386/kernel/entry.S +++ linux/arch/i386/kernel/entry.S @@ -248,14 +248,18 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) cli + cmpl $0, kernel_preemption + jz restore_nocheck cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl - jz restore_all + jz restore_nocheck testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_nocheck + cli + TRACE_IRQS_OFF call preempt_schedule_irq jmp need_resched #endif @@ -311,6 +315,11 @@ sysenter_past_esp: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_LATENCY_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -325,6 +334,11 @@ sysenter_past_esp: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work +#ifdef CONFIG_LATENCY_TRACE + pushl %eax + call sys_ret + popl %eax +#endif /* if something modifies registers it must also disable sysexit */ movl EIP(%esp), %edx movl OLDESP(%esp), %ecx @@ -341,6 +355,11 @@ ENTRY(system_call) pushl %eax # save orig_eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_LATENCY_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) testl $TF_MASK,EFLAGS(%esp) jz no_singlestep @@ -430,19 +449,20 @@ ldt_ss: ALIGN RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jz work_notifysig work_resched: - call schedule - cli # make sure we don't miss an interrupt + cli + TRACE_IRQS_OFF + call __schedule + # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret - TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jnz work_resched work_notifysig: # deal with pending signals and Index: linux/arch/i386/kernel/head.S =================================================================== --- linux.orig/arch/i386/kernel/head.S +++ linux/arch/i386/kernel/head.S @@ -397,6 +397,7 @@ ignore_int: call printk #endif addl $(5*4),%esp + call dump_stack popl %ds popl %es popl %edx Index: linux/arch/i386/kernel/i386_ksyms.c =================================================================== --- linux.orig/arch/i386/kernel/i386_ksyms.c +++ linux/arch/i386/kernel/i386_ksyms.c @@ -2,10 +2,12 @@ #include #include -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_ASM_SEMAPHORES +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); @@ -20,7 +22,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(strstr); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_ASM_SEMAPHORES) extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); EXPORT_SYMBOL(__write_lock_failed); Index: linux/arch/i386/kernel/i8253.c =================================================================== --- linux.orig/arch/i386/kernel/i8253.c +++ linux/arch/i386/kernel/i8253.c @@ -2,7 +2,7 @@ * i8253.c 8253/PIT functions * */ -#include +#include #include #include #include @@ -16,22 +16,66 @@ #include "io_ports.h" -DEFINE_SPINLOCK(i8253_lock); +DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -void setup_pit_timer(void) +static void init_pit_timer(int mode, struct clock_event *evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + + switch(mode) { + case CLOCK_EVT_PERIODIC: + /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(0x34, PIT_MODE); + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + break; + + case CLOCK_EVT_ONESHOT: + case CLOCK_EVT_SHUTDOWN: + /* One shot setup */ + outb_p(0x38, PIT_MODE); + udelay(10); + break; + } + spin_unlock_irqrestore(&i8253_lock, flags); +} + +static void pit_next_event(unsigned long delta, struct clock_event *evt) { unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); - outb(LATCH >> 8 , PIT_CH0); /* MSB */ + outb_p(delta & 0xff , PIT_CH0); /* LSB */ + outb(delta >> 8 , PIT_CH0); /* MSB */ spin_unlock_irqrestore(&i8253_lock, flags); } +struct clock_event pit_clockevent = { + .name = "pit", + .capabilities = CLOCK_CAP_TICK | CLOCK_CAP_PROFILE | CLOCK_CAP_UPDATE +#ifndef CONFIG_SMP + | CLOCK_CAP_NEXTEVT +#endif + , + .set_mode = init_pit_timer, + .set_next_event = pit_next_event, + .shift = 32, +}; + +void setup_pit_timer(void) +{ + pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32); + pit_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFF, &pit_clockevent); + pit_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &pit_clockevent); + register_global_clockevent(&pit_clockevent); +} + /* * Since the PIT overflows every tick, its not very useful * to just read by itself. So use jiffies to emulate a free @@ -46,7 +90,7 @@ static cycle_t pit_read(void) static u32 old_jifs; spin_lock_irqsave(&i8253_lock, flags); - /* + /* * Although our caller may have the read side of xtime_lock, * this is now a seqlock, and we are cheating in this routine * by having side effects on state that we cannot undo if Index: linux/arch/i386/kernel/i8259.c =================================================================== --- linux.orig/arch/i386/kernel/i8259.c +++ linux/arch/i386/kernel/i8259.c @@ -34,39 +34,21 @@ * moves to arch independent land */ -DEFINE_SPINLOCK(i8259A_lock); - -static void end_8259A_irq (unsigned int irq) -{ - if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) && - irq_desc[irq].action) - enable_8259A_irq(irq); -} - -#define shutdown_8259A_irq disable_8259A_irq - static void mask_and_ack_8259A(unsigned int); -unsigned int startup_8259A_irq(unsigned int irq) -{ - enable_8259A_irq(irq); - return 0; /* never anything pending */ -} - -static struct hw_interrupt_type i8259A_irq_type = { - .typename = "XT-PIC", - .startup = startup_8259A_irq, - .shutdown = shutdown_8259A_irq, - .enable = enable_8259A_irq, - .disable = disable_8259A_irq, - .ack = mask_and_ack_8259A, - .end = end_8259A_irq, +static struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .mask = disable_8259A_irq, + .unmask = enable_8259A_irq, + .mask_ack = mask_and_ack_8259A, }; /* * 8259A PIC functions to handle ISA devices: */ +DEFINE_RAW_SPINLOCK(i8259A_lock); + /* * This contains the irq mask for both 8259A irq controllers, */ @@ -131,7 +113,7 @@ void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1< #include #include +#include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include @@ -49,8 +51,8 @@ atomic_t irq_mis_count; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); int timer_over_8254 __initdata = 1; @@ -85,14 +87,6 @@ static struct irq_pin_list { int apic, pin, next; } irq_2_pin[PIN_MAP_SIZE]; -int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; -#ifdef CONFIG_PCI_MSI -#define vector_to_irq(vector) \ - (platform_legacy_irq(vector) ? vector : vector_irq[vector]) -#else -#define vector_to_irq(vector) (vector) -#endif - /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -136,6 +130,105 @@ static void __init replace_pin_at_irq(un } } +//#define IOAPIC_CACHE + +#ifdef IOAPIC_CACHE +# define MAX_IOAPIC_CACHE 512 + +/* + * Cache register values: + */ +static unsigned int io_apic_cache[MAX_IO_APICS][MAX_IOAPIC_CACHE] + ____cacheline_aligned_in_smp; +#endif + +inline unsigned int __raw_io_apic_read(unsigned int apic, unsigned int reg) +{ + *IO_APIC_BASE(apic) = reg; + return *(IO_APIC_BASE(apic)+4); +} + +unsigned int raw_io_apic_read(unsigned int apic, unsigned int reg) +{ + unsigned int val = __raw_io_apic_read(apic, reg); + +#ifdef IOAPIC_CACHE + io_apic_cache[apic][reg] = val; +#endif + return val; +} + +unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ +#ifdef IOAPIC_CACHE + if (unlikely(reg >= MAX_IOAPIC_CACHE)) { + static int once = 1; + + if (once) { + once = 0; + printk("WARNING: ioapic register cache overflow: %d.\n", + reg); + dump_stack(); + } + return __raw_io_apic_read(apic, reg); + } + if (io_apic_cache[apic][reg] && !sis_apic_bug) + return io_apic_cache[apic][reg]; +#endif + return raw_io_apic_read(apic, reg); +} + +void io_apic_write(unsigned int apic, unsigned int reg, unsigned int val) +{ +#ifdef IOAPIC_CACHE + if (unlikely(reg >= MAX_IOAPIC_CACHE)) { + static int once = 1; + + if (once) { + once = 0; + printk("WARNING: ioapic register cache overflow: %d.\n", + reg); + dump_stack(); + } + } else + io_apic_cache[apic][reg] = val; +#endif + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = val; +} + +/* + * Some systems need a POST flush or else level-triggered interrupts + * generate lots of spurious interrupts due to the POST-ed write not + * reaching the IOAPIC before the IRQ is ACK-ed in the local APIC. + */ +#ifdef CONFIG_SMP +# define IOAPIC_POSTFLUSH +#endif + +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index regiser + */ +void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val) +{ +#ifdef IOAPIC_CACHE + io_apic_cache[apic][reg] = val; +#endif + if (unlikely(sis_apic_bug)) + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = val; +#ifndef IOAPIC_POSTFLUSH + if (unlikely(sis_apic_bug)) +#endif + /* + * Force POST flush by reading: + */ + val = *(IO_APIC_BASE(apic)+4); +} + static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) { struct irq_pin_list *entry = irq_2_pin + irq; @@ -167,18 +260,6 @@ static void __unmask_IO_APIC_irq (unsign __modify_IO_APIC_irq(irq, 0, 0x00010000); } -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); -} - static void mask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -258,7 +339,7 @@ static void set_ioapic_affinity_irq(unsi break; entry = irq_2_pin + entry->next; } - set_irq_info(irq, cpumask); + set_native_irq_info(irq, cpumask); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1159,46 +1240,45 @@ static inline int IO_APIC_irq_trigger(in /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; -int assign_irq_vector(int irq) +static int __assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; - unsigned long flags; int vector; - BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); + BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); - spin_lock_irqsave(&vector_lock, flags); - - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { - spin_unlock_irqrestore(&vector_lock, flags); + if (IO_APIC_VECTOR(irq) > 0) return IO_APIC_VECTOR(irq); - } -next: + current_vector += 8; if (current_vector == SYSCALL_VECTOR) - goto next; + current_vector += 8; if (current_vector >= FIRST_SYSTEM_VECTOR) { offset++; - if (!(offset%8)) { - spin_unlock_irqrestore(&vector_lock, flags); + if (!(offset % 8)) return -ENOSPC; - } current_vector = FIRST_DEVICE_VECTOR + offset; } vector = current_vector; - vector_irq[vector] = irq; - if (irq != AUTO_ASSIGN) - IO_APIC_VECTOR(irq) = vector; + IO_APIC_VECTOR(irq) = vector; + return vector; +} + +static int assign_irq_vector(int irq) +{ + unsigned long flags; + int vector; + + spin_lock_irqsave(&vector_lock, flags); + vector = __assign_irq_vector(irq); spin_unlock_irqrestore(&vector_lock, flags); return vector; } - -static struct hw_interrupt_type ioapic_level_type; -static struct hw_interrupt_type ioapic_edge_type; +static struct irq_chip ioapic_chip; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -1206,16 +1286,17 @@ static struct hw_interrupt_type ioapic_e static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { - unsigned idx; - - idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[idx].chip = &ioapic_level_type; - else - irq_desc[idx].chip = &ioapic_edge_type; - set_intr_gate(vector, interrupt[idx]); + trigger == IOAPIC_LEVEL) { +#ifdef CONFIG_PREEMPT_HARDIRQS + set_irq_chip_and_handler(irq, &ioapic_chip, handle_level_irq); +#else + set_irq_chip_and_handler(irq, &ioapic_chip, handle_fasteoi_irq); +#endif + } else { + set_irq_chip_and_handler(irq, &ioapic_chip, handle_edge_irq); + } + set_intr_gate(vector, interrupt[irq]); } static void __init setup_IO_APIC_irqs(void) @@ -1326,7 +1407,8 @@ static void __init setup_ExtINT_IRQ0_pin * The timer IRQ doesn't have to know that behind the * scene we have a 8259A-master in AEOI mode ... */ - irq_desc[0].chip = &ioapic_edge_type; + irq_desc[0].chip = &ioapic_chip; + set_irq_handler(0, handle_edge_irq); /* * Add it to the IO-APIC irq-routing table: @@ -1445,8 +1527,8 @@ void __init print_IO_APIC(void) struct IO_APIC_route_entry entry; spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); + *(((int *)&entry)+0) = raw_io_apic_read(apic, 0x10+i*2); + *(((int *)&entry)+1) = raw_io_apic_read(apic, 0x11+i*2); spin_unlock_irqrestore(&ioapic_lock, flags); printk(KERN_DEBUG " %02x %03X %02X ", @@ -1467,17 +1549,12 @@ void __init print_IO_APIC(void) ); } } - if (use_pci_vector()) - printk(KERN_INFO "Using vector-based indexing\n"); printk(KERN_DEBUG "IRQ to pin mappings:\n"); for (i = 0; i < NR_IRQS; i++) { struct irq_pin_list *entry = irq_2_pin + i; if (entry->pin < 0) continue; - if (use_pci_vector() && !platform_legacy_irq(i)) - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); - else - printk(KERN_DEBUG "IRQ%d ", i); + printk(KERN_DEBUG "IRQ%d ", i); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) @@ -1492,7 +1569,7 @@ void __init print_IO_APIC(void) return; } -#if 0 +#if 1 static void print_APIC_bitfield (int base) { @@ -1893,7 +1970,7 @@ static int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ - if (jiffies - t1 > 4) + if (jiffies - t1 > 4 && jiffies - t1 < 16) return 1; return 0; @@ -1913,6 +1990,8 @@ static int __init timer_irq_works(void) */ /* + * Startup quirk: + * * Starting up a edge-triggered IO-APIC interrupt is * nasty - we need to make sure that we get the edge. * If it is already asserted for some reason, we need @@ -1920,8 +1999,10 @@ static int __init timer_irq_works(void) * * This is not complete - we should be able to fake * an edge even if it isn't on the 8259A... + * + * (We do this for level-triggered IRQs too - it cannot hurt.) */ -static unsigned int startup_edge_ioapic_irq(unsigned int irq) +static unsigned int startup_ioapic_irq(unsigned int irq) { int was_pending = 0; unsigned long flags; @@ -1938,47 +2019,18 @@ static unsigned int startup_edge_ioapic_ return was_pending; } -/* - * Once we have recorded IRQ_PENDING already, we can mask the - * interrupt for real. This prevents IRQ storms from unhandled - * devices. - */ -static void ack_edge_ioapic_irq(unsigned int irq) -{ - move_irq(irq); - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) - == (IRQ_PENDING | IRQ_DISABLED)) - mask_IO_APIC_irq(irq); - ack_APIC_irq(); -} - -/* - * Level triggered interrupts can just be masked, - * and shutting down and starting up the interrupt - * is the same as enabling and disabling them -- except - * with a startup need to return a "was pending" value. - * - * Level triggered interrupts are special because we - * do not touch any IO-APIC register while handling - * them. We ack the APIC in the end-IRQ handler, not - * in the start-IRQ-handler. Protection against reentrance - * from the same interrupt is still provided, both by the - * generic IRQ layer and by the fact that an unacked local - * APIC does not accept IRQs. - */ -static unsigned int startup_level_ioapic_irq (unsigned int irq) +static void ack_ioapic_irq(unsigned int irq) { - unmask_IO_APIC_irq(irq); - - return 0; /* don't check for pending */ + move_native_irq(irq); + ack_APIC_irq(); } -static void end_level_ioapic_irq (unsigned int irq) +static void ack_ioapic_quirk_irq(unsigned int irq) { unsigned long v; int i; - move_irq(irq); + move_native_irq(irq); /* * It appears there is an erratum which affects at least version 0x11 * of I/O APIC (that's the 82093AA and cores integrated into various @@ -2007,111 +2059,34 @@ static void end_level_ioapic_irq (unsign if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + /* mask = 1, trigger = 0 */ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + /* mask = 0, trigger = 1 */ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); spin_unlock(&ioapic_lock); } } -#ifdef CONFIG_PCI_MSI -static unsigned int startup_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_edge_ioapic_irq(irq); -} - -static void ack_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - move_native_irq(vector); - ack_edge_ioapic_irq(irq); -} - -static unsigned int startup_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_level_ioapic_irq (irq); -} - -static void end_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - move_native_irq(vector); - end_level_ioapic_irq(irq); -} - -static void mask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - mask_IO_APIC_irq(irq); -} - -static void unmask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - unmask_IO_APIC_irq(irq); -} - -#ifdef CONFIG_SMP -static void set_ioapic_affinity_vector (unsigned int vector, - cpumask_t cpu_mask) -{ - int irq = vector_to_irq(vector); - - set_native_irq_info(vector, cpu_mask); - set_ioapic_affinity_irq(irq, cpu_mask); -} -#endif -#endif - -static int ioapic_retrigger(unsigned int irq) +static int ioapic_retrigger_irq(unsigned int irq) { send_IPI_self(IO_APIC_VECTOR(irq)); return 1; } -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ -static struct hw_interrupt_type ioapic_edge_type __read_mostly = { - .typename = "IO-APIC-edge", - .startup = startup_edge_ioapic, - .shutdown = shutdown_edge_ioapic, - .enable = enable_edge_ioapic, - .disable = disable_edge_ioapic, - .ack = ack_edge_ioapic, - .end = end_edge_ioapic, +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_ioapic_irq, + .eoi = ack_ioapic_quirk_irq, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity, + .set_affinity = set_ioapic_affinity_irq, #endif - .retrigger = ioapic_retrigger, + .retrigger = ioapic_retrigger_irq, }; -static struct hw_interrupt_type ioapic_level_type __read_mostly = { - .typename = "IO-APIC-level", - .startup = startup_level_ioapic, - .shutdown = shutdown_level_ioapic, - .enable = enable_level_ioapic, - .disable = disable_level_ioapic, - .ack = mask_and_ack_level_ioapic, - .end = end_level_ioapic, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity, -#endif - .retrigger = ioapic_retrigger, -}; static inline void init_IO_APIC_traps(void) { @@ -2130,11 +2105,6 @@ static inline void init_IO_APIC_traps(vo */ for (irq = 0; irq < NR_IRQS ; irq++) { int tmp = irq; - if (use_pci_vector()) { - if (!platform_legacy_irq(tmp)) - if ((tmp = vector_to_irq(tmp)) == -1) - continue; - } if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { /* * Hmm.. We don't have an entry for this, @@ -2145,20 +2115,21 @@ static inline void init_IO_APIC_traps(vo make_8259A_irq(irq); else /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_type; + irq_desc[irq].chip = &no_irq_chip; } } } -static void enable_lapic_irq (unsigned int irq) -{ - unsigned long v; +/* + * The local APIC irq-chip implementation: + */ - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); +static void ack_apic(unsigned int irq) +{ + ack_APIC_irq(); } -static void disable_lapic_irq (unsigned int irq) +static void mask_lapic_irq (unsigned int irq) { unsigned long v; @@ -2166,21 +2137,19 @@ static void disable_lapic_irq (unsigned apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); } -static void ack_lapic_irq (unsigned int irq) +static void unmask_lapic_irq (unsigned int irq) { - ack_APIC_irq(); -} + unsigned long v; -static void end_lapic_irq (unsigned int i) { /* nothing */ } + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); +} -static struct hw_interrupt_type lapic_irq_type __read_mostly = { - .typename = "local-APIC-edge", - .startup = NULL, /* startup_irq() not used for IRQ0 */ - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ - .enable = enable_lapic_irq, - .disable = disable_lapic_irq, - .ack = ack_lapic_irq, - .end = end_lapic_irq +static struct irq_chip lapic_chip __read_mostly = { + .name = "local-APIC-edge", + .mask = mask_lapic_irq, + .unmask = unmask_lapic_irq, + .eoi = ack_apic, }; static void setup_nmi (void) @@ -2361,7 +2330,7 @@ static inline void check_timer(void) printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); disable_8259A_irq(0); - irq_desc[0].chip = &lapic_irq_type; + set_irq_chip_and_handler(0, &lapic_chip, handle_fasteoi_irq); apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); @@ -2543,6 +2512,117 @@ static int __init ioapic_init_sysfs(void device_initcall(ioapic_init_sysfs); +#ifdef CONFIG_PCI_MSI +/* + * Dynamic irq allocate and deallocation for MSI + */ +int create_irq(void) +{ + /* Allocate an unused irq */ + int irq, new, vector; + unsigned long flags; + + irq = -ENOSPC; + spin_lock_irqsave(&vector_lock, flags); + for (new = (NR_IRQS - 1); new >= 0; new--) { + if (platform_legacy_irq(new)) + continue; + if (irq_vector[new] != 0) + continue; + vector = __assign_irq_vector(new); + if (likely(vector > 0)) + irq = new; + break; + } + spin_unlock_irqrestore(&vector_lock, flags); + + if (irq >= 0) { + set_intr_gate(vector, interrupt[irq]); + dynamic_irq_init(irq); + } + return irq; +} + +void destroy_irq(unsigned int irq) +{ + unsigned long flags; + + dynamic_irq_cleanup(irq); + + spin_lock_irqsave(&vector_lock, flags); + irq_vector[irq] = 0; + spin_unlock_irqrestore(&vector_lock, flags); +} +#endif /* CONFIG_PCI_MSI */ + +/* + * MSI mesage composition + */ +#ifdef CONFIG_PCI_MSI +static int msi_msg_setup(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +{ + /* For now always this code always uses physical delivery + * mode. + */ + int vector; + unsigned dest; + + vector = assign_irq_vector(irq); + if (vector >= 0) { + dest = cpu_mask_to_apicid(TARGET_CPUS); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(vector); + } + return vector; +} + +static void msi_msg_teardown(unsigned int irq) +{ + return; +} + +static void msi_msg_set_affinity(unsigned int irq, cpumask_t mask, struct msi_msg *msg) +{ + int vector; + unsigned dest; + + vector = assign_irq_vector(irq); + if (vector > 0) { + dest = cpu_mask_to_apicid(mask); + + msg->data &= ~MSI_DATA_VECTOR_MASK; + msg->data |= MSI_DATA_VECTOR(vector); + msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg->address_lo |= MSI_ADDR_DEST_ID(dest); + } +} + +struct msi_ops arch_msi_ops = { + .needs_64bit_address = 0, + .setup = msi_msg_setup, + .teardown = msi_msg_teardown, + .target = msi_msg_set_affinity, +}; + +#endif /* CONFIG_PCI_MSI */ + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ @@ -2697,7 +2777,7 @@ int io_apic_set_pci_routing (int ioapic, spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); + set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); return 0; Index: linux/arch/i386/kernel/irq.c =================================================================== --- linux.orig/arch/i386/kernel/irq.c +++ linux/arch/i386/kernel/irq.c @@ -51,10 +51,11 @@ static union irq_ctx *softirq_ctx[NR_CPU * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +fastcall notrace unsigned int do_IRQ(struct pt_regs *regs) { /* high bit used in ret_from_ code */ int irq = ~regs->orig_eax; + struct irq_desc *desc = irq_desc + irq; #ifdef CONFIG_4KSTACKS union irq_ctx *curctx, *irqctx; u32 *isp; @@ -67,6 +68,11 @@ fastcall unsigned int do_IRQ(struct pt_r } irq_enter(); +#ifdef CONFIG_LATENCY_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif + trace_special(regs->eip, irq, 0); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { @@ -75,12 +81,25 @@ fastcall unsigned int do_IRQ(struct pt_r __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE - 1)); if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { - printk("do_IRQ: stack overflow: %ld\n", + printk("BUG: do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); dump_stack(); } } #endif +#ifdef CONFIG_NO_HZ + if (idle_cpu(smp_processor_id())) { + update_jiffies(); + /* + * Force polling-idle loops to break out into + * the sched-timer setting code, to make sure + * that timer interval changes due to __mod_timer() + * in IRQ context get properly propagated: + */ + if (tsk_is_polling(current)) + set_need_resched(); + } +#endif #ifdef CONFIG_4KSTACKS @@ -94,7 +113,7 @@ fastcall unsigned int do_IRQ(struct pt_r * current stack (which is the irq stack already after all) */ if (curctx != irqctx) { - int arg1, arg2, ebx; + int arg1, arg2, arg3, ebx; /* build the stack frame on the IRQ stack */ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); @@ -110,16 +129,17 @@ fastcall unsigned int do_IRQ(struct pt_r (curctx->tinfo.preempt_count & SOFTIRQ_MASK); asm volatile( - " xchgl %%ebx,%%esp \n" - " call __do_IRQ \n" + " xchgl %%ebx,%%esp \n" + " call *%%edi \n" " movl %%ebx,%%esp \n" - : "=a" (arg1), "=d" (arg2), "=b" (ebx) - : "0" (irq), "1" (regs), "2" (isp) - : "memory", "cc", "ecx" + : "=a" (arg1), "=d" (arg2), "=c" (arg3), "=b" (ebx) + : "0" (irq), "1" (desc), "2" (regs), "3" (isp), + "D" (desc->handle_irq) + : "memory", "cc" ); } else #endif - __do_IRQ(irq, regs); + desc->handle_irq(irq, desc, regs); irq_exit(); @@ -242,8 +262,10 @@ int show_interrupts(struct seq_file *p, } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); - action = irq_desc[i].action; + irq_desc_t *desc = irq_desc + i; + + spin_lock_irqsave(&desc->lock, flags); + action = desc->action; if (!action) goto skip; seq_printf(p, "%3d: ",i); @@ -253,7 +275,22 @@ int show_interrupts(struct seq_file *p, for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - seq_printf(p, " %14s", irq_desc[i].chip->typename); + seq_printf(p, " %-14s", irq_desc[i].chip->name); +#define F(x,c) ((desc->status & x) ? c : '.') + seq_printf(p, " [%c%c%c%c%c%c%c%c%c/", + F(IRQ_INPROGRESS, 'I'), + F(IRQ_DISABLED, 'D'), + F(IRQ_PENDING, 'P'), + F(IRQ_REPLAY, 'R'), + F(IRQ_AUTODETECT, 'A'), + F(IRQ_WAITING, 'W'), + F(IRQ_LEVEL, 'L'), + F(IRQ_MASKED, 'M'), + F(IRQ_NODELAY, 'N')); +#undef F + seq_printf(p, "%3d]", desc->irqs_unhandled); + + seq_printf(p, "-%s", handle_irq_name(irq_desc[i].handle_irq)); seq_printf(p, " %s", action->name); for (action=action->next; action; action = action->next) Index: linux/arch/i386/kernel/kprobes.c =================================================================== --- linux.orig/arch/i386/kernel/kprobes.c +++ linux/arch/i386/kernel/kprobes.c @@ -338,7 +338,7 @@ ss_probe: /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); regs->eip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); + preempt_enable(); return 1; } #endif @@ -347,7 +347,7 @@ ss_probe: return 1; no_kprobe: - preempt_enable_no_resched(); + preempt_enable(); return ret; } @@ -566,7 +566,7 @@ static int __kprobes post_kprobe_handler } reset_current_kprobe(); out: - preempt_enable_no_resched(); + preempt_enable(); /* * if somebody else is singlestepping across a probe point, eflags @@ -600,7 +600,7 @@ static int __kprobes kprobe_fault_handle restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); break; case KPROBE_HIT_ACTIVE: case KPROBE_HIT_SSDONE: @@ -734,7 +734,7 @@ int __kprobes longjmp_break_handler(stru *regs = kcb->jprobe_saved_regs; memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, MIN_STACK_SIZE(stack_addr)); - preempt_enable_no_resched(); + preempt_enable(); return 1; } return 0; Index: linux/arch/i386/kernel/mcount-wrapper.S =================================================================== --- /dev/null +++ linux/arch/i386/kernel/mcount-wrapper.S @@ -0,0 +1,27 @@ +/* + * linux/arch/i386/mcount-wrapper.S + * + * Copyright (C) 2004 Ingo Molnar + */ + +.globl mcount +mcount: + + cmpl $0, mcount_enabled + jz out + + push %ebp + mov %esp, %ebp + pushl %eax + pushl %ecx + pushl %edx + + call __mcount + + popl %edx + popl %ecx + popl %eax + popl %ebp +out: + ret + Index: linux/arch/i386/kernel/microcode.c =================================================================== --- linux.orig/arch/i386/kernel/microcode.c +++ linux/arch/i386/kernel/microcode.c @@ -115,7 +115,7 @@ module_param(verbose, int, 0644); #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) /* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); +static DEFINE_RAW_SPINLOCK(microcode_update_lock); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); Index: linux/arch/i386/kernel/mpparse.c =================================================================== --- linux.orig/arch/i386/kernel/mpparse.c +++ linux/arch/i386/kernel/mpparse.c @@ -228,12 +228,17 @@ static void __init MP_bus_info (struct m mpc_oem_bus_info(m, str, translation_table[mpc_record]); + /* + * mpc_busid is char: + */ +#if MAX_MP_BUSSES < 256 if (m->mpc_busid >= MAX_MP_BUSSES) { printk(KERN_WARNING "MP table busid value (%d) for bustype %s " " is too large, max. supported is %d\n", m->mpc_busid, str, MAX_MP_BUSSES - 1); return; } +#endif if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; Index: linux/arch/i386/kernel/nmi.c =================================================================== --- linux.orig/arch/i386/kernel/nmi.c +++ linux/arch/i386/kernel/nmi.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -30,7 +31,7 @@ unsigned int nmi_watchdog = NMI_NONE; extern int unknown_nmi_panic; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ static unsigned int nmi_p4_cccr_val; extern void show_registers(struct pt_regs *regs); @@ -99,7 +100,6 @@ int nmi_active; #define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL #define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK -#ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all * CPUs during the test make them busy. @@ -107,7 +107,12 @@ int nmi_active; static __init void nmi_cpu_busy(void *data) { volatile int *endflag = data; + /* + * avoid a warning, on PREEMPT_RT this wont run in hardirq context: + */ +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -117,7 +122,6 @@ static __init void nmi_cpu_busy(void *da while (*endflag == 0) barrier(); } -#endif static int __init check_nmi_watchdog(void) { @@ -140,7 +144,7 @@ static int __init check_nmi_watchdog(voi for_each_possible_cpu(cpu) prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; local_irq_enable(); - mdelay((10*1000)/nmi_hz); // wait 10 ticks + mdelay((100*1000)/nmi_hz); // wait 100 ticks for_each_possible_cpu(cpu) { #ifdef CONFIG_SMP @@ -167,7 +171,7 @@ static int __init check_nmi_watchdog(voi /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = 1; + nmi_hz = 10000; kfree(prev_nmi_count); return 0; @@ -579,9 +583,34 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -void nmi_watchdog_tick (struct pt_regs * regs) +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) { + int i; + + if (nmi_watchdog == NMI_NONE) + return; + if (system_state != SYSTEM_RUNNING) { + printk("nmi_show_all_regs(): system state %d, not doing.\n", + system_state); + return; + } + printk("nmi_show_all_regs(): start on CPU#%d.\n", + raw_smp_processor_id()); + dump_stack(); + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + for_each_online_cpu(i) + while (nmi_show_regs[i] == 1) + barrier(); +} + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + +void notrace nmi_watchdog_tick (struct pt_regs * regs) +{ /* * Since current_thread_info()-> is always on the stack, and we * always switch the stack NMI-atomically, it's safe to use @@ -590,7 +619,16 @@ void nmi_watchdog_tick (struct pt_regs * unsigned int sum; int cpu = smp_processor_id(); - sum = per_cpu(irq_stat, cpu).apic_timer_irqs; + sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0); + + profile_tick(CPU_PROFILING, regs); + if (nmi_show_regs[cpu]) { + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + printk("NMI show regs on CPU#%d:\n", cpu); + show_regs(regs); + spin_unlock(&nmi_print_lock); + } if (last_irq_sums[cpu] == sum) { /* @@ -598,11 +636,26 @@ void nmi_watchdog_tick (struct pt_regs * * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; - if (alert_counter[cpu] == 5*nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); + if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) { + int i; + + bust_spinlocks(1); + spin_lock(&nmi_print_lock); + printk("NMI watchdog detected lockup on CPU#%d (%d/%d)\n", + cpu, alert_counter[cpu], 5*nmi_hz); + show_regs(regs); + spin_unlock(&nmi_print_lock); + + for_each_online_cpu(i) + if (i != cpu) + nmi_show_regs[i] = 1; + for_each_online_cpu(i) + while (nmi_show_regs[i] == 1) + barrier(); + + die_nmi(regs, "NMI Watchdog detected LOCKUP"); + } + } else { last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; Index: linux/arch/i386/kernel/process.c =================================================================== --- linux.orig/arch/i386/kernel/process.c +++ linux/arch/i386/kernel/process.c @@ -103,16 +103,20 @@ void default_idle(void) if (!hlt_counter && boot_cpu_data.hlt_works_ok) { current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { local_irq_disable(); - if (!need_resched()) - safe_halt(); - else + if (!need_resched() && !need_resched_delayed()) { + if (!hrtimer_stop_sched_tick()) + safe_halt(); + else + local_irq_enable(); + hrtimer_restart_sched_tick(); + } else local_irq_enable(); } current_thread_info()->status |= TS_POLLING; } else { - while (!need_resched()) + while (!need_resched() && !need_resched_delayed()) cpu_relax(); } } @@ -125,16 +129,18 @@ EXPORT_SYMBOL(default_idle); * to poll the ->work.need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { local_irq_enable(); - asm volatile( - "2:" - "testl %0, %1;" - "rep; nop;" - "je 2b;" - : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); + while (!need_resched() && !need_resched_delayed()) { + hrtimer_stop_sched_tick(); + local_irq_enable(); + while (!need_resched() && !need_resched_delayed() && !rcu_pending(smp_processor_id()) && !local_softirq_pending()) + rep_nop(); + hrtimer_restart_sched_tick(); + local_irq_enable(); + } } #ifdef CONFIG_HOTPLUG_CPU @@ -177,7 +183,9 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + BUG_ON(irqs_disabled()); + + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) @@ -195,9 +203,11 @@ void cpu_idle(void) __get_cpu_var(irq_stat).idle_timestamp = jiffies; idle(); } - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } @@ -240,13 +250,16 @@ static void mwait_idle(void) { local_irq_enable(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { + if (hrtimer_stop_sched_tick()) + break; __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (need_resched()) + if (need_resched() || need_resched_delayed()) break; __mwait(0, 0); } + hrtimer_restart_sched_tick(); } void __devinit select_idle_routine(const struct cpuinfo_x86 *c) @@ -363,15 +376,23 @@ void exit_thread(void) if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { struct task_struct *tsk = current; struct thread_struct *t = &tsk->thread; - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + void *io_bitmap_ptr = t->io_bitmap_ptr; + int cpu; + struct tss_struct *tss; - kfree(t->io_bitmap_ptr); + /* + * On PREEMPT_RT we must not call kfree() with + * preemption disabled, so we first zap the pointer: + */ t->io_bitmap_ptr = NULL; + kfree(io_bitmap_ptr); + clear_thread_flag(TIF_IO_BITMAP); /* * Careful, clear this in the TSS too: */ + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); t->io_bitmap_max = 0; tss->io_bitmap_owner = NULL; Index: linux/arch/i386/kernel/semaphore.c =================================================================== --- linux.orig/arch/i386/kernel/semaphore.c +++ linux/arch/i386/kernel/semaphore.c @@ -12,6 +12,7 @@ * * rw semaphores implemented November 1999 by Benjamin LaHaise */ +#include #include /* @@ -27,15 +28,15 @@ asm( ".section .sched.text\n" ".align 4\n" -".globl __down_failed\n" -"__down_failed:\n\t" +".globl __compat_down_failed\n" +"__compat_down_failed:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down\n\t" + "call __compat_down\n\t" "popl %ecx\n\t" "popl %edx\n\t" #if defined(CONFIG_FRAME_POINTER) @@ -48,15 +49,15 @@ asm( asm( ".section .sched.text\n" ".align 4\n" -".globl __down_failed_interruptible\n" -"__down_failed_interruptible:\n\t" +".globl __compat_down_failed_interruptible\n" +"__compat_down_failed_interruptible:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down_interruptible\n\t" + "call __compat_down_interruptible\n\t" "popl %ecx\n\t" "popl %edx\n\t" #if defined(CONFIG_FRAME_POINTER) @@ -69,15 +70,15 @@ asm( asm( ".section .sched.text\n" ".align 4\n" -".globl __down_failed_trylock\n" -"__down_failed_trylock:\n\t" +".globl __compat_down_failed_trylock\n" +"__compat_down_failed_trylock:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down_trylock\n\t" + "call __compat_down_trylock\n\t" "popl %ecx\n\t" "popl %edx\n\t" #if defined(CONFIG_FRAME_POINTER) @@ -90,45 +91,13 @@ asm( asm( ".section .sched.text\n" ".align 4\n" -".globl __up_wakeup\n" -"__up_wakeup:\n\t" +".globl __compat_up_wakeup\n" +"__compat_up_wakeup:\n\t" "pushl %edx\n\t" "pushl %ecx\n\t" - "call __up\n\t" + "call __compat_up\n\t" "popl %ecx\n\t" "popl %edx\n\t" "ret" ); -/* - * rw spinlock fallbacks - */ -#if defined(CONFIG_SMP) -asm( -".section .sched.text\n" -".align 4\n" -".globl __write_lock_failed\n" -"__write_lock_failed:\n\t" - LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" -"1: rep; nop\n\t" - "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jne 1b\n\t" - LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jnz __write_lock_failed\n\t" - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __read_lock_failed\n" -"__read_lock_failed:\n\t" - LOCK_PREFIX "incl (%eax)\n" -"1: rep; nop\n\t" - "cmpl $1,(%eax)\n\t" - "js 1b\n\t" - LOCK_PREFIX "decl (%eax)\n\t" - "js __read_lock_failed\n\t" - "ret" -); -#endif Index: linux/arch/i386/kernel/setup.c =================================================================== --- linux.orig/arch/i386/kernel/setup.c +++ linux/arch/i386/kernel/setup.c @@ -61,7 +61,7 @@ #include #include #include - +#include /* Forward Declaration. */ void __init find_max_pfn(void); @@ -1580,6 +1580,7 @@ void __init setup_arch(char **cmdline_p) #endif #endif tsc_init(); + vsyscall_init(); } static __init int add_pcspkr(void) Index: linux/arch/i386/kernel/signal.c =================================================================== --- linux.orig/arch/i386/kernel/signal.c +++ linux/arch/i386/kernel/signal.c @@ -532,6 +532,13 @@ handle_signal(unsigned long sig, siginfo } } +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so * that register information in the sigcontext is correct. @@ -572,6 +579,13 @@ static void fastcall do_signal(struct pt struct k_sigaction ka; sigset_t *oldset; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux/arch/i386/kernel/smp.c =================================================================== --- linux.orig/arch/i386/kernel/smp.c +++ linux/arch/i386/kernel/smp.c @@ -255,7 +255,7 @@ void send_IPI_mask_sequence(cpumask_t ma static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); +static DEFINE_RAW_SPINLOCK(tlbstate_lock); #define FLUSH_ALL 0xffffffff /* @@ -400,7 +400,7 @@ static void flush_tlb_others(cpumask_t c while (!cpus_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ - mb(); + cpu_relax(); flush_mm = NULL; flush_va = 0; @@ -491,10 +491,20 @@ void smp_send_reschedule(int cpu) } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); @@ -599,13 +609,14 @@ void smp_send_stop(void) } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. Trigger a reschedule pass so that + * RT-overload balancing can pass tasks around. */ -fastcall void smp_reschedule_interrupt(struct pt_regs *regs) +fastcall notrace void smp_reschedule_interrupt(struct pt_regs *regs) { + trace_special(regs->eip, 0, 0); ack_APIC_irq(); + set_tsk_need_resched(current); } fastcall void smp_call_function_interrupt(struct pt_regs *regs) Index: linux/arch/i386/kernel/time.c =================================================================== --- linux.orig/arch/i386/kernel/time.c +++ linux/arch/i386/kernel/time.c @@ -131,7 +131,7 @@ static int set_rtc_mmss(unsigned long no int timer_ack; #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -150,15 +150,6 @@ EXPORT_SYMBOL(profile_pc); */ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - #ifdef CONFIG_X86_IO_APIC if (timer_ack) { /* @@ -177,7 +168,6 @@ irqreturn_t timer_interrupt(int irq, voi do_timer_interrupt_hook(regs); - if (MCA_bus) { /* The PS/2 uses level-triggered interrupts. You can't turn them off, nor would you want to (any attempt to @@ -192,8 +182,6 @@ irqreturn_t timer_interrupt(int irq, voi outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ } - write_sequnlock(&xtime_lock); - #ifdef CONFIG_X86_LOCAL_APIC if (using_apic_timer) smp_send_timer_broadcast_ipi(regs); @@ -203,7 +191,7 @@ irqreturn_t timer_interrupt(int irq, voi } /* not static: needed by APM */ -unsigned long get_cmos_time(void) +unsigned long read_persistent_clock(void) { unsigned long retval; unsigned long flags; @@ -219,7 +207,7 @@ unsigned long get_cmos_time(void) return retval; } -EXPORT_SYMBOL(get_cmos_time); +EXPORT_SYMBOL(read_persistent_clock); static void sync_cmos_clock(unsigned long dummy); @@ -270,75 +258,11 @@ void notify_arch_cmos_timer(void) mod_timer(&sync_cmos_timer, jiffies + 1); } -static long clock_cmos_diff, sleep_start; - -static int timer_suspend(struct sys_device *dev, pm_message_t state) -{ - /* - * Estimate time zone so that set_time can update the clock - */ - clock_cmos_diff = -get_cmos_time(); - clock_cmos_diff += get_seconds(); - sleep_start = get_cmos_time(); - return 0; -} - -static int timer_resume(struct sys_device *dev) -{ - unsigned long flags; - unsigned long sec; - unsigned long sleep_length; - -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled()) - hpet_reenable(); -#endif - setup_pit_timer(); - sec = get_cmos_time() + clock_cmos_diff; - sleep_length = (get_cmos_time() - sleep_start) * HZ; - write_seqlock_irqsave(&xtime_lock, flags); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - jiffies_64 += sleep_length; - wall_jiffies += sleep_length; - write_sequnlock_irqrestore(&xtime_lock, flags); - touch_softlockup_watchdog(); - return 0; -} - -static struct sysdev_class timer_sysclass = { - .resume = timer_resume, - .suspend = timer_suspend, - set_kset_name("timer"), -}; - - -/* XXX this driverfs stuff should probably go elsewhere later -john */ -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int time_init_device(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(time_init_device); - #ifdef CONFIG_HPET_TIMER extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ static void __init hpet_time_init(void) { - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - if ((hpet_enable() >= 0) && hpet_use_timer) { printk("Using HPET for base-timer\n"); } @@ -359,10 +283,6 @@ void __init time_init(void) return; } #endif - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); time_init_hook(); } Index: linux/arch/i386/kernel/traps.c =================================================================== --- linux.orig/arch/i386/kernel/traps.c +++ linux/arch/i386/kernel/traps.c @@ -226,6 +226,7 @@ static void show_trace_log_lvl(struct ta break; printk("%s =======================\n", log_lvl); } + print_traces(task); } void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack) @@ -276,6 +277,12 @@ void dump_stack(void) EXPORT_SYMBOL(dump_stack); +#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_LATENCY_TRACE) +extern unsigned long worst_stack_left; +#else +# define worst_stack_left -1L +#endif + void show_registers(struct pt_regs *regs) { int i; @@ -302,8 +309,8 @@ void show_registers(struct pt_regs *regs regs->eax, regs->ebx, regs->ecx, regs->edx); printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->esi, regs->edi, regs->ebp, esp); - printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, ss); + printk(KERN_EMERG "ds: %04x es: %04x ss: %04x preempt: %08x\n", + regs->xds & 0xffff, regs->xes & 0xffff, ss, preempt_count()); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, current->pid, current_thread_info(), current, current->thread_info); @@ -375,11 +382,11 @@ static void handle_BUG(struct pt_regs *r void die(const char * str, struct pt_regs * regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; @@ -486,6 +493,11 @@ static void __kprobes do_trap(int trapnr if (!user_mode(regs)) goto kernel_trap; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + trap_signal: { if (info) force_sig_info(signr, info, tsk); @@ -505,6 +517,7 @@ static void __kprobes do_trap(int trapnr if (ret) goto trap_signal; return; } + print_traces(tsk); } #define DO_ERROR(trapnr, signr, str, name) \ @@ -703,10 +716,11 @@ void die_nmi (struct pt_regs *regs, cons crash_kexec(regs); } + nmi_exit(); do_exit(SIGSEGV); } -static void default_do_nmi(struct pt_regs * regs) +static void notrace default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -715,9 +729,6 @@ static void default_do_nmi(struct pt_reg reason = get_nmi_reason(); if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; #ifdef CONFIG_X86_LOCAL_APIC /* * Ok, so this is none of the documented NMI sources, @@ -725,9 +736,13 @@ static void default_do_nmi(struct pt_reg */ if (nmi_watchdog) { nmi_watchdog_tick(regs); +// trace_special(6, 1, 0); return; } #endif + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; unknown_nmi_error(reason, regs); return; } @@ -744,18 +759,19 @@ static void default_do_nmi(struct pt_reg reassert_nmi(); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +static notrace int dummy_nmi_callback(struct pt_regs * regs, int cpu) { return 0; } static nmi_callback_t nmi_callback = dummy_nmi_callback; -fastcall void do_nmi(struct pt_regs * regs, long error_code) +fastcall notrace void do_nmi(struct pt_regs * regs, long error_code) { int cpu; nmi_enter(); + nmi_trace((unsigned long)do_nmi, regs->eip, regs->eflags); cpu = smp_processor_id(); Index: linux/arch/i386/kernel/tsc.c =================================================================== --- linux.orig/arch/i386/kernel/tsc.c +++ linux/arch/i386/kernel/tsc.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -333,6 +334,16 @@ static cycle_t read_tsc(void) return ret; } + +static cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret; + + rdtscll(ret); + + return ret; +} + static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, @@ -342,6 +353,7 @@ static struct clocksource clocksource_ts .shift = 22, .update_callback = tsc_update_callback, .is_continuous = 1, + .vread = vread_tsc, }; static int tsc_update_callback(void) Index: linux/arch/i386/kernel/vm86.c =================================================================== --- linux.orig/arch/i386/kernel/vm86.c +++ linux/arch/i386/kernel/vm86.c @@ -109,6 +109,7 @@ struct pt_regs * fastcall save_v86_state local_irq_enable(); if (!current->thread.vm86_info) { + local_irq_disable(); printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } Index: linux/arch/i386/kernel/vmlinux.lds.S =================================================================== --- linux.orig/arch/i386/kernel/vmlinux.lds.S +++ linux/arch/i386/kernel/vmlinux.lds.S @@ -8,6 +8,8 @@ #include #include #include +#include +#include OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) @@ -71,6 +73,51 @@ SECTIONS .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } _edata = .; /* End of data section */ +/* VSYSCALL_GTOD data */ +#ifdef CONFIG_GENERIC_TIME_VSYSCALL +#undef VSYSCALL_ADDR +#define VSYSCALL_ADDR VSYSCALL_GTOD_START +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) + +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) + +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } + __vsyscall_0 = VSYSCALL_VIRT_ADDR; + + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } + .vsyscall_data : AT(VLOAD(.vsyscall_data)) { *(.vsyscall_data) } + + . = ALIGN(32); + .vsyscall_gtod_data : AT (VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) } + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + + . = ALIGN(32); + .vsyscall_gtod_lock : AT (VLOAD(.vsyscall_gtod_lock)) { *(.vsyscall_gtod_lock) } + vsyscall_gtod_lock = VVIRT(.vsyscall_gtod_lock); + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } + + . = VSYSCALL_VIRT_ADDR + 4096; + +#undef VSYSCALL_ADDR +#undef VSYSCALL_PHYS_ADDR +#undef VSYSCALL_VIRT_ADDR +#undef VLOAD_OFFSET +#undef VLOAD +#undef VVIRT_OFFSET +#undef VVIRT + +#endif +/* END of VSYSCALL_GTOD data*/ + #ifdef CONFIG_STACK_UNWIND . = ALIGN(4); .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) { Index: linux/arch/i386/kernel/vsyscall-gtod.c =================================================================== --- /dev/null +++ linux/arch/i386/kernel/vsyscall-gtod.c @@ -0,0 +1,179 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct vsyscall_gtod_data_t { + struct timeval wall_time_tv; + struct timezone sys_tz; + struct clocksource clock; +}; + +struct vsyscall_gtod_data_t vsyscall_gtod_data; +struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data; + +seqlock_t vsyscall_gtod_lock = SEQLOCK_UNLOCKED; +seqlock_t __vsyscall_gtod_lock __section_vsyscall_gtod_lock = SEQLOCK_UNLOCKED; + +int errno; +static inline _syscall2(int,gettimeofday,struct timeval *,tv,struct timezone *,tz); + +static int vsyscall_mapped = 0; /* flag variable for remap_vsyscall() */ +extern struct timezone sys_tz; + +static inline void do_vgettimeofday(struct timeval* tv) +{ + cycle_t now, cycle_delta; + s64 nsec_delta; + + if (!__vsyscall_gtod_data.clock.vread) { + gettimeofday(tv, NULL); + return; + } + + /* read the clocksource and calc cycle_delta */ + now = __vsyscall_gtod_data.clock.vread(); + cycle_delta = (now - __vsyscall_gtod_data.clock.cycle_last) + & __vsyscall_gtod_data.clock.mask; + + /* convert cycles to nsecs */ + nsec_delta = cycle_delta * __vsyscall_gtod_data.clock.mult; + nsec_delta = nsec_delta >> __vsyscall_gtod_data.clock.shift; + + /* add nsec offset to wall_time_tv */ + *tv = __vsyscall_gtod_data.wall_time_tv; + do_div(nsec_delta, NSEC_PER_USEC); + while (nsec_delta > NSEC_PER_SEC) { + tv->tv_sec += 1; + nsec_delta -= NSEC_PER_SEC; + } + tv->tv_usec += ((unsigned long)nsec_delta)/1000; +} + +static inline void do_get_tz(struct timezone *tz) +{ + *tz = __vsyscall_gtod_data.sys_tz; +} + +static int __vsyscall(0) asmlinkage vgettimeofday(struct timeval *tv, struct timezone *tz) +{ + unsigned long seq; + do { + seq = read_seqbegin(&__vsyscall_gtod_lock); + + if (tv) + do_vgettimeofday(tv); + if (tz) + do_get_tz(tz); + + } while (read_seqretry(&__vsyscall_gtod_lock, seq)); + + return 0; +} + +static time_t __vsyscall(1) asmlinkage vtime(time_t * t) +{ + struct timeval tv; + vgettimeofday(&tv,NULL); + if (t) + *t = tv.tv_sec; + return tv.tv_sec; +} + +struct clocksource* curr_clock; + +void update_vsyscall(struct timespec *wall_time, + struct clocksource* clock) +{ + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_lock, flags); + + /* XXX - hackitty hack hack. this is terrible! */ + if (curr_clock != clock) { + curr_clock = clock; + } + + /* save off wall time as timeval */ + vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000; + + /* copy current clocksource */ + vsyscall_gtod_data.clock = *clock; + + /* save off current timezone */ + vsyscall_gtod_data.sys_tz = sys_tz; + + write_sequnlock_irqrestore(&vsyscall_gtod_lock, flags); + +} +extern char __vsyscall_0; + +static void __init map_vsyscall(void) +{ + unsigned long physaddr_page0 = (unsigned long) &__vsyscall_0 - PAGE_OFFSET; + + /* Initially we map the VSYSCALL page w/ PAGE_KERNEL permissions to + * keep the alternate_instruction code from bombing out when it + * changes the seq_lock memory barriers in vgettimeofday() + */ + __set_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE, physaddr_page0, PAGE_KERNEL); +} + +static int __init remap_vsyscall(void) +{ + unsigned long physaddr_page0 = (unsigned long) &__vsyscall_0 - PAGE_OFFSET; + + if (!vsyscall_mapped) + return 0; + + /* Remap the VSYSCALL page w/ PAGE_KERNEL_VSYSCALL permissions + * after the alternate_instruction code has run + */ + clear_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE); + __set_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); + + return 0; +} + +int __init vsyscall_init(void) +{ + printk("VSYSCALL: consistency checks..."); + if ((unsigned long) &vgettimeofday != VSYSCALL_ADDR(__NR_vgettimeofday)) { + printk("vgettimeofday link addr broken\n"); + printk("VSYSCALL: vsyscall_init failed!\n"); + return -EFAULT; + } + if ((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)) { + printk("vtime link addr broken\n"); + printk("VSYSCALL: vsyscall_init failed!\n"); + return -EFAULT; + } + if (VSYSCALL_ADDR(0) != __fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE)) { + printk("fixmap first vsyscall 0x%lx should be 0x%x\n", + __fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE), + VSYSCALL_ADDR(0)); + printk("VSYSCALL: vsyscall_init failed!\n"); + return -EFAULT; + } + + + printk("passed...mapping..."); + map_vsyscall(); + printk("done.\n"); + vsyscall_mapped = 1; + printk("VSYSCALL: fixmap virt addr: 0x%lx\n", + __fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE)); + + return 0; +} +__initcall(remap_vsyscall); Index: linux/arch/i386/lib/bitops.c =================================================================== --- linux.orig/arch/i386/lib/bitops.c +++ linux/arch/i386/lib/bitops.c @@ -68,3 +68,35 @@ int find_next_zero_bit(const unsigned lo return (offset + set + res); } EXPORT_SYMBOL(find_next_zero_bit); + + +/* + * rw spinlock fallbacks + */ +#ifdef CONFIG_SMP +asm( +".align 4\n" +".globl __write_lock_failed\n" +"__write_lock_failed:\n\t" + LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" +"1: rep; nop\n\t" + "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" + "jne 1b\n\t" + LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" + "jnz __write_lock_failed\n\t" + "ret" +); + +asm( +".align 4\n" +".globl __read_lock_failed\n" +"__read_lock_failed:\n\t" + LOCK_PREFIX "incl (%eax)\n" +"1: rep; nop\n\t" + "cmpl $1,(%eax)\n\t" + "js 1b\n\t" + LOCK_PREFIX "decl (%eax)\n\t" + "js __read_lock_failed\n\t" + "ret" +); +#endif Index: linux/arch/i386/mach-default/setup.c =================================================================== --- linux.orig/arch/i386/mach-default/setup.c +++ linux/arch/i386/mach-default/setup.c @@ -35,7 +35,7 @@ void __init pre_intr_init_hook(void) /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, IRQF_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; /** * intr_init_hook - post gate setup interrupt initialisation @@ -79,7 +79,7 @@ void __init trap_init_hook(void) { } -static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL}; +static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED | IRQF_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL}; /** * time_init_hook - do any specific initialisations for the system timer. Index: linux/arch/i386/mach-visws/setup.c =================================================================== --- linux.orig/arch/i386/mach-visws/setup.c +++ linux/arch/i386/mach-visws/setup.c @@ -115,7 +115,7 @@ void __init pre_setup_arch_hook() static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NODELAY, .name = "timer", }; Index: linux/arch/i386/mach-visws/visws_apic.c =================================================================== --- linux.orig/arch/i386/mach-visws/visws_apic.c +++ linux/arch/i386/mach-visws/visws_apic.c @@ -259,11 +259,13 @@ out_unlock: static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = IRQF_NODELAY, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = IRQF_NODELAY, }; Index: linux/arch/i386/mach-voyager/setup.c =================================================================== --- linux.orig/arch/i386/mach-voyager/setup.c +++ linux/arch/i386/mach-voyager/setup.c @@ -18,7 +18,7 @@ void __init pre_intr_init_hook(void) /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, IRQF_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; void __init intr_init_hook(void) { @@ -40,7 +40,7 @@ void __init trap_init_hook(void) { } -static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL}; +static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED | IRQF_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL}; void __init time_init_hook(void) { Index: linux/arch/i386/mm/fault.c =================================================================== --- linux.orig/arch/i386/mm/fault.c +++ linux/arch/i386/mm/fault.c @@ -73,6 +73,9 @@ void bust_spinlocks(int yes) int loglevel_save = console_loglevel; if (yes) { + stop_trace(); + user_trace_stop(); + zap_rt_locks(); oops_in_progress = 1; return; } @@ -325,8 +328,8 @@ static inline int vmalloc_fault(unsigned * bit 3 == 1 means use of reserved bit detected * bit 4 == 1 means fault was an instruction fetch */ -fastcall void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) +fastcall notrace void __kprobes do_page_fault(struct pt_regs *regs, + unsigned long error_code) { struct task_struct *tsk; struct mm_struct *mm; @@ -337,6 +340,7 @@ fastcall void __kprobes do_page_fault(st /* get the address */ address = read_cr2(); + trace_special(regs->eip, error_code, address); tsk = current; Index: linux/arch/i386/mm/highmem.c =================================================================== --- linux.orig/arch/i386/mm/highmem.c +++ linux/arch/i386/mm/highmem.c @@ -18,6 +18,26 @@ void kunmap(struct page *page) kunmap_high(page); } +void kunmap_virt(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return; + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + kunmap(page); +} + +struct page *kmap_to_page(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return virt_to_page(ptr); + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + return page; +} + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -26,7 +46,7 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; @@ -48,7 +68,7 @@ void *kmap_atomic(struct page *page, enu return (void*) vaddr; } -void kunmap_atomic(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr, enum km_type type) { #ifdef CONFIG_DEBUG_HIGHMEM unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; @@ -78,7 +98,7 @@ void kunmap_atomic(void *kvaddr, enum km /* This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; @@ -93,7 +113,7 @@ void *kmap_atomic_pfn(unsigned long pfn, return (void*) vaddr; } -struct page *kmap_atomic_to_page(void *ptr) +struct page *__kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; @@ -108,6 +128,7 @@ struct page *kmap_atomic_to_page(void *p EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); +EXPORT_SYMBOL(kunmap_virt); +EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(__kmap_atomic_to_page); Index: linux/arch/i386/mm/init.c =================================================================== --- linux.orig/arch/i386/mm/init.c +++ linux/arch/i386/mm/init.c @@ -45,7 +45,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static int noinline do_test_wp_bit(void); @@ -194,7 +194,7 @@ static inline int page_kills_ppro(unsign extern int is_available_memory(efi_memory_desc_t *); -int page_is_ram(unsigned long pagenr) +int notrace page_is_ram(unsigned long pagenr) { int i; unsigned long addr, end; Index: linux/arch/i386/mm/pgtable.c =================================================================== --- linux.orig/arch/i386/mm/pgtable.c +++ linux/arch/i386/mm/pgtable.c @@ -182,7 +182,7 @@ void pmd_ctor(void *pmd, kmem_cache_t *c * recommendations and having no core impact whatsoever. * -- wli */ -DEFINE_SPINLOCK(pgd_lock); +DEFINE_RAW_SPINLOCK(pgd_lock); struct page *pgd_list; static inline void pgd_list_add(pgd_t *pgd) Index: linux/arch/i386/oprofile/Kconfig =================================================================== --- linux.orig/arch/i386/oprofile/Kconfig +++ linux/arch/i386/oprofile/Kconfig @@ -15,3 +15,6 @@ config OPROFILE If unsure, say N. +config PROFILE_NMI + bool + default y Index: linux/arch/i386/pci/Makefile =================================================================== --- linux.orig/arch/i386/pci/Makefile +++ linux/arch/i386/pci/Makefile @@ -4,8 +4,9 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o obj-$(CONFIG_PCI_DIRECT) += direct.o +obj-$(CONFIG_ACPI) += acpi.o + pci-y := fixup.o -pci-$(CONFIG_ACPI) += acpi.o pci-y += legacy.o irq.o pci-$(CONFIG_X86_VISWS) := visws.o fixup.o Index: linux/arch/i386/pci/direct.c =================================================================== --- linux.orig/arch/i386/pci/direct.c +++ linux/arch/i386/pci/direct.c @@ -220,16 +220,23 @@ static int __init pci_check_type1(void) unsigned int tmp; int works = 0; - local_irq_save(flags); + spin_lock_irqsave(&pci_config_lock, flags); outb(0x01, 0xCFB); tmp = inl(0xCF8); outl(0x80000000, 0xCF8); - if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) { - works = 1; + + if (inl(0xCF8) == 0x80000000) { + spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf1)) + works = 1; + + spin_lock_irqsave(&pci_config_lock, flags); } outl(tmp, 0xCF8); - local_irq_restore(flags); + + spin_unlock_irqrestore(&pci_config_lock, flags); return works; } @@ -239,17 +246,19 @@ static int __init pci_check_type2(void) unsigned long flags; int works = 0; - local_irq_save(flags); + spin_lock_irqsave(&pci_config_lock, flags); outb(0x00, 0xCFB); outb(0x00, 0xCF8); outb(0x00, 0xCFA); - if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 && - pci_sanity_check(&pci_direct_conf2)) { - works = 1; - } - local_irq_restore(flags); + if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) { + spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf2)) + works = 1; + } else + spin_unlock_irqrestore(&pci_config_lock, flags); return works; } Index: linux/arch/i386/pci/irq.c =================================================================== --- linux.orig/arch/i386/pci/irq.c +++ linux/arch/i386/pci/irq.c @@ -981,10 +981,6 @@ static void __init pcibios_fixup_irqs(vo pci_name(bridge), 'A' + pin, irq); } if (irq >= 0) { - if (use_pci_vector() && - !platform_legacy_irq(irq)) - irq = IO_APIC_VECTOR(irq); - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", pci_name(dev), 'A' + pin, irq); dev->irq = irq; @@ -1169,33 +1165,3 @@ static int pirq_enable_irq(struct pci_de } return 0; } - -int pci_vector_resources(int last, int nr_released) -{ - int count = nr_released; - - int next = last; - int offset = (last % 8); - - while (next < FIRST_SYSTEM_VECTOR) { - next += 8; -#ifdef CONFIG_X86_64 - if (next == IA32_SYSCALL_VECTOR) - continue; -#else - if (next == SYSCALL_VECTOR) - continue; -#endif - count++; - if (next >= FIRST_SYSTEM_VECTOR) { - if (offset%8) { - next = FIRST_DEVICE_VECTOR + offset; - offset++; - continue; - } - count--; - } - } - - return count; -} Index: linux/arch/ia64/Kconfig =================================================================== --- linux.orig/arch/ia64/Kconfig +++ linux/arch/ia64/Kconfig @@ -32,6 +32,7 @@ config SWIOTLB config RWSEM_XCHGADD_ALGORITHM bool + depends on !PREEMPT_RT default y config GENERIC_FIND_NEXT_BIT @@ -42,7 +43,11 @@ config GENERIC_CALIBRATE_DELAY bool default y -config TIME_INTERPOLATION +config GENERIC_TIME + bool + default y + +config GENERIC_TIME_VSYSCALL bool default y @@ -258,6 +263,69 @@ config SMP If you don't know what to do here, say N. + +config GENERIC_TIME + bool + default y + +config HIGH_RES_TIMERS + bool "High-Resolution Timers" + help + + POSIX timers are available by default. This option enables + high-resolution POSIX timers. With this option the resolution + is at least 1 microsecond. High resolution is not free. If + enabled this option will add a small overhead each time a + timer expires that is not on a 1/HZ tick boundary. If no such + timers are used the overhead is nil. + + This option enables two additional POSIX CLOCKS, + CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR. Note that this + option does not change the resolution of CLOCK_REALTIME or + CLOCK_MONOTONIC which remain at 1/HZ resolution. + +config HIGH_RES_RESOLUTION + int "High-Resolution-Timer resolution (nanoseconds)" + depends on HIGH_RES_TIMERS + default 1000 + help + + This sets the resolution of timers accessed with + CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR. Too + fine a resolution (small a number) will usually not + be observable due to normal system latencies. For an + 800 MHZ processor about 10,000 is the recommended maximum + (smallest number). If you don't need that sort of resolution, + higher numbers may generate less overhead. + +choice + prompt "Clock source" + depends on HIGH_RES_TIMERS + default HIGH_RES_TIMER_ITC + help + This option allows you to choose the hardware source in charge + of generating high precision interruptions on your system. + On IA-64 these are: + + + ITC Interval Time Counter 1/CPU clock + HPET High Precision Event Timer ~ (XXX:have to check the spec) + + The ITC timer is available on all the ia64 computers because + it is integrated directly into the processor. However it may not + give correct results on MP machines with processors running + at different clock rates. In this case you may want to use + the HPET if available on your machine. + + +config HIGH_RES_TIMER_ITC + bool "Interval Time Counter/ITC" + +config HIGH_RES_TIMER_HPET + bool "High Precision Event Timer/HPET" + +endchoice + config NR_CPUS int "Maximum number of CPUs (2-1024)" range 2 1024 @@ -310,17 +378,15 @@ config FORCE_CPEI_RETARGET This option it useful to enable this feature on older BIOS's as well. You can also enable this by using boot command line option force_cpei=1. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. +source "kernel/Kconfig.preempt" - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +config RWSEM_GENERIC_SPINLOCK + bool + depends on PREEMPT_RT + default y + +config PREEMPT + def_bool y if (PREEMPT_RT || PREEMPT_SOFTIRQS || PREEMPT_HARDIRQS || PREEMPT_VOLUNTARY || PREEMPT_DESKTOP) source "mm/Kconfig" Index: linux/arch/ia64/configs/bigsur_defconfig =================================================================== --- linux.orig/arch/ia64/configs/bigsur_defconfig +++ linux/arch/ia64/configs/bigsur_defconfig @@ -85,7 +85,7 @@ CONFIG_MMU=y CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y Index: linux/arch/ia64/configs/gensparse_defconfig =================================================================== --- linux.orig/arch/ia64/configs/gensparse_defconfig +++ linux/arch/ia64/configs/gensparse_defconfig @@ -86,7 +86,7 @@ CONFIG_MMU=y CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y Index: linux/arch/ia64/configs/sim_defconfig =================================================================== --- linux.orig/arch/ia64/configs/sim_defconfig +++ linux/arch/ia64/configs/sim_defconfig @@ -86,7 +86,7 @@ CONFIG_MMU=y CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y Index: linux/arch/ia64/configs/sn2_defconfig =================================================================== --- linux.orig/arch/ia64/configs/sn2_defconfig +++ linux/arch/ia64/configs/sn2_defconfig @@ -83,7 +83,7 @@ CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_FIND_NEXT_BIT=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_DMI=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y Index: linux/arch/ia64/configs/tiger_defconfig =================================================================== --- linux.orig/arch/ia64/configs/tiger_defconfig +++ linux/arch/ia64/configs/tiger_defconfig @@ -86,7 +86,7 @@ CONFIG_MMU=y CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y Index: linux/arch/ia64/configs/zx1_defconfig =================================================================== --- linux.orig/arch/ia64/configs/zx1_defconfig +++ linux/arch/ia64/configs/zx1_defconfig @@ -84,7 +84,7 @@ CONFIG_MMU=y CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y Index: linux/arch/ia64/defconfig =================================================================== --- linux.orig/arch/ia64/defconfig +++ linux/arch/ia64/defconfig @@ -86,7 +86,7 @@ CONFIG_MMU=y CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y Index: linux/arch/ia64/kernel/asm-offsets.c =================================================================== --- linux.orig/arch/ia64/kernel/asm-offsets.c +++ linux/arch/ia64/kernel/asm-offsets.c @@ -7,6 +7,7 @@ #define ASM_OFFSETS_C 1 #include +#include #include #include @@ -254,18 +255,13 @@ void foo(void) offsetof (struct pal_min_state_area_s, pmsa_xip)); BLANK(); +#ifdef CONFIG_TIME_INTERPOLATION /* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */ - DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr)); - DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source)); - DEFINE(IA64_TIME_INTERPOLATOR_SHIFT_OFFSET, offsetof (struct time_interpolator, shift)); - DEFINE(IA64_TIME_INTERPOLATOR_NSEC_OFFSET, offsetof (struct time_interpolator, nsec_per_cyc)); - DEFINE(IA64_TIME_INTERPOLATOR_OFFSET_OFFSET, offsetof (struct time_interpolator, offset)); - DEFINE(IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET, offsetof (struct time_interpolator, last_cycle)); - DEFINE(IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET, offsetof (struct time_interpolator, last_counter)); - DEFINE(IA64_TIME_INTERPOLATOR_JITTER_OFFSET, offsetof (struct time_interpolator, jitter)); - DEFINE(IA64_TIME_INTERPOLATOR_MASK_OFFSET, offsetof (struct time_interpolator, mask)); - DEFINE(IA64_TIME_SOURCE_CPU, TIME_SOURCE_CPU); - DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64); - DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32); DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec)); + DEFINE(IA64_CLOCKSOURCE_MASK_OFFSET, offsetof (struct clocksource, mask)); + DEFINE(IA64_CLOCKSOURCE_MULT_OFFSET, offsetof (struct clocksource, mult)); + DEFINE(IA64_CLOCKSOURCE_SHIFT_OFFSET, offsetof (struct clocksource, shift)); + DEFINE(IA64_CLOCKSOURCE_MMIO_PTR_OFFSET, offsetof (struct clocksource, fsys_mmio_ptr)); + DEFINE(IA64_CLOCKSOURCE_CYCLE_LAST_OFFSET, offsetof (struct clocksource, cycle_last)); +#endif } Index: linux/arch/ia64/kernel/cyclone.c =================================================================== --- linux.orig/arch/ia64/kernel/cyclone.c +++ linux/arch/ia64/kernel/cyclone.c @@ -3,6 +3,7 @@ #include #include #include +#include #include /* IBM Summit (EXA) Cyclone counter code*/ @@ -18,13 +19,21 @@ void __init cyclone_setup(void) use_cyclone = 1; } +static void __iomem *cyclone_mc_ptr; -struct time_interpolator cyclone_interpolator = { - .source = TIME_SOURCE_MMIO64, - .shift = 16, - .frequency = CYCLONE_TIMER_FREQ, - .drift = -100, - .mask = (1LL << 40) - 1 +static cycle_t read_cyclone(void) +{ + return (cycle_t)readq((void __iomem *)cyclone_mc_ptr); +} + +static struct clocksource clocksource_cyclone = { + .name = "cyclone", + .rating = 300, + .read = read_cyclone, + .mask = (1LL << 40) - 1, + .mult = 0, /*to be caluclated*/ + .shift = 16, + .is_continuous = 1, }; int __init init_cyclone_clock(void) @@ -101,8 +110,10 @@ int __init init_cyclone_clock(void) } } /* initialize last tick */ - cyclone_interpolator.addr = cyclone_timer; - register_time_interpolator(&cyclone_interpolator); + clocksource_cyclone.fsys_mmio_ptr = cyclone_mc_ptr = cyclone_timer; + clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ, + clocksource_cyclone.shift); + clocksource_register(&clocksource_cyclone); return 0; } Index: linux/arch/ia64/kernel/entry.S =================================================================== --- linux.orig/arch/ia64/kernel/entry.S +++ linux/arch/ia64/kernel/entry.S @@ -1101,23 +1101,24 @@ skip_rbs_switch: st8 [r2]=r8 st8 [r3]=r10 .work_pending: - tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? + tbit.nz p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? +(p6) br.cond.sptk.few .needresched + tbit.z p6,p0=r31,TIF_NEED_RESCHED_DELAYED // current_thread_info()->need_resched_delayed==0? (p6) br.cond.sptk.few .notify -#ifdef CONFIG_PREEMPT -(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1 + +.needresched: + +(pKStk) br.cond.sptk.many .fromkernel ;; -(pKStk) st4 [r20]=r21 ssm psr.i // enable interrupts -#endif br.call.spnt.many rp=schedule -.ret9: cmp.eq p6,p0=r0,r0 // p6 <- 1 - rsm psr.i // disable interrupts - ;; -#ifdef CONFIG_PREEMPT -(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 - ;; -(pKStk) st4 [r20]=r0 // preempt_count() <- 0 -#endif +.ret9a: rsm psr.i // disable interrupts + ;; + br.cond.sptk.many .endpreemptdep +.fromkernel: + br.call.spnt.many rp=preempt_schedule_irq +.ret9b: rsm psr.i // disable interrupts +.endpreemptdep: (pLvSys)br.cond.sptk.few .work_pending_syscall_end br.cond.sptk.many .work_processed_kernel // re-check Index: linux/arch/ia64/kernel/fsys.S =================================================================== --- linux.orig/arch/ia64/kernel/fsys.S +++ linux/arch/ia64/kernel/fsys.S @@ -24,6 +24,7 @@ #include "entry.h" +#ifdef CONFIG_TIME_INTERPOLATION /* * See Documentation/ia64/fsys.txt for details on fsyscalls. * @@ -145,13 +146,6 @@ ENTRY(fsys_set_tid_address) FSYS_RETURN END(fsys_set_tid_address) -/* - * Ensure that the time interpolator structure is compatible with the asm code - */ -#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \ - || IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4 -#error fsys_gettimeofday incompatible with changes to struct time_interpolator -#endif #define CLOCK_REALTIME 0 #define CLOCK_MONOTONIC 1 #define CLOCK_DIVIDE_BY_1000 0x4000 @@ -177,19 +171,18 @@ ENTRY(fsys_gettimeofday) // r11 = preserved: saved ar.pfs // r12 = preserved: memory stack // r13 = preserved: thread pointer - // r14 = address of mask / mask + // r14 = address of mask / mask value // r15 = preserved: system call number // r16 = preserved: current task pointer // r17 = wall to monotonic use - // r18 = time_interpolator->offset - // r19 = address of wall_to_monotonic - // r20 = pointer to struct time_interpolator / pointer to time_interpolator->address - // r21 = shift factor - // r22 = address of time interpolator->last_counter - // r23 = address of time_interpolator->last_cycle - // r24 = adress of time_interpolator->offset - // r25 = last_cycle value - // r26 = last_counter value + // r19 = address of itc_lastcycle + // r20 = struct clocksource / address of first element + // r21 = shift value + // r22 = address of itc_jitter/ wall_to_monotonic + // r23 = address of shift + // r24 = address mult factor / cycle_last value + // r25 = itc_lastcycle value + // r26 = address clocksource cycle_last // r27 = pointer to xtime // r28 = sequence number at the beginning of critcal section // r29 = address of seqlock @@ -199,9 +192,9 @@ ENTRY(fsys_gettimeofday) // p6,p7 short term use // p8 = timesource ar.itc // p9 = timesource mmio64 - // p10 = timesource mmio32 + // p10 = timesource mmio32 - not used // p11 = timesource not to be handled by asm code - // p12 = memory time source ( = p9 | p10) + // p12 = memory time source ( = p9 | p10) - not used // p13 = do cmpxchg with time_interpolator_last_cycle // p14 = Divide by 1000 // p15 = Add monotonic @@ -212,61 +205,55 @@ ENTRY(fsys_gettimeofday) tnat.nz p6,p0 = r31 // branch deferred since it does not fit into bundle structure mov pr = r30,0xc000 // Set predicates according to function add r2 = TI_FLAGS+IA64_TASK_SIZE,r16 - movl r20 = time_interpolator + movl r20 = fsyscall_clock // load fsyscall clocksource address ;; - ld8 r20 = [r20] // get pointer to time_interpolator structure + add r10 = IA64_CLOCKSOURCE_MMIO_PTR_OFFSET,r20 movl r29 = xtime_lock ld4 r2 = [r2] // process work pending flags movl r27 = xtime ;; // only one bundle here - ld8 r21 = [r20] // first quad with control information + add r14 = IA64_CLOCKSOURCE_MASK_OFFSET,r20 + movl r22 = itc_jitter + add r24 = IA64_CLOCKSOURCE_MULT_OFFSET,r20 and r2 = TIF_ALLWORK_MASK,r2 (p6) br.cond.spnt.few .fail_einval // deferred branch ;; - add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20 - extr r3 = r21,32,32 // time_interpolator->nsec_per_cyc - extr r8 = r21,0,16 // time_interpolator->source + ld8 r30 = [r10] // clocksource->mmio_ptr + movl r19 = itc_lastcycle + add r23 = IA64_CLOCKSOURCE_SHIFT_OFFSET,r20 cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled (p6) br.cond.spnt.many fsys_fallback_syscall ;; - cmp.eq p8,p12 = 0,r8 // Check for cpu timer - cmp.eq p9,p0 = 1,r8 // MMIO64 ? - extr r2 = r21,24,8 // time_interpolator->jitter - cmp.eq p10,p0 = 2,r8 // MMIO32 ? - cmp.ltu p11,p0 = 2,r8 // function or other clock -(p11) br.cond.spnt.many fsys_fallback_syscall - ;; - setf.sig f7 = r3 // Setup for scaling of counter -(p15) movl r19 = wall_to_monotonic -(p12) ld8 r30 = [r10] - cmp.ne p13,p0 = r2,r0 // need jitter compensation? - extr r21 = r21,16,8 // shift factor + ld8 r14 = [r14] // clocksource mask value + ld4 r2 = [r22] // itc_jitter value + add r26 = IA64_CLOCKSOURCE_CYCLE_LAST_OFFSET,r20 // clock fsyscall_cycle_last + ld4 r3 = [r24] // clocksource->mult value + cmp.eq p8,p9 = 0,r30 // Check for cpu timer, no mmio_ptr, set p8, clear p9 + ;; + setf.sig f7 = r3 // Setup for mult scaling of counter +(p15) movl r22 = wall_to_monotonic + ld4 r21 = [r23] // shift value +(p8) cmp.ne p13,p0 = r2,r0 // need jitter compensation, set p13 +(p9) cmp.eq p13,p0 = 0,r30 // if mmio_ptr, clear p13 jitter control ;; .time_redo: .pred.rel.mutex p8,p9,p10 ld4.acq r28 = [r29] // xtime_lock.sequence. Must come first for locking purposes (p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!! - add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20 (p9) ld8 r2 = [r30] // readq(ti->address). Could also have latency issues.. -(p10) ld4 r2 = [r30] // readw(ti->address) -(p13) add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20 +(p13) ld8 r25 = [r19] // get itc_lastcycle value ;; // could be removed by moving the last add upward - ld8 r26 = [r22] // time_interpolator->last_counter -(p13) ld8 r25 = [r23] // time interpolator->last_cycle - add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20 -(p15) ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET - add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20 + ld8 r24 = [r26] // get fsyscall_cycle_last value +(p15) ld8 r17 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET ;; - ld8 r18 = [r24] // time_interpolator->offset ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET // xtime.tv_nsec -(p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm) +(p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm) ;; - ld8 r14 = [r14] // time_interpolator->mask -(p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared - sub r10 = r2,r26 // current_counter - last_counter +(p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared + sub r10 = r2,r24 // current_counter - last_counter ;; -(p6) sub r10 = r25,r26 // time we got was less than last_cycle +(p6) sub r10 = r25,r24 // time we got was less than last_cycle (p7) mov ar.ccv = r25 // more than last_cycle. Prep for cmpxchg ;; and r10 = r10,r14 // Apply mask @@ -274,22 +261,21 @@ ENTRY(fsys_gettimeofday) setf.sig f8 = r10 nop.i 123 ;; -(p7) cmpxchg8.rel r3 = [r23],r2,ar.ccv +(p7) cmpxchg8.rel r3 = [r19],r2,ar.ccv EX(.fail_efault, probe.w.fault r31, 3) // This takes 5 cycles and we have spare time xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter) (p15) add r9 = r9,r17 // Add wall to monotonic.secs to result secs ;; -(p15) ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET +(p15) ld8 r17 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET (p7) cmp.ne p7,p0 = r25,r3 // if cmpxchg not successful redo // simulate tbit.nz.or p7,p0 = r28,0 and r28 = ~1,r28 // Make sequence even to force retry if odd getf.sig r2 = f8 mf - add r8 = r8,r18 // Add time interpolator offset ;; ld4 r10 = [r29] // xtime_lock.sequence (p15) add r8 = r8, r17 // Add monotonic.nsecs to nsecs - shr.u r2 = r2,r21 + shr.u r2 = r2,r21 // shift by factor ;; // overloaded 3 bundles! // End critical section. add r8 = r8,r2 // Add xtime.nsecs @@ -348,6 +334,26 @@ ENTRY(fsys_clock_gettime) br.many .gettime END(fsys_clock_gettime) + +#else // !CONFIG_TIME_INTERPOLATION + +# define fsys_gettimeofday 0 +# define fsys_clock_gettime 0 + +.fail_einval: + mov r8 = EINVAL + mov r10 = -1 + FSYS_RETURN + +.fail_efault: + mov r8 = EFAULT + mov r10 = -1 + FSYS_RETURN + +#endif + + + /* * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize). */ Index: linux/arch/ia64/kernel/iosapic.c =================================================================== --- linux.orig/arch/ia64/kernel/iosapic.c +++ linux/arch/ia64/kernel/iosapic.c @@ -112,7 +112,7 @@ (PAGE_SIZE / sizeof(struct iosapic_rte_info)) #define RTE_PREALLOCATED (1) -static DEFINE_SPINLOCK(iosapic_lock); +static DEFINE_RAW_SPINLOCK(iosapic_lock); /* * These tables map IA-64 vectors to the IOSAPIC pin that generates this @@ -409,6 +409,34 @@ iosapic_startup_level_irq (unsigned int return 0; } +/* + * In the preemptible case mask the IRQ first then handle it and ack it. + */ +#ifdef CONFIG_PREEMPT_HARDIRQS + +static void +iosapic_ack_level_irq (unsigned int irq) +{ + ia64_vector vec = irq_to_vector(irq); + struct iosapic_rte_info *rte; + + move_irq(irq); + mask_irq(irq); + list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list) + iosapic_eoi(rte->addr, vec); +} + +static void +iosapic_end_level_irq (unsigned int irq) +{ + if (!(irq_desc[irq].status & IRQ_INPROGRESS)) + unmask_irq(irq); +} + +#else /* !CONFIG_PREEMPT_HARDIRQS */ + +#define iosapic_ack_level_irq nop + static void iosapic_end_level_irq (unsigned int irq) { @@ -420,10 +448,12 @@ iosapic_end_level_irq (unsigned int irq) iosapic_eoi(rte->addr, vec); } + +#endif + #define iosapic_shutdown_level_irq mask_irq #define iosapic_enable_level_irq unmask_irq #define iosapic_disable_level_irq mask_irq -#define iosapic_ack_level_irq nop struct hw_interrupt_type irq_type_iosapic_level = { .typename = "IO-SAPIC-level", Index: linux/arch/ia64/kernel/irq_ia64.c =================================================================== --- linux.orig/arch/ia64/kernel/irq_ia64.c +++ linux/arch/ia64/kernel/irq_ia64.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -105,6 +106,25 @@ reserve_irq_vector (int vector) return test_and_set_bit(pos, ia64_vector_mask); } +/* + * Dynamic irq allocate and deallocation for MSI + */ +int create_irq(void) +{ + int vector = assign_irq_vector(AUTO_ASSIGN); + + if (vector >= 0) + dynamic_irq_init(vector); + + return vector; +} + +void destroy_irq(unsigned int irq) +{ + dynamic_irq_cleanup(irq); + free_irq_vector(irq); +} + #ifdef CONFIG_SMP # define IS_RESCHEDULE(vec) (vec == IA64_IPI_RESCHEDULE) #else Index: linux/arch/ia64/kernel/mca.c =================================================================== --- linux.orig/arch/ia64/kernel/mca.c +++ linux/arch/ia64/kernel/mca.c @@ -152,7 +152,7 @@ ia64_mca_spin(const char *func) typedef struct ia64_state_log_s { - spinlock_t isl_lock; + raw_spinlock_t isl_lock; int isl_index; unsigned long isl_count; ia64_err_rec_t *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */ Index: linux/arch/ia64/kernel/perfmon.c =================================================================== --- linux.orig/arch/ia64/kernel/perfmon.c +++ linux/arch/ia64/kernel/perfmon.c @@ -277,7 +277,7 @@ typedef struct { */ typedef struct pfm_context { - spinlock_t ctx_lock; /* context protection */ + raw_spinlock_t ctx_lock; /* context protection */ pfm_context_flags_t ctx_flags; /* bitmask of flags (block reason incl.) */ unsigned int ctx_state; /* state: active/inactive (no bitfield) */ @@ -363,7 +363,7 @@ typedef struct pfm_context { * mostly used to synchronize between system wide and per-process */ typedef struct { - spinlock_t pfs_lock; /* lock the structure */ + raw_spinlock_t pfs_lock; /* lock the structure */ unsigned int pfs_task_sessions; /* number of per task sessions */ unsigned int pfs_sys_sessions; /* number of per system wide sessions */ @@ -504,7 +504,7 @@ static pfm_intr_handler_desc_t *pfm_alt static struct proc_dir_entry *perfmon_dir; static pfm_uuid_t pfm_null_uuid = {0,}; -static spinlock_t pfm_buffer_fmt_lock; +static raw_spinlock_t pfm_buffer_fmt_lock; static LIST_HEAD(pfm_buffer_fmt_list); static pmu_config_t *pmu_conf; Index: linux/arch/ia64/kernel/process.c =================================================================== --- linux.orig/arch/ia64/kernel/process.c +++ linux/arch/ia64/kernel/process.c @@ -96,6 +96,9 @@ show_stack (struct task_struct *task, un void dump_stack (void) { + if (irqs_disabled()) { + printk("Uh oh.. entering dump_stack() with irqs disabled.\n"); + } show_stack(NULL, NULL); } @@ -199,7 +202,7 @@ void default_idle (void) { local_irq_enable(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { if (can_do_pal_halt) safe_halt(); else @@ -275,7 +278,7 @@ cpu_idle (void) else current_thread_info()->status |= TS_POLLING; - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { void (*idle)(void); #ifdef CONFIG_SMP min_xtp(); @@ -297,10 +300,11 @@ cpu_idle (void) normal_xtp(); #endif } - preempt_enable_no_resched(); - schedule(); + __preempt_enable_no_resched(); + __schedule(); + preempt_disable(); - check_pgt_cache(); + if (cpu_is_offline(cpu)) play_dead(); } Index: linux/arch/ia64/kernel/sal.c =================================================================== --- linux.orig/arch/ia64/kernel/sal.c +++ linux/arch/ia64/kernel/sal.c @@ -18,7 +18,7 @@ #include #include - __cacheline_aligned DEFINE_SPINLOCK(sal_lock); + __cacheline_aligned DEFINE_RAW_SPINLOCK(sal_lock); unsigned long sal_platform_features; unsigned short sal_revision; Index: linux/arch/ia64/kernel/salinfo.c =================================================================== --- linux.orig/arch/ia64/kernel/salinfo.c +++ linux/arch/ia64/kernel/salinfo.c @@ -141,7 +141,7 @@ enum salinfo_state { struct salinfo_data { cpumask_t cpu_event; /* which cpus have outstanding events */ - struct semaphore mutex; + struct compat_semaphore mutex; u8 *log_buffer; u64 log_size; u8 *oemdata; /* decoded oem data */ @@ -157,8 +157,8 @@ struct salinfo_data { static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)]; -static DEFINE_SPINLOCK(data_lock); -static DEFINE_SPINLOCK(data_saved_lock); +static DEFINE_RAW_SPINLOCK(data_lock); +static DEFINE_RAW_SPINLOCK(data_saved_lock); /** salinfo_platform_oemdata - optional callback to decode oemdata from an error * record. Index: linux/arch/ia64/kernel/semaphore.c =================================================================== --- linux.orig/arch/ia64/kernel/semaphore.c +++ linux/arch/ia64/kernel/semaphore.c @@ -40,12 +40,12 @@ */ void -__up (struct semaphore *sem) +__up (struct compat_semaphore *sem) { wake_up(&sem->wait); } -void __sched __down (struct semaphore *sem) +void __sched __down (struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -82,7 +82,7 @@ void __sched __down (struct semaphore *s tsk->state = TASK_RUNNING; } -int __sched __down_interruptible (struct semaphore * sem) +int __sched __down_interruptible (struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -142,7 +142,7 @@ int __sched __down_interruptible (struct * count. */ int -__down_trylock (struct semaphore *sem) +__down_trylock (struct compat_semaphore *sem) { unsigned long flags; int sleepers; Index: linux/arch/ia64/kernel/signal.c =================================================================== --- linux.orig/arch/ia64/kernel/signal.c +++ linux/arch/ia64/kernel/signal.c @@ -487,6 +487,14 @@ ia64_do_signal (sigset_t *oldset, struct long errno = scr->pt.r8; # define ERR_CODE(c) (IS_IA32_PROCESS(&scr->pt) ? -(c) : (c)) +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif + /* * In the ia64_leave_kernel code path, we want the common case to go fast, which * is why we may in certain cases get here from kernel mode. Just return without Index: linux/arch/ia64/kernel/smp.c =================================================================== --- linux.orig/arch/ia64/kernel/smp.c +++ linux/arch/ia64/kernel/smp.c @@ -222,6 +222,22 @@ smp_send_reschedule (int cpu) platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != smp_processor_id()) + platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0); + } +} + + void smp_flush_tlb_all (void) { Index: linux/arch/ia64/kernel/smpboot.c =================================================================== --- linux.orig/arch/ia64/kernel/smpboot.c +++ linux/arch/ia64/kernel/smpboot.c @@ -371,6 +371,8 @@ smp_setup_percpu_timer (void) { } +extern void register_itc_clockevent(void); + static void __devinit smp_callin (void) { @@ -430,6 +432,7 @@ smp_callin (void) #ifdef CONFIG_IA32_SUPPORT ia32_gdt_init(); #endif + register_itc_clockevent(); /* * Allow the master to continue. Index: linux/arch/ia64/kernel/time.c =================================================================== --- linux.orig/arch/ia64/kernel/time.c +++ linux/arch/ia64/kernel/time.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,10 @@ extern unsigned long wall_jiffies; +static cycle_t itc_get_cycles(void); +cycle_t itc_lastcycle __attribute__((aligned(L1_CACHE_BYTES))); +int itc_jitter __attribute__((aligned(L1_CACHE_BYTES))); + volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */ #ifdef CONFIG_IA64_DEBUG_IRQ @@ -40,11 +45,16 @@ EXPORT_SYMBOL(last_cli_ip); #endif -static struct time_interpolator itc_interpolator = { - .shift = 16, - .mask = 0xffffffffffffffffLL, - .source = TIME_SOURCE_CPU +static struct clocksource clocksource_itc = { + .name = "itc", + .rating = 350, + .read = itc_get_cycles, + .mask = 0xffffffffffffffffLL, + .mult = 0, /*to be caluclated*/ + .shift = 16, + .is_continuous = 1, }; +static struct clocksource *clocksource_itc_p; static irqreturn_t timer_interrupt (int irq, void *dev_id, struct pt_regs *regs) @@ -57,38 +67,57 @@ timer_interrupt (int irq, void *dev_id, platform_timer_interrupt(irq, dev_id, regs); +#if 0 new_itm = local_cpu_data->itm_next; if (!time_after(ia64_get_itc(), new_itm)) printk(KERN_ERR "Oops: timer tick before it's due (itc=%lx,itm=%lx)\n", ia64_get_itc(), new_itm); - profile_tick(CPU_PROFILING, regs); +#endif - while (1) { - update_process_times(user_mode(regs)); + if (time_after(ia64_get_itc(), local_cpu_data->itm_tick_next)) { - new_itm += local_cpu_data->itm_delta; + unsigned long new_tick_itm; + new_tick_itm = local_cpu_data->itm_tick_next; - if (smp_processor_id() == time_keeper_id) { - /* - * Here we are in the timer irq handler. We have irqs locally - * disabled, but we don't know if the timer_bh is running on - * another CPU. We need to avoid to SMP race by acquiring the - * xtime_lock. - */ - write_seqlock(&xtime_lock); - do_timer(regs); - local_cpu_data->itm_next = new_itm; - write_sequnlock(&xtime_lock); - } else - local_cpu_data->itm_next = new_itm; + profile_tick(CPU_PROFILING, regs); + + while (1) { + update_process_times(user_mode(regs)); + + new_tick_itm += local_cpu_data->itm_tick_delta; + + if (smp_processor_id() == time_keeper_id) { + /* + * Here we are in the timer irq handler. We have irqs locally + * disabled, but we don't know if the timer_bh is running on + * another CPU. We need to avoid to SMP race by acquiring the + * xtime_lock. + */ + write_seqlock(&xtime_lock); + do_timer(regs); + local_cpu_data->itm_tick_next = new_tick_itm; + write_sequnlock(&xtime_lock); + } else + local_cpu_data->itm_tick_next = new_tick_itm; + + if (time_after(new_tick_itm, ia64_get_itc())) + break; + } + } - if (time_after(new_itm, ia64_get_itc())) - break; + if (time_after(ia64_get_itc(), local_cpu_data->itm_timer_next)) { + if (itc_clockevent.event_handler) + itc_clockevent.event_handler(regs); } do { + // FIXME, really, please + new_itm = local_cpu_data->itm_tick_next; + + if (time_after(new_itm, local_cpu_data->itm_timer_next)) + new_itm = local_cpu_data->itm_timer_next; /* * If we're too close to the next clock tick for * comfort, we increase the safety margin by @@ -98,8 +127,8 @@ timer_interrupt (int irq, void *dev_id, * too fast (with the potentially devastating effect * of losing monotony of time). */ - while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2)) - new_itm += local_cpu_data->itm_delta; + while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_tick_delta/2)) + new_itm += local_cpu_data->itm_tick_delta; ia64_set_itm(new_itm); /* double check, in case we got hit by a (slow) PMI: */ } while (time_after_eq(ia64_get_itc(), new_itm)); @@ -118,7 +147,7 @@ ia64_cpu_local_tick (void) /* arrange for the cycle counter to generate a timer interrupt: */ ia64_set_itv(IA64_TIMER_VECTOR); - delta = local_cpu_data->itm_delta; + delta = local_cpu_data->itm_tick_delta; /* * Stagger the timer tick for each CPU so they don't occur all at (almost) the * same time: @@ -127,8 +156,8 @@ ia64_cpu_local_tick (void) unsigned long hi = 1UL << ia64_fls(cpu); shift = (2*(cpu - hi) + 1) * delta/hi/2; } - local_cpu_data->itm_next = ia64_get_itc() + delta + shift; - ia64_set_itm(local_cpu_data->itm_next); + local_cpu_data->itm_tick_next = ia64_get_itc() + delta + shift; + ia64_set_itm(local_cpu_data->itm_tick_next); } static int nojitter; @@ -186,7 +215,7 @@ ia64_init_itm (void) itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den; - local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ; + local_cpu_data->itm_tick_delta = (itc_freq + HZ/2) / HZ; printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%u/%u, " "ITC freq=%lu.%03luMHz", smp_processor_id(), platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000, @@ -206,9 +235,8 @@ ia64_init_itm (void) local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<itc_freq; - itc_interpolator.drift = itc_drift; #ifdef CONFIG_SMP /* On IA64 in an SMP configuration ITCs are never accurately synchronized. * Jitter compensation requires a cmpxchg which may limit @@ -220,18 +248,57 @@ ia64_init_itm (void) * even going backward) if the ITC offsets between the individual CPUs * are too large. */ - if (!nojitter) itc_interpolator.jitter = 1; + if (!nojitter) itc_jitter = 1; #endif - register_time_interpolator(&itc_interpolator); } +#endif /* Setup the CPU local timer tick */ ia64_cpu_local_tick(); + + if (!clocksource_itc_p) { + /* Sort out mult/shift values: */ + clocksource_itc.mult = clocksource_hz2mult(local_cpu_data->itc_freq, + clocksource_itc.shift); + clocksource_register(&clocksource_itc); + clocksource_itc_p = &clocksource_itc; + } } + +static cycle_t itc_get_cycles() +{ + if (itc_jitter) { + u64 lcycle; + u64 now; + + do { + lcycle = itc_lastcycle; + now = get_cycles(); + if (lcycle && time_after(lcycle, now)) + return lcycle; + + /* When holding the xtime write lock, there's no need + * to add the overhead of the cmpxchg. Readers are + * force to retry until the write lock is released. + */ + if (spin_is_locked(&xtime_lock.lock)) { + itc_lastcycle = now; + return now; + } + /* Keep track of the last timer value returned. The use of cmpxchg here + * will cause contention in an SMP environment. + */ + } while (unlikely(cmpxchg(&itc_lastcycle, lcycle, now) != lcycle)); + return now; + } else + return get_cycles(); +} + + static struct irqaction timer_irqaction = { .handler = timer_interrupt, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NODELAY, .name = "timer" }; @@ -252,6 +319,8 @@ time_init (void) * tv_nsec field must be normalized (i.e., 0 <= nsec < NSEC_PER_SEC). */ set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); + register_itc_clocksource(); + register_itc_clockevent(); } /* @@ -304,3 +373,10 @@ ia64_setup_printk_clock(void) if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) ia64_printk_clock = ia64_itc_printk_clock; } + +struct clocksource fsyscall_clock __attribute__((aligned(L1_CACHE_BYTES))); + +void update_vsyscall(struct timespec *wall, struct clocksource *c) +{ + fsyscall_clock = *c; +} Index: linux/arch/ia64/kernel/traps.c =================================================================== --- linux.orig/arch/ia64/kernel/traps.c +++ linux/arch/ia64/kernel/traps.c @@ -24,7 +24,7 @@ #include #include -extern spinlock_t timerlist_lock; +extern raw_spinlock_t timerlist_lock; fpswa_interface_t *fpswa_interface; EXPORT_SYMBOL(fpswa_interface); @@ -85,11 +85,11 @@ void die (const char *str, struct pt_regs *regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED, .lock_owner = -1, .lock_owner_depth = 0 }; @@ -226,7 +226,7 @@ __kprobes ia64_bad_break (unsigned long * access to fph by the time we get here, as the IVT's "Disabled FP-Register" handler takes * care of clearing psr.dfh. */ -static inline void +void disabled_fph_fault (struct pt_regs *regs) { struct ia64_psr *psr = ia64_psr(regs); @@ -245,7 +245,7 @@ disabled_fph_fault (struct pt_regs *regs = (struct task_struct *)ia64_get_kr(IA64_KR_FPU_OWNER); if (ia64_is_local_fpu_owner(current)) { - preempt_enable_no_resched(); + __preempt_enable_no_resched(); return; } @@ -265,7 +265,7 @@ disabled_fph_fault (struct pt_regs *regs */ psr->mfh = 1; } - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } static inline int Index: linux/arch/ia64/kernel/unwind.c =================================================================== --- linux.orig/arch/ia64/kernel/unwind.c +++ linux/arch/ia64/kernel/unwind.c @@ -81,7 +81,7 @@ typedef unsigned long unw_word; typedef unsigned char unw_hash_index_t; static struct { - spinlock_t lock; /* spinlock for unwind data */ + raw_spinlock_t lock; /* spinlock for unwind data */ /* list of unwind tables (one per load-module) */ struct unw_table *tables; @@ -145,7 +145,7 @@ static struct { # endif } unw = { .tables = &unw.kernel_table, - .lock = SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED, .save_order = { UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR, UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR Index: linux/arch/ia64/kernel/unwind_i.h =================================================================== --- linux.orig/arch/ia64/kernel/unwind_i.h +++ linux/arch/ia64/kernel/unwind_i.h @@ -154,7 +154,7 @@ struct unw_script { unsigned long ip; /* ip this script is for */ unsigned long pr_mask; /* mask of predicates script depends on */ unsigned long pr_val; /* predicate values this script is for */ - rwlock_t lock; + raw_rwlock_t lock; unsigned int flags; /* see UNW_FLAG_* in unwind.h */ unsigned short lru_chain; /* used for least-recently-used chain */ unsigned short coll_chain; /* used for hash collisions */ Index: linux/arch/ia64/mm/init.c =================================================================== --- linux.orig/arch/ia64/mm/init.c +++ linux/arch/ia64/mm/init.c @@ -36,7 +36,7 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist); DEFINE_PER_CPU(long, __pgtable_quicklist_size); @@ -92,15 +92,11 @@ check_pgt_cache(void) if (unlikely(pgtable_quicklist_size <= MIN_PGT_PAGES)) return; - preempt_disable(); while (unlikely((pages_to_free = min_pages_to_free()) > 0)) { while (pages_to_free--) { free_page((unsigned long)pgtable_quicklist_alloc()); } - preempt_enable(); - preempt_disable(); } - preempt_enable(); } void Index: linux/arch/ia64/mm/tlb.c =================================================================== --- linux.orig/arch/ia64/mm/tlb.c +++ linux/arch/ia64/mm/tlb.c @@ -32,7 +32,7 @@ static struct { } purge; struct ia64_ctx ia64_ctx = { - .lock = SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED, .next = 1, .max_ctx = ~0U }; Index: linux/arch/ia64/pci/pci.c =================================================================== --- linux.orig/arch/ia64/pci/pci.c +++ linux/arch/ia64/pci/pci.c @@ -809,12 +809,3 @@ pcibios_prep_mwi (struct pci_dev *dev) } return rc; } - -int pci_vector_resources(int last, int nr_released) -{ - int count = nr_released; - - count += (IA64_LAST_DEVICE_VECTOR - last); - - return count; -} Index: linux/arch/ia64/sn/kernel/sn2/timer.c =================================================================== --- linux.orig/arch/ia64/sn/kernel/sn2/timer.c +++ linux/arch/ia64/sn/kernel/sn2/timer.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -22,11 +23,21 @@ extern unsigned long sn_rtc_cycles_per_second; -static struct time_interpolator sn2_interpolator = { - .drift = -1, - .shift = 10, - .mask = (1LL << 55) - 1, - .source = TIME_SOURCE_MMIO64 +static void __iomem *sn2_mc_ptr; + +static cycle_t read_sn2(void) +{ + return (cycle_t)readq(sn2_mc_ptr); +} + +static struct clocksource clocksource_sn2 = { + .name = "sn2_rtc", + .rating = 300, + .read = read_sn2, + .mask = (1LL << 55) - 1, + .mult = 0, + .shift = 10, + .is_continuous = 1, }; /* @@ -47,9 +58,10 @@ ia64_sn_udelay (unsigned long usecs) void __init sn_timer_init(void) { - sn2_interpolator.frequency = sn_rtc_cycles_per_second; - sn2_interpolator.addr = RTC_COUNTER_ADDR; - register_time_interpolator(&sn2_interpolator); + clocksource_sn2.fsys_mmio_ptr = sn2_mc_ptr = RTC_COUNTER_ADDR; + clocksource_sn2.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, + clocksource_sn2.shift); + clocksource_register(&clocksource_sn2); ia64_udelay = &ia64_sn_udelay; } Index: linux/arch/mips/Kconfig =================================================================== --- linux.orig/arch/mips/Kconfig +++ linux/arch/mips/Kconfig @@ -417,6 +417,7 @@ config MOMENCO_JAGUAR_ATX config MOMENCO_OCELOT bool "Momentum Ocelot board" select DMA_NONCOHERENT + select NO_SPINLOCK select HW_HAS_PCI select IRQ_CPU select IRQ_CPU_RM7K @@ -837,6 +838,7 @@ source "arch/mips/cobalt/Kconfig" endmenu + config RWSEM_GENERIC_SPINLOCK bool default y @@ -844,6 +846,10 @@ config RWSEM_GENERIC_SPINLOCK config RWSEM_XCHGADD_ALGORITHM bool +config ASM_SEMAPHORES + bool + default y + config GENERIC_FIND_NEXT_BIT bool default y @@ -889,6 +895,9 @@ config DMA_NEED_PCI_MAP_STATE config OWN_DMA bool +config NO_SPINLOCK + bool + config EARLY_PRINTK bool @@ -1843,12 +1852,17 @@ config MIPS_INSANE_LARGE This will result in additional memory usage, so it is not recommended for normal users. -endmenu - -config RWSEM_GENERIC_SPINLOCK +config GENERIC_TIME bool default y +source "kernel/time/Kconfig" + +config CPU_SPEED + int "CPU speed used for clocksource/clockevent calculations" + default 600 +endmenu + source "init/Kconfig" menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)" Index: linux/arch/mips/kernel/Makefile =================================================================== --- linux.orig/arch/mips/kernel/Makefile +++ linux/arch/mips/kernel/Makefile @@ -5,7 +5,7 @@ extra-y := head.o init_task.o vmlinux.lds obj-y += cpu-probe.o branch.o entry.o genex.o irq.o process.o \ - ptrace.o reset.o semaphore.o setup.o signal.o syscall.o \ + ptrace.o reset.o setup.o signal.o syscall.o \ time.o traps.o unaligned.o binfmt_irix-objs := irixelf.o irixinv.o irixioctl.o irixsig.o \ @@ -15,6 +15,8 @@ obj-$(CONFIG_MODULES) += mips_ksyms.o m obj-$(CONFIG_APM) += apm.o +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o + obj-$(CONFIG_CPU_R3000) += r2300_fpu.o r2300_switch.o obj-$(CONFIG_CPU_TX39XX) += r2300_fpu.o r2300_switch.o obj-$(CONFIG_CPU_TX49XX) += r4k_fpu.o r4k_switch.o Index: linux/arch/mips/kernel/asm-offsets.c =================================================================== --- linux.orig/arch/mips/kernel/asm-offsets.c +++ linux/arch/mips/kernel/asm-offsets.c @@ -10,9 +10,11 @@ */ #include #include +#include #include #include #include +#include #include #include Index: linux/arch/mips/kernel/entry.S =================================================================== --- linux.orig/arch/mips/kernel/entry.S +++ linux/arch/mips/kernel/entry.S @@ -25,7 +25,7 @@ .endm #else .macro preempt_stop - local_irq_disable + raw_local_irq_disable .endm #define resume_kernel restore_all #endif @@ -40,7 +40,7 @@ FEXPORT(ret_from_irq) beqz t0, resume_kernel resume_userspace: - local_irq_disable # make sure we dont miss an + raw_local_irq_disable # make sure we dont miss an # interrupt setting need_resched # between sampling and return LONG_L a2, TI_FLAGS($28) # current->work @@ -50,7 +50,9 @@ resume_userspace: #ifdef CONFIG_PREEMPT resume_kernel: - local_irq_disable + raw_local_irq_disable + lw t0, kernel_preemption + beqz t0, restore_all lw t0, TI_PRE_COUNT($28) bnez t0, restore_all need_resched: @@ -60,7 +62,9 @@ need_resched: LONG_L t0, PT_STATUS(sp) # Interrupts off? andi t0, 1 beqz t0, restore_all + raw_local_irq_disable jal preempt_schedule_irq + sw zero, TI_PRE_COUNT($28) b need_resched #endif @@ -68,7 +72,7 @@ FEXPORT(ret_from_fork) jal schedule_tail # a0 = struct task_struct *prev FEXPORT(syscall_exit) - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work @@ -133,19 +137,21 @@ FEXPORT(restore_partial) # restore part .set at work_pending: - andi t0, a2, _TIF_NEED_RESCHED # a2 is preloaded with TI_FLAGS + # a2 is preloaded with TI_FLAGS + andi t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beqz t0, work_notifysig work_resched: + raw_local_irq_enable t0 jal schedule - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) andi t0, a2, _TIF_WORK_MASK # is there any work to be done # other than syscall tracing? beqz t0, restore_all - andi t0, a2, _TIF_NEED_RESCHED + andi t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bnez t0, work_resched work_notifysig: # deal with pending signals and @@ -161,7 +167,7 @@ syscall_exit_work: li t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT and t0, a2 # a2 is preloaded with TI_FLAGS beqz t0, work_pending # trace bit set? - local_irq_enable # could let do_syscall_trace() + raw_local_irq_enable # could let do_syscall_trace() # call schedule() instead move a0, sp li a1, 1 Index: linux/arch/mips/kernel/i8259.c =================================================================== --- linux.orig/arch/mips/kernel/i8259.c +++ linux/arch/mips/kernel/i8259.c @@ -31,7 +31,7 @@ void disable_8259A_irq(unsigned int irq) * moves to arch independent land */ -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void end_8259A_irq (unsigned int irq) { Index: linux/arch/mips/kernel/irq.c =================================================================== --- linux.orig/arch/mips/kernel/irq.c +++ linux/arch/mips/kernel/irq.c @@ -137,7 +137,10 @@ void __init init_IRQ(void) irq_desc[i].action = NULL; irq_desc[i].depth = 1; irq_desc[i].chip = &no_irq_chip; - spin_lock_init(&irq_desc[i].lock); + __raw_spin_lock_init(&irq_desc[i].lock); +#ifdef CONFIG_PREEMPT_HARDIRQS + irq_desc[i].thread = NULL; +#endif #ifdef CONFIG_MIPS_MT_SMTC irq_hwmask[i] = 0; #endif /* CONFIG_MIPS_MT_SMTC */ Index: linux/arch/mips/kernel/module.c =================================================================== --- linux.orig/arch/mips/kernel/module.c +++ linux/arch/mips/kernel/module.c @@ -39,7 +39,7 @@ struct mips_hi16 { static struct mips_hi16 *mips_hi16_list; static LIST_HEAD(dbe_list); -static DEFINE_SPINLOCK(dbe_lock); +static DEFINE_RAW_SPINLOCK(dbe_lock); void *module_alloc(unsigned long size) { Index: linux/arch/mips/kernel/process.c =================================================================== --- linux.orig/arch/mips/kernel/process.c +++ linux/arch/mips/kernel/process.c @@ -54,16 +54,18 @@ ATTRIB_NORET void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { #ifdef CONFIG_MIPS_MT_SMTC smtc_idle_loop_hook(); #endif /* CONFIG_MIPS_MT_SMTC */ if (cpu_wait) (*cpu_wait)(); } - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } Index: linux/arch/mips/kernel/scall32-o32.S =================================================================== --- linux.orig/arch/mips/kernel/scall32-o32.S +++ linux/arch/mips/kernel/scall32-o32.S @@ -84,7 +84,7 @@ stack_done: 1: sw v0, PT_R2(sp) # result o32_syscall_exit: - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return lw a2, TI_FLAGS($28) # current->work Index: linux/arch/mips/kernel/scall64-64.S =================================================================== --- linux.orig/arch/mips/kernel/scall64-64.S +++ linux/arch/mips/kernel/scall64-64.S @@ -72,7 +72,7 @@ NESTED(handle_sys64, PT_SIZE, sp) 1: sd v0, PT_R2(sp) # result n64_syscall_exit: - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work Index: linux/arch/mips/kernel/scall64-n32.S =================================================================== --- linux.orig/arch/mips/kernel/scall64-n32.S +++ linux/arch/mips/kernel/scall64-n32.S @@ -69,7 +69,7 @@ NESTED(handle_sysn32, PT_SIZE, sp) sd v0, PT_R0(sp) # set flag for syscall restarting 1: sd v0, PT_R2(sp) # result - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work Index: linux/arch/mips/kernel/scall64-o32.S =================================================================== --- linux.orig/arch/mips/kernel/scall64-o32.S +++ linux/arch/mips/kernel/scall64-o32.S @@ -98,7 +98,7 @@ NESTED(handle_sys, PT_SIZE, sp) 1: sd v0, PT_R2(sp) # result o32_syscall_exit: - local_irq_disable # make need_resched and + raw_local_irq_disable # make need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) Index: linux/arch/mips/kernel/semaphore.c =================================================================== --- linux.orig/arch/mips/kernel/semaphore.c +++ linux/arch/mips/kernel/semaphore.c @@ -36,7 +36,7 @@ * sem->count and sem->waking atomic. Scalability isn't an issue because * this lock is used on UP only so it's just an empty variable. */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -67,7 +67,7 @@ static inline int __sem_update_count(str : "=&r" (old_count), "=&r" (tmp), "=m" (sem->count) : "r" (incr), "m" (sem->count)); } else { - static DEFINE_SPINLOCK(semaphore_lock); + static DEFINE_RAW_SPINLOCK(semaphore_lock); unsigned long flags; spin_lock_irqsave(&semaphore_lock, flags); @@ -80,7 +80,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -94,7 +94,7 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -EXPORT_SYMBOL(__up); +EXPORT_SYMBOL(__compat_up); /* * Note that when we come in to __down or __down_interruptible, @@ -104,7 +104,7 @@ EXPORT_SYMBOL(__up); * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -133,9 +133,9 @@ void __sched __down(struct semaphore *se wake_up(&sem->wait); } -EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__compat_down); -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -165,4 +165,10 @@ int __sched __down_interruptible(struct return retval; } -EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__compat_down_interruptible); + +int fastcall compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux/arch/mips/kernel/signal.c =================================================================== --- linux.orig/arch/mips/kernel/signal.c +++ linux/arch/mips/kernel/signal.c @@ -416,6 +416,10 @@ void do_signal(struct pt_regs *regs) siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which is why we may in certain * cases get here from kernel mode. Just return without doing anything Index: linux/arch/mips/kernel/signal32.c =================================================================== --- linux.orig/arch/mips/kernel/signal32.c +++ linux/arch/mips/kernel/signal32.c @@ -807,6 +807,10 @@ void do_signal32(struct pt_regs *regs) siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which is why we may in certain * cases get here from kernel mode. Just return without doing anything Index: linux/arch/mips/kernel/smp.c =================================================================== --- linux.orig/arch/mips/kernel/smp.c +++ linux/arch/mips/kernel/smp.c @@ -115,7 +115,22 @@ asmlinkage void start_secondary(void) cpu_idle(); } -DEFINE_SPINLOCK(smp_call_lock); +DEFINE_RAW_SPINLOCK(smp_call_lock); + +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them. + */ +void smp_send_reschedule_allbutself(void) +{ + int cpu = smp_processor_id(); + int i; + + for (i = 0; i < NR_CPUS; i++) + if (cpu_online(i) && i != cpu) + core_send_ipi(i, SMP_RESCHEDULE_YOURSELF); +} struct call_data_struct *call_data; @@ -303,6 +318,8 @@ int setup_profiling_timer(unsigned int m return 0; } +static DEFINE_RAW_SPINLOCK(tlbstate_lock); + static void flush_tlb_all_ipi(void *info) { local_flush_tlb_all(); @@ -360,6 +377,7 @@ static inline void smp_on_each_tlb(void void flush_tlb_mm(struct mm_struct *mm) { preempt_disable(); + spin_lock(&tlbstate_lock); if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) { smp_on_other_tlbs(flush_tlb_mm_ipi, (void *)mm); @@ -369,6 +387,7 @@ void flush_tlb_mm(struct mm_struct *mm) if (smp_processor_id() != i) cpu_context(i, mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_mm(mm); preempt_enable(); @@ -392,6 +411,8 @@ void flush_tlb_range(struct vm_area_stru struct mm_struct *mm = vma->vm_mm; preempt_disable(); + spin_lock(&tlbstate_lock); + if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) { struct flush_tlb_data fd; @@ -405,6 +426,7 @@ void flush_tlb_range(struct vm_area_stru if (smp_processor_id() != i) cpu_context(i, mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_range(vma, start, end); preempt_enable(); } @@ -435,6 +457,8 @@ static void flush_tlb_page_ipi(void *inf void flush_tlb_page(struct vm_area_struct *vma, unsigned long page) { preempt_disable(); + spin_lock(&tlbstate_lock); + if ((atomic_read(&vma->vm_mm->mm_users) != 1) || (current->mm != vma->vm_mm)) { struct flush_tlb_data fd; @@ -447,6 +471,7 @@ void flush_tlb_page(struct vm_area_struc if (smp_processor_id() != i) cpu_context(i, vma->vm_mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_page(vma, page); preempt_enable(); } Index: linux/arch/mips/kernel/time.c =================================================================== --- linux.orig/arch/mips/kernel/time.c +++ linux/arch/mips/kernel/time.c @@ -10,6 +10,11 @@ * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. + * + * This implementation of High Res Timers uses two timers. One is the system + * timer. The second is used for the high res timers. The high res timers + * require the CPU to have count/compare registers. The mips_set_next_event() + * function schedules the next high res timer interrupt. */ #include #include @@ -23,6 +28,7 @@ #include #include #include +#include #include #include @@ -49,8 +55,28 @@ */ extern volatile unsigned long wall_jiffies; +/* any missed timer interrupts */ +int missed_timer_count; + DEFINE_SPINLOCK(rtc_lock); +#ifdef CONFIG_HIGH_RES_TIMERS +static void mips_set_next_event(unsigned long evt); +static void mips_set_mode(int mode, void *priv); + +static struct clock_event lapic_clockevent = { + .name = "mips clockevent interface", + .capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE | + CLOCK_HAS_IRQHANDLER +#ifdef CONFIG_SMP + | CLOCK_CAP_UPDATE +#endif + , + .shift = 32, + .set_next_event = mips_set_next_event, +}; +#endif + /* * By default we provide the null RTC ops */ @@ -68,6 +94,12 @@ unsigned long (*rtc_mips_get_time)(void) int (*rtc_mips_set_time)(unsigned long) = null_rtc_set_time; int (*rtc_mips_set_mmss)(unsigned long); +u64 read_persistent_clock(void) +{ + unsigned long sec; + sec = rtc_mips_get_time(); + return (u64)sec * NSEC_PER_SEC; +} /* usecs per counter cycle, shifted to left by 32 bits */ static unsigned int sll32_usecs_per_cycle; @@ -75,18 +107,30 @@ static unsigned int sll32_usecs_per_cycl /* how many counter cycles in a jiffy */ static unsigned long cycles_per_jiffy __read_mostly; +static unsigned long hrt_cycles_per_jiffy __read_mostly; + + /* Cycle counter value at the previous timer interrupt.. */ static unsigned int timerhi, timerlo; /* expirelo is the count value for next CPU timer interrupt */ static unsigned int expirelo; - /* * Null timer ack for systems not needing one (e.g. i8254). */ static void null_timer_ack(void) { /* nothing */ } +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * Set the next event + */ +static void mips_set_next_event(unsigned long evt) +{ + write_c0_compare(read_c0_count() + evt); +} +#endif + /* * Null high precision timer functions for systems lacking one. */ @@ -100,7 +144,6 @@ static void null_hpt_init(unsigned int c /* nothing */ } - /* * Timer ack for an R4k-compatible timer of a known frequency. */ @@ -110,14 +153,15 @@ static void c0_timer_ack(void) #ifndef CONFIG_SOC_PNX8550 /* pnx8550 resets to zero */ /* Ack this timer interrupt and set the next one. */ - expirelo += cycles_per_jiffy; + expirelo += hrt_cycles_per_jiffy; #endif write_c0_compare(expirelo); /* Check to see if we have missed any timer interrupts. */ - while (((count = read_c0_count()) - expirelo) < 0x7fffffff) { - /* missed_timer_count++; */ - expirelo = count + cycles_per_jiffy; + count = read_c0_count(); + if ((count - expirelo) < 0x7fffffff) { + /* missed_timer_count++; */ + expirelo = count + hrt_cycles_per_jiffy; write_c0_compare(expirelo); } } @@ -250,11 +294,9 @@ static unsigned long null_gettimeoffset( return 0; } - /* The function pointer to one of the gettimeoffset funcs. */ unsigned long (*do_gettimeoffset)(void) = null_gettimeoffset; - static unsigned long fixed_rate_gettimeoffset(void) { u32 count; @@ -410,6 +452,7 @@ void local_timer_interrupt(int irq, void { if (current->pid) profile_tick(CPU_PROFILING, regs); + update_process_times(user_mode(regs)); } @@ -438,7 +481,7 @@ irqreturn_t timer_interrupt(int irq, voi /* * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. rtc_mips_set_time() has to be + * CMOS clock accordingly every ~11 minutes. rtc_set_time() has to be * called as close as possible to 500 ms before the new second starts. */ if (ntp_synced() && @@ -518,6 +561,15 @@ int (*perf_irq)(struct pt_regs *regs) = EXPORT_SYMBOL(null_perf_irq); EXPORT_SYMBOL(perf_irq); +#ifdef CONFIG_HIGH_RES_TIMERS +void event_timer_handler(struct pt_regs *regs) +{ + c0_timer_ack(); + if (lapic_clockevent.event_handler) + lapic_clockevent.event_handler(regs,NULL); +} +#endif + asmlinkage void ll_timer_interrupt(int irq, struct pt_regs *regs) { int r2 = cpu_has_mips_r2; @@ -531,6 +583,15 @@ asmlinkage void ll_timer_interrupt(int i * performance counter interrupt was pending, so we have to run the * performance counter interrupt handler anyway. */ +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * Run the event handler + */ + if (!r2 || (read_c0_cause() & (1 << 26))) + if (lapic_clockevent.event_handler) + lapic_clockevent.event_handler(regs,NULL); +#endif + if (!r2 || (read_c0_cause() & (1 << 26))) if (perf_irq(regs)) goto out; @@ -563,7 +624,7 @@ asmlinkage void ll_local_timer_interrupt * b) (optional) calibrate and set the mips_hpt_frequency * (only needed if you intended to use fixed_rate_gettimeoffset * or use cpu counter as timer interrupt source) - * 2) setup xtime based on rtc_mips_get_time(). + * 2) setup xtime based on rtc_get_time(). * 3) choose a appropriate gettimeoffset routine. * 4) calculate a couple of cached variables for later usage * 5) plat_timer_setup() - @@ -578,7 +639,7 @@ unsigned int mips_hpt_frequency; static struct irqaction timer_irqaction = { .handler = timer_interrupt, - .flags = IRQF_DISABLED, + .flags = IRQF_NODELAY | IRQF_DISABLED, .name = "timer", }; @@ -627,6 +688,9 @@ static unsigned int __init calibrate_hpt void __init time_init(void) { +#ifdef CONFIG_HIGH_RES_TIMERS + u64 temp; +#endif if (board_time_init) board_time_init(); @@ -688,6 +752,12 @@ void __init time_init(void) /* Calculate cache parameters. */ cycles_per_jiffy = (mips_hpt_frequency + HZ / 2) / HZ; +#ifdef CONFIG_HIGH_RES_TIMERS + hrt_cycles_per_jiffy = ( (CONFIG_CPU_SPEED * 1000000) + HZ / 2) / HZ; +#else + hrt_cycles_per_jiffy = cycles_per_jiffy; +#endif + /* sll32_usecs_per_cycle = 10^6 * 2^32 / mips_counter_freq */ do_div64_32(sll32_usecs_per_cycle, 1000000, mips_hpt_frequency / 2, @@ -776,3 +846,128 @@ unsigned long long sched_clock(void) { return (unsigned long long)jiffies*(1000000000/HZ); } + + +#ifdef CONFIG_SMP +/* + * We have to synchronize the master CPU with all the slave CPUs + */ +static atomic_t cpus_started; +static atomic_t cpus_ready; +static atomic_t cpus_count; +/* + * Master processor inits + */ +static void sync_cpus_init(int v) +{ + atomic_set(&cpus_count, 0); + mb(); + atomic_set(&cpus_started, v); + mb(); + atomic_set(&cpus_ready, v); + mb(); +} + +/* + * Called by the master processor + */ +static void sync_cpus_master(int v) +{ + atomic_set(&cpus_count, 0); + mb(); + atomic_set(&cpus_started, v); + mb(); + /* Wait here till all other CPUs are now ready */ + while (atomic_read(&cpus_count) != (num_online_cpus() -1) ) + mb(); + atomic_set(&cpus_ready, v); + mb(); +} +/* + * Called by the slave processors + */ +static void sync_cpus_slave(int v) +{ + /* Check if the master has been through this */ + while (atomic_read(&cpus_started) != v) + mb(); + atomic_inc(&cpus_count); + mb(); + while (atomic_read(&cpus_ready) != v) + mb(); +} +/* + * Called by the slave CPUs when done syncing the count register + * with the master processor + */ +static void sync_cpus_slave_exit(int v) +{ + while (atomic_read(&cpus_started) != v) + mb(); + atomic_inc(&cpus_count); + mb(); +} + +#define LOOPS 100 +static u32 c0_count[NR_CPUS]; /* Count register per CPU */ +static u32 c[NR_CPUS][LOOPS + 1]; /* Count register per CPU per loop for syncing */ + +/* + * Slave processors execute this via IPI + */ +static void sync_c0_count_slave(void *info) +{ + int cpus = 1, loop, prev_count = 0, cpu = smp_processor_id(); + unsigned long flags; + u32 diff_count; /* CPU count registers are 32-bit */ + local_irq_save(flags); + + for(loop = 0; loop <= LOOPS; loop++) { + /* Sync with the Master processor */ + sync_cpus_slave(cpus++); + c[cpu][loop] = c0_count[cpu] = read_c0_count(); + mb(); + sync_cpus_slave(cpus++); + diff_count = c0_count[0] - c0_count[cpu]; + diff_count += prev_count; + diff_count += read_c0_count(); + write_c0_count(diff_count); + prev_count = (prev_count >> 1) + + ((int)(c0_count[0] - c0_count[cpu]) >> 1); + } + + /* Slave processor is done syncing count register with Master */ + sync_cpus_slave_exit(cpus++); + printk("SMP: Slave processor %d done syncing count \n", cpu); + local_irq_restore(flags); +} + +/* + * Master kicks off the syncing process + */ +void sync_c0_count_master(void) +{ + int cpus = 0, loop, cpu = smp_processor_id(); + unsigned long flags; + + printk("SMP: Starting to sync the c0 count register ... \n"); + sync_cpus_init(cpus++); + + /* Kick off the slave processors to also start the syncing process */ + smp_call_function(sync_c0_count_slave, NULL, 0, 0); + local_irq_save(flags); + + for (loop = 0; loop <= LOOPS; loop++) { + /* Wait for all the CPUs here */ + sync_cpus_master(cpus++); + c[cpu][loop] = c0_count[cpu] = read_c0_count(); + mb(); + /* Do syncing once more */ + sync_cpus_master(cpus++); + } + sync_cpus_master(cpus++); + local_irq_restore(flags); + + printk("SMP: Syncing process completed accross CPUs ... \n"); +} +#endif /* CONFIG_SMP */ Index: linux/arch/mips/kernel/traps.c =================================================================== --- linux.orig/arch/mips/kernel/traps.c +++ linux/arch/mips/kernel/traps.c @@ -274,7 +274,7 @@ void show_registers(struct pt_regs *regs printk("\n"); } -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); NORET_TYPE void ATTRIB_NORET die(const char * str, struct pt_regs * regs) { Index: linux/arch/mips/mm/init.c =================================================================== --- linux.orig/arch/mips/mm/init.c +++ linux/arch/mips/mm/init.c @@ -36,7 +36,7 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; Index: linux/arch/mips/sibyte/cfe/smp.c =================================================================== --- linux.orig/arch/mips/sibyte/cfe/smp.c +++ linux/arch/mips/sibyte/cfe/smp.c @@ -107,4 +107,8 @@ void prom_smp_finish(void) */ void prom_cpus_done(void) { +#ifdef CONFIG_HIGH_RES_TIMERS + extern void sync_c0_count_master(void); + sync_c0_count_master(); +#endif } Index: linux/arch/mips/sibyte/sb1250/irq.c =================================================================== --- linux.orig/arch/mips/sibyte/sb1250/irq.c +++ linux/arch/mips/sibyte/sb1250/irq.c @@ -85,7 +85,7 @@ static struct irq_chip sb1250_irq_type = /* Store the CPU id (not the logical number) */ int sb1250_irq_owner[SB1250_NR_IRQS]; -DEFINE_SPINLOCK(sb1250_imr_lock); +DEFINE_RAW_SPINLOCK(sb1250_imr_lock); void sb1250_mask_irq(int cpu, int irq) { @@ -262,7 +262,7 @@ static irqreturn_t sb1250_dummy_handler static struct irqaction sb1250_dummy_action = { .handler = sb1250_dummy_handler, - .flags = 0, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "sb1250-private", .next = NULL, @@ -372,6 +372,10 @@ void __init arch_init_irq(void) #ifdef CONFIG_KGDB imask |= STATUSF_IP6; #endif + +#ifdef CONFIG_HIGH_RES_TIMERS + imask |= STATUSF_IP7; +#endif /* Enable necessary IPs, disable the rest */ change_c0_status(ST0_IM, imask); @@ -465,6 +469,10 @@ asmlinkage void plat_irq_dispatch(struct else #endif +#ifdef CONFIG_HIGH_RES_TIMERS + if (pending & CAUSEF_IP7) + event_timer_handler(regs); +#endif if (pending & CAUSEF_IP4) sb1250_timer_interrupt(regs); Index: linux/arch/mips/sibyte/sb1250/smp.c =================================================================== --- linux.orig/arch/mips/sibyte/sb1250/smp.c +++ linux/arch/mips/sibyte/sb1250/smp.c @@ -59,7 +59,7 @@ void sb1250_smp_finish(void) { extern void sb1250_time_init(void); sb1250_time_init(); - local_irq_enable(); + raw_local_irq_enable(); } /* Index: linux/arch/mips/sibyte/swarm/setup.c =================================================================== --- linux.orig/arch/mips/sibyte/swarm/setup.c +++ linux/arch/mips/sibyte/swarm/setup.c @@ -131,6 +131,12 @@ void __init plat_mem_setup(void) rtc_mips_set_time = m41t81_set_time; } +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * set the mips_hpt_frequency here + */ + mips_hpt_frequency = CONFIG_CPU_SPEED * 1000000; +#endif printk("This kernel optimized for " #ifdef CONFIG_SIMULATION "simulation" Index: linux/arch/powerpc/Kconfig =================================================================== --- linux.orig/arch/powerpc/Kconfig +++ linux/arch/powerpc/Kconfig @@ -26,18 +26,15 @@ config MMU bool default y -config GENERIC_HARDIRQS +config GENERIC_TIME bool default y -config IRQ_PER_CPU +config GENERIC_HARDIRQS bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM +config IRQ_PER_CPU bool default y @@ -596,6 +593,18 @@ config HIGHMEM source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "fs/Kconfig.binfmt" # We optimistically allocate largepages from the VM, so make the limit Index: linux/arch/powerpc/boot/Makefile =================================================================== --- linux.orig/arch/powerpc/boot/Makefile +++ linux/arch/powerpc/boot/Makefile @@ -29,6 +29,14 @@ OBJCOPYFLAGS := contents,alloc,load,r OBJCOPY_COFF_ARGS := -O aixcoff-rs6000 --set-start 0x500000 OBJCOPY_MIB_ARGS := -O aixcoff-rs6000 -R .stab -R .stabstr -R .comment +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +CFLAGS := $(subst ${pg_flag},${space},${CFLAGS}) +endif + zlib := inffast.c inflate.c inftrees.c zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h zliblinuxheader := zlib.h zconf.h zutil.h @@ -44,7 +52,7 @@ obj-boot := $(addsuffix .o, $(basename $ BOOTCFLAGS += -I$(obj) -I$(srctree)/$(obj) quiet_cmd_copy_zlib = COPY $@ - cmd_copy_zlib = sed "s@__attribute_used__@@;s@]\+\).*@\"\1\"@" $< > $@ + cmd_copy_zlib = sed "s@__attribute_used__@@;s@.include.@@;s@.include.@@;s@.*spin.*lock.*@@;s@.*SPINLOCK.*@@;s@]\+\).*@\"\1\"@" $< > $@ quiet_cmd_copy_zlibheader = COPY $@ cmd_copy_zlibheader = sed "s@]\+\).*@\"\1\"@" $< > $@ Index: linux/arch/powerpc/kernel/Makefile =================================================================== --- linux.orig/arch/powerpc/kernel/Makefile +++ linux/arch/powerpc/kernel/Makefile @@ -10,10 +10,11 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif -obj-y := semaphore.o cputable.o ptrace.o syscalls.o \ +obj-y := cputable.o ptrace.o syscalls.o \ irq.o align.o signal_32.o pmc.o vdso.o \ init_task.o process.o systbl.o idle.o obj-y += vdso32/ +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o obj-$(CONFIG_PPC64) += setup_64.o binfmt_elf32.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o cpu_setup_power4.o \ Index: linux/arch/powerpc/kernel/entry_32.S =================================================================== --- linux.orig/arch/powerpc/kernel/entry_32.S +++ linux/arch/powerpc/kernel/entry_32.S @@ -638,7 +638,7 @@ user_exc_return: /* r10 contains MSR_KE /* Check current_thread_info()->flags */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED) + andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne do_work restore_user: @@ -856,7 +856,7 @@ load_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -870,7 +870,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne- do_resched andi. r0,r9,_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK beq restore_user @@ -978,3 +978,85 @@ machine_check_in_rtas: /* XXX load up BATs and panic */ #endif /* CONFIG_PPC_RTAS */ + +#ifdef CONFIG_MCOUNT +/* + * mcount() is not the same as _mcount(). The callers of mcount() have a + * normal context. The callers of _mcount() do not have a stack frame and + * have not saved the "caller saves" registers. + */ +_GLOBAL(mcount) + stwu r1,-16(r1) + mflr r3 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + stw r3,20(r1) + cmpwi r5,0 + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,16(r1) + lwz r4,4(r4) + bl __trace +1: + lwz r0,20(r1) + mtlr r0 + addi r1,r1,16 + blr + +/* + * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the + * C compiler to add a call to _mcount() at the start of each function + * preamble, before the stack frame is created. An example of this preamble + * code is: + * + * mflr r0 + * lis r12,-16354 + * stw r0,4(r1) + * addi r0,r12,-19652 + * bl 0xc00034c8 <_mcount> + * mflr r0 + * stwu r1,-16(r1) + */ +_GLOBAL(_mcount) +#define M_STK_SIZE 48 + /* Would not expect to need to save cr, but glibc version of */ + /* _mcount() does, so cautiously saving it here too. */ + stwu r1,-M_STK_SIZE(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 /* will use as first arg to __trace() */ + mfcr r4 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + cmpwi r5,0 + stw r3, 44(r1) /* lr */ + stw r4, 8(r1) /* cr */ + stw r7, 28(r1) + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,M_STK_SIZE+4(r1) + bl __trace +1: + lwz r8, 8(r1) /* cr */ + lwz r9, 44(r1) /* lr */ + lwz r3, 12(r1) + lwz r4, 16(r1) + lwz r5, 20(r1) + mtcrf 0xff,r8 + mtctr r9 + lwz r0, 52(r1) + lwz r6, 24(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1,r1,M_STK_SIZE + mtlr r0 + bctr + +#endif /* CONFIG_MCOUNT */ Index: linux/arch/powerpc/kernel/irq.c =================================================================== --- linux.orig/arch/powerpc/kernel/irq.c +++ linux/arch/powerpc/kernel/irq.c @@ -91,8 +91,6 @@ extern atomic_t ipi_sent; #endif #ifdef CONFIG_PPC64 -EXPORT_SYMBOL(irq_desc); - int distribute_irqs = 1; #endif /* CONFIG_PPC64 */ Index: linux/arch/powerpc/kernel/ppc_ksyms.c =================================================================== --- linux.orig/arch/powerpc/kernel/ppc_ksyms.c +++ linux/arch/powerpc/kernel/ppc_ksyms.c @@ -16,7 +16,6 @@ #include #include -#include #include #include #include @@ -189,7 +188,6 @@ EXPORT_SYMBOL(screen_info); #ifdef CONFIG_PPC32 EXPORT_SYMBOL(timer_interrupt); -EXPORT_SYMBOL(irq_desc); EXPORT_SYMBOL(tb_ticks_per_jiffy); EXPORT_SYMBOL(console_drivers); EXPORT_SYMBOL(cacheable_memcpy); Index: linux/arch/powerpc/kernel/semaphore.c =================================================================== --- linux.orig/arch/powerpc/kernel/semaphore.c +++ linux/arch/powerpc/kernel/semaphore.c @@ -31,7 +31,7 @@ * sem->count = tmp; * return old_count; */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -50,7 +50,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -63,7 +63,7 @@ void __up(struct semaphore *sem) __sem_update_count(sem, 1); wake_up(&sem->wait); } -EXPORT_SYMBOL(__up); +EXPORT_SYMBOL(__compat_up); /* * Note that when we come in to __down or __down_interruptible, @@ -73,7 +73,7 @@ EXPORT_SYMBOL(__up); * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -101,9 +101,9 @@ void __sched __down(struct semaphore *se */ wake_up(&sem->wait); } -EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__compat_down); -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore *sem) { int retval = 0; struct task_struct *tsk = current; @@ -132,4 +132,10 @@ int __sched __down_interruptible(struct wake_up(&sem->wait); return retval; } -EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__compat_down_interruptible); + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux/arch/powerpc/kernel/smp.c =================================================================== --- linux.orig/arch/powerpc/kernel/smp.c +++ linux/arch/powerpc/kernel/smp.c @@ -148,6 +148,16 @@ void smp_send_reschedule(int cpu) smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE); +} + #ifdef CONFIG_DEBUGGER void smp_send_debugger_break(int cpu) { @@ -184,7 +194,7 @@ void smp_send_stop(void) * static memory requirements. It also looks cleaner. * Stolen from the i386 version. */ -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock); +static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(call_lock); static struct call_data_struct { void (*func) (void *info); Index: linux/arch/powerpc/kernel/time.c =================================================================== --- linux.orig/arch/powerpc/kernel/time.c +++ linux/arch/powerpc/kernel/time.c @@ -73,6 +73,9 @@ #endif #include +unsigned long cpu_khz; /* Detected as we calibrate the TSC */ +EXPORT_SYMBOL(cpu_khz); + /* keep track of when we need to update the rtc */ time_t last_rtc_update; #ifdef CONFIG_PPC_ISERIES @@ -115,8 +118,6 @@ EXPORT_SYMBOL_GPL(rtc_lock); u64 tb_to_ns_scale; unsigned tb_to_ns_shift; -struct gettimeofday_struct do_gtod; - extern unsigned long wall_jiffies; extern struct timezone sys_tz; @@ -407,162 +408,8 @@ static __inline__ void timer_check_rtc(v } } -/* - * This version of gettimeofday has microsecond resolution. - */ -static inline void __do_gettimeofday(struct timeval *tv) -{ - unsigned long sec, usec; - u64 tb_ticks, xsec; - struct gettimeofday_vars *temp_varp; - u64 temp_tb_to_xs, temp_stamp_xsec; - - /* - * These calculations are faster (gets rid of divides) - * if done in units of 1/2^20 rather than microseconds. - * The conversion to microseconds at the end is done - * without a divide (and in fact, without a multiply) - */ - temp_varp = do_gtod.varp; - - /* Sampling the time base must be done after loading - * do_gtod.varp in order to avoid racing with update_gtod. - */ - data_barrier(temp_varp); - tb_ticks = get_tb() - temp_varp->tb_orig_stamp; - temp_tb_to_xs = temp_varp->tb_to_xs; - temp_stamp_xsec = temp_varp->stamp_xsec; - xsec = temp_stamp_xsec + mulhdu(tb_ticks, temp_tb_to_xs); - sec = xsec / XSEC_PER_SEC; - usec = (unsigned long)xsec & (XSEC_PER_SEC - 1); - usec = SCALE_XSEC(usec, 1000000); - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -void do_gettimeofday(struct timeval *tv) -{ - if (__USE_RTC()) { - /* do this the old way */ - unsigned long flags, seq; - unsigned int sec, nsec, usec; - - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - sec = xtime.tv_sec; - nsec = xtime.tv_nsec + tb_ticks_since(tb_last_jiffy); - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - usec = nsec / 1000; - while (usec >= 1000000) { - usec -= 1000000; - ++sec; - } - tv->tv_sec = sec; - tv->tv_usec = usec; - return; - } - __do_gettimeofday(tv); -} - -EXPORT_SYMBOL(do_gettimeofday); - -/* - * There are two copies of tb_to_xs and stamp_xsec so that no - * lock is needed to access and use these values in - * do_gettimeofday. We alternate the copies and as long as a - * reasonable time elapses between changes, there will never - * be inconsistent values. ntpd has a minimum of one minute - * between updates. - */ -static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec, - u64 new_tb_to_xs) -{ - unsigned temp_idx; - struct gettimeofday_vars *temp_varp; - - temp_idx = (do_gtod.var_idx == 0); - temp_varp = &do_gtod.vars[temp_idx]; - - temp_varp->tb_to_xs = new_tb_to_xs; - temp_varp->tb_orig_stamp = new_tb_stamp; - temp_varp->stamp_xsec = new_stamp_xsec; - smp_mb(); - do_gtod.varp = temp_varp; - do_gtod.var_idx = temp_idx; - - /* - * tb_update_count is used to allow the userspace gettimeofday code - * to assure itself that it sees a consistent view of the tb_to_xs and - * stamp_xsec variables. It reads the tb_update_count, then reads - * tb_to_xs and stamp_xsec and then reads tb_update_count again. If - * the two values of tb_update_count match and are even then the - * tb_to_xs and stamp_xsec values are consistent. If not, then it - * loops back and reads them again until this criteria is met. - * We expect the caller to have done the first increment of - * vdso_data->tb_update_count already. - */ - vdso_data->tb_orig_stamp = new_tb_stamp; - vdso_data->stamp_xsec = new_stamp_xsec; - vdso_data->tb_to_xs = new_tb_to_xs; - vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec; - vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec; - smp_wmb(); - ++(vdso_data->tb_update_count); -} - -/* - * When the timebase - tb_orig_stamp gets too big, we do a manipulation - * between tb_orig_stamp and stamp_xsec. The goal here is to keep the - * difference tb - tb_orig_stamp small enough to always fit inside a - * 32 bits number. This is a requirement of our fast 32 bits userland - * implementation in the vdso. If we "miss" a call to this function - * (interrupt latency, CPU locked in a spinlock, ...) and we end up - * with a too big difference, then the vdso will fallback to calling - * the syscall - */ -static __inline__ void timer_recalc_offset(u64 cur_tb) -{ - unsigned long offset; - u64 new_stamp_xsec; - u64 tlen, t2x; - u64 tb, xsec_old, xsec_new; - struct gettimeofday_vars *varp; - - if (__USE_RTC()) - return; - tlen = current_tick_length(); - offset = cur_tb - do_gtod.varp->tb_orig_stamp; - if (tlen == last_tick_len && offset < 0x80000000u) - return; - if (tlen != last_tick_len) { - t2x = mulhdu(tlen << TICKLEN_SHIFT, ticklen_to_xs); - last_tick_len = tlen; - } else - t2x = do_gtod.varp->tb_to_xs; - new_stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC; - do_div(new_stamp_xsec, 1000000000); - new_stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC; - - ++vdso_data->tb_update_count; - smp_mb(); - - /* - * Make sure time doesn't go backwards for userspace gettimeofday. - */ - tb = get_tb(); - varp = do_gtod.varp; - xsec_old = mulhdu(tb - varp->tb_orig_stamp, varp->tb_to_xs) - + varp->stamp_xsec; - xsec_new = mulhdu(tb - cur_tb, t2x) + new_stamp_xsec; - if (xsec_new < xsec_old) - new_stamp_xsec += xsec_old - xsec_new; - - update_gtod(cur_tb, new_stamp_xsec, t2x); -} - #ifdef CONFIG_SMP -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -610,11 +457,7 @@ static void iSeries_tb_recal(void) tb_ticks_per_sec = new_tb_ticks_per_sec; calc_cputime_factors(); div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres ); - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; tb_to_xs = divres.result_low; - do_gtod.varp->tb_to_xs = tb_to_xs; - vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - vdso_data->tb_to_xs = tb_to_xs; } else { printk( "Titan recalibrate: FAILED (difference > 4 percent)\n" @@ -781,81 +624,6 @@ unsigned long long sched_clock(void) return mulhdu(get_tb(), tb_to_ns_scale) << tb_to_ns_shift; } -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, new_sec = tv->tv_sec; - long wtm_nsec, new_nsec = tv->tv_nsec; - unsigned long flags; - u64 new_xsec; - unsigned long tb_delta; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irqsave(&xtime_lock, flags); - - /* - * Updating the RTC is not the job of this code. If the time is - * stepped under NTP, the RTC will be updated after STA_UNSYNC - * is cleared. Tools like clock/hwclock either copy the RTC - * to the system time, in which case there is no point in writing - * to the RTC again, or write to the RTC but then they don't call - * settimeofday to perform this operation. - */ -#ifdef CONFIG_PPC_ISERIES - if (first_settimeofday) { - iSeries_tb_recal(); - first_settimeofday = 0; - } -#endif - - /* Make userspace gettimeofday spin until we're done. */ - ++vdso_data->tb_update_count; - smp_mb(); - - /* - * Subtract off the number of nanoseconds since the - * beginning of the last tick. - * Note that since we don't increment jiffies_64 anywhere other - * than in do_timer (since we don't have a lost tick problem), - * wall_jiffies will always be the same as jiffies, - * and therefore the (jiffies - wall_jiffies) computation - * has been removed. - */ - tb_delta = tb_ticks_since(tb_last_jiffy); - tb_delta = mulhdu(tb_delta, do_gtod.varp->tb_to_xs); /* in xsec */ - new_nsec -= SCALE_XSEC(tb_delta, 1000000000); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - new_sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - new_nsec); - - set_normalized_timespec(&xtime, new_sec, new_nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - /* In case of a large backwards jump in time with NTP, we want the - * clock to be updated as soon as the PLL is again in lock. - */ - last_rtc_update = new_sec - 658; - - ntp_clear(); - - new_xsec = xtime.tv_nsec; - if (new_xsec != 0) { - new_xsec *= XSEC_PER_SEC; - do_div(new_xsec, NSEC_PER_SEC); - } - new_xsec += (u64)xtime.tv_sec * XSEC_PER_SEC; - update_gtod(tb_last_jiffy, new_xsec, do_gtod.varp->tb_to_xs); - - vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; - vdso_data->tz_dsttime = sys_tz.tz_dsttime; - - write_sequnlock_irqrestore(&xtime_lock, flags); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); static int __init get_freq(char *name, int cells, unsigned long *val) { @@ -1024,20 +792,6 @@ void __init time_init(void) xtime.tv_sec = tm; xtime.tv_nsec = 0; - do_gtod.varp = &do_gtod.vars[0]; - do_gtod.var_idx = 0; - do_gtod.varp->tb_orig_stamp = tb_last_jiffy; - __get_cpu_var(last_jiffy) = tb_last_jiffy; - do_gtod.varp->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; - do_gtod.varp->tb_to_xs = tb_to_xs; - do_gtod.tb_to_us = tb_to_us; - - vdso_data->tb_orig_stamp = tb_last_jiffy; - vdso_data->tb_update_count = 0; - vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; - vdso_data->tb_to_xs = tb_to_xs; time_freq = 0; @@ -1050,7 +804,6 @@ void __init time_init(void) set_dec(tb_ticks_per_jiffy); } - #define FEBRUARY 2 #define STARTOFTIME 1970 #define SECDAY 86400L @@ -1195,3 +948,36 @@ void div128_by_32(u64 dividend_high, u64 dr->result_low = ((u64)y << 32) + z; } + + +/* powerpc clocksource code */ + +#include +static cycle_t timebase_read(void) +{ + return (cycle_t)get_tb(); +} + +struct clocksource clocksource_timebase = { + .name = "timebase", + .rating = 200, + .read = timebase_read, + .mask = (cycle_t)-1, + .mult = 0, + .shift = 22, +}; + + +/* XXX - this should be calculated or properly externed! */ +static int __init init_timebase_clocksource(void) +{ + if (__USE_RTC()) + return -ENODEV; + + clocksource_timebase.mult = clocksource_hz2mult(tb_ticks_per_sec, + clocksource_timebase.shift); + return clocksource_register(&clocksource_timebase); +} + +module_init(init_timebase_clocksource); + Index: linux/arch/powerpc/kernel/traps.c =================================================================== --- linux.orig/arch/powerpc/kernel/traps.c +++ linux/arch/powerpc/kernel/traps.c @@ -93,7 +93,7 @@ EXPORT_SYMBOL(unregister_die_notifier); * Trap & Exception support */ -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); int die(const char *str, struct pt_regs *regs, long err) { @@ -164,6 +164,11 @@ void _exception(int signr, struct pt_reg return; } +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + memset(&info, 0, sizeof(info)); info.si_signo = signr; info.si_code = code; Index: linux/arch/powerpc/lib/locks.c =================================================================== --- linux.orig/arch/powerpc/lib/locks.c +++ linux/arch/powerpc/lib/locks.c @@ -24,7 +24,7 @@ #include #include -void __spin_yield(raw_spinlock_t *lock) +void __spin_yield(__raw_spinlock_t *lock) { unsigned int lock_value, holder_cpu, yield_count; @@ -79,7 +79,7 @@ void __rw_yield(raw_rwlock_t *rw) } #endif -void __raw_spin_unlock_wait(raw_spinlock_t *lock) +void __raw_spin_unlock_wait(__raw_spinlock_t *lock) { while (lock->slock) { HMT_low(); Index: linux/arch/powerpc/mm/fault.c =================================================================== --- linux.orig/arch/powerpc/mm/fault.c +++ linux/arch/powerpc/mm/fault.c @@ -149,8 +149,8 @@ static void do_dabr(struct pt_regs *regs * The return value is 0 if the fault was handled, or the signal * number if this is a kernel fault that can't be handled here. */ -int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) +int __kprobes notrace do_page_fault(struct pt_regs *regs, + unsigned long address, unsigned long error_code) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; Index: linux/arch/powerpc/mm/init_32.c =================================================================== --- linux.orig/arch/powerpc/mm/init_32.c +++ linux/arch/powerpc/mm/init_32.c @@ -56,7 +56,7 @@ #endif #define MAX_LOW_MEM CONFIG_LOWMEM_SIZE -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long total_memory; unsigned long total_lowmem; Index: linux/arch/powerpc/mm/tlb_64.c =================================================================== --- linux.orig/arch/powerpc/mm/tlb_64.c +++ linux/arch/powerpc/mm/tlb_64.c @@ -37,7 +37,7 @@ DEFINE_PER_CPU(struct ppc64_tlb_batch, p /* This is declared as we are using the more or less generic * include/asm-powerpc/tlb.h file -- tgall */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); unsigned long pte_freelist_forced_free; Index: linux/arch/powerpc/platforms/cell/smp.c =================================================================== --- linux.orig/arch/powerpc/platforms/cell/smp.c +++ linux/arch/powerpc/platforms/cell/smp.c @@ -133,7 +133,7 @@ static void __devinit smp_iic_setup_cpu( iic_setup_cpu(); } -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned long timebase = 0; static void __devinit cell_give_timebase(void) Index: linux/arch/powerpc/platforms/chrp/smp.c =================================================================== --- linux.orig/arch/powerpc/platforms/chrp/smp.c +++ linux/arch/powerpc/platforms/chrp/smp.c @@ -45,7 +45,7 @@ static void __devinit smp_chrp_setup_cpu mpic_setup_this_cpu(); } -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned int timebase_upper = 0, timebase_lower = 0; void __devinit smp_chrp_give_timebase(void) Index: linux/arch/powerpc/platforms/chrp/time.c =================================================================== --- linux.orig/arch/powerpc/platforms/chrp/time.c +++ linux/arch/powerpc/platforms/chrp/time.c @@ -27,7 +27,7 @@ #include #include -extern spinlock_t rtc_lock; +extern raw_spinlock_t rtc_lock; static int nvram_as1 = NVRAM_AS1; static int nvram_as0 = NVRAM_AS0; Index: linux/arch/powerpc/platforms/iseries/setup.c =================================================================== --- linux.orig/arch/powerpc/platforms/iseries/setup.c +++ linux/arch/powerpc/platforms/iseries/setup.c @@ -594,12 +594,14 @@ static void yield_shared_processor(void) static void iseries_shared_idle(void) { while (1) { - while (!need_resched() && !hvlpevent_is_pending()) { + while (!need_resched() && !need_resched_delayed() + && !hvlpevent_is_pending()) { local_irq_disable(); ppc64_runlatch_off(); /* Recheck with irqs off */ - if (!need_resched() && !hvlpevent_is_pending()) + if (!need_resched() && !need_resched_delayed() + && !hvlpevent_is_pending()) yield_shared_processor(); HMT_medium(); Index: linux/arch/powerpc/platforms/powermac/feature.c =================================================================== --- linux.orig/arch/powerpc/platforms/powermac/feature.c +++ linux/arch/powerpc/platforms/powermac/feature.c @@ -59,7 +59,7 @@ extern struct device_node *k2_skiplist[2 * We use a single global lock to protect accesses. Each driver has * to take care of its own locking */ -DEFINE_SPINLOCK(feature_lock); +DEFINE_RAW_SPINLOCK(feature_lock); #define LOCK(flags) spin_lock_irqsave(&feature_lock, flags); #define UNLOCK(flags) spin_unlock_irqrestore(&feature_lock, flags); Index: linux/arch/powerpc/platforms/powermac/nvram.c =================================================================== --- linux.orig/arch/powerpc/platforms/powermac/nvram.c +++ linux/arch/powerpc/platforms/powermac/nvram.c @@ -80,7 +80,7 @@ static int is_core_99; static int core99_bank = 0; static int nvram_partitions[3]; // XXX Turn that into a sem -static DEFINE_SPINLOCK(nv_lock); +static DEFINE_RAW_SPINLOCK(nv_lock); static int (*core99_write_bank)(int bank, u8* datas); static int (*core99_erase_bank)(int bank); Index: linux/arch/powerpc/platforms/powermac/pic.c =================================================================== --- linux.orig/arch/powerpc/platforms/powermac/pic.c +++ linux/arch/powerpc/platforms/powermac/pic.c @@ -63,7 +63,7 @@ static int max_irqs; static int max_real_irqs; static u32 level_mask[4]; -static DEFINE_SPINLOCK(pmac_pic_lock); +static DEFINE_RAW_SPINLOCK(pmac_pic_lock); #define NR_MASK_WORDS ((NR_IRQS + 31) / 32) static unsigned long ppc_lost_interrupts[NR_MASK_WORDS]; Index: linux/arch/powerpc/platforms/pseries/setup.c =================================================================== --- linux.orig/arch/powerpc/platforms/pseries/setup.c +++ linux/arch/powerpc/platforms/pseries/setup.c @@ -483,7 +483,8 @@ static void pseries_dedicated_idle_sleep set_thread_flag(TIF_POLLING_NRFLAG); while (get_tb() < start_snooze) { - if (need_resched() || cpu_is_offline(cpu)) + if (need_resched() || need_resched_delayed() || + cpu_is_offline(cpu)) goto out; ppc64_runlatch_off(); HMT_low(); @@ -494,7 +495,8 @@ static void pseries_dedicated_idle_sleep clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb(); local_irq_disable(); - if (need_resched() || cpu_is_offline(cpu)) + if (need_resched() || need_resched_delayed() || + cpu_is_offline(cpu)) goto out; } Index: linux/arch/powerpc/platforms/pseries/smp.c =================================================================== --- linux.orig/arch/powerpc/platforms/pseries/smp.c +++ linux/arch/powerpc/platforms/pseries/smp.c @@ -344,7 +344,7 @@ static void __devinit smp_xics_setup_cpu } #endif /* CONFIG_XICS */ -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned long timebase = 0; static void __devinit pSeries_give_timebase(void) Index: linux/arch/ppc/8260_io/enet.c =================================================================== --- linux.orig/arch/ppc/8260_io/enet.c +++ linux/arch/ppc/8260_io/enet.c @@ -116,7 +116,7 @@ struct scc_enet_private { scc_t *sccp; struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; }; static int scc_enet_open(struct net_device *dev); Index: linux/arch/ppc/8260_io/fcc_enet.c =================================================================== --- linux.orig/arch/ppc/8260_io/fcc_enet.c +++ linux/arch/ppc/8260_io/fcc_enet.c @@ -376,7 +376,7 @@ struct fcc_enet_private { volatile fcc_enet_t *ep; struct net_device_stats stats; uint tx_free; - spinlock_t lock; + raw_spinlock_t lock; #ifdef CONFIG_USE_MDIO uint phy_id; Index: linux/arch/ppc/8xx_io/commproc.c =================================================================== --- linux.orig/arch/ppc/8xx_io/commproc.c +++ linux/arch/ppc/8xx_io/commproc.c @@ -356,7 +356,7 @@ cpm_setbrg(uint brg, uint rate) /* * dpalloc / dpfree bits. */ -static spinlock_t cpm_dpmem_lock; +static raw_spinlock_t cpm_dpmem_lock; /* * 16 blocks should be enough to satisfy all requests * until the memory subsystem goes up... Index: linux/arch/ppc/8xx_io/enet.c =================================================================== --- linux.orig/arch/ppc/8xx_io/enet.c +++ linux/arch/ppc/8xx_io/enet.c @@ -143,7 +143,7 @@ struct scc_enet_private { unsigned char *rx_vaddr[RX_RING_SIZE]; struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; }; static int scc_enet_open(struct net_device *dev); Index: linux/arch/ppc/8xx_io/fec.c =================================================================== --- linux.orig/arch/ppc/8xx_io/fec.c +++ linux/arch/ppc/8xx_io/fec.c @@ -164,7 +164,7 @@ struct fec_enet_private { struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; #ifdef CONFIG_USE_MDIO uint phy_id; Index: linux/arch/ppc/Kconfig =================================================================== --- linux.orig/arch/ppc/Kconfig +++ linux/arch/ppc/Kconfig @@ -12,13 +12,6 @@ config GENERIC_HARDIRQS bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config GENERIC_HWEIGHT bool default y @@ -955,6 +948,18 @@ config HIGHMEM source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "mm/Kconfig" source "fs/Kconfig.binfmt" Index: linux/arch/ppc/boot/Makefile =================================================================== --- linux.orig/arch/ppc/boot/Makefile +++ linux/arch/ppc/boot/Makefile @@ -14,6 +14,15 @@ # CFLAGS += -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include + +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +CFLAGS := $(subst ${pg_flag},${space},${CFLAGS}) +endif + HOSTCFLAGS += -Iarch/$(ARCH)/boot/include BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd Index: linux/arch/ppc/kernel/dma-mapping.c =================================================================== --- linux.orig/arch/ppc/kernel/dma-mapping.c +++ linux/arch/ppc/kernel/dma-mapping.c @@ -70,7 +70,7 @@ int map_page(unsigned long va, phys_addr * This is the page table (2MB) covering uncached, DMA consistent allocations */ static pte_t *consistent_pte; -static DEFINE_SPINLOCK(consistent_lock); +static DEFINE_RAW_SPINLOCK(consistent_lock); /* * VM region handling support. Index: linux/arch/ppc/kernel/entry.S =================================================================== --- linux.orig/arch/ppc/kernel/entry.S +++ linux/arch/ppc/kernel/entry.S @@ -856,7 +856,7 @@ load_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -870,7 +870,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,18 lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne- do_resched andi. r0,r9,_TIF_SIGPENDING beq restore_user Index: linux/arch/ppc/kernel/semaphore.c =================================================================== --- linux.orig/arch/ppc/kernel/semaphore.c +++ linux/arch/ppc/kernel/semaphore.c @@ -29,7 +29,7 @@ * sem->count = tmp; * return old_count; */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -48,7 +48,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -70,7 +70,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -100,7 +100,7 @@ void __sched __down(struct semaphore *se wake_up(&sem->wait); } -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -129,3 +129,8 @@ int __sched __down_interruptible(struct wake_up(&sem->wait); return retval; } + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} Index: linux/arch/ppc/kernel/smp.c =================================================================== --- linux.orig/arch/ppc/kernel/smp.c +++ linux/arch/ppc/kernel/smp.c @@ -137,6 +137,16 @@ void smp_send_reschedule(int cpu) smp_message_pass(cpu, PPC_MSG_RESCHEDULE); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + smp_message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE, 0, 0); +} + #ifdef CONFIG_XMON void smp_send_xmon_break(int cpu) { @@ -161,7 +171,7 @@ void smp_send_stop(void) * static memory requirements. It also looks cleaner. * Stolen from the i386 version. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); static struct call_data_struct { void (*func) (void *info); Index: linux/arch/ppc/kernel/time.c =================================================================== --- linux.orig/arch/ppc/kernel/time.c +++ linux/arch/ppc/kernel/time.c @@ -65,6 +65,9 @@ #include +unsigned long cpu_khz; /* Detected as we calibrate the TSC */ +EXPORT_SYMBOL(cpu_khz); + unsigned long disarm_decr[NR_CPUS]; extern struct timezone sys_tz; @@ -103,7 +106,7 @@ static inline int tb_delta(unsigned *jif } #ifdef CONFIG_SMP -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); Index: linux/arch/ppc/kernel/traps.c =================================================================== --- linux.orig/arch/ppc/kernel/traps.c +++ linux/arch/ppc/kernel/traps.c @@ -71,7 +71,7 @@ void (*debugger_fault_handler)(struct pt * Trap & Exception support */ -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); int die(const char * str, struct pt_regs * fp, long err) { @@ -106,6 +106,10 @@ void _exception(int signr, struct pt_reg debugger(regs); die("Exception in kernel mode", regs, signr); } +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif info.si_signo = signr; info.si_errno = 0; info.si_code = code; Index: linux/arch/ppc/lib/locks.c =================================================================== --- linux.orig/arch/ppc/lib/locks.c +++ linux/arch/ppc/lib/locks.c @@ -42,7 +42,7 @@ static inline unsigned long __spin_trylo return ret; } -void _raw_spin_lock(spinlock_t *lock) +void __raw_spin_lock(raw_spinlock_t *lock) { int cpu = smp_processor_id(); unsigned int stuck = INIT_STUCK; @@ -62,9 +62,9 @@ void _raw_spin_lock(spinlock_t *lock) lock->owner_pc = (unsigned long)__builtin_return_address(0); lock->owner_cpu = cpu; } -EXPORT_SYMBOL(_raw_spin_lock); +EXPORT_SYMBOL(__raw_spin_lock); -int _raw_spin_trylock(spinlock_t *lock) +int __raw_spin_trylock(raw_spinlock_t *lock) { if (__spin_trylock(&lock->lock)) return 0; @@ -72,9 +72,9 @@ int _raw_spin_trylock(spinlock_t *lock) lock->owner_pc = (unsigned long)__builtin_return_address(0); return 1; } -EXPORT_SYMBOL(_raw_spin_trylock); +EXPORT_SYMBOL(__raw_spin_trylock); -void _raw_spin_unlock(spinlock_t *lp) +void __raw_spin_unlock(raw_spinlock_t *lp) { if ( !lp->lock ) printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n", @@ -88,13 +88,13 @@ void _raw_spin_unlock(spinlock_t *lp) wmb(); lp->lock = 0; } -EXPORT_SYMBOL(_raw_spin_unlock); +EXPORT_SYMBOL(__raw_spin_unlock); /* * For rwlocks, zero is unlocked, -1 is write-locked, * positive is read-locked. */ -static __inline__ int __read_trylock(rwlock_t *rw) +static __inline__ int __read_trylock(raw_rwlock_t *rw) { signed int tmp; @@ -114,13 +114,13 @@ static __inline__ int __read_trylock(rwl return tmp; } -int _raw_read_trylock(rwlock_t *rw) +int __raw_read_trylock(raw_rwlock_t *rw) { return __read_trylock(rw) > 0; } -EXPORT_SYMBOL(_raw_read_trylock); +EXPORT_SYMBOL(__raw_read_trylock); -void _raw_read_lock(rwlock_t *rw) +void __raw_read_lock(rwlock_t *rw) { unsigned int stuck; @@ -135,9 +135,9 @@ void _raw_read_lock(rwlock_t *rw) } } } -EXPORT_SYMBOL(_raw_read_lock); +EXPORT_SYMBOL(__raw_read_lock); -void _raw_read_unlock(rwlock_t *rw) +void __raw_read_unlock(raw_rwlock_t *rw) { if ( rw->lock == 0 ) printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n", @@ -146,9 +146,9 @@ void _raw_read_unlock(rwlock_t *rw) wmb(); atomic_dec((atomic_t *) &(rw)->lock); } -EXPORT_SYMBOL(_raw_read_unlock); +EXPORT_SYMBOL(__raw_read_unlock); -void _raw_write_lock(rwlock_t *rw) +void __raw_write_lock(raw_rwlock_t *rw) { unsigned int stuck; @@ -164,18 +164,18 @@ void _raw_write_lock(rwlock_t *rw) } wmb(); } -EXPORT_SYMBOL(_raw_write_lock); +EXPORT_SYMBOL(__raw_write_lock); -int _raw_write_trylock(rwlock_t *rw) +int __raw_write_trylock(raw_rwlock_t *rw) { if (cmpxchg(&rw->lock, 0, -1) != 0) return 0; wmb(); return 1; } -EXPORT_SYMBOL(_raw_write_trylock); +EXPORT_SYMBOL(__raw_write_trylock); -void _raw_write_unlock(rwlock_t *rw) +void __raw_write_unlock(raw_rwlock_t *rw) { if (rw->lock >= 0) printk("_write_lock(): %s/%d (nip %08lX) lock %d\n", @@ -184,6 +184,6 @@ void _raw_write_unlock(rwlock_t *rw) wmb(); rw->lock = 0; } -EXPORT_SYMBOL(_raw_write_unlock); +EXPORT_SYMBOL(__raw_write_unlock); #endif Index: linux/arch/ppc/mm/fault.c =================================================================== --- linux.orig/arch/ppc/mm/fault.c +++ linux/arch/ppc/mm/fault.c @@ -89,7 +89,7 @@ static int store_updates_sp(struct pt_re * the error_code parameter is ESR for a data fault, 0 for an instruction * fault. */ -int do_page_fault(struct pt_regs *regs, unsigned long address, +int notrace do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long error_code) { struct vm_area_struct * vma; Index: linux/arch/ppc/mm/init.c =================================================================== --- linux.orig/arch/ppc/mm/init.c +++ linux/arch/ppc/mm/init.c @@ -55,7 +55,7 @@ #endif #define MAX_LOW_MEM CONFIG_LOWMEM_SIZE -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long total_memory; unsigned long total_lowmem; Index: linux/arch/ppc/platforms/apus_setup.c =================================================================== --- linux.orig/arch/ppc/platforms/apus_setup.c +++ linux/arch/ppc/platforms/apus_setup.c @@ -275,6 +275,7 @@ void apus_calibrate_decr(void) freq/1000000, freq%1000000); tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; __bus_speed = bus_speed; __speed_test_failed = speed_test_failed; Index: linux/arch/ppc/platforms/ev64260.c =================================================================== --- linux.orig/arch/ppc/platforms/ev64260.c +++ linux/arch/ppc/platforms/ev64260.c @@ -550,6 +550,7 @@ ev64260_calibrate_decr(void) tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; return; } Index: linux/arch/ppc/platforms/gemini_setup.c =================================================================== --- linux.orig/arch/ppc/platforms/gemini_setup.c +++ linux/arch/ppc/platforms/gemini_setup.c @@ -459,6 +459,7 @@ void __init gemini_calibrate_decr(void) divisor = 4; tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } unsigned long __init gemini_find_end_of_memory(void) Index: linux/arch/ppc/platforms/hdpu.c =================================================================== --- linux.orig/arch/ppc/platforms/hdpu.c +++ linux/arch/ppc/platforms/hdpu.c @@ -55,7 +55,7 @@ static void parse_bootinfo(unsigned long static void hdpu_set_l1pe(void); static void hdpu_cpustate_set(unsigned char new_state); #ifdef CONFIG_SMP -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned int timebase_upper = 0, timebase_lower = 0; extern int smp_tb_synchronized; Index: linux/arch/ppc/platforms/powerpmc250.c =================================================================== --- linux.orig/arch/ppc/platforms/powerpmc250.c +++ linux/arch/ppc/platforms/powerpmc250.c @@ -163,6 +163,7 @@ powerpmc250_calibrate_decr(void) tb_ticks_per_jiffy = freq / (HZ * divisor); tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static void Index: linux/arch/ppc/platforms/prep_setup.c =================================================================== --- linux.orig/arch/ppc/platforms/prep_setup.c +++ linux/arch/ppc/platforms/prep_setup.c @@ -940,6 +940,7 @@ prep_calibrate_decr(void) (freq/divisor)/1000000, (freq/divisor)%1000000); tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; tb_ticks_per_jiffy = freq / HZ / divisor; } } Index: linux/arch/ppc/platforms/prpmc750.c =================================================================== --- linux.orig/arch/ppc/platforms/prpmc750.c +++ linux/arch/ppc/platforms/prpmc750.c @@ -268,6 +268,7 @@ static void __init prpmc750_calibrate_de tb_ticks_per_jiffy = freq / (HZ * divisor); tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static void prpmc750_restart(char *cmd) Index: linux/arch/ppc/platforms/prpmc800.c =================================================================== --- linux.orig/arch/ppc/platforms/prpmc800.c +++ linux/arch/ppc/platforms/prpmc800.c @@ -327,6 +327,7 @@ static void __init prpmc800_calibrate_de tb_ticks_per_second = 100000000 / 4; tb_ticks_per_jiffy = tb_ticks_per_second / HZ; tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000); + cpu_khz = tb_ticks_per_second / 1000; return; } @@ -367,6 +368,7 @@ static void __init prpmc800_calibrate_de tb_ticks_per_second = (tbl_end - tbl_start) * 2; tb_ticks_per_jiffy = tb_ticks_per_second / HZ; tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000); + cpu_khz = tb_ticks_per_second / 1000; } static void prpmc800_restart(char *cmd) Index: linux/arch/ppc/platforms/sbc82xx.c =================================================================== --- linux.orig/arch/ppc/platforms/sbc82xx.c +++ linux/arch/ppc/platforms/sbc82xx.c @@ -65,7 +65,7 @@ static void sbc82xx_time_init(void) static volatile char *sbc82xx_i8259_map; static char sbc82xx_i8259_mask = 0xff; -static DEFINE_SPINLOCK(sbc82xx_i8259_lock); +static DEFINE_RAW_SPINLOCK(sbc82xx_i8259_lock); static void sbc82xx_i8259_mask_and_ack_irq(unsigned int irq_nr) { Index: linux/arch/ppc/platforms/spruce.c =================================================================== --- linux.orig/arch/ppc/platforms/spruce.c +++ linux/arch/ppc/platforms/spruce.c @@ -147,6 +147,7 @@ spruce_calibrate_decr(void) freq = SPRUCE_BUS_SPEED; tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static int Index: linux/arch/ppc/syslib/cpm2_common.c =================================================================== --- linux.orig/arch/ppc/syslib/cpm2_common.c +++ linux/arch/ppc/syslib/cpm2_common.c @@ -114,7 +114,7 @@ cpm2_fastbrg(uint brg, uint rate, int di /* * dpalloc / dpfree bits. */ -static spinlock_t cpm_dpmem_lock; +static raw_spinlock_t cpm_dpmem_lock; /* 16 blocks should be enough to satisfy all requests * until the memory subsystem goes up... */ static rh_block_t cpm_boot_dpmem_rh_block[16]; Index: linux/arch/ppc/syslib/ibm44x_common.c =================================================================== --- linux.orig/arch/ppc/syslib/ibm44x_common.c +++ linux/arch/ppc/syslib/ibm44x_common.c @@ -63,6 +63,7 @@ void __init ibm44x_calibrate_decr(unsign { tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; /* Set the time base to zero */ mtspr(SPRN_TBWL, 0); Index: linux/arch/ppc/syslib/m8260_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/m8260_setup.c +++ linux/arch/ppc/syslib/m8260_setup.c @@ -79,6 +79,7 @@ m8260_calibrate_decr(void) divisor = 4; tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } /* The 8260 has an internal 1-second timer update register that Index: linux/arch/ppc/syslib/m8xx_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/m8xx_setup.c +++ linux/arch/ppc/syslib/m8xx_setup.c @@ -218,6 +218,7 @@ void __init m8xx_calibrate_decr(void) printk("Decrementer Frequency = %d/%d\n", freq, divisor); tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; /* Perform some more timer/timebase initialization. This used * to be done elsewhere, but other changes caused it to get Index: linux/arch/ppc/syslib/mpc52xx_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/mpc52xx_setup.c +++ linux/arch/ppc/syslib/mpc52xx_setup.c @@ -215,6 +215,7 @@ mpc52xx_calibrate_decr(void) tb_ticks_per_jiffy = xlbfreq / HZ / divisor; tb_to_us = mulhwu_scale_factor(xlbfreq / divisor, 1000000); + cpu_khz = (xlbfreq / divisor) / 1000; } Index: linux/arch/ppc/syslib/ocp.c =================================================================== --- linux.orig/arch/ppc/syslib/ocp.c +++ linux/arch/ppc/syslib/ocp.c @@ -44,11 +44,11 @@ #include #include #include +#include #include #include #include -#include #include //#define DBG(x) printk x Index: linux/arch/ppc/syslib/open_pic.c =================================================================== --- linux.orig/arch/ppc/syslib/open_pic.c +++ linux/arch/ppc/syslib/open_pic.c @@ -526,7 +526,7 @@ void openpic_reset_processor_phys(u_int } #if defined(CONFIG_SMP) || defined(CONFIG_PM) -static DEFINE_SPINLOCK(openpic_setup_lock); +static DEFINE_RAW_SPINLOCK(openpic_setup_lock); #endif #ifdef CONFIG_SMP Index: linux/arch/ppc/syslib/open_pic2.c =================================================================== --- linux.orig/arch/ppc/syslib/open_pic2.c +++ linux/arch/ppc/syslib/open_pic2.c @@ -380,7 +380,7 @@ static void openpic2_set_spurious(u_int vec); } -static DEFINE_SPINLOCK(openpic2_setup_lock); +static DEFINE_RAW_SPINLOCK(openpic2_setup_lock); /* * Initialize a timer interrupt (and disable it) Index: linux/arch/ppc/syslib/ppc4xx_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/ppc4xx_setup.c +++ linux/arch/ppc/syslib/ppc4xx_setup.c @@ -172,6 +172,7 @@ ppc4xx_calibrate_decr(void) freq = bip->bi_tbfreq; tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; /* Set the time base to zero. ** At 200 Mhz, time base will rollover in ~2925 years. Index: linux/arch/ppc/syslib/ppc85xx_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/ppc85xx_setup.c +++ linux/arch/ppc/syslib/ppc85xx_setup.c @@ -57,6 +57,7 @@ mpc85xx_calibrate_decr(void) divisor = 8; tb_ticks_per_jiffy = freq / divisor / HZ; tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; /* Set the time base to zero */ mtspr(SPRN_TBWL, 0); Index: linux/arch/ppc/syslib/todc_time.c =================================================================== --- linux.orig/arch/ppc/syslib/todc_time.c +++ linux/arch/ppc/syslib/todc_time.c @@ -506,6 +506,7 @@ todc_calibrate_decr(void) tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; return; } Index: linux/arch/sparc64/Kconfig =================================================================== --- linux.orig/arch/sparc64/Kconfig +++ linux/arch/sparc64/Kconfig @@ -26,7 +26,7 @@ config MMU bool default y -config TIME_INTERPOLATION +config GENERIC_TIME bool default y Index: linux/arch/sparc64/defconfig =================================================================== --- linux.orig/arch/sparc64/defconfig +++ linux/arch/sparc64/defconfig @@ -7,7 +7,7 @@ CONFIG_SPARC=y CONFIG_SPARC64=y CONFIG_64BIT=y CONFIG_MMU=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y CONFIG_SPARC64_PAGE_SIZE_8KB=y # CONFIG_SPARC64_PAGE_SIZE_64KB is not set Index: linux/arch/sparc64/kernel/time.c =================================================================== --- linux.orig/arch/sparc64/kernel/time.c +++ linux/arch/sparc64/kernel/time.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -621,7 +622,7 @@ static void __init set_system_time(void) if (!mregs && !dregs) { prom_printf("Something wrong, clock regs not mapped yet.\n"); prom_halt(); - } + } if (mregs) { spin_lock_irq(&mostek_lock); @@ -821,7 +822,7 @@ static int __devinit clock_probe(struct } set_system_time(); - + local_irq_restore(flags); return 0; @@ -976,22 +977,33 @@ static struct notifier_block sparc64_cpu #endif /* CONFIG_CPU_FREQ */ -static struct time_interpolator sparc64_cpu_interpolator = { - .source = TIME_SOURCE_CPU, - .shift = 16, - .mask = 0xffffffffffffffffLL +static cycle_t read_itc(void) +{ + return (cycle_t)get_cycles()); +} + +static struct clocksource clocksource_sparc64_itc = { + .name = "sparc64_itc", + .rating = 300, + .read = read_itc, + .mask = 0xffffffffffffffffLL, + .mult = 0, /*to be caluclated*/ + .shift = 16, + .is_continuous = 1, }; + /* The quotient formula is taken from the IA64 port. */ #define SPARC64_NSEC_PER_CYC_SHIFT 30UL void __init time_init(void) { unsigned long clock = sparc64_init_timers(); - sparc64_cpu_interpolator.frequency = clock; - register_time_interpolator(&sparc64_cpu_interpolator); + clocksource_sparc64_itc.mult = clocksource_hz2mult(clock, + clocksource_sparc64_itc.shift); + clocksource_register(&clocksource_sparc64_itc); - /* Now that the interpolator is registered, it is + /* Now that the clocksource is registered, it is * safe to start the timer ticking. */ sparc64_start_timers(); @@ -1026,11 +1038,11 @@ static int set_rtc_mmss(unsigned long no unsigned long flags; u8 tmp; - /* + /* * Not having a register set can lead to trouble. * Also starfire doesn't have a tod clock. */ - if (!mregs && !dregs) + if (!mregs && !dregs) return -1; if (mregs) { Index: linux/arch/v850/Kconfig =================================================================== --- linux.orig/arch/v850/Kconfig +++ linux/arch/v850/Kconfig @@ -34,6 +34,10 @@ config GENERIC_IRQ_PROBE bool default y +config GENERIC_TIME + bool + default y + config TIME_LOW_RES bool default y Index: linux/arch/v850/kernel/time.c =================================================================== --- linux.orig/arch/v850/kernel/time.c +++ linux/arch/v850/kernel/time.c @@ -99,81 +99,6 @@ static irqreturn_t timer_interrupt (int return IRQ_HANDLED; } -/* - * This version of gettimeofday has near microsecond resolution. - */ -void do_gettimeofday (struct timeval *tv) -{ -#if 0 /* DAVIDM later if possible */ - extern volatile unsigned long lost_ticks; - unsigned long lost; -#endif - unsigned long flags; - unsigned long usec, sec; - unsigned long seq; - - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - -#if 0 - usec = mach_gettimeoffset ? mach_gettimeoffset () : 0; -#else - usec = 0; -#endif -#if 0 /* DAVIDM later if possible */ - lost = lost_ticks; - if (lost) - usec += lost * (1000000/HZ); -#endif - sec = xtime.tv_sec; - usec += xtime.tv_nsec / 1000; - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq (&xtime_lock); - - /* This is revolting. We need to set the xtime.tv_nsec - * correctly. However, the value in this location is - * is value at the last tick. - * Discover what correction gettimeofday - * would have done, and then undo it! - */ -#if 0 - tv->tv_nsec -= mach_gettimeoffset() * 1000; -#endif - - while (tv->tv_nsec < 0) { - tv->tv_nsec += NSEC_PER_SEC; - tv->tv_sec--; - } - - xtime.tv_sec = tv->tv_sec; - xtime.tv_nsec = tv->tv_nsec; - - ntp_clear(); - - write_sequnlock_irq (&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - static int timer_dev_id; static struct irqaction timer_irqaction = { timer_interrupt, Index: linux/arch/x86_64/Kconfig =================================================================== --- linux.orig/arch/x86_64/Kconfig +++ linux/arch/x86_64/Kconfig @@ -24,6 +24,14 @@ config X86 bool default y +config GENERIC_TIME + bool + default y + +config GENERIC_TIME_VSYSCALL + bool + default y + config LOCKDEP_SUPPORT bool default y @@ -46,13 +54,6 @@ config ISA config SBUS bool -config RWSEM_GENERIC_SPINLOCK - bool - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - config GENERIC_HWEIGHT bool default y @@ -289,6 +290,14 @@ config NUMA If the system is EM64T, you should say N unless your system is EM64T NUMA. +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + depends on !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT + bool + config K8_NUMA bool "Old style AMD Opteron NUMA detection" depends on NUMA @@ -659,3 +668,6 @@ source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" + +source "kernel/time/Kconfig" + Index: linux/arch/x86_64/ia32/ia32entry.S =================================================================== --- linux.orig/arch/x86_64/ia32/ia32entry.S +++ linux/arch/x86_64/ia32/ia32entry.S @@ -119,7 +119,9 @@ sysenter_do_call: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -227,7 +229,9 @@ cstar_do_call: cmpl $IA32_NR_syscalls-1,%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -320,8 +324,10 @@ ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) jmp int_ret_from_sys_call @@ -390,7 +396,7 @@ END(ia32_ptregs_common) .section .rodata,"a" .align 8 -ia32_sys_call_table: +ENTRY(ia32_sys_call_table) .quad sys_restart_syscall .quad sys_exit .quad stub32_fork @@ -713,4 +719,7 @@ ia32_sys_call_table: .quad sys_tee .quad compat_sys_vmsplice .quad compat_sys_move_pages +#ifdef CONFIG_LATENCY_TRACE +.globl ia32_syscall_end +#endif ia32_syscall_end: Index: linux/arch/x86_64/kernel/Makefile =================================================================== --- linux.orig/arch/x86_64/kernel/Makefile +++ linux/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o trap ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o + pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o Index: linux/arch/x86_64/kernel/apic.c =================================================================== --- linux.orig/arch/x86_64/kernel/apic.c +++ linux/arch/x86_64/kernel/apic.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -36,9 +37,9 @@ #include #include #include +#include int apic_verbosity; -int apic_runs_main_timer; int apic_calibrate_pmtmr __initdata; int disable_apic_timer __initdata; @@ -52,6 +53,25 @@ static cpumask_t timer_interrupt_broadca /* Using APIC to generate smp_local_timer_interrupt? */ int using_apic_timer __read_mostly = 0; + +static unsigned int calibration_result; + +static void lapic_next_event(unsigned long delta, struct clock_event *evt); +static void lapic_timer_setup(int mode, struct clock_event *evt); + +static struct clock_event lapic_clockevent = { + .name = "lapic", + .capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE +#ifdef CONFIG_SMP + | CLOCK_CAP_UPDATE +#endif + , + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, +}; +static DEFINE_PER_CPU(struct clock_event, lapic_events); + static void apic_pm_activate(void); void enable_NMI_through_LVT0 (void * dummy) @@ -527,8 +547,7 @@ static int lapic_suspend(struct sys_devi apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); - local_save_flags(flags); - local_irq_disable(); + local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); return 0; @@ -696,13 +715,16 @@ void __init init_apic_mappings(void) #define APIC_DIVISOR 16 -static void __setup_APIC_LVTT(unsigned int clocks) +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot) { unsigned int lvtt_value, tmp_value, ver; int cpu = smp_processor_id(); ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) lvtt_value |= APIC_LVT_MASKED; @@ -717,48 +739,34 @@ static void __setup_APIC_LVTT(unsigned i & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | APIC_TDR_DIV_16); - apic_write(APIC_TMICT, clocks/APIC_DIVISOR); + if (!oneshot) + apic_write(APIC_TMICT, clocks/APIC_DIVISOR); } -static void setup_APIC_timer(unsigned int clocks) +static void lapic_next_event(unsigned long delta, struct clock_event *evt) +{ + apic_write(APIC_TMICT, delta); +} + +static void lapic_timer_setup(int mode, struct clock_event *evt) { unsigned long flags; local_irq_save(flags); - - /* wait for irq slice */ - if (vxtime.hpet_address && hpet_use_timer) { - int trigger = hpet_readl(HPET_T0_CMP); - while (hpet_readl(HPET_COUNTER) >= trigger) - /* do nothing */ ; - while (hpet_readl(HPET_COUNTER) < trigger) - /* do nothing */ ; - } else { - int c1, c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - do { - c1 = c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - } while (c2 - c1 < 300); - } - __setup_APIC_LVTT(clocks); - /* Turn off PIT interrupt if we use APIC timer as main timer. - Only works with the PM timer right now - TBD fix it for HPET too. */ - if (vxtime.mode == VXTIME_PMTMR && - smp_processor_id() == boot_cpu_id && - apic_runs_main_timer == 1 && - !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) { - stop_timer_interrupt(); - apic_runs_main_timer++; - } + __setup_APIC_LVTT(calibration_result, mode != CLOCK_EVT_PERIODIC); local_irq_restore(flags); } + +static void __devinit setup_APIC_timer(void) +{ + struct clock_event *levt = &__get_cpu_var(lapic_events); + + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + + register_local_clockevent(levt); +} + /* * In this function we calibrate APIC bus clocks to the external * timer. Unfortunately we cannot use jiffies and the timer irq @@ -778,12 +786,13 @@ static int __init calibrate_APIC_clock(v { int apic, apic_start, tsc, tsc_start; int result; + u64 wallclock_nsecs; /* * Put whatever arbitrary (but long enough) timeout * value into the APIC clock, we just want to get the * counter running for calibration. */ - __setup_APIC_LVTT(1000000000); + __setup_APIC_LVTT(1000000000, 0); apic_start = apic_read(APIC_TMCCT); #ifdef CONFIG_X86_PM_TIMER @@ -791,6 +800,8 @@ static int __init calibrate_APIC_clock(v pmtimer_wait(5000); /* 5ms wait */ apic = apic_read(APIC_TMCCT); result = (apic_start - apic) * 1000L / 5; + printk("using pmtimer for lapic calibration\n"); + wallclock_nsecs = 5000000; } else #endif { @@ -804,6 +815,8 @@ static int __init calibrate_APIC_clock(v result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start); + wallclock_nsecs = ((u64)tsc - (u64)tsc_start) * 1000000 / (u64)cpu_khz; + } printk("result %d\n", result); @@ -811,11 +824,22 @@ static int __init calibrate_APIC_clock(v printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", result / 1000 / 1000, result / 1000 % 1000); + + + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(apic_start - apic, wallclock_nsecs, 32); + + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + printk("lapic max_delta_ns: %ld\n", lapic_clockevent.max_delta_ns); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + return result * APIC_DIVISOR / HZ; } -static unsigned int calibration_result; - void __init setup_boot_APIC_clock (void) { if (disable_apic_timer) { @@ -832,7 +856,7 @@ void __init setup_boot_APIC_clock (void) /* * Now set up the timer for real. */ - setup_APIC_timer(calibration_result); + setup_APIC_timer(); local_irq_enable(); } @@ -840,7 +864,7 @@ void __init setup_boot_APIC_clock (void) void __cpuinit setup_secondary_APIC_clock(void) { local_irq_disable(); /* FIXME: Do we need this? --RR */ - setup_APIC_timer(calibration_result); + setup_APIC_timer(); local_irq_enable(); } @@ -887,6 +911,13 @@ void switch_APIC_timer_to_ipi(void *cpum !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { disable_APIC_timer(); cpu_set(cpu, timer_interrupt_broadcast_ipi_mask); +#ifdef CONFIG_HIGH_RES_TIMERS + printk("Disabling NO_HZ and high resolution timers " + "due to timer broadcasting\n"); + for_each_possible_cpu(cpu) + per_cpu(lapic_events, cpu).capabilities &= + ~CLOCK_CAP_NEXTEVT; +#endif } } EXPORT_SYMBOL(switch_APIC_timer_to_ipi); @@ -945,8 +976,6 @@ void smp_local_timer_interrupt(struct pt #ifdef CONFIG_SMP update_process_times(user_mode(regs)); #endif - if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id) - main_timer_handler(regs); /* * We take the 'long' return path, and there every subsystem * grabs the appropriate locks (kernel lock/ irq lock). @@ -969,6 +998,8 @@ void smp_local_timer_interrupt(struct pt */ void smp_apic_timer_interrupt(struct pt_regs *regs) { + int cpu = smp_processor_id(); + struct clock_event *evt = &per_cpu(lapic_events, cpu); /* * the NMI deadlock-detector uses this. */ @@ -986,7 +1017,7 @@ void smp_apic_timer_interrupt(struct pt_ */ exit_idle(); irq_enter(); - smp_local_timer_interrupt(regs); + evt->event_handler(regs); irq_exit(); } @@ -1161,26 +1192,11 @@ static __init int setup_noapictimer(char return 1; } -static __init int setup_apicmaintimer(char *str) -{ - apic_runs_main_timer = 1; - nohpet = 1; - return 1; -} -__setup("apicmaintimer", setup_apicmaintimer); - -static __init int setup_noapicmaintimer(char *str) -{ - apic_runs_main_timer = -1; - return 1; -} -__setup("noapicmaintimer", setup_noapicmaintimer); - static __init int setup_apicpmtimer(char *s) { apic_calibrate_pmtmr = 1; notsc_setup(NULL); - return setup_apicmaintimer(NULL); + return 1; } __setup("apicpmtimer", setup_apicpmtimer); Index: linux/arch/x86_64/kernel/early_printk.c =================================================================== --- linux.orig/arch/x86_64/kernel/early_printk.c +++ linux/arch/x86_64/kernel/early_printk.c @@ -203,7 +203,7 @@ static int early_console_initialized = 0 void early_printk(const char *fmt, ...) { - char buf[512]; + static char buf[512]; int n; va_list ap; Index: linux/arch/x86_64/kernel/entry.S =================================================================== --- linux.orig/arch/x86_64/kernel/entry.S +++ linux/arch/x86_64/kernel/entry.S @@ -45,6 +45,47 @@ .code64 +#ifdef CONFIG_LATENCY_TRACE + +ENTRY(mcount) + cmpq $0, mcount_enabled + jz out + + push %rbp + mov %rsp,%rbp + + push %r11 + push %r10 + push %r9 + push %r8 + push %rdi + push %rsi + push %rdx + push %rcx + push %rax + + mov 0x0(%rbp),%rax + mov 0x8(%rbp),%rdi + mov 0x8(%rax),%rsi + + call __trace + + pop %rax + pop %rcx + pop %rdx + pop %rsi + pop %rdi + pop %r8 + pop %r9 + pop %r10 + pop %r11 + + pop %rbp +out: + ret + +#endif + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif @@ -221,7 +262,9 @@ ENTRY(system_call) cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx + TRACE_SYS_CALL call *sys_call_table(,%rax,8) # XXX: rip relative + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) /* * Syscall return path ending with SYSRET (fast path) @@ -255,8 +298,8 @@ sysret_check: /* edx: work, edi: workmask */ sysret_careful: CFI_RESTORE_STATE - bt $TIF_NEED_RESCHED,%edx - jnc sysret_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz sysret_signal TRACE_IRQS_ON sti pushq %rdi @@ -279,7 +322,7 @@ sysret_signal: leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ cli @@ -303,7 +346,9 @@ tracesys: cmpq $__NR_syscall_max,%rax ja 1f movq %r10,%rcx /* fixup for C */ + TRACE_SYS_CALL call *sys_call_table(,%rax,8) + TRACE_SYS_RET 1: movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ jmp int_ret_from_sys_call @@ -349,8 +394,8 @@ int_with_check: /* First do a reschedule test. */ /* edx: work, edi: workmask */ int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz int_very_careful TRACE_IRQS_ON sti pushq %rdi @@ -387,7 +432,7 @@ int_signal: movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 call do_notify_resume -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi int_restore_rest: RESTORE_REST cli @@ -585,8 +630,8 @@ bad_iret: /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz retint_signal TRACE_IRQS_ON sti pushq %rdi @@ -612,7 +657,7 @@ retint_signal: RESTORE_REST cli TRACE_IRQS_OFF - movl $_TIF_NEED_RESCHED,%edi + movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi GET_THREAD_INFO(%rcx) jmp retint_check Index: linux/arch/x86_64/kernel/head64.c =================================================================== --- linux.orig/arch/x86_64/kernel/head64.c +++ linux/arch/x86_64/kernel/head64.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -74,7 +75,7 @@ static void __init setup_boot_cpu_data(v boot_cpu_data.x86_mask = eax & 0xf; } -void __init x86_64_start_kernel(char * real_mode_data) +void __init notrace x86_64_start_kernel(char * real_mode_data) { char *s; int i; @@ -99,6 +100,7 @@ void __init x86_64_start_kernel(char * r cpu_pda(i) = &boot_cpu_pda[i]; pda_init(0); + copy_bootdata(real_mode_data); #ifdef CONFIG_SMP cpu_set(0, cpu_online_map); @@ -120,5 +122,6 @@ void __init x86_64_start_kernel(char * r panic("Kernel too big for kernel mapping\n"); setup_boot_cpu_data(); + start_kernel(); } Index: linux/arch/x86_64/kernel/hpet.c =================================================================== --- /dev/null +++ linux/arch/x86_64/kernel/hpet.c @@ -0,0 +1,475 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int nohpet __initdata = 0; + +unsigned long hpet_address; +static unsigned long hpet_period; /* fsecs / HPET clock */ +unsigned long hpet_tick; /* HPET clocks / interrupt */ +int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */ + +#define FSEC_PER_TICK (FSEC_PER_SEC / HZ) + +/* + * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing + * it to the HPET timer of known frequency. + */ + +#define TICK_COUNT 100000000 + +unsigned int __init hpet_calibrate_tsc(void) +{ + int tsc_start, hpet_start; + int tsc_now, hpet_now; + unsigned long flags; + + local_irq_save(flags); + local_irq_disable(); + + hpet_start = hpet_readl(HPET_COUNTER); + rdtscl(tsc_start); + + do { + local_irq_disable(); + hpet_now = hpet_readl(HPET_COUNTER); + tsc_now = get_cycles_sync(); + local_irq_restore(flags); + } while ((tsc_now - tsc_start) < TICK_COUNT && + (hpet_now - hpet_start) < TICK_COUNT); + + return (tsc_now - tsc_start) * 1000000000L + / ((hpet_now - hpet_start) * hpet_period / 1000); +} + + + +#ifdef CONFIG_HPET +static __init int late_hpet_init(void) +{ + struct hpet_data hd; + unsigned int ntimer; + + if (!hpet_address) + return 0; + + memset(&hd, 0, sizeof (hd)); + + ntimer = hpet_readl(HPET_ID); + ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + ntimer++; + + /* + * Register with driver. + * Timer0 and Timer1 is used by platform. + */ + hd.hd_phys_address = hpet_address; + hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); + hd.hd_nirqs = ntimer; + hd.hd_flags = HPET_DATA_PLATFORM; + hpet_reserve_timer(&hd, 0); +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + if (ntimer > 2) { + struct hpet *hpet; + struct hpet_timer *timer; + int i; + + hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); + timer = &hpet->hpet_timers[2]; + for (i = 2; i < ntimer; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & + Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; + + } + + hpet_alloc(&hd); + return 0; +} +fs_initcall(late_hpet_init); +#endif + +static int hpet_timer_stop_set_go(unsigned long tick) +{ + unsigned int cfg; + +/* + * Stop the timers and reset the main counter. + */ + + cfg = hpet_readl(HPET_CFG); + cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); + hpet_writel(cfg, HPET_CFG); + hpet_writel(0, HPET_COUNTER); + hpet_writel(0, HPET_COUNTER + 4); + +/* + * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, + * and period also hpet_tick. + */ + if (hpet_use_timer) { + hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + HPET_TN_32BIT, HPET_T0_CFG); + hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ + cfg |= HPET_CFG_LEGACY; + } +/* + * Go! + */ + + cfg |= HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); + + return 0; +} + +int hpet_arch_init(void) +{ + unsigned int id; + + if (!hpet_address) + return -1; + set_fixmap_nocache(FIX_HPET_BASE, hpet_address); + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + +/* + * Read the period, compute tick and quotient. + */ + + id = hpet_readl(HPET_ID); + + if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) + return -1; + + hpet_period = hpet_readl(HPET_PERIOD); + if (hpet_period < 100000 || hpet_period > 100000000) + return -1; + + hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; + + hpet_use_timer = (id & HPET_ID_LEGSUP); + + return hpet_timer_stop_set_go(hpet_tick); +} + +int hpet_reenable(void) +{ + return hpet_timer_stop_set_go(hpet_tick); +} + +int hpet_stop(void) +{ + return hpet_timer_stop_set_go(0); +} + +#ifdef CONFIG_HPET_EMULATE_RTC +/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET + * is enabled, we support RTC interrupt functionality in software. + * RTC has 3 kinds of interrupts: + * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock + * is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) + * (1) and (2) above are implemented using polling at a frequency of + * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt + * overhead. (DEFAULT_RTC_INT_FREQ) + * For (3), we use interrupts at 64Hz or user specified periodic + * frequency, whichever is higher. + */ +#include + +#define DEFAULT_RTC_INT_FREQ 64 +#define RTC_NUM_INTS 1 + +static unsigned long UIE_on; +static unsigned long prev_update_sec; + +static unsigned long AIE_on; +static struct rtc_time alarm_time; + +static unsigned long PIE_on; +static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; +static unsigned long PIE_count; + +static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ +static unsigned int hpet_t1_cmp; /* cached comparator register */ + +int is_hpet_enabled(void) +{ + return hpet_address != 0; +} + +/* + * Timer 1 for RTC, we do not use periodic interrupt feature, + * even if HPET supports periodic interrupts on Timer 1. + * The reason being, to set up a periodic interrupt in HPET, we need to + * stop the main counter. And if we do that everytime someone diables/enables + * RTC, we will have adverse effect on main kernel timer running on Timer 0. + * So, for the time being, simulate the periodic interrupt in software. + * + * hpet_rtc_timer_init() is called for the first time and during subsequent + * interuppts reinit happens through hpet_rtc_timer_reinit(). + */ +int hpet_rtc_timer_init(void) +{ + unsigned int cfg, cnt; + unsigned long flags; + + if (!is_hpet_enabled()) + return 0; + /* + * Set the counter 1 and enable the interrupts. + */ + if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) + hpet_rtc_int_freq = PIE_freq; + else + hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; + + local_irq_save(flags); + cnt = hpet_readl(HPET_COUNTER); + cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); + hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; + local_irq_restore(flags); + + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + return 1; +} + +static void hpet_rtc_timer_reinit(void) +{ + unsigned int cfg, cnt; + + if (unlikely(!(PIE_on | AIE_on | UIE_on))) { + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); + return; + } + + if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) + hpet_rtc_int_freq = PIE_freq; + else + hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; + + /* It is more accurate to use the comparator value than current count.*/ + cnt = hpet_t1_cmp; + cnt += hpet_tick*HZ/hpet_rtc_int_freq; + hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; +} + +/* + * The functions below are called from rtc driver. + * Return 0 if HPET is not being used. + * Otherwise do the necessary changes and return 1. + */ +int hpet_mask_rtc_irq_bit(unsigned long bit_mask) +{ + if (!is_hpet_enabled()) + return 0; + + if (bit_mask & RTC_UIE) + UIE_on = 0; + if (bit_mask & RTC_PIE) + PIE_on = 0; + if (bit_mask & RTC_AIE) + AIE_on = 0; + + return 1; +} + +int hpet_set_rtc_irq_bit(unsigned long bit_mask) +{ + int timer_init_reqd = 0; + + if (!is_hpet_enabled()) + return 0; + + if (!(PIE_on | AIE_on | UIE_on)) + timer_init_reqd = 1; + + if (bit_mask & RTC_UIE) { + UIE_on = 1; + } + if (bit_mask & RTC_PIE) { + PIE_on = 1; + PIE_count = 0; + } + if (bit_mask & RTC_AIE) { + AIE_on = 1; + } + + if (timer_init_reqd) + hpet_rtc_timer_init(); + + return 1; +} + +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) +{ + if (!is_hpet_enabled()) + return 0; + + alarm_time.tm_hour = hrs; + alarm_time.tm_min = min; + alarm_time.tm_sec = sec; + + return 1; +} + +int hpet_set_periodic_freq(unsigned long freq) +{ + if (!is_hpet_enabled()) + return 0; + + PIE_freq = freq; + PIE_count = 0; + + return 1; +} + +int hpet_rtc_dropped_irq(void) +{ + if (!is_hpet_enabled()) + return 0; + + return 1; +} + +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct rtc_time curr_time; + unsigned long rtc_int_flag = 0; + int call_rtc_interrupt = 0; + + hpet_rtc_timer_reinit(); + + if (UIE_on | AIE_on) { + rtc_get_rtc_time(&curr_time); + } + if (UIE_on) { + if (curr_time.tm_sec != prev_update_sec) { + /* Set update int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag = RTC_UF; + prev_update_sec = curr_time.tm_sec; + } + } + if (PIE_on) { + PIE_count++; + if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { + /* Set periodic int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag |= RTC_PF; + PIE_count = 0; + } + } + if (AIE_on) { + if ((curr_time.tm_sec == alarm_time.tm_sec) && + (curr_time.tm_min == alarm_time.tm_min) && + (curr_time.tm_hour == alarm_time.tm_hour)) { + /* Set alarm int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag |= RTC_AF; + } + } + if (call_rtc_interrupt) { + rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); + rtc_interrupt(rtc_int_flag, dev_id, regs); + } + return IRQ_HANDLED; +} +#endif + +static int __init nohpet_setup(char *s) +{ + nohpet = 1; + return 1; +} + +__setup("nohpet", nohpet_setup); + +#define HPET_MASK 0xFFFFFFFF +#define HPET_SHIFT 22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000 + +static void *hpet_ptr; + +static cycle_t read_hpet(void) +{ + return (cycle_t)readl(hpet_ptr); +} + +static cycle_t __vsyscall_fn vread_hpet(void) +{ + return (cycle_t)readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0); +} + +struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = (cycle_t)HPET_MASK, + .mult = 0, /* set below */ + .shift = HPET_SHIFT, + .is_continuous = 1, + .vread = vread_hpet, +}; + +static int __init init_hpet_clocksource(void) +{ + unsigned long hpet_period; + void __iomem *hpet_base; + u64 tmp; + + if (!hpet_address) + return -ENODEV; + + /* calculate the hpet address: */ + hpet_base = + (void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + hpet_ptr = hpet_base + HPET_COUNTER; + + /* calculate the frequency: */ + hpet_period = readl(hpet_base + HPET_PERIOD); + + /* + * hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + + return clocksource_register(&clocksource_hpet); +} + +module_init(init_hpet_clocksource); Index: linux/arch/x86_64/kernel/i8259.c =================================================================== --- linux.orig/arch/x86_64/kernel/i8259.c +++ linux/arch/x86_64/kernel/i8259.c @@ -43,17 +43,10 @@ BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ BI(x,c) BI(x,d) BI(x,e) BI(x,f) -#define BUILD_15_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) - /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: * (these are usually mapped to vectors 0x20-0x2f) */ -BUILD_16_IRQS(0x0) #ifdef CONFIG_X86_LOCAL_APIC /* @@ -66,19 +59,14 @@ BUILD_16_IRQS(0x0) * * (these are usually mapped into the 0x30-0xff vector range) */ - BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) + BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) - -#ifdef CONFIG_PCI_MSI - BUILD_15_IRQS(0xe) -#endif +BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) #endif #undef BUILD_16_IRQS -#undef BUILD_15_IRQS #undef BI @@ -91,26 +79,11 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) -#define IRQLIST_15(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e) - void (*interrupt[NR_IRQS])(void) = { - IRQLIST_16(0x0), - -#ifdef CONFIG_X86_IO_APIC - IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), + IRQLIST_16(0x2), IRQLIST_16(0x3), IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd) - -#ifdef CONFIG_PCI_MSI - , IRQLIST_15(0xe) -#endif - -#endif + IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) }; #undef IRQ @@ -126,46 +99,21 @@ void (*interrupt[NR_IRQS])(void) = { * moves to arch independent land */ -DEFINE_SPINLOCK(i8259A_lock); - -static void end_8259A_irq (unsigned int irq) -{ - if (irq > 256) { - char var; - printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, task_thread_info(current)); - - BUG(); - } - - if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) && - irq_desc[irq].action) - enable_8259A_irq(irq); -} - -#define shutdown_8259A_irq disable_8259A_irq - static void mask_and_ack_8259A(unsigned int); -static unsigned int startup_8259A_irq(unsigned int irq) -{ - enable_8259A_irq(irq); - return 0; /* never anything pending */ -} - -static struct hw_interrupt_type i8259A_irq_type = { - .typename = "XT-PIC", - .startup = startup_8259A_irq, - .shutdown = shutdown_8259A_irq, - .enable = enable_8259A_irq, - .disable = disable_8259A_irq, - .ack = mask_and_ack_8259A, - .end = end_8259A_irq, +static struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .mask = disable_8259A_irq, + .unmask = enable_8259A_irq, + .mask_ack = mask_and_ack_8259A, }; /* * 8259A PIC functions to handle ISA devices: */ +DEFINE_RAW_SPINLOCK(i8259A_lock); + /* * This contains the irq mask for both 8259A irq controllers, */ @@ -234,7 +182,7 @@ void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<= NR_IRQS) - break; if (vector != IA32_SYSCALL_VECTOR) set_intr_gate(vector, interrupt[i]); } @@ -557,7 +525,7 @@ void __init init_IRQ(void) * IRQ0 must be given a fixed assignment and initialized, * because it's used before the IO-APIC is set up. */ - set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); + __get_cpu_var(vector_irq)[FIRST_DEVICE_VECTOR] = 0; /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper Index: linux/arch/x86_64/kernel/io_apic.c =================================================================== --- linux.orig/arch/x86_64/kernel/io_apic.c +++ linux/arch/x86_64/kernel/io_apic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,9 @@ #include #include #include +#include + +static int assign_irq_vector(int irq, cpumask_t mask); #define __apicdebuginit __init @@ -55,8 +59,8 @@ int timer_over_8254 __initdata = 0; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -81,14 +85,6 @@ static struct irq_pin_list { short apic, pin, next; } irq_2_pin[PIN_MAP_SIZE]; -int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; -#ifdef CONFIG_PCI_MSI -#define vector_to_irq(vector) \ - (platform_legacy_irq(vector) ? vector : vector_irq[vector]) -#else -#define vector_to_irq(vector) (vector) -#endif - #define __DO_ACTION(R, ACTION, FINAL) \ \ { \ @@ -104,6 +100,9 @@ int vector_irq[NR_VECTORS] __read_mostly reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ reg ACTION; \ io_apic_modify(entry->apic, reg); \ + /* Force POST flush by reading: */ \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + \ if (!entry->next) \ break; \ entry = irq_2_pin + entry->next; \ @@ -112,11 +111,35 @@ int vector_irq[NR_VECTORS] __read_mostly } #ifdef CONFIG_SMP +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + int apic, pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + BUG_ON(irq >= NR_IRQS); + for (;;) { + unsigned int reg; + apic = entry->apic; + pin = entry->pin; + if (pin == -1) + break; + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~0x000000ff; + reg |= vector; + io_apic_modify(apic, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { unsigned long flags; unsigned int dest; cpumask_t tmp; + int vector; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -124,7 +147,13 @@ static void set_ioapic_affinity_irq(unsi cpus_and(mask, tmp, CPU_MASK_ALL); - dest = cpu_mask_to_apicid(mask); + vector = assign_irq_vector(irq, mask); + if (vector < 0) + return; + + cpus_clear(tmp); + cpu_set(vector >> 8, tmp); + dest = cpu_mask_to_apicid(tmp); /* * Only the high 8 bits are valid. @@ -132,14 +161,12 @@ static void set_ioapic_affinity_irq(unsi dest = SET_APIC_LOGICAL_ID(dest); spin_lock_irqsave(&ioapic_lock, flags); - __DO_ACTION(1, = dest, ) - set_irq_info(irq, mask); + __target_IO_APIC_irq(irq, dest, vector & 0xff); + set_native_irq_info(irq, mask); spin_unlock_irqrestore(&ioapic_lock, flags); } #endif -static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF }; - /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -170,10 +197,8 @@ static void add_pin_to_irq(unsigned int static void name##_IO_APIC_irq (unsigned int irq) \ __DO_ACTION(R, ACTION, FINAL) -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ +DO_ACTION( __mask, 0, |= 0x00010000, ) /* mask = 1 */ +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) /* mask = 0 */ static void mask_IO_APIC_irq (unsigned int irq) { @@ -695,64 +720,6 @@ static inline int irq_trigger(int idx) return MPBIOS_trigger(idx); } -static int next_irq = 16; - -/* - * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ - * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number - * from ACPI, which can reach 800 in large boxen. - * - * Compact the sparse GSI space into a sequential IRQ series and reuse - * vectors if possible. - */ -int gsi_irq_sharing(int gsi) -{ - int i, tries, vector; - - BUG_ON(gsi >= NR_IRQ_VECTORS); - - if (platform_legacy_irq(gsi)) - return gsi; - - if (gsi_2_irq[gsi] != 0xFF) - return (int)gsi_2_irq[gsi]; - - tries = NR_IRQS; - try_again: - vector = assign_irq_vector(gsi); - - /* - * Sharing vectors means sharing IRQs, so scan irq_vectors for previous - * use of vector and if found, return that IRQ. However, we never want - * to share legacy IRQs, which usually have a different trigger mode - * than PCI. - */ - for (i = 0; i < NR_IRQS; i++) - if (IO_APIC_VECTOR(i) == vector) - break; - if (platform_legacy_irq(i)) { - if (--tries >= 0) { - IO_APIC_VECTOR(i) = 0; - goto try_again; - } - panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi); - } - if (i < NR_IRQS) { - gsi_2_irq[gsi] = i; - printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n", - gsi, vector, i); - return i; - } - - i = next_irq++; - BUG_ON(i >= NR_IRQS); - gsi_2_irq[gsi] = i; - IO_APIC_VECTOR(i) = vector; - printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n", - gsi, vector, i); - return i; -} - static int pin_2_irq(int idx, int apic, int pin) { int irq, i; @@ -782,7 +749,6 @@ static int pin_2_irq(int idx, int apic, while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; - irq = gsi_irq_sharing(irq); break; } default: @@ -830,46 +796,83 @@ static inline int IO_APIC_irq_trigger(in } /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; +unsigned int irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_EXTERNAL_VECTOR, 0 }; -int assign_irq_vector(int irq) +static int __assign_irq_vector(int irq, cpumask_t mask) { - static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; - unsigned long flags; - int vector; - - BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); - - spin_lock_irqsave(&vector_lock, flags); - - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { - spin_unlock_irqrestore(&vector_lock, flags); - return IO_APIC_VECTOR(irq); - } + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + static struct { + int vector; + int offset; + } pos[NR_CPUS] = { [ 0 ... NR_CPUS - 1] = {FIRST_DEVICE_VECTOR, 0} }; + int old_vector = -1; + int cpu; + + BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); + + if (IO_APIC_VECTOR(irq) > 0) + old_vector = IO_APIC_VECTOR(irq); + if ((old_vector > 0) && cpu_isset(old_vector >> 8, mask)) { + return old_vector; + } + + for_each_cpu_mask(cpu, mask) { + int vector, offset; + vector = pos[cpu].vector; + offset = pos[cpu].offset; next: - current_vector += 8; - if (current_vector == IA32_SYSCALL_VECTOR) - goto next; - - if (current_vector >= FIRST_SYSTEM_VECTOR) { - /* If we run out of vectors on large boxen, must share them. */ - offset = (offset + 1) % 8; - current_vector = FIRST_DEVICE_VECTOR + offset; + vector += 8; + if (vector >= FIRST_SYSTEM_VECTOR) { + /* If we run out of vectors on large boxen, must share them. */ + offset = (offset + 1) % 8; + vector = FIRST_DEVICE_VECTOR + offset; + } + if (unlikely(pos[cpu].vector == vector)) + continue; + if (vector == IA32_SYSCALL_VECTOR) + goto next; + if (per_cpu(vector_irq, cpu)[vector] != -1) + goto next; + /* Found one! */ + pos[cpu].vector = vector; + pos[cpu].offset = offset; + if (old_vector >= 0) { + int old_cpu = old_vector >> 8; + old_vector &= 0xff; + per_cpu(vector_irq, old_cpu)[old_vector] = -1; + } + per_cpu(vector_irq, cpu)[vector] = irq; + vector |= cpu << 8; + IO_APIC_VECTOR(irq) = vector; + return vector; } + return -ENOSPC; +} - vector = current_vector; - vector_irq[vector] = irq; - if (irq != AUTO_ASSIGN) - IO_APIC_VECTOR(irq) = vector; +static int assign_irq_vector(int irq, cpumask_t mask) +{ + int vector; + unsigned long flags; + spin_lock_irqsave(&vector_lock, flags); + vector = __assign_irq_vector(irq, mask); spin_unlock_irqrestore(&vector_lock, flags); - return vector; } extern void (*interrupt[NR_IRQS])(void); -static struct hw_interrupt_type ioapic_level_type; -static struct hw_interrupt_type ioapic_edge_type; + +static struct irq_chip ioapic_chip; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -877,16 +880,16 @@ static struct hw_interrupt_type ioapic_e static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { - unsigned idx; - - idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[idx].chip = &ioapic_level_type; - else - irq_desc[idx].chip = &ioapic_edge_type; - set_intr_gate(vector, interrupt[idx]); + trigger == IOAPIC_LEVEL) { +#ifdef CONFIG_PREEMPT_HARDIRQS + set_irq_chip_and_handler(irq, &ioapic_chip, handle_level_irq); +#else + set_irq_chip_and_handler(irq, &ioapic_chip, handle_fasteoi_irq); +#endif + } else { + set_irq_chip_and_handler(irq, &ioapic_chip, handle_edge_irq); + } } static void __init setup_IO_APIC_irqs(void) @@ -936,8 +939,15 @@ static void __init setup_IO_APIC_irqs(vo continue; if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); - entry.vector = vector; + cpumask_t mask; + vector = assign_irq_vector(irq, TARGET_CPUS); + if (vector < 0) + continue; + + cpus_clear(mask); + cpu_set(vector >> 8, mask); + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); + entry.vector = vector & 0xff; ioapic_register_intr(irq, vector, IOAPIC_AUTO); if (!apic && (irq < 16)) @@ -987,7 +997,7 @@ static void __init setup_ExtINT_IRQ0_pin * The timer IRQ doesn't have to know that behind the * scene we have a 8259A-master in AEOI mode ... */ - irq_desc[0].chip = &ioapic_edge_type; + set_irq_chip_and_handler(0, &ioapic_chip, handle_edge_irq); /* * Add it to the IO-APIC irq-routing table: @@ -1106,17 +1116,12 @@ void __apicdebuginit print_IO_APIC(void) ); } } - if (use_pci_vector()) - printk(KERN_INFO "Using vector-based indexing\n"); printk(KERN_DEBUG "IRQ to pin mappings:\n"); for (i = 0; i < NR_IRQS; i++) { struct irq_pin_list *entry = irq_2_pin + i; if (entry->pin < 0) continue; - if (use_pci_vector() && !platform_legacy_irq(i)) - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); - else - printk(KERN_DEBUG "IRQ%d ", i); + printk(KERN_DEBUG "IRQ%d ", i); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) @@ -1502,7 +1507,7 @@ static int __init timer_irq_works(void) * an edge even if it isn't on the 8259A... */ -static unsigned int startup_edge_ioapic_irq(unsigned int irq) +static unsigned int startup_ioapic_irq(unsigned int irq) { int was_pending = 0; unsigned long flags; @@ -1519,107 +1524,16 @@ static unsigned int startup_edge_ioapic_ return was_pending; } -/* - * Once we have recorded IRQ_PENDING already, we can mask the - * interrupt for real. This prevents IRQ storms from unhandled - * devices. - */ -static void ack_edge_ioapic_irq(unsigned int irq) -{ - move_irq(irq); - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) - == (IRQ_PENDING | IRQ_DISABLED)) - mask_IO_APIC_irq(irq); - ack_APIC_irq(); -} - -/* - * Level triggered interrupts can just be masked, - * and shutting down and starting up the interrupt - * is the same as enabling and disabling them -- except - * with a startup need to return a "was pending" value. - * - * Level triggered interrupts are special because we - * do not touch any IO-APIC register while handling - * them. We ack the APIC in the end-IRQ handler, not - * in the start-IRQ-handler. Protection against reentrance - * from the same interrupt is still provided, both by the - * generic IRQ layer and by the fact that an unacked local - * APIC does not accept IRQs. - */ -static unsigned int startup_level_ioapic_irq (unsigned int irq) -{ - unmask_IO_APIC_irq(irq); - - return 0; /* don't check for pending */ -} - -static void end_level_ioapic_irq (unsigned int irq) -{ - move_irq(irq); - ack_APIC_irq(); -} - -#ifdef CONFIG_PCI_MSI -static unsigned int startup_edge_ioapic_vector(unsigned int vector) +static int ioapic_retrigger_irq(unsigned int irq) { - int irq = vector_to_irq(vector); + cpumask_t mask; + unsigned vector; - return startup_edge_ioapic_irq(irq); -} + vector = irq_vector[irq]; + cpus_clear(mask); + cpu_set(vector >> 8, mask); -static void ack_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - move_native_irq(vector); - ack_edge_ioapic_irq(irq); -} - -static unsigned int startup_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_level_ioapic_irq (irq); -} - -static void end_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - move_native_irq(vector); - end_level_ioapic_irq(irq); -} - -static void mask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - mask_IO_APIC_irq(irq); -} - -static void unmask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - unmask_IO_APIC_irq(irq); -} - -#ifdef CONFIG_SMP -static void set_ioapic_affinity_vector (unsigned int vector, - cpumask_t cpu_mask) -{ - int irq = vector_to_irq(vector); - - set_native_irq_info(vector, cpu_mask); - set_ioapic_affinity_irq(irq, cpu_mask); -} -#endif // CONFIG_SMP -#endif // CONFIG_PCI_MSI - -static int ioapic_retrigger(unsigned int irq) -{ - send_IPI_self(IO_APIC_VECTOR(irq)); + send_IPI_mask(mask, vector & 0xff); return 1; } @@ -1633,32 +1547,47 @@ static int ioapic_retrigger(unsigned int * races. */ -static struct hw_interrupt_type ioapic_edge_type __read_mostly = { - .typename = "IO-APIC-edge", - .startup = startup_edge_ioapic, - .shutdown = shutdown_edge_ioapic, - .enable = enable_edge_ioapic, - .disable = disable_edge_ioapic, - .ack = ack_edge_ioapic, - .end = end_edge_ioapic, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity, +static void ack_apic_edge(unsigned int irq) +{ + move_native_irq(irq); + ack_APIC_irq(); +} + +static void ack_apic_level(unsigned int irq) +{ + int do_unmask_irq = 0; + +#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) + /* If we are moving the irq we need to mask it */ + if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { + do_unmask_irq = 1; + mask_IO_APIC_irq(irq); + } #endif - .retrigger = ioapic_retrigger, -}; -static struct hw_interrupt_type ioapic_level_type __read_mostly = { - .typename = "IO-APIC-level", - .startup = startup_level_ioapic, - .shutdown = shutdown_level_ioapic, - .enable = enable_level_ioapic, - .disable = disable_level_ioapic, - .ack = mask_and_ack_level_ioapic, - .end = end_level_ioapic, + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propogate properly. + */ + ack_APIC_irq(); + + /* Now we can move and renable the irq */ + move_masked_irq(irq); + if (unlikely(do_unmask_irq)) + unmask_IO_APIC_irq(irq); +} + +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity, + .set_affinity = set_ioapic_affinity_irq, #endif - .retrigger = ioapic_retrigger, + .retrigger = ioapic_retrigger_irq, }; static inline void init_IO_APIC_traps(void) @@ -1678,11 +1607,6 @@ static inline void init_IO_APIC_traps(vo */ for (irq = 0; irq < NR_IRQS ; irq++) { int tmp = irq; - if (use_pci_vector()) { - if (!platform_legacy_irq(tmp)) - if ((tmp = vector_to_irq(tmp)) == -1) - continue; - } if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { /* * Hmm.. We don't have an entry for this, @@ -1693,7 +1617,7 @@ static inline void init_IO_APIC_traps(vo make_8259A_irq(irq); else /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_type; + irq_desc[irq].chip = &no_irq_chip; } } } @@ -1812,8 +1736,6 @@ static inline void unlock_ExtINT_logic(v spin_unlock_irqrestore(&ioapic_lock, flags); } -int timer_uses_ioapic_pin_0; - /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -1831,8 +1753,7 @@ static inline void check_timer(void) * get/set the timer IRQ vector: */ disable_8259A_irq(0); - vector = assign_irq_vector(0); - set_intr_gate(vector, interrupt[0]); + vector = assign_irq_vector(0, TARGET_CPUS); /* * Subtle, code in do_timer_interrupt() expects an AEOI @@ -1851,9 +1772,6 @@ static inline void check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - if (pin1 == 0) - timer_uses_ioapic_pin_0 = 1; - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", vector, apic1, pin1, apic2, pin2); @@ -2069,6 +1987,124 @@ static int __init ioapic_init_sysfs(void device_initcall(ioapic_init_sysfs); +/* + * Dynamic irq allocate and deallocation + */ +int create_irq(void) +{ + /* Allocate an unused irq */ + int irq; + int new; + int vector = 0; + unsigned long flags; + + irq = -ENOSPC; + spin_lock_irqsave(&vector_lock, flags); + for (new = (NR_IRQS - 1); new >= 0; new--) { + if (platform_legacy_irq(new)) + continue; + if (irq_vector[new] != 0) + continue; + vector = __assign_irq_vector(new, TARGET_CPUS); + if (likely(vector > 0)) + irq = new; + break; + } + spin_unlock_irqrestore(&vector_lock, flags); + + if (irq >= 0) { + dynamic_irq_init(irq); + } + return irq; +} + +void destroy_irq(unsigned int irq) +{ + unsigned long flags; + + dynamic_irq_cleanup(irq); + + spin_lock_irqsave(&vector_lock, flags); + irq_vector[irq] = 0; + spin_unlock_irqrestore(&vector_lock, flags); +} + +/* + * MSI mesage composition + */ +#ifdef CONFIG_PCI_MSI +static int msi_msg_setup(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +{ + /* For now always this code always uses physical delivery + * mode. + */ + int vector; + unsigned dest; + + vector = assign_irq_vector(irq, TARGET_CPUS); + if (vector >= 0) { + cpumask_t tmp; + + cpus_clear(tmp); + cpu_set(vector >> 8, tmp); + dest = cpu_mask_to_apicid(tmp); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(vector); + } + return vector; +} + +static void msi_msg_teardown(unsigned int irq) +{ + return; +} + +static void msi_msg_set_affinity(unsigned int irq, cpumask_t mask, struct msi_msg *msg) +{ + int vector; + unsigned dest; + + vector = assign_irq_vector(irq, mask); + if (vector > 0) { + cpumask_t tmp; + + cpus_clear(tmp); + cpu_set(vector >> 8, tmp); + dest = cpu_mask_to_apicid(tmp); + + msg->data &= ~MSI_DATA_VECTOR_MASK; + msg->data |= MSI_DATA_VECTOR(vector); + msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg->address_lo |= MSI_ADDR_DEST_ID(dest); + } +} + +struct msi_ops arch_msi_ops = { + .needs_64bit_address = 0, + .setup = msi_msg_setup, + .teardown = msi_msg_teardown, + .target = msi_msg_set_affinity, +}; + +#endif + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ @@ -2107,6 +2143,8 @@ int io_apic_set_pci_routing (int ioapic, { struct IO_APIC_route_entry entry; unsigned long flags; + int vector; + cpumask_t mask; if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", @@ -2115,6 +2153,20 @@ int io_apic_set_pci_routing (int ioapic, } /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + + vector = assign_irq_vector(irq, TARGET_CPUS); + if (vector < 0) + return vector; + + cpus_clear(mask); + cpu_set(vector >> 8, mask); + + /* * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. * Note that we mask (disable) IRQs now -- these get enabled when the * corresponding device driver registers for this IRQ. @@ -2124,19 +2176,11 @@ int io_apic_set_pci_routing (int ioapic, entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); entry.trigger = triggering; entry.polarity = polarity; entry.mask = 1; /* Disabled (masked) */ - - irq = gsi_irq_sharing(irq); - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - entry.vector = assign_irq_vector(irq); + entry.vector = vector & 0xff; apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " "IRQ %d Mode:%i Active:%i)\n", ioapic, @@ -2151,7 +2195,7 @@ int io_apic_set_pci_routing (int ioapic, spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); + set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); return 0; Index: linux/arch/x86_64/kernel/irq.c =================================================================== --- linux.orig/arch/x86_64/kernel/irq.c +++ linux/arch/x86_64/kernel/irq.c @@ -79,7 +79,8 @@ int show_interrupts(struct seq_file *p, for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - seq_printf(p, " %14s", irq_desc[i].chip->typename); + seq_printf(p, " %8s", irq_desc[i].chip->name); + seq_printf(p, "-%s", handle_irq_name(irq_desc[i].handle_irq)); seq_printf(p, " %s", action->name); for (action=action->next; action; action = action->next) @@ -116,7 +117,18 @@ skip: asmlinkage unsigned int do_IRQ(struct pt_regs *regs) { /* high bit used in ret_from_ code */ - unsigned irq = ~regs->orig_rax; + unsigned vector = ~regs->orig_rax; + unsigned irq; + + exit_idle(); + irq_enter(); + irq = __get_cpu_var(vector_irq)[vector]; + +#ifdef CONFIG_LATENCY_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif + trace_special(regs->rip, irq, 0); if (unlikely(irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", @@ -124,12 +136,24 @@ asmlinkage unsigned int do_IRQ(struct pt BUG(); } - exit_idle(); - irq_enter(); #ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); #endif - __do_IRQ(irq, regs); +#ifdef CONFIG_NO_HZ + if (idle_cpu(smp_processor_id())) { + update_jiffies(); + /* + * Force polling-idle loops to break out into + * the sched-timer setting code, to make sure + * that timer interval changes due to __mod_timer() + * in IRQ context get properly propagated: + */ + if (tsk_is_polling(current)) + set_need_resched(); + } +#endif + + generic_handle_irq(irq, regs); irq_exit(); return 1; Index: linux/arch/x86_64/kernel/mpparse.c =================================================================== --- linux.orig/arch/x86_64/kernel/mpparse.c +++ linux/arch/x86_64/kernel/mpparse.c @@ -909,20 +909,11 @@ void __init mp_config_acpi_legacy_irqs ( return; } -#define MAX_GSI_NUM 4096 - int mp_register_gsi(u32 gsi, int triggering, int polarity) { int ioapic = -1; int ioapic_pin = 0; int idx, bit = 0; - static int pci_irq = 16; - /* - * Mapping between Global System Interrupts, which - * represent all possible interrupts, to the IRQs - * assigned to actual devices. - */ - static int gsi_to_irq[MAX_GSI_NUM]; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -955,42 +946,11 @@ int mp_register_gsi(u32 gsi, int trigger if ((1< 15), but - * avoid a problem where the 8254 timer (IRQ0) is setup - * via an override (so it's not on pin 0 of the ioapic), - * and at the same time, the pin 0 interrupt is a PCI - * type. The gsi > 15 test could cause these two pins - * to be shared as IRQ0, and they are not shareable. - * So test for this condition, and if necessary, avoid - * the pin collision. - */ - if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) - gsi = pci_irq++; - /* - * Don't assign IRQ used by ACPI SCI - */ - if (gsi == acpi_fadt.sci_int) - gsi = pci_irq++; - gsi_to_irq[irq] = gsi; - } else { - printk(KERN_ERR "GSI %u is too high\n", gsi); - return gsi; - } - } - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); Index: linux/arch/x86_64/kernel/nmi.c =================================================================== --- linux.orig/arch/x86_64/kernel/nmi.c +++ linux/arch/x86_64/kernel/nmi.c @@ -37,7 +37,7 @@ * This is maintained separately from nmi_active because the NMI * watchdog may also be driven from the I/O APIC timer. */ -static DEFINE_SPINLOCK(lapic_nmi_owner_lock); +static DEFINE_RAW_SPINLOCK(lapic_nmi_owner_lock); static unsigned int lapic_nmi_owner; #define LAPIC_NMI_WATCHDOG (1<<0) #define LAPIC_NMI_RESERVED (1<<1) @@ -127,7 +127,9 @@ void __cpuinit nmi_watchdog_default(void static __init void nmi_cpu_busy(void *data) { volatile int *endflag = data; +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -526,12 +528,42 @@ void touch_nmi_watchdog (void) touch_softlockup_watchdog(); } +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) +{ + int i; + + if (nmi_watchdog == NMI_NONE) + return; + if (system_state != SYSTEM_RUNNING) { + printk("nmi_show_all_regs(): system state %d, not doing.\n", + system_state); + return; + } + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + for_each_online_cpu(i) + while (nmi_show_regs[i] == 1) + barrier(); +} + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; + int cpu = safe_smp_processor_id(); sum = read_pda(apic_timer_irqs); + if (nmi_show_regs[cpu]) { + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + show_regs(regs); + spin_unlock(&nmi_print_lock); + } if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; @@ -549,6 +581,11 @@ void __kprobes nmi_watchdog_tick(struct */ local_inc(&__get_cpu_var(alert_counter)); if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { + int i; + + for (i = 0; i < NR_CPUS; i++) + nmi_show_regs[i] = 1; + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { local_set(&__get_cpu_var(alert_counter), 0); Index: linux/arch/x86_64/kernel/pmtimer.c =================================================================== --- linux.orig/arch/x86_64/kernel/pmtimer.c +++ linux/arch/x86_64/kernel/pmtimer.c @@ -24,15 +24,6 @@ #include #include -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/kernel/acpi/boot.c */ -u32 pmtmr_ioport __read_mostly; - -/* value of the Power timer at last timer interrupt */ -static u32 offset_delay; -static u32 last_pmtmr_tick; - #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ static inline u32 cyc2us(u32 cycles) @@ -48,38 +39,6 @@ static inline u32 cyc2us(u32 cycles) return (cycles >> 10); } -int pmtimer_mark_offset(void) -{ - static int first_run = 1; - unsigned long tsc; - u32 lost; - - u32 tick = inl(pmtmr_ioport); - u32 delta; - - delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK); - - last_pmtmr_tick = tick; - monotonic_base += delta * NSEC_PER_USEC; - - delta += offset_delay; - - lost = delta / (USEC_PER_SEC / HZ); - offset_delay = delta % (USEC_PER_SEC / HZ); - - rdtscll(tsc); - vxtime.last_tsc = tsc - offset_delay * (u64)cpu_khz / 1000; - - /* don't calculate delay for first run, - or if we've got less then a tick */ - if (first_run || (lost < 1)) { - first_run = 0; - offset_delay = 0; - } - - return lost - 1; -} - static unsigned pmtimer_wait_tick(void) { u32 a, b; @@ -101,23 +60,6 @@ void pmtimer_wait(unsigned us) } while (cyc2us(b - a) < us); } -void pmtimer_resume(void) -{ - last_pmtmr_tick = inl(pmtmr_ioport); -} - -unsigned int do_gettimeoffset_pm(void) -{ - u32 now, offset, delta = 0; - - offset = last_pmtmr_tick; - now = inl(pmtmr_ioport); - delta = (now - offset) & ACPI_PM_MASK; - - return offset_delay + cyc2us(delta); -} - - static int __init nopmtimer_setup(char *s) { pmtmr_ioport = 0; Index: linux/arch/x86_64/kernel/process.c =================================================================== --- linux.orig/arch/x86_64/kernel/process.c +++ linux/arch/x86_64/kernel/process.c @@ -113,11 +113,15 @@ static void default_idle(void) current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { local_irq_disable(); - if (!need_resched()) - safe_halt(); - else + if (!need_resched() && !need_resched_delayed()) { + if (!hrtimer_stop_sched_tick()) + safe_halt(); + else + local_irq_enable(); + hrtimer_restart_sched_tick(); + } else local_irq_enable(); } current_thread_info()->status |= TS_POLLING; @@ -131,6 +135,14 @@ static void default_idle(void) static void poll_idle (void) { local_irq_enable(); + while (!need_resched() && !need_resched_delayed()) { + hrtimer_stop_sched_tick(); + local_irq_enable(); + while (!need_resched() && !need_resched_delayed() && !rcu_pending(smp_processor_id()) && !local_softirq_pending()) + rep_nop(); + hrtimer_restart_sched_tick(); + local_irq_enable(); + } asm volatile( "2:" @@ -206,7 +218,9 @@ void cpu_idle (void) current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + BUG_ON(irqs_disabled()); + + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) @@ -218,14 +232,16 @@ void cpu_idle (void) idle = default_idle; if (cpu_is_offline(smp_processor_id())) play_dead(); + stop_critical_timing(); enter_idle(); idle(); __exit_idle(); } - - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } @@ -240,13 +256,16 @@ static void mwait_idle(void) { local_irq_enable(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { + if (hrtimer_stop_sched_tick()) + break; __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (need_resched()) + if (need_resched() && !need_resched_delayed()) break; __mwait(0, 0); } + hrtimer_restart_sched_tick(); } void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) @@ -346,13 +365,14 @@ void exit_thread(void) struct thread_struct *t = &me->thread; if (me->thread.io_bitmap_ptr) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss; kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; /* * Careful, clear this in the TSS too: */ + tss = &per_cpu(init_tss, get_cpu()); memset(tss->io_bitmap, 0xff, t->io_bitmap_max); t->io_bitmap_max = 0; put_cpu(); Index: linux/arch/x86_64/kernel/setup64.c =================================================================== --- linux.orig/arch/x86_64/kernel/setup64.c +++ linux/arch/x86_64/kernel/setup64.c @@ -116,7 +116,7 @@ void __init setup_per_cpu_areas(void) } } -void pda_init(int cpu) +void notrace pda_init(int cpu) { struct x8664_pda *pda = cpu_pda(cpu); @@ -185,7 +185,7 @@ void __cpuinit check_efer(void) * 'CPU state barrier', nothing should get across. * A lot of state is already set up in PDA init. */ -void __cpuinit cpu_init (void) +void __cpuinit notrace cpu_init (void) { int cpu = stack_smp_processor_id(); struct tss_struct *t = &per_cpu(init_tss, cpu); Index: linux/arch/x86_64/kernel/signal.c =================================================================== --- linux.orig/arch/x86_64/kernel/signal.c +++ linux/arch/x86_64/kernel/signal.c @@ -431,6 +431,13 @@ int do_signal(struct pt_regs *regs, sigs siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux/arch/x86_64/kernel/smp.c =================================================================== --- linux.orig/arch/x86_64/kernel/smp.c +++ linux/arch/x86_64/kernel/smp.c @@ -57,7 +57,7 @@ union smp_flush_state { struct mm_struct *flush_mm; unsigned long flush_va; #define FLUSH_ALL -1ULL - spinlock_t tlbstate_lock; + raw_spinlock_t tlbstate_lock; }; char pad[SMP_CACHE_BYTES]; } ____cacheline_aligned; @@ -296,10 +296,20 @@ void smp_send_reschedule(int cpu) } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); Index: linux/arch/x86_64/kernel/smpboot.c =================================================================== --- linux.orig/arch/x86_64/kernel/smpboot.c +++ linux/arch/x86_64/kernel/smpboot.c @@ -204,7 +204,7 @@ static void __cpuinit smp_store_cpu_info latency and low latency is the primary objective here. -AK */ #define no_cpu_relax() barrier() -static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); +static __cpuinitdata __DEFINE_RAW_SPINLOCK(tsc_sync_lock); static volatile __cpuinitdata unsigned long go[SLAVE + 1]; static int notscsync __cpuinitdata; @@ -530,7 +530,7 @@ static inline void set_cpu_sibling_map(i /* * Setup code on secondary processor (after comming out of the trampoline) */ -void __cpuinit start_secondary(void) +void __cpuinit notrace start_secondary(void) { /* * Dont put anything before smp_callin(), SMP Index: linux/arch/x86_64/kernel/time.c =================================================================== --- linux.orig/arch/x86_64/kernel/time.c +++ linux/arch/x86_64/kernel/time.c @@ -39,149 +39,29 @@ #include #include #include +#include +#include #ifdef CONFIG_X86_LOCAL_APIC #include #endif +#include -#ifdef CONFIG_CPU_FREQ -static void cpufreq_delayed_get(void); -#endif extern void i8254_timer_resume(void); extern int using_apic_timer; +extern struct clock_event pit_clockevent; -static char *time_init_gtod(void); DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); -DEFINE_SPINLOCK(i8253_lock); - -int nohpet __initdata = 0; -static int notsc __initdata = 0; +DEFINE_RAW_SPINLOCK(i8253_lock); #define USEC_PER_TICK (USEC_PER_SEC / HZ) #define NSEC_PER_TICK (NSEC_PER_SEC / HZ) -#define FSEC_PER_TICK (FSEC_PER_SEC / HZ) -#define NS_SCALE 10 /* 2^10, carefully chosen */ -#define US_SCALE 32 /* 2^32, arbitralrily chosen */ -unsigned int cpu_khz; /* TSC clocks / usec, not used here */ -EXPORT_SYMBOL(cpu_khz); -static unsigned long hpet_period; /* fsecs / HPET clock */ -unsigned long hpet_tick; /* HPET clocks / interrupt */ -int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */ -unsigned long vxtime_hz = PIT_TICK_RATE; int report_lost_ticks; /* command line option */ -unsigned long long monotonic_base; - -struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ - -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; -struct timespec __xtime __section_xtime; -struct timezone __sys_tz __section_sys_tz; - -/* - * do_gettimeoffset() returns microseconds since last timer interrupt was - * triggered by hardware. A memory read of HPET is slower than a register read - * of TSC, but much more reliable. It's also synchronized to the timer - * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a - * timer interrupt has happened already, but vxtime.trigger wasn't updated yet. - * This is not a problem, because jiffies hasn't updated either. They are bound - * together by xtime_lock. - */ - -static inline unsigned int do_gettimeoffset_tsc(void) -{ - unsigned long t; - unsigned long x; - t = get_cycles_sync(); - if (t < vxtime.last_tsc) - t = vxtime.last_tsc; /* hack */ - x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE; - return x; -} - -static inline unsigned int do_gettimeoffset_hpet(void) -{ - /* cap counter read to one tick to avoid inconsistencies */ - unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; - return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE; -} - -unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; - -/* - * This version of gettimeofday() has microsecond resolution and better than - * microsecond precision, as we're using at least a 10 MHz (usually 14.31818 - * MHz) HPET timer. - */ - -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq, t; - unsigned int sec, usec; - - do { - seq = read_seqbegin(&xtime_lock); - - sec = xtime.tv_sec; - usec = xtime.tv_nsec / NSEC_PER_USEC; - - /* i386 does some correction here to keep the clock - monotonous even when ntpd is fixing drift. - But they didn't work for me, there is a non monotonic - clock anyways with ntp. - I dropped all corrections now until a real solution can - be found. Note when you fix it here you need to do the same - in arch/x86_64/kernel/vsyscall.c and export all needed - variables in vmlinux.lds. -AK */ - - t = (jiffies - wall_jiffies) * USEC_PER_TICK + - do_gettimeoffset(); - usec += t; - - } while (read_seqretry(&xtime_lock, seq)); - - tv->tv_sec = sec + usec / USEC_PER_SEC; - tv->tv_usec = usec % USEC_PER_SEC; -} - -EXPORT_SYMBOL(do_gettimeofday); - -/* - * settimeofday() first undoes the correction that gettimeofday would do - * on the time, and then saves it. This is ugly, but has been like this for - * ages already. - */ - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - - nsec -= do_gettimeoffset() * NSEC_PER_USEC + - (jiffies - wall_jiffies) * NSEC_PER_TICK; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} -EXPORT_SYMBOL(do_settimeofday); +volatile unsigned long jiffies = INITIAL_JIFFIES; unsigned long profile_pc(struct pt_regs *regs) { @@ -277,84 +157,9 @@ static void set_rtc_mmss(unsigned long n } -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -unsigned long long monotonic_clock(void) -{ - unsigned long seq; - u32 last_offset, this_offset, offset; - unsigned long long base; - - if (vxtime.mode == VXTIME_HPET) { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last; - base = monotonic_base; - this_offset = hpet_readl(HPET_COUNTER); - } while (read_seqretry(&xtime_lock, seq)); - offset = (this_offset - last_offset); - offset *= NSEC_PER_TICK / hpet_tick; - } else { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last_tsc; - base = monotonic_base; - } while (read_seqretry(&xtime_lock, seq)); - this_offset = get_cycles_sync(); - /* FIXME: 1000 or 1000000? */ - offset = (this_offset - last_offset)*1000 / cpu_khz; - } - return base + offset; -} -EXPORT_SYMBOL(monotonic_clock); - -static noinline void handle_lost_ticks(int lost, struct pt_regs *regs) -{ - static long lost_count; - static int warned; - if (report_lost_ticks) { - printk(KERN_WARNING "time.c: Lost %d timer tick(s)! ", lost); - print_symbol("rip %s)\n", regs->rip); - } - - if (lost_count == 1000 && !warned) { - printk(KERN_WARNING "warning: many lost ticks.\n" - KERN_WARNING "Your time source seems to be instable or " - "some driver is hogging interupts\n"); - print_symbol("rip %s\n", regs->rip); - if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) { - printk(KERN_WARNING "Falling back to HPET\n"); - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; - } - /* else should fall back to PIT, but code missing. */ - warned = 1; - } else - lost_count++; - -#ifdef CONFIG_CPU_FREQ - /* In some cases the CPU can change frequency without us noticing - Give cpufreq a change to catch up. */ - if ((lost_count+1) % 25 == 0) - cpufreq_delayed_get(); -#endif -} - void main_timer_handler(struct pt_regs *regs) { static unsigned long rtc_update = 0; - unsigned long tsc; - int delay = 0, offset = 0, lost = 0; - /* * Here we are in the timer irq handler. We have irqs locally disabled (so we * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running @@ -362,92 +167,11 @@ void main_timer_handler(struct pt_regs * * variables, because both do_timer() and us change them -arca+vojtech */ - write_seqlock(&xtime_lock); - - if (vxtime.hpet_address) - offset = hpet_readl(HPET_COUNTER); - - if (hpet_use_timer) { - /* if we're using the hpet timer functionality, - * we can more accurately know the counter value - * when the timer interrupt occured. - */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - delay = hpet_readl(HPET_COUNTER) - offset; - } else if (!pmtmr_ioport) { - spin_lock(&i8253_lock); - outb_p(0x00, 0x43); - delay = inb_p(0x40); - delay |= inb(0x40) << 8; - spin_unlock(&i8253_lock); - delay = LATCH - 1 - delay; - } - - tsc = get_cycles_sync(); - - if (vxtime.mode == VXTIME_HPET) { - if (offset - vxtime.last > hpet_tick) { - lost = (offset - vxtime.last) / hpet_tick - 1; - } - - monotonic_base += - (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick; - - vxtime.last = offset; -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - lost = pmtimer_mark_offset(); -#endif - } else { - offset = (((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK; - - if (offset < 0) - offset = 0; - - if (offset > USEC_PER_TICK) { - lost = offset / USEC_PER_TICK; - offset %= USEC_PER_TICK; - } - - /* FIXME: 1000 or 1000000? */ - monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz; - - vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; - - if ((((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> US_SCALE) < offset) - vxtime.last_tsc = tsc - - (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1; - } - - if (lost > 0) { - handle_lost_ticks(lost, regs); - jiffies += lost; - } - /* * Do the timer stuff. */ - do_timer(regs); -#ifndef CONFIG_SMP - update_process_times(user_mode(regs)); -#endif - -/* - * In the SMP case we use the local APIC timer interrupt to do the profiling, - * except when we simulate SMP mode on a uniprocessor system, in that case we - * have to call the local interrupt handler. - */ - -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING, regs); -#else - if (!using_apic_timer) - smp_local_timer_interrupt(regs); -#endif - + pit_clockevent.event_handler(regs); /* * If we have an externally synchronized Linux clock, then update CMOS clock * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy @@ -462,13 +186,10 @@ void main_timer_handler(struct pt_regs * rtc_update = xtime.tv_sec + 660; } - write_sequnlock(&xtime_lock); } static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { - if (apic_runs_main_timer > 1) - return IRQ_HANDLED; main_timer_handler(regs); #ifdef CONFIG_X86_LOCAL_APIC if (using_apic_timer) @@ -477,39 +198,6 @@ static irqreturn_t timer_interrupt(int i return IRQ_HANDLED; } -static unsigned int cyc2ns_scale __read_mostly; - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ - cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> NS_SCALE; -} - -unsigned long long sched_clock(void) -{ - unsigned long a = 0; - -#if 0 - /* Don't do a HPET read here. Using TSC always is much faster - and HPET may not be mapped yet when the scheduler first runs. - Disadvantage is a small drift between CPUs in some configurations, - but that should be tolerable. */ - if (__vxtime.mode == VXTIME_HPET) - return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE; -#endif - - /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, - which means it is not completely exact and may not be monotonous between - CPUs. But the errors should be too small to matter for scheduling - purposes. */ - - rdtscll(a); - return cycles_2_ns(a); -} static unsigned long get_cmos_time(void) { @@ -562,142 +250,6 @@ static unsigned long get_cmos_time(void) return mktime(year, mon, day, hour, min, sec); } -#ifdef CONFIG_CPU_FREQ - -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency - changes. - - RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - not that important because current Opteron setups do not support - scaling on SMP anyroads. - - Should fix up last_tsc too. Currently gettimeofday in the - first tick after the change will be slightly wrong. */ - -#include - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(void *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static void cpufreq_delayed_get(void) -{ - static int warned; - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - if (!warned) { - warned = 1; - printk(KERN_DEBUG - "Losing some ticks... checking if CPU frequency changed.\n"); - } - schedule_work(&cpufreq_delayed_get_work); - } -} - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -static unsigned long cpu_khz_ref = 0; - -static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - unsigned long *lpj, dummy; - - if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) - return 0; - - lpj = &dummy; - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) -#ifdef CONFIG_SMP - lpj = &cpu_data[freq->cpu].loops_per_jiffy; -#else - lpj = &boot_cpu_data.loops_per_jiffy; -#endif - - if (!ref_freq) { - ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; - cpu_khz_ref = cpu_khz; - } - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - *lpj = - cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); - - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; - } - - set_cyc2ns_scale(cpu_khz_ref); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - -static int __init cpufreq_tsc(void) -{ - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); - if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER)) - cpufreq_init = 1; - return 0; -} - -core_initcall(cpufreq_tsc); - -#endif - -/* - * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing - * it to the HPET timer of known frequency. - */ - -#define TICK_COUNT 100000000 - -static unsigned int __init hpet_calibrate_tsc(void) -{ - int tsc_start, hpet_start; - int tsc_now, hpet_now; - unsigned long flags; - - local_irq_save(flags); - local_irq_disable(); - - hpet_start = hpet_readl(HPET_COUNTER); - rdtscl(tsc_start); - - do { - local_irq_disable(); - hpet_now = hpet_readl(HPET_COUNTER); - tsc_now = get_cycles_sync(); - local_irq_restore(flags); - } while ((tsc_now - tsc_start) < TICK_COUNT && - (hpet_now - hpet_start) < TICK_COUNT); - - return (tsc_now - tsc_start) * 1000000000L - / ((hpet_now - hpet_start) * hpet_period / 1000); -} - /* * pit_calibrate_tsc() uses the speaker output (channel 2) of @@ -728,137 +280,84 @@ static unsigned int __init pit_calibrate return (end - start) / 50; } -#ifdef CONFIG_HPET -static __init int late_hpet_init(void) -{ - struct hpet_data hd; - unsigned int ntimer; - - if (!vxtime.hpet_address) - return 0; - - memset(&hd, 0, sizeof (hd)); - - ntimer = hpet_readl(HPET_ID); - ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; - ntimer++; - - /* - * Register with driver. - * Timer0 and Timer1 is used by platform. - */ - hd.hd_phys_address = vxtime.hpet_address; - hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); - hd.hd_nirqs = ntimer; - hd.hd_flags = HPET_DATA_PLATFORM; - hpet_reserve_timer(&hd, 0); -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif - hd.hd_irq[0] = HPET_LEGACY_8254; - hd.hd_irq[1] = HPET_LEGACY_RTC; - if (ntimer > 2) { - struct hpet *hpet; - struct hpet_timer *timer; - int i; - - hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); - timer = &hpet->hpet_timers[2]; - for (i = 2; i < ntimer; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & - Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; +#define PIT_MODE 0x43 +#define PIT_CH0 0x40 - } +static void __init __pit_init(int val, u8 mode) +{ + unsigned long flags; - hpet_alloc(&hd); - return 0; + spin_lock_irqsave(&i8253_lock, flags); + outb_p(mode, PIT_MODE); + outb_p(val & 0xff, PIT_CH0); /* LSB */ + outb_p(val >> 8, PIT_CH0); /* MSB */ + spin_unlock_irqrestore(&i8253_lock, flags); } -fs_initcall(late_hpet_init); -#endif -static int hpet_timer_stop_set_go(unsigned long tick) +static void init_pit_timer(int mode, struct clock_event *evt) { - unsigned int cfg; - -/* - * Stop the timers and reset the main counter. - */ + unsigned long flags; - cfg = hpet_readl(HPET_CFG); - cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - hpet_writel(0, HPET_COUNTER); - hpet_writel(0, HPET_COUNTER + 4); + spin_lock_irqsave(&i8253_lock, flags); -/* - * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, - * and period also hpet_tick. - */ - if (hpet_use_timer) { - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | - HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ - hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ - cfg |= HPET_CFG_LEGACY; + switch(mode) { + case CLOCK_EVT_PERIODIC: + /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(0x34, PIT_MODE); + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + break; + + case CLOCK_EVT_ONESHOT: + /* One shot setup */ + outb_p(0x38, PIT_MODE); + udelay(10); + break; + case CLOCK_EVT_SHUTDOWN: + outb_p(0x30, PIT_MODE); + outb_p(0, PIT_CH0); /* LSB */ + outb_p(0, PIT_CH0); /* MSB */ + disable_irq(0); + break; } -/* - * Go! - */ - - cfg |= HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); - - return 0; + spin_unlock_irqrestore(&i8253_lock, flags); } -static int hpet_init(void) +static void pit_next_event(unsigned long delta, struct clock_event *evt) { - unsigned int id; - - if (!vxtime.hpet_address) - return -1; - set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address); - __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); - -/* - * Read the period, compute tick and quotient. - */ - - id = hpet_readl(HPET_ID); - - if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) - return -1; - - hpet_period = hpet_readl(HPET_PERIOD); - if (hpet_period < 100000 || hpet_period > 100000000) - return -1; - - hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; - - hpet_use_timer = (id & HPET_ID_LEGSUP); + unsigned long flags; - return hpet_timer_stop_set_go(hpet_tick); + spin_lock_irqsave(&i8253_lock, flags); + outb_p(delta & 0xff , PIT_CH0); /* LSB */ + outb(delta >> 8 , PIT_CH0); /* MSB */ + spin_unlock_irqrestore(&i8253_lock, flags); } -static int hpet_reenable(void) +struct clock_event pit_clockevent = { + .name = "pit", + .capabilities = CLOCK_CAP_TICK | CLOCK_CAP_PROFILE | CLOCK_CAP_UPDATE +#ifndef CONFIG_SMP + | CLOCK_CAP_NEXTEVT +#endif + , + .set_mode = init_pit_timer, + .set_next_event = pit_next_event, + .shift = 32, +}; + +void setup_pit_timer(void) { - return hpet_timer_stop_set_go(hpet_tick); + pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32); + pit_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFF, &pit_clockevent); + pit_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &pit_clockevent); + register_global_clockevent(&pit_clockevent); } -#define PIT_MODE 0x43 -#define PIT_CH0 0x40 -static void __init __pit_init(int val, u8 mode) -{ - unsigned long flags; - spin_lock_irqsave(&i8253_lock, flags); - outb_p(mode, PIT_MODE); - outb_p(val & 0xff, PIT_CH0); /* LSB */ - outb_p(val >> 8, PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); -} void __init pit_init(void) { @@ -873,9 +372,9 @@ void __init pit_stop_interrupt(void) void __init stop_timer_interrupt(void) { char *name; - if (vxtime.hpet_address) { + if (hpet_address) { name = "HPET"; - hpet_timer_stop_set_go(0); + hpet_stop(); } else { name = "PIT"; pit_stop_interrupt(); @@ -890,119 +389,47 @@ int __init time_setup(char *str) } static struct irqaction irq0 = { - timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL + timer_interrupt, IRQF_DISABLED | IRQF_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL }; void __init time_init(void) { char *timename; - char *gtod; if (nohpet) - vxtime.hpet_address = 0; - + hpet_address = 0; xtime.tv_sec = get_cmos_time(); xtime.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - if (!hpet_init()) - vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period; - else - vxtime.hpet_address = 0; + if (hpet_arch_init()) + hpet_address = 0; + + setup_pit_timer(); if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ tick_nsec = TICK_NSEC_HPET; cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; -#ifdef CONFIG_X86_PM_TIMER - } else if (pmtmr_ioport && !vxtime.hpet_address) { - vxtime_hz = PM_TIMER_FREQUENCY; - timename = "PM"; - pit_init(); - cpu_khz = pit_calibrate_tsc(); -#endif } else { pit_init(); cpu_khz = pit_calibrate_tsc(); timename = "PIT"; } - vxtime.mode = VXTIME_TSC; - gtod = time_init_gtod(); + if (unsynchronized_tsc()) + mark_tsc_unstable(); - printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", - vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod); printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); - vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; - vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; - vxtime.last_tsc = get_cycles_sync(); setup_irq(0, &irq0); set_cyc2ns_scale(cpu_khz); } -/* - * Make an educated guess if the TSC is trustworthy and synchronized - * over all CPUs. - */ -__cpuinit int unsynchronized_tsc(void) -{ -#ifdef CONFIG_SMP - if (apic_is_clustered_box()) - return 1; -#endif - /* Most intel systems have synchronized TSCs except for - multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { -#ifdef CONFIG_ACPI - /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100) - return 1; -#endif - return 0; - } - - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; -} - -/* - * Decide what mode gettimeofday should use. - */ -__init static char *time_init_gtod(void) -{ - char *timetype; - - if (unsynchronized_tsc()) - notsc = 1; - if (vxtime.hpet_address && notsc) { - timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; -#ifdef CONFIG_X86_PM_TIMER - /* Using PM for gettimeofday is quite slow, but we have no other - choice because the TSC is too unreliable on some systems. */ - } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) { - timetype = "PM"; - do_gettimeoffset = do_gettimeoffset_pm; - vxtime.mode = VXTIME_PMTMR; - sysctl_vsyscall = 0; - printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); -#endif - } else { - timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; - vxtime.mode = VXTIME_TSC; - } - return timetype; -} __setup("report_lost_ticks", time_setup); @@ -1033,7 +460,7 @@ static int timer_resume(struct sys_devic unsigned long ctime = get_cmos_time(); unsigned long sleep_length = (ctime - sleep_start) * HZ; - if (vxtime.hpet_address) + if (hpet_address) hpet_reenable(); else i8254_timer_resume(); @@ -1042,21 +469,9 @@ static int timer_resume(struct sys_devic write_seqlock_irqsave(&xtime_lock,flags); xtime.tv_sec = sec; xtime.tv_nsec = 0; - if (vxtime.mode == VXTIME_HPET) { - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - pmtimer_resume(); -#endif - } else - vxtime.last_tsc = get_cycles_sync(); - write_sequnlock_irqrestore(&xtime_lock,flags); jiffies += sleep_length; wall_jiffies += sleep_length; - monotonic_base += sleep_length * (NSEC_PER_SEC/HZ); + write_sequnlock_irqrestore(&xtime_lock,flags); touch_softlockup_watchdog(); return 0; } @@ -1083,243 +498,3 @@ static int time_init_device(void) device_initcall(time_init_device); -#ifdef CONFIG_HPET_EMULATE_RTC -/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET - * is enabled, we support RTC interrupt functionality in software. - * RTC has 3 kinds of interrupts: - * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock - * is updated - * 2) Alarm Interrupt - generate an interrupt at a specific time of day - * 3) Periodic Interrupt - generate periodic interrupt, with frequencies - * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) - * (1) and (2) above are implemented using polling at a frequency of - * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt - * overhead. (DEFAULT_RTC_INT_FREQ) - * For (3), we use interrupts at 64Hz or user specified periodic - * frequency, whichever is higher. - */ -#include - -#define DEFAULT_RTC_INT_FREQ 64 -#define RTC_NUM_INTS 1 - -static unsigned long UIE_on; -static unsigned long prev_update_sec; - -static unsigned long AIE_on; -static struct rtc_time alarm_time; - -static unsigned long PIE_on; -static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; -static unsigned long PIE_count; - -static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ -static unsigned int hpet_t1_cmp; /* cached comparator register */ - -int is_hpet_enabled(void) -{ - return vxtime.hpet_address != 0; -} - -/* - * Timer 1 for RTC, we do not use periodic interrupt feature, - * even if HPET supports periodic interrupts on Timer 1. - * The reason being, to set up a periodic interrupt in HPET, we need to - * stop the main counter. And if we do that everytime someone diables/enables - * RTC, we will have adverse effect on main kernel timer running on Timer 0. - * So, for the time being, simulate the periodic interrupt in software. - * - * hpet_rtc_timer_init() is called for the first time and during subsequent - * interuppts reinit happens through hpet_rtc_timer_reinit(). - */ -int hpet_rtc_timer_init(void) -{ - unsigned int cfg, cnt; - unsigned long flags; - - if (!is_hpet_enabled()) - return 0; - /* - * Set the counter 1 and enable the interrupts. - */ - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - local_irq_save(flags); - cnt = hpet_readl(HPET_COUNTER); - cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; - local_irq_restore(flags); - - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_PERIODIC; - cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T1_CFG); - - return 1; -} - -static void hpet_rtc_timer_reinit(void) -{ - unsigned int cfg, cnt; - - if (unlikely(!(PIE_on | AIE_on | UIE_on))) { - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T1_CFG); - return; - } - - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - /* It is more accurate to use the comparator value than current count.*/ - cnt = hpet_t1_cmp; - cnt += hpet_tick*HZ/hpet_rtc_int_freq; - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; -} - -/* - * The functions below are called from rtc driver. - * Return 0 if HPET is not being used. - * Otherwise do the necessary changes and return 1. - */ -int hpet_mask_rtc_irq_bit(unsigned long bit_mask) -{ - if (!is_hpet_enabled()) - return 0; - - if (bit_mask & RTC_UIE) - UIE_on = 0; - if (bit_mask & RTC_PIE) - PIE_on = 0; - if (bit_mask & RTC_AIE) - AIE_on = 0; - - return 1; -} - -int hpet_set_rtc_irq_bit(unsigned long bit_mask) -{ - int timer_init_reqd = 0; - - if (!is_hpet_enabled()) - return 0; - - if (!(PIE_on | AIE_on | UIE_on)) - timer_init_reqd = 1; - - if (bit_mask & RTC_UIE) { - UIE_on = 1; - } - if (bit_mask & RTC_PIE) { - PIE_on = 1; - PIE_count = 0; - } - if (bit_mask & RTC_AIE) { - AIE_on = 1; - } - - if (timer_init_reqd) - hpet_rtc_timer_init(); - - return 1; -} - -int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) -{ - if (!is_hpet_enabled()) - return 0; - - alarm_time.tm_hour = hrs; - alarm_time.tm_min = min; - alarm_time.tm_sec = sec; - - return 1; -} - -int hpet_set_periodic_freq(unsigned long freq) -{ - if (!is_hpet_enabled()) - return 0; - - PIE_freq = freq; - PIE_count = 0; - - return 1; -} - -int hpet_rtc_dropped_irq(void) -{ - if (!is_hpet_enabled()) - return 0; - - return 1; -} - -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - struct rtc_time curr_time; - unsigned long rtc_int_flag = 0; - int call_rtc_interrupt = 0; - - hpet_rtc_timer_reinit(); - - if (UIE_on | AIE_on) { - rtc_get_rtc_time(&curr_time); - } - if (UIE_on) { - if (curr_time.tm_sec != prev_update_sec) { - /* Set update int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag = RTC_UF; - prev_update_sec = curr_time.tm_sec; - } - } - if (PIE_on) { - PIE_count++; - if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { - /* Set periodic int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_PF; - PIE_count = 0; - } - } - if (AIE_on) { - if ((curr_time.tm_sec == alarm_time.tm_sec) && - (curr_time.tm_min == alarm_time.tm_min) && - (curr_time.tm_hour == alarm_time.tm_hour)) { - /* Set alarm int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_AF; - } - } - if (call_rtc_interrupt) { - rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id, regs); - } - return IRQ_HANDLED; -} -#endif - -static int __init nohpet_setup(char *s) -{ - nohpet = 1; - return 1; -} - -__setup("nohpet", nohpet_setup); - -int __init notsc_setup(char *s) -{ - notsc = 1; - return 1; -} - -__setup("notsc", notsc_setup); Index: linux/arch/x86_64/kernel/traps.c =================================================================== --- linux.orig/arch/x86_64/kernel/traps.c +++ linux/arch/x86_64/kernel/traps.c @@ -368,6 +368,7 @@ void show_trace(struct task_struct *tsk, #undef HANDLE_STACK printk("\n"); + print_traces(tsk); } static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp) @@ -497,7 +498,7 @@ void out_of_line_bug(void) EXPORT_SYMBOL(out_of_line_bug); #endif -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); static int die_owner = -1; static unsigned int die_nest_count; Index: linux/arch/x86_64/kernel/tsc.c =================================================================== --- /dev/null +++ linux/arch/x86_64/kernel/tsc.c @@ -0,0 +1,229 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define NS_SCALE 10 /* 2^10, carefully chosen */ +#define US_SCALE 32 /* 2^32, arbitralrily chosen */ + +static int notsc __initdata = 0; + +unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); + +static unsigned int cyc2ns_scale __read_mostly; + +void set_cyc2ns_scale(unsigned long khz) +{ + cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> NS_SCALE; +} + +unsigned long long sched_clock(void) +{ + unsigned long a = 0; + + /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, + which means it is not completely exact and may not be monotonous between + CPUs. But the errors should be too small to matter for scheduling + purposes. */ + + rdtscll(a); + return cycles_2_ns(a); +} + +static int tsc_unstable; + +static inline int check_tsc_unstable(void) +{ + return tsc_unstable; +} + +void mark_tsc_unstable(void) +{ + tsc_unstable = 1; +} +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +#ifdef CONFIG_CPU_FREQ + +/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency + changes. + + RED-PEN: On SMP we assume all CPUs run with the same frequency. It's + not that important because current Opteron setups do not support + scaling on SMP anyroads. + + Should fix up last_tsc too. Currently gettimeofday in the + first tick after the change will be slightly wrong. */ + +#include + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(void *v) +{ + unsigned int cpu; + for_each_online_cpu(cpu) { + cpufreq_get(cpu); + } + cpufreq_delayed_issched = 0; +} + +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; + +static unsigned long cpu_khz_ref = 0; + +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + unsigned long *lpj, dummy; + + if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) + return 0; + + lpj = &dummy; + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) +#ifdef CONFIG_SMP + lpj = &cpu_data[freq->cpu].loops_per_jiffy; +#else + lpj = &boot_cpu_data.loops_per_jiffy; +#endif + + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = *lpj; + cpu_khz_ref = cpu_khz; + } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + *lpj = + cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + + cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + mark_tsc_unstable(); + } + + set_cyc2ns_scale(cpu_khz_ref); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); + if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER)) + cpufreq_init = 1; + return 0; +} + +core_initcall(cpufreq_tsc); + +#endif +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +__cpuinit int unsynchronized_tsc(void) +{ +#ifdef CONFIG_SMP + if (apic_is_clustered_box()) + return 1; +#endif + /* Most intel systems have synchronized TSCs except for + multi node systems */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { +#ifdef CONFIG_ACPI + /* But TSC doesn't tick in C3 so don't use it there */ + if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100) + return 1; +#endif + return 0; + } + + /* Assume multi socket systems are not synchronized */ + return num_present_cpus() > 1; +} + +int __init notsc_setup(char *s) +{ + notsc = 1; + return 1; +} + +__setup("notsc", notsc_setup); + + +/* clock source code: */ + +static int tsc_update_callback(void); + +static cycle_t read_tsc(void) +{ + cycle_t ret = (cycle_t)get_cycles_sync(); + return ret; +} + +static cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret = (cycle_t)get_cycles_sync(); + return ret; +} + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = (cycle_t)-1, + .mult = 0, /* to be set */ + .shift = 22, + .update_callback = tsc_update_callback, + .is_continuous = 1, + .vread = vread_tsc, +}; + +static int tsc_update_callback(void) +{ + int change = 0; + + /* check to see if we should switch to the safe clocksource: */ + if (clocksource_tsc.rating != 50 && check_tsc_unstable()) { + clocksource_tsc.rating = 50; + clocksource_reselect(); + change = 1; + } + return change; +} + +static int __init init_tsc_clocksource(void) +{ + if (!notsc) { + clocksource_tsc.mult = clocksource_khz2mult(cpu_khz, + clocksource_tsc.shift); + return clocksource_register(&clocksource_tsc); + } + return 0; +} + +module_init(init_tsc_clocksource); Index: linux/arch/x86_64/kernel/vmlinux.lds.S =================================================================== --- linux.orig/arch/x86_64/kernel/vmlinux.lds.S +++ linux/arch/x86_64/kernel/vmlinux.lds.S @@ -93,27 +93,11 @@ SECTIONS __vsyscall_0 = VSYSCALL_VIRT_ADDR; . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) } - xtime_lock = VVIRT(.xtime_lock); - - .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } - vxtime = VVIRT(.vxtime); - - .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } - wall_jiffies = VVIRT(.wall_jiffies); - - .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) } - sys_tz = VVIRT(.sys_tz); - - .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) } - sysctl_vsyscall = VVIRT(.sysctl_vsyscall); - - .xtime : AT(VLOAD(.xtime)) { *(.xtime) } - xtime = VVIRT(.xtime); - + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } - jiffies = VVIRT(.jiffies); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) } + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } Index: linux/arch/x86_64/kernel/vsyscall.c =================================================================== --- linux.orig/arch/x86_64/kernel/vsyscall.c +++ linux/arch/x86_64/kernel/vsyscall.c @@ -26,65 +26,50 @@ #include #include #include +#include #include #include #include +#include #include #include #include -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) - -int __sysctl_vsyscall __section_sysctl_vsyscall = 1; -seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace -#include - -static __always_inline void timeval_normalize(struct timeval * tv) -{ - time_t __sec; +struct vsyscall_gtod_data_t { + raw_seqlock_t lock; + int sysctl_enabled; + struct timeval wall_time_tv; + struct timezone sys_tz; + cycle_t offset_base; + struct clocksource clock; +}; - __sec = tv->tv_usec / 1000000; - if (__sec) { - tv->tv_usec %= 1000000; - tv->tv_sec += __sec; - } -} +struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data = { + .lock = __RAW_SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), + .sysctl_enabled = 1, +}; -static __always_inline void do_vgettimeofday(struct timeval * tv) +void update_vsyscall(struct timespec* wall_time, struct clocksource* clock) { - long sequence, t; - unsigned long sec, usec; + unsigned long flags; - do { - sequence = read_seqbegin(&__xtime_lock); - - sec = __xtime.tv_sec; - usec = (__xtime.tv_nsec / 1000) + - (__jiffies - __wall_jiffies) * (1000000 / HZ); - - if (__vxtime.mode != VXTIME_HPET) { - t = get_cycles_sync(); - if (t < __vxtime.last_tsc) - t = __vxtime.last_tsc; - usec += ((t - __vxtime.last_tsc) * - __vxtime.tsc_quot) >> 32; - /* See comment in x86_64 do_gettimeofday. */ - } else { - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - - __vxtime.last) * __vxtime.quot) >> 32; - } - } while (read_seqretry(&__xtime_lock, sequence)); + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* copy vsyscall data */ + vsyscall_gtod_data.clock = *clock; + vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000; + vsyscall_gtod_data.sys_tz = sys_tz; - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } /* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ static __always_inline void do_get_tz(struct timezone * tz) { - *tz = __sys_tz; + *tz = __vsyscall_gtod_data.sys_tz; } static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) @@ -105,10 +90,44 @@ static __always_inline long time_syscall return secs; } +static __always_inline void do_vgettimeofday(struct timeval * tv) +{ + cycle_t now, base, mask, cycle_delta; + unsigned long seq, mult, shift, nsec_delta; + cycle_t (*vread)(void); + do { + seq = read_seqbegin(&__vsyscall_gtod_data.lock); + + vread = __vsyscall_gtod_data.clock.vread; + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { + gettimeofday(tv,0); + return; + } + now = vread(); + base = __vsyscall_gtod_data.clock.cycle_last; + mask = __vsyscall_gtod_data.clock.mask; + mult = __vsyscall_gtod_data.clock.mult; + shift = __vsyscall_gtod_data.clock.shift; + + *tv = __vsyscall_gtod_data.wall_time_tv; + + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + + /* calculate interval: */ + cycle_delta = (now - base) & mask; + /* convert to nsecs: */ + nsec_delta = (cycle_delta * mult) >> shift; + + /* convert to usecs and add to timespec: */ + tv->tv_usec += nsec_delta / NSEC_PER_USEC; + while (tv->tv_usec > USEC_PER_SEC) { + tv->tv_sec += 1; + tv->tv_usec -= USEC_PER_SEC; + } +} + int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) { - if (!__sysctl_vsyscall) - return gettimeofday(tv,tz); if (tv) do_vgettimeofday(tv); if (tz) @@ -120,11 +139,11 @@ int __vsyscall(0) vgettimeofday(struct t * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { - if (!__sysctl_vsyscall) + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); else if (t) - *t = __xtime.tv_sec; - return __xtime.tv_sec; + *t = __vsyscall_gtod_data.wall_time_tv.tv_sec; + return __vsyscall_gtod_data.wall_time_tv.tv_sec; } long __vsyscall(2) venosys_0(void) @@ -163,7 +182,7 @@ static int vsyscall_sysctl_change(ctl_ta ret = -ENOMEM; goto out; } - if (!sysctl_vsyscall) { + if (!vsyscall_gtod_data.sysctl_enabled) { *map1 = SYSCALL; *map2 = SYSCALL; } else { @@ -186,7 +205,7 @@ static int vsyscall_sysctl_nostrat(ctl_t static ctl_table kernel_table2[] = { { .ctl_name = 99, .procname = "vsyscall64", - .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, + .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, .strategy = vsyscall_sysctl_nostrat, .proc_handler = vsyscall_sysctl_change }, { 0, } Index: linux/arch/x86_64/kernel/x8664_ksyms.c =================================================================== --- linux.orig/arch/x86_64/kernel/x8664_ksyms.c +++ linux/arch/x86_64/kernel/x8664_ksyms.c @@ -12,10 +12,12 @@ EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); Index: linux/arch/x86_64/lib/thunk.S =================================================================== --- linux.orig/arch/x86_64/lib/thunk.S +++ linux/arch/x86_64/lib/thunk.S @@ -42,11 +42,13 @@ thunk rwsem_wake_thunk,rwsem_wake thunk rwsem_downgrade_thunk,rwsem_downgrade_wake #endif - - thunk __down_failed,__down - thunk_retrax __down_failed_interruptible,__down_interruptible - thunk_retrax __down_failed_trylock,__down_trylock - thunk __up_wakeup,__up + +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK + thunk __compat_down_failed,__compat_down + thunk_retrax __compat_down_failed_interruptible,__compat_down_interruptible + thunk_retrax __compat_down_failed_trylock,__compat_down_trylock + thunk __compat_up_wakeup,__compat_up +#endif #ifdef CONFIG_TRACE_IRQFLAGS thunk trace_hardirqs_on_thunk,trace_hardirqs_on Index: linux/arch/x86_64/mm/fault.c =================================================================== --- linux.orig/arch/x86_64/mm/fault.c +++ linux/arch/x86_64/mm/fault.c @@ -79,6 +79,7 @@ void bust_spinlocks(int yes) { int loglevel_save = console_loglevel; if (yes) { + stop_trace(); oops_in_progress = 1; } else { #ifdef CONFIG_VT Index: linux/arch/x86_64/mm/init.c =================================================================== --- linux.orig/arch/x86_64/mm/init.c +++ linux/arch/x86_64/mm/init.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(dma_ops); static unsigned long dma_reserve __initdata; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the Index: linux/block/cfq-iosched.c =================================================================== --- linux.orig/block/cfq-iosched.c +++ linux/block/cfq-iosched.c @@ -1283,7 +1283,7 @@ static void cfq_exit_single_io_context(s q = cfqd->queue; - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); spin_lock(q->queue_lock); Index: linux/block/ll_rw_blk.c =================================================================== --- linux.orig/block/ll_rw_blk.c +++ linux/block/ll_rw_blk.c @@ -1547,7 +1547,7 @@ static int ll_merge_requests_fn(request_ */ void blk_plug_device(request_queue_t *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); /* * don't plug a stopped queue, it must be paired with blk_start_queue() @@ -1570,7 +1570,7 @@ EXPORT_SYMBOL(blk_plug_device); */ int blk_remove_plug(request_queue_t *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) return 0; @@ -3584,13 +3584,15 @@ void exit_io_context(void) struct io_context *ioc; struct cfq_io_context *cic; - local_irq_save(flags); + // FIXME: unsafe upstream too? + + local_irq_save_nort(flags); task_lock(current); ioc = current->io_context; current->io_context = NULL; ioc->task = NULL; task_unlock(current); - local_irq_restore(flags); + local_irq_restore_nort(flags); if (ioc->aic && ioc->aic->exit) ioc->aic->exit(ioc->aic); Index: linux/drivers/acpi/executer/exmutex.c =================================================================== --- linux.orig/drivers/acpi/executer/exmutex.c +++ linux/drivers/acpi/executer/exmutex.c @@ -267,9 +267,9 @@ acpi_ex_release_mutex(union acpi_operand && (obj_desc->mutex.os_mutex != ACPI_GLOBAL_LOCK)) { ACPI_ERROR((AE_INFO, "Thread %X cannot release Mutex [%4.4s] acquired by thread %X", - (u32) walk_state->thread->thread_id, + (u32)(long) walk_state->thread->thread_id, acpi_ut_get_node_name(obj_desc->mutex.node), - (u32) obj_desc->mutex.owner_thread->thread_id)); + (u32)(long) obj_desc->mutex.owner_thread->thread_id)); return_ACPI_STATUS(AE_AML_NOT_OWNER); } Index: linux/drivers/acpi/osl.c =================================================================== --- linux.orig/drivers/acpi/osl.c +++ linux/drivers/acpi/osl.c @@ -676,13 +676,13 @@ void acpi_os_delete_lock(acpi_spinlock h acpi_status acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle) { - struct semaphore *sem = NULL; + struct compat_semaphore *sem = NULL; - sem = acpi_os_allocate(sizeof(struct semaphore)); + sem = acpi_os_allocate(sizeof(struct compat_semaphore)); if (!sem) return AE_NO_MEMORY; - memset(sem, 0, sizeof(struct semaphore)); + memset(sem, 0, sizeof(struct compat_semaphore)); sema_init(sem, initial_units); @@ -705,7 +705,7 @@ EXPORT_SYMBOL(acpi_os_create_semaphore); acpi_status acpi_os_delete_semaphore(acpi_handle handle) { - struct semaphore *sem = (struct semaphore *)handle; + struct compat_semaphore *sem = (struct compat_semaphore *)handle; if (!sem) @@ -733,7 +733,7 @@ EXPORT_SYMBOL(acpi_os_delete_semaphore); acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout) { acpi_status status = AE_OK; - struct semaphore *sem = (struct semaphore *)handle; + struct compat_semaphore *sem = (struct compat_semaphore *)handle; int ret = 0; @@ -820,7 +820,7 @@ EXPORT_SYMBOL(acpi_os_wait_semaphore); */ acpi_status acpi_os_signal_semaphore(acpi_handle handle, u32 units) { - struct semaphore *sem = (struct semaphore *)handle; + struct compat_semaphore *sem = (struct compat_semaphore *)handle; if (!sem || (units < 1)) Index: linux/drivers/acpi/processor_idle.c =================================================================== --- linux.orig/drivers/acpi/processor_idle.c +++ linux/drivers/acpi/processor_idle.c @@ -38,9 +38,11 @@ #include #include #include /* need_resched() */ +#include #include #include +#include #include #include @@ -368,10 +370,12 @@ static void acpi_processor_idle(void) /* Get end time (ticks) */ t2 = inl(acpi_fadt.xpm_tmr_blk.address); +#ifndef CONFIG_IA64 #ifdef CONFIG_GENERIC_TIME /* TSC halts in C2, so notify users */ mark_tsc_unstable(); #endif +#endif /* Re-enable interrupts */ local_irq_enable(); current_thread_info()->status |= TS_POLLING; @@ -412,10 +416,12 @@ static void acpi_processor_idle(void) ACPI_MTX_DO_NOT_LOCK); } +#ifndef CONFIG_IA64 #ifdef CONFIG_GENERIC_TIME /* TSC halts in C3, so notify users */ mark_tsc_unstable(); #endif +#endif /* Re-enable interrupts */ local_irq_enable(); current_thread_info()->status |= TS_POLLING; @@ -453,7 +459,8 @@ static void acpi_processor_idle(void) */ if (cx->promotion.state && ((cx->promotion.state - pr->power.states) <= max_cstate)) { - if (sleep_ticks > cx->promotion.threshold.ticks) { + if (sleep_ticks > cx->promotion.threshold.ticks && + cx->promotion.state->latency <= system_latency_constraint()) { cx->promotion.count++; cx->demotion.count = 0; if (cx->promotion.count >= @@ -494,8 +501,10 @@ static void acpi_processor_idle(void) end: /* * Demote if current state exceeds max_cstate + * or if the latency of the current state is unacceptable */ - if ((pr->power.state - pr->power.states) > max_cstate) { + if ((pr->power.state - pr->power.states) > max_cstate || + pr->power.state->latency > system_latency_constraint()) { if (cx->demotion.state) next_state = cx->demotion.state; } @@ -1009,9 +1018,10 @@ static int acpi_processor_power_seq_show seq_printf(seq, "active state: C%zd\n" "max_cstate: C%d\n" - "bus master activity: %08x\n", + "bus master activity: %08x\n" + "maximum allowed latency: %d usec\n", pr->power.state ? pr->power.state - pr->power.states : 0, - max_cstate, (unsigned)pr->power.bm_activity); + max_cstate, (unsigned)pr->power.bm_activity, system_latency_constraint()); seq_puts(seq, "states:\n"); @@ -1077,6 +1087,29 @@ static const struct file_operations acpi .release = single_release, }; + +static void smp_callback(void *v) +{ + /* we already woke the CPU up, nothing more to do */ +} + +/* + * This function gets called when a part of the kernel has a new latency requirement. + * This means we need to get all processors out of their C-state, and then recalculate + * a new suitable C-state. Just do a cross-cpu IPI; that wakes them all right up. + */ +static int acpi_processor_latency_notify(struct notifier_block *b, + unsigned long l, void *v) +{ + smp_call_function(smp_callback, NULL, 0, 1); + return NOTIFY_OK; +} + +static struct notifier_block acpi_processor_latency_notifier = { + .notifier_call = acpi_processor_latency_notify, +}; + + int acpi_processor_power_init(struct acpi_processor *pr, struct acpi_device *device) { @@ -1093,6 +1126,7 @@ int acpi_processor_power_init(struct acp "ACPI: processor limited to max C-state %d\n", max_cstate); first_run++; + register_latency_notifier(&acpi_processor_latency_notifier); } if (!pr) @@ -1164,6 +1198,7 @@ int acpi_processor_power_exit(struct acp * copies of pm_idle before proceeding. */ cpu_idle_wait(); + unregister_latency_notifier(&acpi_processor_latency_notifier); } return 0; Index: linux/drivers/acpi/tables/tbget.c =================================================================== --- linux.orig/drivers/acpi/tables/tbget.c +++ linux/drivers/acpi/tables/tbget.c @@ -325,7 +325,7 @@ acpi_tb_get_this_table(struct acpi_point if (header->length < sizeof(struct acpi_table_header)) { ACPI_ERROR((AE_INFO, "Table length (%X) is smaller than minimum (%X)", - header->length, sizeof(struct acpi_table_header))); + header->length, (int)sizeof(struct acpi_table_header))); return_ACPI_STATUS(AE_INVALID_TABLE_LENGTH); } Index: linux/drivers/acpi/tables/tbrsdt.c =================================================================== --- linux.orig/drivers/acpi/tables/tbrsdt.c +++ linux/drivers/acpi/tables/tbrsdt.c @@ -189,7 +189,7 @@ acpi_status acpi_tb_validate_rsdt(struct ACPI_ERROR((AE_INFO, "RSDT/XSDT length (%X) is smaller than minimum (%X)", table_ptr->length, - sizeof(struct acpi_table_header))); + (int)sizeof(struct acpi_table_header))); return (AE_INVALID_TABLE_LENGTH); } Index: linux/drivers/acpi/utilities/utmutex.c =================================================================== --- linux.orig/drivers/acpi/utilities/utmutex.c +++ linux/drivers/acpi/utilities/utmutex.c @@ -259,7 +259,7 @@ acpi_status acpi_ut_acquire_mutex(acpi_m } else { ACPI_EXCEPTION((AE_INFO, status, "Thread %X could not acquire Mutex [%X]", - (u32) this_thread_id, mutex_id)); + (u32)(long) this_thread_id, mutex_id)); } return (status); Index: linux/drivers/block/paride/pseudo.h =================================================================== --- linux.orig/drivers/block/paride/pseudo.h +++ linux/drivers/block/paride/pseudo.h @@ -43,7 +43,7 @@ static unsigned long ps_timeout; static int ps_tq_active = 0; static int ps_nice = 0; -static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused))); +static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock); static DECLARE_WORK(ps_tq, ps_tq_int, NULL); Index: linux/drivers/char/Kconfig =================================================================== --- linux.orig/drivers/char/Kconfig +++ linux/drivers/char/Kconfig @@ -741,6 +741,46 @@ config RTC To compile this driver as a module, choose M here: the module will be called rtc. +config RTC_HISTOGRAM + bool "Real Time Clock Histogram Support" + default n + depends on RTC + ---help--- + If you say Y here then the kernel will track the delivery and + wakeup latency of /dev/rtc using tasks and will report a + histogram to the kernel log when the application closes /dev/rtc. + +config BLOCKER + tristate "Priority Inheritance Debugging (Blocker) Device Support" + depends on X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + pi_test suite uses to test and measure kernel locking primitives. + +config LPPTEST + tristate "Parallel Port Based Latency Measurement Device" + depends on !PARPORT && X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + testlpp utility uses to measure IRQ latencies of a target system + from an independent measurement system. + + NOTE: this code assumes x86 PCs and that the parallel port is + bidirectional and is on IRQ 7. + + to use the device, both the target and the source system needs to + run a kernel with CONFIG_LPPTEST enabled. To measure latencies, + use the scripts/testlpp utility in your kernel source directory, + and run it (as root) on the source system - it will start printing + out the latencies it took to get a response from the target system: + + Latency of response: 12.2 usecs (121265 cycles) + + then generate various workloads on the target system to see how + (worst-case-) latencies are impacted. + config SGI_DS1286 tristate "SGI DS1286 RTC support" depends on SGI_IP22 Index: linux/drivers/char/Makefile =================================================================== --- linux.orig/drivers/char/Makefile +++ linux/drivers/char/Makefile @@ -89,6 +89,9 @@ obj-$(CONFIG_GPIO_VR41XX) += vr41xx_giu. obj-$(CONFIG_TANBAC_TB0219) += tb0219.o obj-$(CONFIG_TELCLOCK) += tlclk.o +obj-$(CONFIG_BLOCKER) += blocker.o +obj-$(CONFIG_LPPTEST) += lpptest.o + obj-$(CONFIG_WATCHDOG) += watchdog/ obj-$(CONFIG_MWAVE) += mwave/ obj-$(CONFIG_AGP) += agp/ Index: linux/drivers/char/blocker.c =================================================================== --- /dev/null +++ linux/drivers/char/blocker.c @@ -0,0 +1,107 @@ +/* + * priority inheritance testing device + */ + +#include +#include + +#define BLOCKER_MINOR 221 + +#define BLOCK_IOCTL 4245 +#define BLOCK_SET_DEPTH 4246 + +#define BLOCKER_MAX_LOCK_DEPTH 10 + +void loop(int loops) +{ + int i; + + for (i = 0; i < loops; i++) + get_cycles(); +} + +static spinlock_t blocker_lock[BLOCKER_MAX_LOCK_DEPTH]; + +static unsigned int lock_depth = 1; + +void do_the_lock_and_loop(unsigned int args) +{ + int i, max; + + if (rt_task(current)) + max = lock_depth; + else if (lock_depth > 1) + max = (current->pid % lock_depth) + 1; + else + max = 1; + + /* Always lock from the top down */ + for (i = max-1; i >= 0; i--) + spin_lock(&blocker_lock[i]); + loop(args); + for (i = 0; i < max; i++) + spin_unlock(&blocker_lock[i]); +} + +static int blocker_open(struct inode *in, struct file *file) +{ + printk(KERN_INFO "blocker_open called\n"); + + return 0; +} + +static long blocker_ioctl(struct file *file, + unsigned int cmd, unsigned long args) +{ + switch(cmd) { + case BLOCK_IOCTL: + do_the_lock_and_loop(args); + return 0; + case BLOCK_SET_DEPTH: + if (args >= BLOCKER_MAX_LOCK_DEPTH) + return -EINVAL; + lock_depth = args; + return 0; + default: + return -EINVAL; + } +} + +static struct file_operations blocker_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .unlocked_ioctl = blocker_ioctl, + .open = blocker_open, +}; + +static struct miscdevice blocker_dev = +{ + BLOCKER_MINOR, + "blocker", + &blocker_fops +}; + +static int __init blocker_init(void) +{ + int i; + + if (misc_register(&blocker_dev)) + return -ENODEV; + + for (i = 0; i < BLOCKER_MAX_LOCK_DEPTH; i++) + spin_lock_init(blocker_lock + i); + + return 0; +} + +void __exit blocker_exit(void) +{ + printk(KERN_INFO "blocker device uninstalled\n"); + misc_deregister(&blocker_dev); +} + +module_init(blocker_init); +module_exit(blocker_exit); + +MODULE_LICENSE("GPL"); + Index: linux/drivers/char/hangcheck-timer.c =================================================================== --- linux.orig/drivers/char/hangcheck-timer.c +++ linux/drivers/char/hangcheck-timer.c @@ -117,7 +117,7 @@ __setup("hcheck_reboot", hangcheck_parse __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); #endif /* not MODULE */ -#if defined(CONFIG_X86_64) || defined(CONFIG_S390) +#ifdef CONFIG_S390 # define HAVE_MONOTONIC # define TIMER_FREQ 1000000000ULL #elif defined(CONFIG_IA64) Index: linux/drivers/char/hpet.c =================================================================== --- linux.orig/drivers/char/hpet.c +++ linux/drivers/char/hpet.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -50,8 +51,34 @@ #define HPET_RANGE_SIZE 1024 /* from HPET spec */ +#if BITS_PER_LONG == 64 +#define write_counter(V, MC) writeq(V, MC) +#define read_counter(MC) readq(MC) +#else +#define write_counter(V, MC) writel(V, MC) +#define read_counter(MC) readl(MC) +#endif + static u32 hpet_nhpet, hpet_max_freq = HPET_USER_FREQ; +static void __iomem *hpet_mc_ptr; + +static cycle_t read_hpet(void) +{ + return (cycle_t)read_counter((void __iomem *)hpet_mc_ptr); +} + +static struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 300, + .read = read_hpet, + .mask = 0xffffffffffffffffLL, + .mult = 0, /*to be caluclated*/ + .shift = 10, + .is_continuous = 1, +}; +static struct clocksource *hpet_clocksource_p; + /* A lock for concurrent access by app and isr hpet activity. */ static DEFINE_SPINLOCK(hpet_lock); /* A lock for concurrent intermodule access to hpet and isr hpet activity. */ @@ -78,7 +105,7 @@ struct hpets { struct hpets *hp_next; struct hpet __iomem *hp_hpet; unsigned long hp_hpet_phys; - struct time_interpolator *hp_interpolator; + struct clocksource *hp_clocksource; unsigned long long hp_tick_freq; unsigned long hp_delta; unsigned int hp_ntimer; @@ -93,13 +120,6 @@ static struct hpets *hpets; #define HPET_PERIODIC 0x0004 #define HPET_SHARED_IRQ 0x0008 -#if BITS_PER_LONG == 64 -#define write_counter(V, MC) writeq(V, MC) -#define read_counter(MC) readq(MC) -#else -#define write_counter(V, MC) writel(V, MC) -#define read_counter(MC) readl(MC) -#endif #ifndef readq static inline unsigned long long readq(void __iomem *addr) @@ -736,27 +756,6 @@ static ctl_table dev_root[] = { static struct ctl_table_header *sysctl_header; -static void hpet_register_interpolator(struct hpets *hpetp) -{ -#ifdef CONFIG_TIME_INTERPOLATION - struct time_interpolator *ti; - - ti = kzalloc(sizeof(*ti), GFP_KERNEL); - if (!ti) - return; - - ti->source = TIME_SOURCE_MMIO64; - ti->shift = 10; - ti->addr = &hpetp->hp_hpet->hpet_mc; - ti->frequency = hpetp->hp_tick_freq; - ti->drift = HPET_DRIFT; - ti->mask = -1; - - hpetp->hp_interpolator = ti; - register_time_interpolator(ti); -#endif -} - /* * Adjustment for when arming the timer with * initial conditions. That is, main counter @@ -908,7 +907,16 @@ int hpet_alloc(struct hpet_data *hdp) } hpetp->hp_delta = hpet_calibrate(hpetp); - hpet_register_interpolator(hpetp); + + if (!hpet_clocksource_p) { +#ifdef CONFIG_IA64 + clocksource_hpet.fsys_mmio_ptr = hpet_mc_ptr = &hpetp->hp_hpet->hpet_mc; +#endif + clocksource_hpet.mult = clocksource_hz2mult(hpetp->hp_tick_freq, + clocksource_hpet.shift); + clocksource_register(&clocksource_hpet); + hpet_clocksource_p = hpetp->hp_clocksource = &clocksource_hpet; + } return 0; } @@ -994,7 +1002,7 @@ static int hpet_acpi_add(struct acpi_dev static int hpet_acpi_remove(struct acpi_device *device, int type) { - /* XXX need to unregister interpolator, dealloc mem, etc */ + /* XXX need to unregister clocksource, dealloc mem, etc */ return -EINVAL; } Index: linux/drivers/char/lpptest.c =================================================================== --- /dev/null +++ linux/drivers/char/lpptest.c @@ -0,0 +1,179 @@ +/* + * /dev/lpptest device: test IRQ handling latencies over parallel port + * + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar + * + * licensed under the GPL + * + * You need to have CONFIG_PARPORT disabled for this device, it is a + * completely self-contained device that assumes sole ownership of the + * parallel port. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * API wrappers so that the code can be shared with the -rt tree: + */ +#ifndef local_irq_disable +# define local_irq_disable local_irq_disable +# define local_irq_enable local_irq_enable +#endif + +#ifndef IRQ_NODELAY +# define IRQ_NODELAY 0 +# define IRQF_NODELAY 0 +#endif + +/* + * Driver: + */ +#define LPPTEST_CHAR_MAJOR 245 +#define LPPTEST_DEVICE_NAME "lpptest" + +#define LPPTEST_IRQ 7 + +#define LPPTEST_TEST _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long) +#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long) +#define LPPTEST_ENABLE _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long) + +static char dev_id[] = "lpptest"; + +#define INIT_PORT() outb(0x04, 0x37a) +#define ENABLE_IRQ() outb(0x10, 0x37a) +#define DISABLE_IRQ() outb(0, 0x37a) + +static unsigned char out = 0x5a; + +/** + * Interrupt handler. Flip a bit in the reply. + */ +static int lpptest_irq (int irq, void *dev_id, struct pt_regs *regs) +{ + out ^= 0xff; + outb(out, 0x378); + + return IRQ_HANDLED; +} + +static cycles_t test_response(void) +{ + cycles_t now, end; + unsigned char in; + int timeout = 0; + + local_irq_disable(); + in = inb(0x379); + inb(0x378); + outb(0x08, 0x378); + now = get_cycles(); + while(1) { + if (inb(0x379) != in) + break; + if (timeout++ > 1000000) { + outb(0x00, 0x378); + local_irq_enable(); + + return 0; + } + } + end = get_cycles(); + outb(0x00, 0x378); + local_irq_enable(); + + return end - now; +} + +static int lpptest_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static int lpptest_close(struct inode *inode, struct file *file) +{ + return 0; +} + +int lpptest_ioctl(struct inode *inode, struct file *file, unsigned int ioctl_num, unsigned long ioctl_param) +{ + int retval = 0; + + switch (ioctl_num) { + + case LPPTEST_DISABLE: + DISABLE_IRQ(); + break; + + case LPPTEST_ENABLE: + ENABLE_IRQ(); + break; + + case LPPTEST_TEST: { + + cycles_t diff = test_response(); + if (copy_to_user((void *)ioctl_param, (void*) &diff, sizeof(diff))) + goto errcpy; + break; + } + default: retval = -EINVAL; + } + + return retval; + + errcpy: + return -EFAULT; +} + +static struct file_operations lpptest_dev_fops = { + .ioctl = lpptest_ioctl, + .open = lpptest_open, + .release = lpptest_close, +}; + +static int __init lpptest_init (void) +{ + if (register_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME, &lpptest_dev_fops)) + { + printk(KERN_NOTICE "Can't allocate major number %d for lpptest.\n", + LPPTEST_CHAR_MAJOR); + return -EAGAIN; + } + + if (request_irq (LPPTEST_IRQ, lpptest_irq, 0, "lpptest", dev_id)) { + printk (KERN_WARNING "lpptest: irq %d in use. Unload parport module!\n", LPPTEST_IRQ); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); + return -EAGAIN; + } + irq_desc[LPPTEST_IRQ].status |= IRQ_NODELAY; + irq_desc[LPPTEST_IRQ].action->flags |= IRQF_NODELAY | IRQF_DISABLED; + + INIT_PORT(); + ENABLE_IRQ(); + + return 0; +} +module_init (lpptest_init); + +static void __exit lpptest_exit (void) +{ + DISABLE_IRQ(); + + free_irq(LPPTEST_IRQ, dev_id); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); +} +module_exit (lpptest_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("lpp test module"); + Index: linux/drivers/char/random.c =================================================================== --- linux.orig/drivers/char/random.c +++ linux/drivers/char/random.c @@ -580,8 +580,11 @@ static void add_timer_randomness(struct preempt_disable(); /* if over the trickle threshold, use only 1 in 4096 samples */ if (input_pool.entropy_count > trickle_thresh && - (__get_cpu_var(trickle_count)++ & 0xfff)) - goto out; + (__get_cpu_var(trickle_count)++ & 0xfff)) { + preempt_enable(); + return; + } + preempt_enable(); sample.jiffies = jiffies; sample.cycles = get_cycles(); @@ -626,9 +629,6 @@ static void add_timer_randomness(struct if(input_pool.entropy_count >= random_read_wakeup_thresh) wake_up_interruptible(&random_read_wait); - -out: - preempt_enable(); } void add_input_randomness(unsigned int type, unsigned int code, Index: linux/drivers/char/rtc.c =================================================================== --- linux.orig/drivers/char/rtc.c +++ linux/drivers/char/rtc.c @@ -82,10 +82,36 @@ #include #include +#ifdef CONFIG_MIPS +# include +#endif + #if defined(__i386__) #include #endif +#ifdef CONFIG_RTC_HISTOGRAM + +static cycles_t last_interrupt_time; + +#include + +#define CPU_MHZ (cpu_khz / 1000) + +#define HISTSIZE 10000 +static int histogram[HISTSIZE]; + +static int rtc_state; + +enum rtc_states { + S_STARTUP, /* First round - let the application start */ + S_IDLE, /* Waiting for an interrupt */ + S_WAITING_FOR_READ, /* Signal delivered. waiting for rtc_read() */ + S_READ_MISSED, /* Signal delivered, read() deadline missed */ +}; + +#endif + #ifdef __sparc__ #include #include @@ -217,7 +243,146 @@ static inline unsigned char rtc_is_updat return uip; } +#ifndef RTC_IRQ +# undef CONFIG_RTC_HISTOGRAM +#endif + +static inline void rtc_open_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i; + + last_interrupt_time = 0; + rtc_state = S_STARTUP; + rtc_irq_data = 0; + + for (i = 0; i < HISTSIZE; i++) + histogram[i] = 0; +#endif +} + +static inline void rtc_wake_event(void) +{ +#ifndef CONFIG_RTC_HISTOGRAM + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); +#else + if (!(rtc_status & RTC_IS_OPEN)) + return; + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + break; + /* Waiting for an interrupt */ + case S_IDLE: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + last_interrupt_time = get_cycles(); + rtc_state = S_WAITING_FOR_READ; + break; + + /* Signal has been delivered. waiting for rtc_read() */ + case S_WAITING_FOR_READ: + /* + * Well foo. The usermode application didn't + * schedule and read in time. + */ + last_interrupt_time = get_cycles(); + rtc_state = S_READ_MISSED; + printk("Read missed before next interrupt\n"); + break; + /* Signal has been delivered, read() deadline was missed */ + case S_READ_MISSED: + /* + * Not much we can do here. We're waiting for the usermode + * application to read the rtc + */ + last_interrupt_time = get_cycles(); + break; + } +#endif +} + +static inline void rtc_read_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + cycles_t now = get_cycles(); + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + rtc_state = S_IDLE; + break; + + /* Waiting for an interrupt */ + case S_IDLE: + printk("bug in rtc_read(): called in state S_IDLE!\n"); + break; + case S_WAITING_FOR_READ: /* + * Signal has been delivered. + * waiting for rtc_read() + */ + /* + * Well done + */ + case S_READ_MISSED: /* + * Signal has been delivered, read() + * deadline was missed + */ + /* + * So, you finally got here. + */ + if (!last_interrupt_time) + printk("bug in rtc_read(): last_interrupt_time = 0\n"); + rtc_state = S_IDLE; + { + cycles_t latency = now - last_interrupt_time; + unsigned long delta; /* Microseconds */ + + delta = latency; + delta /= CPU_MHZ; + + if (delta > 1000 * 1000) { + printk("rtc: eek\n"); + } else { + unsigned long slot = delta; + if (slot >= HISTSIZE) + slot = HISTSIZE - 1; + histogram[slot]++; + if (delta > 2000) + printk("wow! That was a " + "%ld millisec bump\n", + delta / 1000); + } + } + rtc_state = S_IDLE; + break; + } +#endif +} + +static inline void rtc_close_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i = 0; + unsigned long total = 0; + + for (i = 0; i < HISTSIZE; i++) + total += histogram[i]; + if (!total) + return; + + printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n", + current->comm, current->pid, total); + for (i = 0; i < HISTSIZE; i++) { + if (histogram[i]) + printk("%d %d\n", i, histogram[i]); + } +#endif +} + #ifdef RTC_IRQ + /* * A very tiny interrupt handler. It runs with IRQF_DISABLED set, * but there is possibility of conflicting with the set_rtc_mmss() @@ -261,7 +426,7 @@ irqreturn_t rtc_interrupt(int irq, void if (rtc_callback) rtc_callback->func(rtc_callback->private_data); spin_unlock(&rtc_task_lock); - wake_up_interruptible(&rtc_wait); + wake_up_interruptible(&rtc_wait); kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); @@ -375,6 +540,8 @@ static ssize_t rtc_read(struct file *fil schedule(); } while (1); + rtc_read_event(); + if (count == sizeof(unsigned int)) retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); else @@ -607,6 +774,11 @@ static int rtc_do_ioctl(unsigned int cmd save_freq_select = CMOS_READ(RTC_FREQ_SELECT); CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + /* + * Make CMOS date writes nonpreemptible even on PREEMPT_RT. + * There's a limit to everything! =B-) + */ + preempt_disable(); #ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR); #endif @@ -616,6 +788,7 @@ static int rtc_do_ioctl(unsigned int cmd CMOS_WRITE(hrs, RTC_HOURS); CMOS_WRITE(min, RTC_MINUTES); CMOS_WRITE(sec, RTC_SECONDS); + preempt_enable(); CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); @@ -714,6 +887,7 @@ static int rtc_open(struct inode *inode, if(rtc_status & RTC_IS_OPEN) goto out_busy; + rtc_open_event(); rtc_status |= RTC_IS_OPEN; rtc_irq_data = 0; @@ -769,6 +943,7 @@ no_irq: rtc_irq_data = 0; rtc_status &= ~RTC_IS_OPEN; spin_unlock_irq (&rtc_lock); + rtc_close_event(); return 0; } @@ -1152,6 +1327,7 @@ static void rtc_dropped_irq(unsigned lon printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq); /* Now we have new data */ + rtc_wake_event(); wake_up_interruptible(&rtc_wait); kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); Index: linux/drivers/char/sysrq.c =================================================================== --- linux.orig/drivers/char/sysrq.c +++ linux/drivers/char/sysrq.c @@ -176,6 +176,23 @@ static struct sysrq_key_op sysrq_showreg .enable_mask = SYSRQ_ENABLE_DUMP, }; +#if defined(__i386__) + +static void sysrq_handle_showallregs(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + nmi_show_all_regs(); +} + +static struct sysrq_key_op sysrq_showallregs_op = { + .handler = sysrq_handle_showallregs, + .help_msg = "showalLcpupc", + .action_msg = "Show Regs On All CPUs", +}; +#else +#define sysrq_showallregs_op (*(struct sysrq_key_op *)0) +#endif + static void sysrq_handle_showstate(int key, struct pt_regs *pt_regs, struct tty_struct *tty) { @@ -301,7 +318,7 @@ static struct sysrq_key_op *sysrq_key_ta &sysrq_kill_op, /* i */ NULL, /* j */ &sysrq_SAK_op, /* k */ - NULL, /* l */ + &sysrq_showallregs_op, /* l */ &sysrq_showmem_op, /* m */ &sysrq_unrt_op, /* n */ /* This will often be registered as 'Off' at init time */ Index: linux/drivers/char/tty_io.c =================================================================== --- linux.orig/drivers/char/tty_io.c +++ linux/drivers/char/tty_io.c @@ -254,6 +254,7 @@ static int check_tty_count(struct tty_st printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) " "!= #fd's(%d) in %s\n", tty->name, tty->count, count, routine); + dump_stack(); return count; } #endif Index: linux/drivers/ide/ide-floppy.c =================================================================== --- linux.orig/drivers/ide/ide-floppy.c +++ linux/drivers/ide/ide-floppy.c @@ -1666,9 +1666,9 @@ static int idefloppy_get_format_progress atapi_status_t status; unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); status.all = HWIF(drive)->INB(IDE_STATUS_REG); - local_irq_restore(flags); + local_irq_restore_nort(flags); progress_indication = !status.b.dsc ? 0 : 0x10000; } Index: linux/drivers/ide/ide-io.c =================================================================== --- linux.orig/drivers/ide/ide-io.c +++ linux/drivers/ide/ide-io.c @@ -1173,7 +1173,7 @@ static void ide_do_request (ide_hwgroup_ ide_get_lock(ide_intr, hwgroup); /* caller must own ide_lock */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); while (!hwgroup->busy) { hwgroup->busy = 1; @@ -1434,7 +1434,7 @@ void ide_timer_expiry (unsigned long dat #endif /* DISABLE_IRQ_NOSYNC */ /* local CPU only, * as if we were handling an interrupt */ - local_irq_disable(); + local_irq_disable_nort(); if (hwgroup->polling) { startstop = handler(drive); } else if (drive_is_ready(drive)) { Index: linux/drivers/ide/ide-iops.c =================================================================== --- linux.orig/drivers/ide/ide-iops.c +++ linux/drivers/ide/ide-iops.c @@ -244,10 +244,10 @@ static void ata_input_data(ide_drive_t * if (io_32bit) { if (io_32bit & 2) { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(drive, IDE_NSECTOR_REG); hwif->INSL(IDE_DATA_REG, buffer, wcount); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else hwif->INSL(IDE_DATA_REG, buffer, wcount); } else { @@ -266,10 +266,10 @@ static void ata_output_data(ide_drive_t if (io_32bit) { if (io_32bit & 2) { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(drive, IDE_NSECTOR_REG); hwif->OUTSL(IDE_DATA_REG, buffer, wcount); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else hwif->OUTSL(IDE_DATA_REG, buffer, wcount); } else { @@ -564,12 +564,12 @@ int ide_wait_stat (ide_startstop_t *star if (!(stat & BUSY_STAT)) break; - local_irq_restore(flags); + local_irq_restore_nort(flags); *startstop = ide_error(drive, "status timeout", stat); return 1; } } - local_irq_restore(flags); + local_irq_restore_nort(flags); } /* * Allow status to settle, then read it again. @@ -731,17 +731,15 @@ int ide_driveid_update (ide_drive_t *dri printk("%s: CHECK for good STATUS\n", drive->name); return 0; } - local_irq_save(flags); - SELECT_MASK(drive, 0); id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC); - if (!id) { - local_irq_restore(flags); + if (!id) return 0; - } + local_irq_save_nort(flags); + SELECT_MASK(drive, 0); ata_input_data(drive, id, SECTOR_WORDS); (void) hwif->INB(IDE_STATUS_REG); /* clear drive IRQ */ - local_irq_enable(); - local_irq_restore(flags); + local_irq_enable_nort(); + local_irq_restore_nort(flags); ide_fix_driveid(id); if (id) { drive->id->dma_ultra = id->dma_ultra; @@ -821,7 +819,7 @@ int ide_config_drive_speed (ide_drive_t if (time_after(jiffies, timeout)) break; } - local_irq_restore(flags); + local_irq_restore_nort(flags); } /* Index: linux/drivers/ide/ide-lib.c =================================================================== --- linux.orig/drivers/ide/ide-lib.c +++ linux/drivers/ide/ide-lib.c @@ -445,15 +445,16 @@ EXPORT_SYMBOL_GPL(ide_set_xfer_rate); static void ide_dump_opcode(ide_drive_t *drive) { + unsigned long flags; struct request *rq; u8 opcode = 0; int found = 0; - spin_lock(&ide_lock); + spin_lock_irqsave(&ide_lock, flags); rq = NULL; if (HWGROUP(drive)) rq = HWGROUP(drive)->rq; - spin_unlock(&ide_lock); + spin_unlock_irqrestore(&ide_lock, flags); if (!rq) return; if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) { @@ -481,10 +482,8 @@ static void ide_dump_opcode(ide_drive_t static u8 ide_dump_ata_status(ide_drive_t *drive, const char *msg, u8 stat) { ide_hwif_t *hwif = HWIF(drive); - unsigned long flags; u8 err = 0; - local_irq_save(flags); printk("%s: %s: status=0x%02x { ", drive->name, msg, stat); if (stat & BUSY_STAT) printk("Busy "); @@ -544,7 +543,7 @@ static u8 ide_dump_ata_status(ide_drive_ printk("\n"); } ide_dump_opcode(drive); - local_irq_restore(flags); + return err; } @@ -559,14 +558,11 @@ static u8 ide_dump_ata_status(ide_drive_ static u8 ide_dump_atapi_status(ide_drive_t *drive, const char *msg, u8 stat) { - unsigned long flags; - atapi_status_t status; atapi_error_t error; status.all = stat; error.all = 0; - local_irq_save(flags); printk("%s: %s: status=0x%02x { ", drive->name, msg, stat); if (status.b.bsy) printk("Busy "); @@ -592,7 +588,7 @@ static u8 ide_dump_atapi_status(ide_driv printk("}\n"); } ide_dump_opcode(drive); - local_irq_restore(flags); + return error.all; } Index: linux/drivers/ide/ide-probe.c =================================================================== --- linux.orig/drivers/ide/ide-probe.c +++ linux/drivers/ide/ide-probe.c @@ -143,7 +143,7 @@ static inline void do_identify (ide_driv hwif->ata_input_data(drive, id, SECTOR_WORDS); drive->id_read = 1; - local_irq_enable(); + local_irq_enable_nort(); ide_fix_driveid(id); #if defined (CONFIG_SCSI_EATA_DMA) || defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA) @@ -325,14 +325,14 @@ static int actual_try_to_identify (ide_d unsigned long flags; /* local CPU only; some systems need this */ - local_irq_save(flags); + local_irq_save_nort(flags); /* drive returned ID */ do_identify(drive, cmd); /* drive responded with ID */ rc = 0; /* clear drive IRQ */ (void) hwif->INB(IDE_STATUS_REG); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else { /* drive refused ID */ rc = 2; @@ -804,7 +804,7 @@ static void probe_hwif(ide_hwif_t *hwif) } while ((stat & BUSY_STAT) && time_after(timeout, jiffies)); } - local_irq_restore(flags); + local_irq_restore_nort(flags); /* * Use cached IRQ number. It might be (and is...) changed by probe * code above Index: linux/drivers/ide/ide-taskfile.c =================================================================== --- linux.orig/drivers/ide/ide-taskfile.c +++ linux/drivers/ide/ide-taskfile.c @@ -274,7 +274,7 @@ static void ide_pio_sector(ide_drive_t * offset %= PAGE_SIZE; #ifdef CONFIG_HIGHMEM - local_irq_save(flags); + local_irq_save_nort(flags); #endif buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset; @@ -294,7 +294,7 @@ static void ide_pio_sector(ide_drive_t * kunmap_atomic(buf, KM_BIO_SRC_IRQ); #ifdef CONFIG_HIGHMEM - local_irq_restore(flags); + local_irq_restore_nort(flags); #endif } @@ -460,7 +460,7 @@ ide_startstop_t pre_task_out_intr (ide_d } if (!drive->unmask) - local_irq_disable(); + local_irq_disable_nort(); ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL); ide_pio_datablock(drive, rq, 1); Index: linux/drivers/ide/pci/alim15x3.c =================================================================== --- linux.orig/drivers/ide/pci/alim15x3.c +++ linux/drivers/ide/pci/alim15x3.c @@ -322,7 +322,7 @@ static void ali15x3_tune_drive (ide_driv if (r_clc >= 16) r_clc = 0; } - local_irq_save(flags); + local_irq_save_nort(flags); /* * PIO mode => ATA FIFO on, ATAPI FIFO off @@ -344,7 +344,7 @@ static void ali15x3_tune_drive (ide_driv pci_write_config_byte(dev, port, s_clc); pci_write_config_byte(dev, port+drive->select.b.unit+2, (a_clc << 4) | r_clc); - local_irq_restore(flags); + local_irq_restore_nort(flags); /* * setup active rec @@ -600,7 +600,7 @@ static unsigned int __devinit init_chips } #endif /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_PROC_FS) */ - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision < 0xC2) { /* @@ -613,7 +613,7 @@ static unsigned int __devinit init_chips * clear bit 7 */ pci_write_config_byte(dev, 0x4b, tmpbyte & 0x7F); - local_irq_restore(flags); + local_irq_restore_nort(flags); return 0; } @@ -638,7 +638,7 @@ static unsigned int __devinit init_chips * 0:0.0 so if we didn't find one we know what is cooking. */ if (north && north->vendor != PCI_VENDOR_ID_AL) { - local_irq_restore(flags); + local_irq_restore_nort(flags); return 0; } @@ -661,7 +661,7 @@ static unsigned int __devinit init_chips pci_write_config_byte(isa_dev, 0x79, tmpbyte | 0x02); } } - local_irq_restore(flags); + local_irq_restore_nort(flags); return 0; } @@ -685,7 +685,7 @@ static unsigned int __devinit ata66_ali1 unsigned long flags; u8 tmpbyte; - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision >= 0xC2) { /* @@ -737,7 +737,7 @@ static unsigned int __devinit ata66_ali1 pci_write_config_byte(dev, 0x53, tmpbyte); - local_irq_restore(flags); + local_irq_restore_nort(flags); return(ata66); } Index: linux/drivers/ide/pci/cs5530.c =================================================================== --- linux.orig/drivers/ide/pci/cs5530.c +++ linux/drivers/ide/pci/cs5530.c @@ -241,8 +241,8 @@ static unsigned int __devinit init_chips return 0; } - spin_lock_irqsave(&ide_lock, flags); - /* all CPUs (there should only be one CPU with this chipset) */ + /* Local CPU. ide_lock is acquired in do_ide_setup_pci_device. */ + local_irq_save(flags); /* * Enable BusMaster and MemoryWriteAndInvalidate for the cs5530: @@ -294,7 +294,7 @@ static unsigned int __devinit init_chips pci_write_config_byte(master_0, 0x42, 0x00); pci_write_config_byte(master_0, 0x43, 0xc1); - spin_unlock_irqrestore(&ide_lock, flags); + local_irq_restore(flags); return 0; } Index: linux/drivers/ide/pci/hpt366.c =================================================================== --- linux.orig/drivers/ide/pci/hpt366.c +++ linux/drivers/ide/pci/hpt366.c @@ -1496,7 +1496,7 @@ static void __devinit init_dma_hpt366(id dma_old = hwif->INB(dmabase+2); - local_irq_save(flags); + local_irq_save_nort(flags); dma_new = dma_old; pci_read_config_byte(hwif->pci_dev, primary, &masterdma); @@ -1507,7 +1507,7 @@ static void __devinit init_dma_hpt366(id if (dma_new != dma_old) hwif->OUTB(dma_new, dmabase+2); - local_irq_restore(flags); + local_irq_restore_nort(flags); ide_setup_dma(hwif, dmabase, 8); } Index: linux/drivers/ieee1394/ieee1394_types.h =================================================================== --- linux.orig/drivers/ieee1394/ieee1394_types.h +++ linux/drivers/ieee1394/ieee1394_types.h @@ -19,7 +19,7 @@ struct hpsb_tlabel_pool { spinlock_t lock; u8 next; u32 allocations; - struct semaphore count; + struct compat_semaphore count; }; #define HPSB_TPOOL_INIT(_tp) \ Index: linux/drivers/ieee1394/nodemgr.c =================================================================== --- linux.orig/drivers/ieee1394/nodemgr.c +++ linux/drivers/ieee1394/nodemgr.c @@ -167,7 +167,7 @@ struct host_info { struct hpsb_host *host; struct list_head list; struct completion exited; - struct semaphore reset_sem; + struct compat_semaphore reset_sem; int pid; char daemon_name[15]; int kill_me; Index: linux/drivers/ieee1394/raw1394-private.h =================================================================== --- linux.orig/drivers/ieee1394/raw1394-private.h +++ linux/drivers/ieee1394/raw1394-private.h @@ -29,7 +29,7 @@ struct file_info { struct list_head req_pending; struct list_head req_complete; - struct semaphore complete_sem; + struct compat_semaphore complete_sem; spinlock_t reqlists_lock; wait_queue_head_t poll_wait_complete; Index: linux/drivers/input/gameport/gameport.c =================================================================== --- linux.orig/drivers/input/gameport/gameport.c +++ linux/drivers/input/gameport/gameport.c @@ -21,6 +21,7 @@ #include #include #include +#include #include /* HZ */ #include @@ -101,12 +102,12 @@ static int gameport_measure_speed(struct tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); GET_TIME(t1); for (t = 0; t < 50; t++) gameport_read(gameport); GET_TIME(t2); GET_TIME(t3); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t; } @@ -125,11 +126,11 @@ static int gameport_measure_speed(struct tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); rdtscl(t1); for (t = 0; t < 50; t++) gameport_read(gameport); rdtscl(t2); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if (t2 - t1 < tx) tx = t2 - t1; } Index: linux/drivers/input/serio/i8042.c =================================================================== --- linux.orig/drivers/input/serio/i8042.c +++ linux/drivers/input/serio/i8042.c @@ -1084,7 +1084,7 @@ static int __devinit i8042_probe(struct goto err_controller_cleanup; } - mod_timer(&i8042_timer, jiffies + I8042_POLL_PERIOD); + mod_timer(&i8042_timer, jiffies + 2); //I8042_POLL_PERIOD); return 0; err_unregister_ports: Index: linux/drivers/input/serio/i8042.h =================================================================== --- linux.orig/drivers/input/serio/i8042.h +++ linux/drivers/input/serio/i8042.h @@ -43,7 +43,7 @@ * polling. */ -#define I8042_POLL_PERIOD HZ/20 +#define I8042_POLL_PERIOD (10*HZ) /* * Status register bits. Index: linux/drivers/media/dvb/dvb-core/dvb_frontend.c =================================================================== --- linux.orig/drivers/media/dvb/dvb-core/dvb_frontend.c +++ linux/drivers/media/dvb/dvb-core/dvb_frontend.c @@ -97,7 +97,7 @@ struct dvb_frontend_private { struct dvb_device *dvbdev; struct dvb_frontend_parameters parameters; struct dvb_fe_events events; - struct semaphore sem; + struct compat_semaphore sem; struct list_head list_head; wait_queue_head_t wait_queue; pid_t thread_pid; Index: linux/drivers/media/dvb/dvb-core/dvb_frontend.h =================================================================== --- linux.orig/drivers/media/dvb/dvb-core/dvb_frontend.h +++ linux/drivers/media/dvb/dvb-core/dvb_frontend.h @@ -138,7 +138,7 @@ struct dvb_fe_events { int eventr; int overflow; wait_queue_head_t wait_queue; - struct semaphore sem; + struct compat_semaphore sem; }; struct dvb_frontend { Index: linux/drivers/net/3c527.c =================================================================== --- linux.orig/drivers/net/3c527.c +++ linux/drivers/net/3c527.c @@ -182,7 +182,7 @@ struct mc32_local u16 rx_ring_tail; /* index to rx de-queue end */ - struct semaphore cmd_mutex; /* Serialises issuing of execute commands */ + struct compat_semaphore cmd_mutex; /* Serialises issuing of execute commands */ struct completion execution_cmd; /* Card has completed an execute command */ struct completion xceiver_cmd; /* Card has completed a tx or rx command */ }; Index: linux/drivers/net/3c59x.c =================================================================== --- linux.orig/drivers/net/3c59x.c +++ linux/drivers/net/3c59x.c @@ -793,9 +793,9 @@ static void poll_vortex(struct net_devic struct vortex_private *vp = netdev_priv(dev); unsigned long flags; local_save_flags(flags); - local_irq_disable(); + local_irq_disable_nort(); (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev,NULL); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #endif @@ -1724,6 +1724,7 @@ vortex_timer(unsigned long data) int next_tick = 60*HZ; int ok = 0; int media_status, old_window; + unsigned long flags; if (vortex_debug > 2) { printk(KERN_DEBUG "%s: Media selection timer tick happened, %s.\n", @@ -1731,7 +1732,7 @@ vortex_timer(unsigned long data) printk(KERN_DEBUG "dev->watchdog_timeo=%d\n", dev->watchdog_timeo); } - disable_irq_lockdep(dev->irq); + spin_lock_irqsave(&vp->lock, flags); old_window = ioread16(ioaddr + EL3_CMD) >> 13; EL3WINDOW(4); media_status = ioread16(ioaddr + Wn4_Media); @@ -1754,9 +1755,7 @@ vortex_timer(unsigned long data) case XCVR_MII: case XCVR_NWAY: { ok = 1; - spin_lock_bh(&vp->lock); vortex_check_media(dev, 0); - spin_unlock_bh(&vp->lock); } break; default: /* Other media types handled by Tx timeouts. */ @@ -1812,7 +1811,7 @@ leave_media_alone: dev->name, media_tbl[dev->if_port].name); EL3WINDOW(old_window); - enable_irq_lockdep(dev->irq); + spin_unlock_irqrestore(&vp->lock, flags); mod_timer(&vp->timer, RUN_AT(next_tick)); if (vp->deferred) iowrite16(FakeIntr, ioaddr + EL3_CMD); @@ -1845,13 +1844,17 @@ static void vortex_tx_timeout(struct net /* * Block interrupts because vortex_interrupt does a bare spin_lock() */ +#ifndef CONFIG_PREEMPT_RT unsigned long flags; local_irq_save(flags); +#endif if (vp->full_bus_master_tx) boomerang_interrupt(dev->irq, dev, NULL); else vortex_interrupt(dev->irq, dev, NULL); +#ifndef CONFIG_PREEMPT_RT local_irq_restore(flags); +#endif } } Index: linux/drivers/net/e1000/e1000_main.c =================================================================== --- linux.orig/drivers/net/e1000/e1000_main.c +++ linux/drivers/net/e1000/e1000_main.c @@ -2965,10 +2965,8 @@ e1000_xmit_frame(struct sk_buff *skb, st (adapter->hw.mac_type == e1000_82573)) e1000_transfer_dhcp_info(adapter, skb); - local_irq_save(flags); - if (!spin_trylock(&tx_ring->tx_lock)) { + if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) { /* Collision - tell upper layer to requeue */ - local_irq_restore(flags); return NETDEV_TX_LOCKED; } Index: linux/drivers/net/hamradio/6pack.c =================================================================== --- linux.orig/drivers/net/hamradio/6pack.c +++ linux/drivers/net/hamradio/6pack.c @@ -123,7 +123,7 @@ struct sixpack { struct timer_list tx_t; struct timer_list resync_t; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; spinlock_t lock; }; Index: linux/drivers/net/hamradio/mkiss.c =================================================================== --- linux.orig/drivers/net/hamradio/mkiss.c +++ linux/drivers/net/hamradio/mkiss.c @@ -84,7 +84,7 @@ struct mkiss { #define CRC_MODE_SMACK_TEST 4 atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; }; /*---------------------------------------------------------------------------*/ Index: linux/drivers/net/ibm_emac/ibm_emac_core.c =================================================================== --- linux.orig/drivers/net/ibm_emac/ibm_emac_core.c +++ linux/drivers/net/ibm_emac/ibm_emac_core.c @@ -1061,6 +1061,8 @@ static inline int emac_xmit_finish(struc ++dev->stats.tx_packets; dev->stats.tx_bytes += len; + spin_unlock(&dev->tx_lock); + return 0; } @@ -1074,6 +1076,7 @@ static int emac_start_xmit(struct sk_buf u16 ctrl = EMAC_TX_CTRL_GFCS | EMAC_TX_CTRL_GP | MAL_TX_CTRL_READY | MAL_TX_CTRL_LAST | emac_tx_csum(dev, skb); + spin_lock(&dev->tx_lock); slot = dev->tx_slot++; if (dev->tx_slot == NUM_TX_BUFF) { dev->tx_slot = 0; @@ -1243,6 +1246,7 @@ static void emac_poll_tx(void *param) DBG2("%d: poll_tx, %d %d" NL, dev->def->index, dev->tx_cnt, dev->ack_slot); + spin_lock(&dev->tx_lock); if (dev->tx_cnt) { u16 ctrl; int slot = dev->ack_slot, n = 0; @@ -1252,6 +1256,7 @@ static void emac_poll_tx(void *param) struct sk_buff *skb = dev->tx_skb[slot]; ++n; + spin_unlock(&dev->tx_lock); if (skb) { dev_kfree_skb(skb); dev->tx_skb[slot] = NULL; @@ -1261,6 +1266,7 @@ static void emac_poll_tx(void *param) if (unlikely(EMAC_IS_BAD_TX(ctrl))) emac_parse_tx_error(dev, ctrl); + spin_lock(&dev->tx_lock); if (--dev->tx_cnt) goto again; } @@ -1273,6 +1279,7 @@ static void emac_poll_tx(void *param) DBG2("%d: tx %d pkts" NL, dev->def->index, n); } } + spin_unlock(&dev->tx_lock); } static inline void emac_recycle_rx_skb(struct ocp_enet_private *dev, int slot, @@ -1966,6 +1973,7 @@ static int __init emac_probe(struct ocp_ dev->ldev = &ocpdev->dev; dev->def = ocpdev->def; SET_MODULE_OWNER(ndev); + spin_lock_init(&dev->tx_lock); /* Find MAL device we are connected to */ maldev = Index: linux/drivers/net/ibm_emac/ibm_emac_core.h =================================================================== --- linux.orig/drivers/net/ibm_emac/ibm_emac_core.h +++ linux/drivers/net/ibm_emac/ibm_emac_core.h @@ -193,6 +193,8 @@ struct ocp_enet_private { struct ibm_emac_error_stats estats; struct net_device_stats nstats; + spinlock_t tx_lock; + struct device* ldev; }; Index: linux/drivers/net/netconsole.c =================================================================== --- linux.orig/drivers/net/netconsole.c +++ linux/drivers/net/netconsole.c @@ -74,16 +74,22 @@ static void write_msg(struct console *co if (!np.dev) return; - local_irq_save(flags); + /* + * A bit hairy. Netconsole uses mutexes (indirectly) and + * thus must have interrupts enabled: + */ + local_irq_save_nort(flags); for(left = len; left; ) { frag = min(left, MAX_PRINT_CHUNK); + WARN_ON_RT(irqs_disabled()); netpoll_send_udp(&np, msg, frag); + WARN_ON_RT(irqs_disabled()); msg += frag; left -= frag; } - local_irq_restore(flags); + local_irq_restore_nort(flags); } static struct console netconsole = { Index: linux/drivers/net/plip.c =================================================================== --- linux.orig/drivers/net/plip.c +++ linux/drivers/net/plip.c @@ -227,7 +227,10 @@ struct net_local { struct hh_cache *hh); spinlock_t lock; atomic_t kill_timer; - struct semaphore killed_timer_sem; + /* + * PREEMPT_RT: this isnt a mutex, it should be struct completion. + */ + struct compat_semaphore killed_timer_sem; }; static inline void enable_parport_interrupts (struct net_device *dev) Index: linux/drivers/net/ppp_async.c =================================================================== --- linux.orig/drivers/net/ppp_async.c +++ linux/drivers/net/ppp_async.c @@ -67,7 +67,7 @@ struct asyncppp { struct tasklet_struct tsk; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; struct ppp_channel chan; /* interface to generic ppp layer */ unsigned char obuf[OBUFSIZE]; }; Index: linux/drivers/net/ppp_synctty.c =================================================================== --- linux.orig/drivers/net/ppp_synctty.c +++ linux/drivers/net/ppp_synctty.c @@ -70,7 +70,7 @@ struct syncppp { struct tasklet_struct tsk; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; struct ppp_channel chan; /* interface to generic ppp layer */ }; Index: linux/drivers/net/tulip/tulip_core.c =================================================================== --- linux.orig/drivers/net/tulip/tulip_core.c +++ linux/drivers/net/tulip/tulip_core.c @@ -1804,6 +1804,7 @@ static void __devexit tulip_remove_one ( pci_iounmap(pdev, tp->base_addr); free_netdev (dev); pci_release_regions (pdev); + pci_disable_device (pdev); pci_set_drvdata (pdev, NULL); /* pci_power_off (pdev, -1); */ Index: linux/drivers/net/wireless/ipw2100.c =================================================================== --- linux.orig/drivers/net/wireless/ipw2100.c +++ linux/drivers/net/wireless/ipw2100.c @@ -163,6 +163,7 @@ that only one external action is invoked #include #include #include +#include #include "ipw2100.h" @@ -1697,6 +1698,11 @@ static int ipw2100_up(struct ipw2100_pri return 0; } + /* the ipw2100 hardware really doesn't want power management delays + * longer than 175usec + */ + modify_acceptable_latency("ipw2100", 175); + /* If the interrupt is enabled, turn it off... */ spin_lock_irqsave(&priv->low_lock, flags); ipw2100_disable_interrupts(priv); @@ -1849,6 +1855,8 @@ static void ipw2100_down(struct ipw2100_ ipw2100_disable_interrupts(priv); spin_unlock_irqrestore(&priv->low_lock, flags); + modify_acceptable_latency("ipw2100", INFINITE_LATENCY); + #ifdef ACPI_CSTATE_LIMIT_DEFINED if (priv->config & CFG_C3_DISABLED) { IPW_DEBUG_INFO(": Resetting C3 transitions.\n"); @@ -6533,6 +6541,7 @@ static int __init ipw2100_init(void) ret = pci_module_init(&ipw2100_pci_driver); + set_acceptable_latency("ipw2100", INFINITE_LATENCY); #ifdef CONFIG_IPW2100_DEBUG ipw2100_debug_level = debug; driver_create_file(&ipw2100_pci_driver.driver, @@ -6553,6 +6562,7 @@ static void __exit ipw2100_exit(void) &driver_attr_debug_level); #endif pci_unregister_driver(&ipw2100_pci_driver); + remove_acceptable_latency("ipw2100"); } module_init(ipw2100_init); Index: linux/drivers/oprofile/oprofilefs.c =================================================================== --- linux.orig/drivers/oprofile/oprofilefs.c +++ linux/drivers/oprofile/oprofilefs.c @@ -21,7 +21,7 @@ #define OPROFILEFS_MAGIC 0x6f70726f -DEFINE_SPINLOCK(oprofilefs_lock); +DEFINE_RAW_SPINLOCK(oprofilefs_lock); static struct inode * oprofilefs_get_inode(struct super_block * sb, int mode) { Index: linux/drivers/pci/Makefile =================================================================== --- linux.orig/drivers/pci/Makefile +++ linux/drivers/pci/Makefile @@ -27,7 +27,8 @@ obj-$(CONFIG_PPC64) += setup-bus.o obj-$(CONFIG_MIPS) += setup-bus.o setup-irq.o obj-$(CONFIG_X86_VISWS) += setup-irq.o -msiobj-y := msi.o msi-apic.o +msiobj-y := msi.o +msiobj-$(CONFIG_IA64) += msi-apic.o msiobj-$(CONFIG_IA64_GENERIC) += msi-altix.o msiobj-$(CONFIG_IA64_SGI_SN2) += msi-altix.o obj-$(CONFIG_PCI_MSI) += $(msiobj-y) Index: linux/drivers/pci/hotplug/cpci_hotplug_core.c =================================================================== --- linux.orig/drivers/pci/hotplug/cpci_hotplug_core.c +++ linux/drivers/pci/hotplug/cpci_hotplug_core.c @@ -59,8 +59,8 @@ static int slots; static atomic_t extracting; int cpci_debug; static struct cpci_hp_controller *controller; -static struct semaphore event_semaphore; /* mutex for process loop (up if something to process) */ -static struct semaphore thread_exit; /* guard ensure thread has exited before calling it quits */ +static struct compat_semaphore event_semaphore; /* mutex for process loop (up if something to process) */ +static struct compat_semaphore thread_exit; /* guard ensure thread has exited before calling it quits */ static int thread_finished = 1; static int enable_slot(struct hotplug_slot *slot); Index: linux/drivers/pci/hotplug/cpqphp_ctrl.c =================================================================== --- linux.orig/drivers/pci/hotplug/cpqphp_ctrl.c +++ linux/drivers/pci/hotplug/cpqphp_ctrl.c @@ -44,8 +44,8 @@ static int configure_new_function(struct u8 behind_bridge, struct resource_lists *resources); static void interrupt_event_handler(struct controller *ctrl); -static struct semaphore event_semaphore; /* mutex for process loop (up if something to process) */ -static struct semaphore event_exit; /* guard ensure thread has exited before calling it quits */ +static struct compat_semaphore event_semaphore; /* mutex for process loop (up if something to process) */ +static struct compat_semaphore event_exit; /* guard ensure thread has exited before calling it quits */ static int event_finished; static unsigned long pushbutton_pending; /* = 0 */ Index: linux/drivers/pci/hotplug/ibmphp_hpc.c =================================================================== --- linux.orig/drivers/pci/hotplug/ibmphp_hpc.c +++ linux/drivers/pci/hotplug/ibmphp_hpc.c @@ -106,7 +106,7 @@ static int tid_poll; static struct mutex sem_hpcaccess; // lock access to HPC static struct semaphore semOperations; // lock all operations and // access to data structures -static struct semaphore sem_exit; // make sure polling thread goes away +static struct compat_semaphore sem_exit; // make sure polling thread goes away //---------------------------------------------------------------------------- // local function prototypes //---------------------------------------------------------------------------- Index: linux/drivers/pci/hotplug/pciehp_ctrl.c =================================================================== --- linux.orig/drivers/pci/hotplug/pciehp_ctrl.c +++ linux/drivers/pci/hotplug/pciehp_ctrl.c @@ -37,8 +37,8 @@ static void interrupt_event_handler(struct controller *ctrl); -static struct semaphore event_semaphore; /* mutex for process loop (up if something to process) */ -static struct semaphore event_exit; /* guard ensure thread has exited before calling it quits */ +static struct compat_semaphore event_semaphore; /* mutex for process loop (up if something to process) */ +static struct compat_semaphore event_exit; /* guard ensure thread has exited before calling it quits */ static int event_finished; static unsigned long pushbutton_pending; /* = 0 */ static unsigned long surprise_rm_pending; /* = 0 */ Index: linux/drivers/pci/msi-altix.c =================================================================== --- linux.orig/drivers/pci/msi-altix.c +++ linux/drivers/pci/msi-altix.c @@ -26,7 +26,7 @@ struct sn_msi_info { static struct sn_msi_info *sn_msi_info; static void -sn_msi_teardown(unsigned int vector) +sn_msi_teardown(unsigned int irq) { nasid_t nasid; int widget; @@ -36,7 +36,7 @@ sn_msi_teardown(unsigned int vector) struct pcibus_bussoft *bussoft; struct sn_pcibus_provider *provider; - sn_irq_info = sn_msi_info[vector].sn_irq_info; + sn_irq_info = sn_msi_info[irq].sn_irq_info; if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0) return; @@ -45,9 +45,9 @@ sn_msi_teardown(unsigned int vector) provider = SN_PCIDEV_BUSPROVIDER(pdev); (*provider->dma_unmap)(pdev, - sn_msi_info[vector].pci_addr, + sn_msi_info[irq].pci_addr, PCI_DMA_FROMDEVICE); - sn_msi_info[vector].pci_addr = 0; + sn_msi_info[irq].pci_addr = 0; bussoft = SN_PCIDEV_BUSSOFT(pdev); nasid = NASID_GET(bussoft->bs_base); @@ -56,14 +56,13 @@ sn_msi_teardown(unsigned int vector) SWIN_WIDGETNUM(bussoft->bs_base); sn_intr_free(nasid, widget, sn_irq_info); - sn_msi_info[vector].sn_irq_info = NULL; + sn_msi_info[irq].sn_irq_info = NULL; return; } int -sn_msi_setup(struct pci_dev *pdev, unsigned int vector, - u32 *addr_hi, u32 *addr_lo, u32 *data) +sn_msi_setup(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) { int widget; int status; @@ -93,7 +92,7 @@ sn_msi_setup(struct pci_dev *pdev, unsig if (! sn_irq_info) return -ENOMEM; - status = sn_intr_alloc(nasid, widget, sn_irq_info, vector, -1, -1); + status = sn_intr_alloc(nasid, widget, sn_irq_info, irq, -1, -1); if (status) { kfree(sn_irq_info); return -ENOMEM; @@ -119,28 +118,27 @@ sn_msi_setup(struct pci_dev *pdev, unsig return -ENOMEM; } - sn_msi_info[vector].sn_irq_info = sn_irq_info; - sn_msi_info[vector].pci_addr = bus_addr; + sn_msi_info[irq].sn_irq_info = sn_irq_info; + sn_msi_info[irq].pci_addr = bus_addr; - *addr_hi = (u32)(bus_addr >> 32); - *addr_lo = (u32)(bus_addr & 0x00000000ffffffff); + msg->address_hi = (u32)(bus_addr >> 32); + msg->address_lo = (u32)(bus_addr & 0x00000000ffffffff); /* * In the SN platform, bit 16 is a "send vector" bit which * must be present in order to move the vector through the system. */ - *data = 0x100 + (unsigned int)vector; + msg->data = 0x100 + irq; #ifdef CONFIG_SMP - set_irq_affinity_info((vector & 0xff), sn_irq_info->irq_cpuid, 0); + set_irq_affinity_info(irq, sn_irq_info->irq_cpuid, 0); #endif return 0; } static void -sn_msi_target(unsigned int vector, unsigned int cpu, - u32 *addr_hi, u32 *addr_lo) +sn_msi_target(unsigned int irq, cpumask_t cpu_mask, struct msi_msg *msg) { int slice; nasid_t nasid; @@ -150,8 +148,10 @@ sn_msi_target(unsigned int vector, unsig struct sn_irq_info *sn_irq_info; struct sn_irq_info *new_irq_info; struct sn_pcibus_provider *provider; + unsigned int cpu; - sn_irq_info = sn_msi_info[vector].sn_irq_info; + cpu = first_cpu(cpu_mask); + sn_irq_info = sn_msi_info[irq].sn_irq_info; if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0) return; @@ -163,15 +163,15 @@ sn_msi_target(unsigned int vector, unsig pdev = sn_pdev->pdi_linux_pcidev; provider = SN_PCIDEV_BUSPROVIDER(pdev); - bus_addr = (u64)(*addr_hi) << 32 | (u64)(*addr_lo); + bus_addr = (u64)(msg->address_hi) << 32 | (u64)(msg->address_lo); (*provider->dma_unmap)(pdev, bus_addr, PCI_DMA_FROMDEVICE); - sn_msi_info[vector].pci_addr = 0; + sn_msi_info[irq].pci_addr = 0; nasid = cpuid_to_nasid(cpu); slice = cpuid_to_slice(cpu); new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice); - sn_msi_info[vector].sn_irq_info = new_irq_info; + sn_msi_info[irq].sn_irq_info = new_irq_info; if (new_irq_info == NULL) return; @@ -184,12 +184,13 @@ sn_msi_target(unsigned int vector, unsig sizeof(new_irq_info->irq_xtalkaddr), SN_DMA_MSI|SN_DMA_ADDR_XIO); - sn_msi_info[vector].pci_addr = bus_addr; - *addr_hi = (u32)(bus_addr >> 32); - *addr_lo = (u32)(bus_addr & 0x00000000ffffffff); + sn_msi_info[irq].pci_addr = bus_addr; + msg->address_hi = (u32)(bus_addr >> 32); + msg->address_lo = (u32)(bus_addr & 0x00000000ffffffff); } struct msi_ops sn_msi_ops = { + .needs_64bit_address = 1, .setup = sn_msi_setup, .teardown = sn_msi_teardown, #ifdef CONFIG_SMP @@ -201,7 +202,7 @@ int sn_msi_init(void) { sn_msi_info = - kzalloc(sizeof(struct sn_msi_info) * NR_VECTORS, GFP_KERNEL); + kzalloc(sizeof(struct sn_msi_info) * NR_IRQS, GFP_KERNEL); if (! sn_msi_info) return -ENOMEM; Index: linux/drivers/pci/msi-apic.c =================================================================== --- linux.orig/drivers/pci/msi-apic.c +++ linux/drivers/pci/msi-apic.c @@ -46,37 +46,36 @@ static void -msi_target_apic(unsigned int vector, - unsigned int dest_cpu, - u32 *address_hi, /* in/out */ - u32 *address_lo) /* in/out */ +msi_target_apic(unsigned int irq, cpumask_t cpu_mask, struct msi_msg *msg) { - u32 addr = *address_lo; + u32 addr = msg->address_lo; addr &= MSI_ADDR_DESTID_MASK; - addr |= MSI_ADDR_DESTID_CPU(cpu_physical_id(dest_cpu)); + addr |= MSI_ADDR_DESTID_CPU(cpu_physical_id(first_cpu(cpu_mask))); - *address_lo = addr; + msg->address_lo = addr; } static int msi_setup_apic(struct pci_dev *pdev, /* unused in generic */ - unsigned int vector, - u32 *address_hi, - u32 *address_lo, - u32 *data) + unsigned int irq, + struct msi_msg *msg) { unsigned long dest_phys_id; + unsigned int vector; dest_phys_id = cpu_physical_id(first_cpu(cpu_online_map)); + vector = irq; - *address_hi = 0; - *address_lo = MSI_ADDR_HEADER | - MSI_ADDR_DESTMODE_PHYS | - MSI_ADDR_REDIRECTION_CPU | - MSI_ADDR_DESTID_CPU(dest_phys_id); + msg->address_hi = 0; + msg->address_lo = + MSI_ADDR_HEADER | + MSI_ADDR_DESTMODE_PHYS | + MSI_ADDR_REDIRECTION_CPU | + MSI_ADDR_DESTID_CPU(dest_phys_id); - *data = MSI_DATA_TRIGGER_EDGE | + msg->data = + MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(vector); @@ -85,7 +84,7 @@ msi_setup_apic(struct pci_dev *pdev, /* } static void -msi_teardown_apic(unsigned int vector) +msi_teardown_apic(unsigned int irq) { return; /* no-op */ } @@ -95,6 +94,7 @@ msi_teardown_apic(unsigned int vector) */ struct msi_ops msi_apic_ops = { + .needs_64bit_address = 0, .setup = msi_setup_apic, .teardown = msi_teardown_apic, .target = msi_target_apic, Index: linux/drivers/pci/msi.c =================================================================== --- linux.orig/drivers/pci/msi.c +++ linux/drivers/pci/msi.c @@ -6,6 +6,7 @@ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com) */ +#include #include #include #include @@ -22,19 +23,11 @@ #include "pci.h" #include "msi.h" -static DEFINE_SPINLOCK(msi_lock); +static DEFINE_RAW_SPINLOCK(msi_lock); static struct msi_desc* msi_desc[NR_IRQS] = { [0 ... NR_IRQS-1] = NULL }; static kmem_cache_t* msi_cachep; static int pci_msi_enable = 1; -static int last_alloc_vector; -static int nr_released_vectors; -static int nr_reserved_vectors = NR_HP_RESERVED_VECTORS; -static int nr_msix_devices; - -#ifndef CONFIG_X86_IO_APIC -int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1}; -#endif static struct msi_ops *msi_ops; @@ -61,11 +54,11 @@ static int msi_cache_init(void) return 0; } -static void msi_set_mask_bit(unsigned int vector, int flag) +static void msi_set_mask_bit(unsigned int irq, int flag) { struct msi_desc *entry; - entry = (struct msi_desc *)msi_desc[vector]; + entry = msi_desc[irq]; if (!entry || !entry->dev || !entry->mask_base) return; switch (entry->msi_attrib.type) { @@ -93,84 +86,119 @@ static void msi_set_mask_bit(unsigned in } } -#ifdef CONFIG_SMP -static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask) +static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { - struct msi_desc *entry; - u32 address_hi, address_lo; - unsigned int irq = vector; - unsigned int dest_cpu = first_cpu(cpu_mask); + switch(entry->msi_attrib.type) { + case PCI_CAP_ID_MSI: + { + struct pci_dev *dev = entry->dev; + int pos = entry->msi_attrib.pos; + u16 data; + + pci_read_config_dword(dev, msi_lower_address_reg(pos), + &msg->address_lo); + if (entry->msi_attrib.is_64) { + pci_read_config_dword(dev, msi_upper_address_reg(pos), + &msg->address_hi); + pci_read_config_word(dev, msi_data_reg(pos, 1), &data); + } else { + msg->address_hi = 0; + pci_read_config_word(dev, msi_data_reg(pos, 1), &data); + } + msg->data = data; + break; + } + case PCI_CAP_ID_MSIX: + { + void __iomem *base; + base = entry->mask_base + + entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; - entry = (struct msi_desc *)msi_desc[vector]; - if (!entry || !entry->dev) - return; + msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); + msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); + msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET); + break; + } + default: + BUG(); + } +} +static void write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) +{ switch (entry->msi_attrib.type) { case PCI_CAP_ID_MSI: { - int pos = pci_find_capability(entry->dev, PCI_CAP_ID_MSI); + struct pci_dev *dev = entry->dev; + int pos = entry->msi_attrib.pos; - if (!pos) - return; - - pci_read_config_dword(entry->dev, msi_upper_address_reg(pos), - &address_hi); - pci_read_config_dword(entry->dev, msi_lower_address_reg(pos), - &address_lo); - - msi_ops->target(vector, dest_cpu, &address_hi, &address_lo); - - pci_write_config_dword(entry->dev, msi_upper_address_reg(pos), - address_hi); - pci_write_config_dword(entry->dev, msi_lower_address_reg(pos), - address_lo); - set_native_irq_info(irq, cpu_mask); + pci_write_config_dword(dev, msi_lower_address_reg(pos), + msg->address_lo); + if (entry->msi_attrib.is_64) { + pci_write_config_dword(dev, msi_upper_address_reg(pos), + msg->address_hi); + pci_write_config_word(dev, msi_data_reg(pos, 1), + msg->data); + } else { + pci_write_config_word(dev, msi_data_reg(pos, 0), + msg->data); + } break; } case PCI_CAP_ID_MSIX: { - int offset_hi = - entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET; - int offset_lo = - entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET; - - address_hi = readl(entry->mask_base + offset_hi); - address_lo = readl(entry->mask_base + offset_lo); - - msi_ops->target(vector, dest_cpu, &address_hi, &address_lo); - - writel(address_hi, entry->mask_base + offset_hi); - writel(address_lo, entry->mask_base + offset_lo); - set_native_irq_info(irq, cpu_mask); + void __iomem *base; + base = entry->mask_base + + entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; + + writel(msg->address_lo, + base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); + writel(msg->address_hi, + base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); + writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET); break; } default: - break; + BUG(); } } + +#ifdef CONFIG_SMP +static void set_msi_affinity(unsigned int irq, cpumask_t cpu_mask) +{ + struct msi_desc *entry; + struct msi_msg msg; + + entry = msi_desc[irq]; + if (!entry || !entry->dev) + return; + + read_msi_msg(entry, &msg); + msi_ops->target(irq, cpu_mask, &msg); + write_msi_msg(entry, &msg); + set_native_irq_info(irq, cpu_mask); +} #else #define set_msi_affinity NULL #endif /* CONFIG_SMP */ -static void mask_MSI_irq(unsigned int vector) +static void mask_MSI_irq(unsigned int irq) { - msi_set_mask_bit(vector, 1); + msi_set_mask_bit(irq, 1); } -static void unmask_MSI_irq(unsigned int vector) +static void unmask_MSI_irq(unsigned int irq) { - msi_set_mask_bit(vector, 0); + msi_set_mask_bit(irq, 0); } -static unsigned int startup_msi_irq_wo_maskbit(unsigned int vector) +static unsigned int startup_msi_irq_wo_maskbit(unsigned int irq) { struct msi_desc *entry; unsigned long flags; spin_lock_irqsave(&msi_lock, flags); - entry = msi_desc[vector]; + entry = msi_desc[irq]; if (!entry || !entry->dev) { spin_unlock_irqrestore(&msi_lock, flags); return 0; @@ -181,39 +209,39 @@ static unsigned int startup_msi_irq_wo_m return 0; /* never anything pending */ } -static unsigned int startup_msi_irq_w_maskbit(unsigned int vector) +static unsigned int startup_msi_irq_w_maskbit(unsigned int irq) { - startup_msi_irq_wo_maskbit(vector); - unmask_MSI_irq(vector); + startup_msi_irq_wo_maskbit(irq); + unmask_MSI_irq(irq); return 0; /* never anything pending */ } -static void shutdown_msi_irq(unsigned int vector) +static void shutdown_msi_irq(unsigned int irq) { struct msi_desc *entry; unsigned long flags; spin_lock_irqsave(&msi_lock, flags); - entry = msi_desc[vector]; + entry = msi_desc[irq]; if (entry && entry->dev) entry->msi_attrib.state = 0; /* Mark it not active */ spin_unlock_irqrestore(&msi_lock, flags); } -static void end_msi_irq_wo_maskbit(unsigned int vector) +static void end_msi_irq_wo_maskbit(unsigned int irq) { - move_native_irq(vector); + move_native_irq(irq); ack_APIC_irq(); } -static void end_msi_irq_w_maskbit(unsigned int vector) +static void end_msi_irq_w_maskbit(unsigned int irq) { - move_native_irq(vector); - unmask_MSI_irq(vector); + move_native_irq(irq); + unmask_MSI_irq(irq); ack_APIC_irq(); } -static void do_nothing(unsigned int vector) +static void do_nothing(unsigned int irq) { } @@ -264,86 +292,7 @@ static struct hw_interrupt_type msi_irq_ .set_affinity = set_msi_affinity }; -static int msi_free_vector(struct pci_dev* dev, int vector, int reassign); -static int assign_msi_vector(void) -{ - static int new_vector_avail = 1; - int vector; - unsigned long flags; - - /* - * msi_lock is provided to ensure that successful allocation of MSI - * vector is assigned unique among drivers. - */ - spin_lock_irqsave(&msi_lock, flags); - - if (!new_vector_avail) { - int free_vector = 0; - - /* - * vector_irq[] = -1 indicates that this specific vector is: - * - assigned for MSI (since MSI have no associated IRQ) or - * - assigned for legacy if less than 16, or - * - having no corresponding 1:1 vector-to-IOxAPIC IRQ mapping - * vector_irq[] = 0 indicates that this vector, previously - * assigned for MSI, is freed by hotplug removed operations. - * This vector will be reused for any subsequent hotplug added - * operations. - * vector_irq[] > 0 indicates that this vector is assigned for - * IOxAPIC IRQs. This vector and its value provides a 1-to-1 - * vector-to-IOxAPIC IRQ mapping. - */ - for (vector = FIRST_DEVICE_VECTOR; vector < NR_IRQS; vector++) { - if (vector_irq[vector] != 0) - continue; - free_vector = vector; - if (!msi_desc[vector]) - break; - else - continue; - } - if (!free_vector) { - spin_unlock_irqrestore(&msi_lock, flags); - return -EBUSY; - } - vector_irq[free_vector] = -1; - nr_released_vectors--; - spin_unlock_irqrestore(&msi_lock, flags); - if (msi_desc[free_vector] != NULL) { - struct pci_dev *dev; - int tail; - - /* free all linked vectors before re-assign */ - do { - spin_lock_irqsave(&msi_lock, flags); - dev = msi_desc[free_vector]->dev; - tail = msi_desc[free_vector]->link.tail; - spin_unlock_irqrestore(&msi_lock, flags); - msi_free_vector(dev, tail, 1); - } while (free_vector != tail); - } - - return free_vector; - } - vector = assign_irq_vector(AUTO_ASSIGN); - last_alloc_vector = vector; - if (vector == LAST_DEVICE_VECTOR) - new_vector_avail = 0; - - spin_unlock_irqrestore(&msi_lock, flags); - return vector; -} - -static int get_new_vector(void) -{ - int vector = assign_msi_vector(); - - if (vector > 0) - set_intr_gate(vector, interrupt[vector]); - - return vector; -} - +static int msi_free_irq(struct pci_dev* dev, int irq); static int msi_init(void) { static int status = -ENOMEM; @@ -367,13 +316,13 @@ static int msi_init(void) } if (! msi_ops) { + pci_msi_enable = 0; printk(KERN_WARNING "PCI: MSI ops not registered. MSI disabled.\n"); status = -EINVAL; return status; } - last_alloc_vector = assign_irq_vector(AUTO_ASSIGN); status = msi_cache_init(); if (status < 0) { pci_msi_enable = 0; @@ -381,23 +330,9 @@ static int msi_init(void) return status; } - if (last_alloc_vector < 0) { - pci_msi_enable = 0; - printk(KERN_WARNING "PCI: No interrupt vectors available for MSI\n"); - status = -EBUSY; - return status; - } - vector_irq[last_alloc_vector] = 0; - nr_released_vectors++; - return status; } -static int get_msi_vector(struct pci_dev *dev) -{ - return get_new_vector(); -} - static struct msi_desc* alloc_msi_entry(void) { struct msi_desc *entry; @@ -413,29 +348,45 @@ static struct msi_desc* alloc_msi_entry( return entry; } -static void attach_msi_entry(struct msi_desc *entry, int vector) +static void attach_msi_entry(struct msi_desc *entry, int irq) { unsigned long flags; spin_lock_irqsave(&msi_lock, flags); - msi_desc[vector] = entry; + msi_desc[irq] = entry; spin_unlock_irqrestore(&msi_lock, flags); } -static void irq_handler_init(int cap_id, int pos, int mask) +static int create_msi_irq(struct hw_interrupt_type *handler) { - unsigned long flags; + struct msi_desc *entry; + int irq; + + entry = alloc_msi_entry(); + if (!entry) + return -ENOMEM; - spin_lock_irqsave(&irq_desc[pos].lock, flags); - if (cap_id == PCI_CAP_ID_MSIX) - irq_desc[pos].chip = &msix_irq_type; - else { - if (!mask) - irq_desc[pos].chip = &msi_irq_wo_maskbit_type; - else - irq_desc[pos].chip = &msi_irq_w_maskbit_type; + irq = create_irq(); + if (irq < 0) { + kmem_cache_free(msi_cachep, entry); + return -EBUSY; } - spin_unlock_irqrestore(&irq_desc[pos].lock, flags); + + set_irq_chip(irq, handler); + set_irq_data(irq, entry); + + return irq; +} + +static void destroy_msi_irq(unsigned int irq) +{ + struct msi_desc *entry; + + entry = get_irq_data(irq); + set_irq_chip(irq, NULL); + set_irq_data(irq, NULL); + destroy_irq(irq); + kmem_cache_free(msi_cachep, entry); } static void enable_msi_mode(struct pci_dev *dev, int pos, int type) @@ -480,21 +431,21 @@ void disable_msi_mode(struct pci_dev *de } } -static int msi_lookup_vector(struct pci_dev *dev, int type) +static int msi_lookup_irq(struct pci_dev *dev, int type) { - int vector; + int irq; unsigned long flags; spin_lock_irqsave(&msi_lock, flags); - for (vector = FIRST_DEVICE_VECTOR; vector < NR_IRQS; vector++) { - if (!msi_desc[vector] || msi_desc[vector]->dev != dev || - msi_desc[vector]->msi_attrib.type != type || - msi_desc[vector]->msi_attrib.default_vector != dev->irq) + for (irq = 0; irq < NR_IRQS; irq++) { + if (!msi_desc[irq] || msi_desc[irq]->dev != dev || + msi_desc[irq]->msi_attrib.type != type || + msi_desc[irq]->msi_attrib.default_irq != dev->irq) continue; spin_unlock_irqrestore(&msi_lock, flags); - /* This pre-assigned MSI vector for this device - already exits. Override dev->irq with this vector */ - dev->irq = vector; + /* This pre-assigned MSI irq for this device + already exits. Override dev->irq with this irq */ + dev->irq = irq; return 0; } spin_unlock_irqrestore(&msi_lock, flags); @@ -506,11 +457,6 @@ void pci_scan_msi_device(struct pci_dev { if (!dev) return; - - if (pci_find_capability(dev, PCI_CAP_ID_MSIX) > 0) - nr_msix_devices++; - else if (pci_find_capability(dev, PCI_CAP_ID_MSI) > 0) - nr_reserved_vectors++; } #ifdef CONFIG_PM @@ -584,7 +530,7 @@ int pci_save_msix_state(struct pci_dev * { int pos; int temp; - int vector, head, tail = 0; + int irq, head, tail = 0; u16 control; struct pci_cap_saved_state *save_state; @@ -606,33 +552,20 @@ int pci_save_msix_state(struct pci_dev * /* save the table */ temp = dev->irq; - if (msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) { + if (msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) { kfree(save_state); return -EINVAL; } - vector = head = dev->irq; + irq = head = dev->irq; while (head != tail) { - int j; - void __iomem *base; struct msi_desc *entry; - entry = msi_desc[vector]; - base = entry->mask_base; - j = entry->msi_attrib.entry_nr; - - entry->address_lo_save = - readl(base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); - entry->address_hi_save = - readl(base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); - entry->data_save = - readl(base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_DATA_OFFSET); + entry = msi_desc[irq]; + read_msi_msg(entry, &entry->msg_save); - tail = msi_desc[vector]->link.tail; - vector = tail; + tail = msi_desc[irq]->link.tail; + irq = tail; } dev->irq = temp; @@ -645,9 +578,7 @@ void pci_restore_msix_state(struct pci_d { u16 save; int pos; - int vector, head, tail = 0; - void __iomem *base; - int j; + int irq, head, tail = 0; struct msi_desc *entry; int temp; struct pci_cap_saved_state *save_state; @@ -665,26 +596,15 @@ void pci_restore_msix_state(struct pci_d /* route the table */ temp = dev->irq; - if (msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) + if (msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) return; - vector = head = dev->irq; + irq = head = dev->irq; while (head != tail) { - entry = msi_desc[vector]; - base = entry->mask_base; - j = entry->msi_attrib.entry_nr; - - writel(entry->address_lo_save, - base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); - writel(entry->address_hi_save, - base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); - writel(entry->data_save, - base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_DATA_OFFSET); + entry = msi_desc[irq]; + write_msi_msg(entry, &entry->msg_save); - tail = msi_desc[vector]->link.tail; - vector = tail; + tail = msi_desc[irq]->link.tail; + irq = tail; } dev->irq = temp; @@ -696,29 +616,19 @@ void pci_restore_msix_state(struct pci_d static int msi_register_init(struct pci_dev *dev, struct msi_desc *entry) { int status; - u32 address_hi; - u32 address_lo; - u32 data; - int pos, vector = dev->irq; + struct msi_msg msg; + int pos; u16 control; - pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + pos = entry->msi_attrib.pos; pci_read_config_word(dev, msi_control_reg(pos), &control); /* Configure MSI capability structure */ - status = msi_ops->setup(dev, vector, &address_hi, &address_lo, &data); + status = msi_ops->setup(dev, dev->irq, &msg); if (status < 0) return status; - pci_write_config_dword(dev, msi_lower_address_reg(pos), address_lo); - if (is_64bit_address(control)) { - pci_write_config_dword(dev, - msi_upper_address_reg(pos), address_hi); - pci_write_config_word(dev, - msi_data_reg(pos, 1), data); - } else - pci_write_config_word(dev, - msi_data_reg(pos, 0), data); + write_msi_msg(entry, &msg); if (entry->msi_attrib.maskbit) { unsigned int maskbits, temp; /* All MSIs are unmasked by default, Mask them all */ @@ -741,53 +651,54 @@ static int msi_register_init(struct pci_ * @dev: pointer to the pci_dev data structure of MSI device function * * Setup the MSI capability structure of device function with a single - * MSI vector, regardless of device function is capable of handling + * MSI irq, regardless of device function is capable of handling * multiple messages. A return of zero indicates the successful setup - * of an entry zero with the new MSI vector or non-zero for otherwise. + * of an entry zero with the new MSI irq or non-zero for otherwise. **/ static int msi_capability_init(struct pci_dev *dev) { int status; struct msi_desc *entry; - int pos, vector; + int pos, irq; u16 control; + struct hw_interrupt_type *handler; pos = pci_find_capability(dev, PCI_CAP_ID_MSI); pci_read_config_word(dev, msi_control_reg(pos), &control); /* MSI Entry Initialization */ - entry = alloc_msi_entry(); - if (!entry) - return -ENOMEM; - - vector = get_msi_vector(dev); - if (vector < 0) { - kmem_cache_free(msi_cachep, entry); - return -EBUSY; - } - entry->link.head = vector; - entry->link.tail = vector; + handler = &msi_irq_wo_maskbit_type; + if (is_mask_bit_support(control)) + handler = &msi_irq_w_maskbit_type; + + irq = create_msi_irq(handler); + if (irq < 0) + return irq; + + entry = get_irq_data(irq); + entry->link.head = irq; + entry->link.tail = irq; entry->msi_attrib.type = PCI_CAP_ID_MSI; entry->msi_attrib.state = 0; /* Mark it not active */ + entry->msi_attrib.is_64 = is_64bit_address(control); entry->msi_attrib.entry_nr = 0; entry->msi_attrib.maskbit = is_mask_bit_support(control); - entry->msi_attrib.default_vector = dev->irq; /* Save IOAPIC IRQ */ - dev->irq = vector; + entry->msi_attrib.default_irq = dev->irq; /* Save IOAPIC IRQ */ + entry->msi_attrib.pos = pos; + dev->irq = irq; entry->dev = dev; if (is_mask_bit_support(control)) { entry->mask_base = (void __iomem *)(long)msi_mask_bits_reg(pos, is_64bit_address(control)); } - /* Replace with MSI handler */ - irq_handler_init(PCI_CAP_ID_MSI, vector, entry->msi_attrib.maskbit); /* Configure MSI capability structure */ status = msi_register_init(dev, entry); if (status != 0) { - dev->irq = entry->msi_attrib.default_vector; - kmem_cache_free(msi_cachep, entry); + dev->irq = entry->msi_attrib.default_irq; + destroy_msi_irq(irq); return status; } - attach_msi_entry(entry, vector); + attach_msi_entry(entry, irq); /* Set MSI enabled bits */ enable_msi_mode(dev, pos, PCI_CAP_ID_MSI); @@ -801,18 +712,16 @@ static int msi_capability_init(struct pc * @nvec: number of @entries * * Setup the MSI-X capability structure of device function with a - * single MSI-X vector. A return of zero indicates the successful setup of - * requested MSI-X entries with allocated vectors or non-zero for otherwise. + * single MSI-X irq. A return of zero indicates the successful setup of + * requested MSI-X entries with allocated irqs or non-zero for otherwise. **/ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, int nvec) { struct msi_desc *head = NULL, *tail = NULL, *entry = NULL; - u32 address_hi; - u32 address_lo; - u32 data; + struct msi_msg msg; int status; - int vector, pos, i, j, nr_entries, temp = 0; + int irq, pos, i, j, nr_entries, temp = 0; unsigned long phys_addr; u32 table_offset; u16 control; @@ -834,65 +743,58 @@ static int msix_capability_init(struct p /* MSI-X Table Initialization */ for (i = 0; i < nvec; i++) { - entry = alloc_msi_entry(); - if (!entry) - break; - vector = get_msi_vector(dev); - if (vector < 0) { - kmem_cache_free(msi_cachep, entry); + irq = create_msi_irq(&msix_irq_type); + if (irq < 0) break; - } + entry = get_irq_data(irq); j = entries[i].entry; - entries[i].vector = vector; + entries[i].vector = irq; entry->msi_attrib.type = PCI_CAP_ID_MSIX; entry->msi_attrib.state = 0; /* Mark it not active */ + entry->msi_attrib.is_64 = 1; entry->msi_attrib.entry_nr = j; entry->msi_attrib.maskbit = 1; - entry->msi_attrib.default_vector = dev->irq; + entry->msi_attrib.default_irq = dev->irq; + entry->msi_attrib.pos = pos; entry->dev = dev; entry->mask_base = base; if (!head) { - entry->link.head = vector; - entry->link.tail = vector; + entry->link.head = irq; + entry->link.tail = irq; head = entry; } else { entry->link.head = temp; entry->link.tail = tail->link.tail; - tail->link.tail = vector; - head->link.head = vector; + tail->link.tail = irq; + head->link.head = irq; } - temp = vector; + temp = irq; tail = entry; - /* Replace with MSI-X handler */ - irq_handler_init(PCI_CAP_ID_MSIX, vector, 1); /* Configure MSI-X capability structure */ - status = msi_ops->setup(dev, vector, - &address_hi, - &address_lo, - &data); - if (status < 0) + status = msi_ops->setup(dev, irq, &msg); + if (status < 0) { + destroy_msi_irq(irq); break; + } - writel(address_lo, - base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); - writel(address_hi, - base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); - writel(data, - base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_DATA_OFFSET); - attach_msi_entry(entry, vector); + write_msi_msg(entry, &msg); + attach_msi_entry(entry, irq); } if (i != nvec) { + int avail = i - 1; i--; for (; i >= 0; i--) { - vector = (entries + i)->vector; - msi_free_vector(dev, vector, 0); + irq = (entries + i)->vector; + msi_free_irq(dev, irq); (entries + i)->vector = 0; } - return -EBUSY; + /* If we had some success report the number of irqs + * we succeeded in setting up. + */ + if (avail <= 0) + avail = -EBUSY; + return avail; } /* Set MSI-X enabled bits */ enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); @@ -905,10 +807,10 @@ static int msix_capability_init(struct p * @dev: pointer to the pci_dev data structure of MSI device function * * Setup the MSI capability structure of device function with - * a single MSI vector upon its software driver call to request for + * a single MSI irq upon its software driver call to request for * MSI mode enabled on its hardware device function. A return of zero * indicates the successful setup of an entry zero with the new MSI - * vector or non-zero for otherwise. + * irq or non-zero for otherwise. **/ int pci_enable_msi(struct pci_dev* dev) { @@ -936,52 +838,29 @@ int pci_enable_msi(struct pci_dev* dev) if (!pos) return -EINVAL; - if (!msi_lookup_vector(dev, PCI_CAP_ID_MSI)) { - /* Lookup Sucess */ - unsigned long flags; + pci_read_config_word(dev, msi_control_reg(pos), &control); + if (!is_64bit_address(control) && msi_ops->needs_64bit_address) + return -EINVAL; - pci_read_config_word(dev, msi_control_reg(pos), &control); - if (control & PCI_MSI_FLAGS_ENABLE) - return 0; /* Already in MSI mode */ - spin_lock_irqsave(&msi_lock, flags); - if (!vector_irq[dev->irq]) { - msi_desc[dev->irq]->msi_attrib.state = 0; - vector_irq[dev->irq] = -1; - nr_released_vectors--; - spin_unlock_irqrestore(&msi_lock, flags); - status = msi_register_init(dev, msi_desc[dev->irq]); - if (status == 0) - enable_msi_mode(dev, pos, PCI_CAP_ID_MSI); - return status; - } - spin_unlock_irqrestore(&msi_lock, flags); - dev->irq = temp; - } - /* Check whether driver already requested for MSI-X vectors */ + WARN_ON(!msi_lookup_irq(dev, PCI_CAP_ID_MSI)); + + /* Check whether driver already requested for MSI-X irqs */ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - if (pos > 0 && !msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) { + if (pos > 0 && !msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) { printk(KERN_INFO "PCI: %s: Can't enable MSI. " - "Device already has MSI-X vectors assigned\n", + "Device already has MSI-X irq assigned\n", pci_name(dev)); dev->irq = temp; return -EINVAL; } status = msi_capability_init(dev); - if (!status) { - if (!pos) - nr_reserved_vectors--; /* Only MSI capable */ - else if (nr_msix_devices > 0) - nr_msix_devices--; /* Both MSI and MSI-X capable, - but choose enabling MSI */ - } - return status; } void pci_disable_msi(struct pci_dev* dev) { struct msi_desc *entry; - int pos, default_vector; + int pos, default_irq; u16 control; unsigned long flags; @@ -998,6 +877,8 @@ void pci_disable_msi(struct pci_dev* dev if (!(control & PCI_MSI_FLAGS_ENABLE)) return; + disable_msi_mode(dev, pos, PCI_CAP_ID_MSI); + spin_lock_irqsave(&msi_lock, flags); entry = msi_desc[dev->irq]; if (!entry || !entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI) { @@ -1007,32 +888,30 @@ void pci_disable_msi(struct pci_dev* dev if (entry->msi_attrib.state) { spin_unlock_irqrestore(&msi_lock, flags); printk(KERN_WARNING "PCI: %s: pci_disable_msi() called without " - "free_irq() on MSI vector %d\n", + "free_irq() on MSI irq %d\n", pci_name(dev), dev->irq); BUG_ON(entry->msi_attrib.state > 0); } else { - vector_irq[dev->irq] = 0; /* free it */ - nr_released_vectors++; - default_vector = entry->msi_attrib.default_vector; + default_irq = entry->msi_attrib.default_irq; spin_unlock_irqrestore(&msi_lock, flags); - /* Restore dev->irq to its default pin-assertion vector */ - dev->irq = default_vector; - disable_msi_mode(dev, pci_find_capability(dev, PCI_CAP_ID_MSI), - PCI_CAP_ID_MSI); + msi_free_irq(dev, dev->irq); + + /* Restore dev->irq to its default pin-assertion irq */ + dev->irq = default_irq; } } -static int msi_free_vector(struct pci_dev* dev, int vector, int reassign) +static int msi_free_irq(struct pci_dev* dev, int irq) { struct msi_desc *entry; int head, entry_nr, type; void __iomem *base; unsigned long flags; - msi_ops->teardown(vector); + msi_ops->teardown(irq); spin_lock_irqsave(&msi_lock, flags); - entry = msi_desc[vector]; + entry = msi_desc[irq]; if (!entry || entry->dev != dev) { spin_unlock_irqrestore(&msi_lock, flags); return -EINVAL; @@ -1044,101 +923,43 @@ static int msi_free_vector(struct pci_de msi_desc[entry->link.head]->link.tail = entry->link.tail; msi_desc[entry->link.tail]->link.head = entry->link.head; entry->dev = NULL; - if (!reassign) { - vector_irq[vector] = 0; - nr_released_vectors++; - } - msi_desc[vector] = NULL; + msi_desc[irq] = NULL; spin_unlock_irqrestore(&msi_lock, flags); - kmem_cache_free(msi_cachep, entry); + destroy_msi_irq(irq); if (type == PCI_CAP_ID_MSIX) { - if (!reassign) - writel(1, base + - entry_nr * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); + writel(1, base + entry_nr * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); - if (head == vector) + if (head == irq) iounmap(base); } return 0; } -static int reroute_msix_table(int head, struct msix_entry *entries, int *nvec) -{ - int vector = head, tail = 0; - int i, j = 0, nr_entries = 0; - void __iomem *base; - unsigned long flags; - - spin_lock_irqsave(&msi_lock, flags); - while (head != tail) { - nr_entries++; - tail = msi_desc[vector]->link.tail; - if (entries[0].entry == msi_desc[vector]->msi_attrib.entry_nr) - j = vector; - vector = tail; - } - if (*nvec > nr_entries) { - spin_unlock_irqrestore(&msi_lock, flags); - *nvec = nr_entries; - return -EINVAL; - } - vector = ((j > 0) ? j : head); - for (i = 0; i < *nvec; i++) { - j = msi_desc[vector]->msi_attrib.entry_nr; - msi_desc[vector]->msi_attrib.state = 0; /* Mark it not active */ - vector_irq[vector] = -1; /* Mark it busy */ - nr_released_vectors--; - entries[i].vector = vector; - if (j != (entries + i)->entry) { - base = msi_desc[vector]->mask_base; - msi_desc[vector]->msi_attrib.entry_nr = - (entries + i)->entry; - writel( readl(base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET), base + - (entries + i)->entry * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); - writel( readl(base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET), base + - (entries + i)->entry * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); - writel( (readl(base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_DATA_OFFSET) & 0xff00) | vector, - base + (entries+i)->entry*PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_DATA_OFFSET); - } - vector = msi_desc[vector]->link.tail; - } - spin_unlock_irqrestore(&msi_lock, flags); - - return 0; -} - /** * pci_enable_msix - configure device's MSI-X capability structure * @dev: pointer to the pci_dev data structure of MSI-X device function * @entries: pointer to an array of MSI-X entries - * @nvec: number of MSI-X vectors requested for allocation by device driver + * @nvec: number of MSI-X irqs requested for allocation by device driver * * Setup the MSI-X capability structure of device function with the number - * of requested vectors upon its software driver call to request for + * of requested irqs upon its software driver call to request for * MSI-X mode enabled on its hardware device function. A return of zero * indicates the successful configuration of MSI-X capability structure - * with new allocated MSI-X vectors. A return of < 0 indicates a failure. + * with new allocated MSI-X irqs. A return of < 0 indicates a failure. * Or a return of > 0 indicates that driver request is exceeding the number - * of vectors available. Driver should use the returned value to re-send + * of irqs available. Driver should use the returned value to re-send * its request. **/ int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec) { struct pci_bus *bus; - int status, pos, nr_entries, free_vectors; + int status, pos, nr_entries; int i, j, temp; u16 control; - unsigned long flags; if (!pci_msi_enable || !dev || !entries) return -EINVAL; @@ -1159,9 +980,6 @@ int pci_enable_msix(struct pci_dev* dev, return -EINVAL; pci_read_config_word(dev, msi_control_reg(pos), &control); - if (control & PCI_MSIX_FLAGS_ENABLE) - return -EINVAL; /* Already in MSI-X mode */ - nr_entries = multi_msix_capable(control); if (nvec > nr_entries) return -EINVAL; @@ -1176,56 +994,18 @@ int pci_enable_msix(struct pci_dev* dev, } } temp = dev->irq; - if (!msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) { - /* Lookup Sucess */ - nr_entries = nvec; - /* Reroute MSI-X table */ - if (reroute_msix_table(dev->irq, entries, &nr_entries)) { - /* #requested > #previous-assigned */ - dev->irq = temp; - return nr_entries; - } - dev->irq = temp; - enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); - return 0; - } - /* Check whether driver already requested for MSI vector */ + WARN_ON(!msi_lookup_irq(dev, PCI_CAP_ID_MSIX)); + + /* Check whether driver already requested for MSI irq */ if (pci_find_capability(dev, PCI_CAP_ID_MSI) > 0 && - !msi_lookup_vector(dev, PCI_CAP_ID_MSI)) { + !msi_lookup_irq(dev, PCI_CAP_ID_MSI)) { printk(KERN_INFO "PCI: %s: Can't enable MSI-X. " - "Device already has an MSI vector assigned\n", + "Device already has an MSI irq assigned\n", pci_name(dev)); dev->irq = temp; return -EINVAL; } - - spin_lock_irqsave(&msi_lock, flags); - /* - * msi_lock is provided to ensure that enough vectors resources are - * available before granting. - */ - free_vectors = pci_vector_resources(last_alloc_vector, - nr_released_vectors); - /* Ensure that each MSI/MSI-X device has one vector reserved by - default to avoid any MSI-X driver to take all available - resources */ - free_vectors -= nr_reserved_vectors; - /* Find the average of free vectors among MSI-X devices */ - if (nr_msix_devices > 0) - free_vectors /= nr_msix_devices; - spin_unlock_irqrestore(&msi_lock, flags); - - if (nvec > free_vectors) { - if (free_vectors > 0) - return free_vectors; - else - return -EBUSY; - } - status = msix_capability_init(dev, entries, nvec); - if (!status && nr_msix_devices > 0) - nr_msix_devices--; - return status; } @@ -1247,47 +1027,42 @@ void pci_disable_msix(struct pci_dev* de if (!(control & PCI_MSIX_FLAGS_ENABLE)) return; + disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); + temp = dev->irq; - if (!msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) { - int state, vector, head, tail = 0, warning = 0; + if (!msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) { + int state, irq, head, tail = 0, warning = 0; unsigned long flags; - vector = head = dev->irq; - spin_lock_irqsave(&msi_lock, flags); + irq = head = dev->irq; + dev->irq = temp; /* Restore pin IRQ */ while (head != tail) { - state = msi_desc[vector]->msi_attrib.state; + spin_lock_irqsave(&msi_lock, flags); + state = msi_desc[irq]->msi_attrib.state; + tail = msi_desc[irq]->link.tail; + spin_unlock_irqrestore(&msi_lock, flags); if (state) warning = 1; - else { - vector_irq[vector] = 0; /* free it */ - nr_released_vectors++; - } - tail = msi_desc[vector]->link.tail; - vector = tail; + else if (irq != head) /* Release MSI-X irq */ + msi_free_irq(dev, irq); + irq = tail; } - spin_unlock_irqrestore(&msi_lock, flags); + msi_free_irq(dev, irq); if (warning) { - dev->irq = temp; printk(KERN_WARNING "PCI: %s: pci_disable_msix() called without " - "free_irq() on all MSI-X vectors\n", + "free_irq() on all MSI-X irqs\n", pci_name(dev)); BUG_ON(warning > 0); - } else { - dev->irq = temp; - disable_msi_mode(dev, - pci_find_capability(dev, PCI_CAP_ID_MSIX), - PCI_CAP_ID_MSIX); - } } } /** - * msi_remove_pci_irq_vectors - reclaim MSI(X) vectors to unused state + * msi_remove_pci_irq_vectors - reclaim MSI(X) irqs to unused state * @dev: pointer to the pci_dev data structure of MSI(X) device function * * Being called during hotplug remove, from which the device function - * is hot-removed. All previous assigned MSI/MSI-X vectors, if + * is hot-removed. All previous assigned MSI/MSI-X irqs, if * allocated for this device function, are reclaimed to unused state, * which may be used later on. **/ @@ -1301,42 +1076,42 @@ void msi_remove_pci_irq_vectors(struct p temp = dev->irq; /* Save IOAPIC IRQ */ pos = pci_find_capability(dev, PCI_CAP_ID_MSI); - if (pos > 0 && !msi_lookup_vector(dev, PCI_CAP_ID_MSI)) { + if (pos > 0 && !msi_lookup_irq(dev, PCI_CAP_ID_MSI)) { spin_lock_irqsave(&msi_lock, flags); state = msi_desc[dev->irq]->msi_attrib.state; spin_unlock_irqrestore(&msi_lock, flags); if (state) { printk(KERN_WARNING "PCI: %s: msi_remove_pci_irq_vectors() " - "called without free_irq() on MSI vector %d\n", + "called without free_irq() on MSI irq %d\n", pci_name(dev), dev->irq); BUG_ON(state > 0); - } else /* Release MSI vector assigned to this device */ - msi_free_vector(dev, dev->irq, 0); + } else /* Release MSI irq assigned to this device */ + msi_free_irq(dev, dev->irq); dev->irq = temp; /* Restore IOAPIC IRQ */ } pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - if (pos > 0 && !msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) { - int vector, head, tail = 0, warning = 0; + if (pos > 0 && !msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) { + int irq, head, tail = 0, warning = 0; void __iomem *base = NULL; - vector = head = dev->irq; + irq = head = dev->irq; while (head != tail) { spin_lock_irqsave(&msi_lock, flags); - state = msi_desc[vector]->msi_attrib.state; - tail = msi_desc[vector]->link.tail; - base = msi_desc[vector]->mask_base; + state = msi_desc[irq]->msi_attrib.state; + tail = msi_desc[irq]->link.tail; + base = msi_desc[irq]->mask_base; spin_unlock_irqrestore(&msi_lock, flags); if (state) warning = 1; - else if (vector != head) /* Release MSI-X vector */ - msi_free_vector(dev, vector, 0); - vector = tail; + else if (irq != head) /* Release MSI-X irq */ + msi_free_irq(dev, irq); + irq = tail; } - msi_free_vector(dev, vector, 0); + msi_free_irq(dev, irq); if (warning) { iounmap(base); printk(KERN_WARNING "PCI: %s: msi_remove_pci_irq_vectors() " - "called without free_irq() on all MSI-X vectors\n", + "called without free_irq() on all MSI-X irqs\n", pci_name(dev)); BUG_ON(warning > 0); } Index: linux/drivers/pci/msi.h =================================================================== --- linux.orig/drivers/pci/msi.h +++ linux/drivers/pci/msi.h @@ -6,85 +6,9 @@ #ifndef MSI_H #define MSI_H -/* - * MSI operation vector. Used by the msi core code (drivers/pci/msi.c) - * to abstract platform-specific tasks relating to MSI address generation - * and resource management. - */ -struct msi_ops { - /** - * setup - generate an MSI bus address and data for a given vector - * @pdev: PCI device context (in) - * @vector: vector allocated by the msi core (in) - * @addr_hi: upper 32 bits of PCI bus MSI address (out) - * @addr_lo: lower 32 bits of PCI bus MSI address (out) - * @data: MSI data payload (out) - * - * Description: The setup op is used to generate a PCI bus addres and - * data which the msi core will program into the card MSI capability - * registers. The setup routine is responsible for picking an initial - * cpu to target the MSI at. The setup routine is responsible for - * examining pdev to determine the MSI capabilities of the card and - * generating a suitable address/data. The setup routine is - * responsible for allocating and tracking any system resources it - * needs to route the MSI to the cpu it picks, and for associating - * those resources with the passed in vector. - * - * Returns 0 if the MSI address/data was successfully setup. - **/ - - int (*setup) (struct pci_dev *pdev, unsigned int vector, - u32 *addr_hi, u32 *addr_lo, u32 *data); - - /** - * teardown - release resources allocated by setup - * @vector: vector context for resources (in) - * - * Description: The teardown op is used to release any resources - * that were allocated in the setup routine associated with the passed - * in vector. - **/ - - void (*teardown) (unsigned int vector); - - /** - * target - retarget an MSI at a different cpu - * @vector: vector context for resources (in) - * @cpu: new cpu to direct vector at (in) - * @addr_hi: new value of PCI bus upper 32 bits (in/out) - * @addr_lo: new value of PCI bus lower 32 bits (in/out) - * - * Description: The target op is used to redirect an MSI vector - * at a different cpu. addr_hi/addr_lo coming in are the existing - * values that the MSI core has programmed into the card. The - * target code is responsible for freeing any resources (if any) - * associated with the old address, and generating a new PCI bus - * addr_hi/addr_lo that will redirect the vector at the indicated cpu. - **/ - - void (*target) (unsigned int vector, unsigned int cpu, - u32 *addr_hi, u32 *addr_lo); -}; - -extern int msi_register(struct msi_ops *ops); - #include /* - * Assume the maximum number of hot plug slots supported by the system is about - * ten. The worstcase is that each of these slots is hot-added with a device, - * which has two MSI/MSI-X capable functions. To avoid any MSI-X driver, which - * attempts to request all available vectors, NR_HP_RESERVED_VECTORS is defined - * as below to ensure at least one message is assigned to each detected MSI/ - * MSI-X device function. - */ -#define NR_HP_RESERVED_VECTORS 20 - -extern int vector_irq[NR_VECTORS]; -extern void (*interrupt[NR_IRQS])(void); -extern int pci_vector_resources(int last, int nr_released); - -/* * MSI-X Address Register */ #define PCI_MSIX_FLAGS_QSIZE 0x7FF @@ -110,8 +34,8 @@ extern int pci_vector_resources(int last (1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1)) #define multi_msi_enable(control, num) \ control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE); -#define is_64bit_address(control) (control & PCI_MSI_FLAGS_64BIT) -#define is_mask_bit_support(control) (control & PCI_MSI_FLAGS_MASKBIT) +#define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT)) +#define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT)) #define msi_enable(control, num) multi_msi_enable(control, num); \ control |= PCI_MSI_FLAGS_ENABLE @@ -130,10 +54,10 @@ struct msi_desc { __u8 type : 5; /* {0: unused, 5h:MSI, 11h:MSI-X} */ __u8 maskbit : 1; /* mask-pending bit supported ? */ __u8 state : 1; /* {0: free, 1: busy} */ - __u8 reserved: 1; /* reserved */ - __u8 entry_nr; /* specific enabled entry */ - __u8 default_vector; /* default pre-assigned vector */ - __u8 unused; /* formerly unused destination cpu*/ + __u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */ + __u8 pos; /* Location of the msi capability */ + __u16 entry_nr; /* specific enabled entry */ + unsigned default_irq; /* default pre-assigned irq */ }msi_attrib; struct { @@ -146,10 +70,7 @@ struct msi_desc { #ifdef CONFIG_PM /* PM save area for MSIX address/data */ - - u32 address_hi_save; - u32 address_lo_save; - u32 data_save; + struct msi_msg msg_save; #endif }; Index: linux/drivers/scsi/aacraid/aacraid.h =================================================================== --- linux.orig/drivers/scsi/aacraid/aacraid.h +++ linux/drivers/scsi/aacraid/aacraid.h @@ -744,7 +744,7 @@ struct aac_fib_context { u32 unique; // unique value representing this context ulong jiffies; // used for cleanup - dmb changed to ulong struct list_head next; // used to link context's into a linked list - struct semaphore wait_sem; // this is used to wait for the next fib to arrive. + struct compat_semaphore wait_sem; // this is used to wait for the next fib to arrive. int wait; // Set to true when thread is in WaitForSingleObject unsigned long count; // total number of FIBs on FibList struct list_head fib_list; // this holds fibs and their attachd hw_fibs @@ -814,7 +814,7 @@ struct fib { * This is the event the sendfib routine will wait on if the * caller did not pass one and this is synch io. */ - struct semaphore event_wait; + struct compat_semaphore event_wait; spinlock_t event_lock; u32 done; /* gets set to 1 when fib is complete */ Index: linux/drivers/scsi/qla2xxx/qla_def.h =================================================================== --- linux.orig/drivers/scsi/qla2xxx/qla_def.h +++ linux/drivers/scsi/qla2xxx/qla_def.h @@ -2284,7 +2284,7 @@ typedef struct scsi_qla_host { spinlock_t mbx_reg_lock; /* Mbx Cmd Register Lock */ struct semaphore mbx_cmd_sem; /* Serialialize mbx access */ - struct semaphore mbx_intr_sem; /* Used for completion notification */ + struct compat_semaphore mbx_intr_sem; /* Used for completion notification */ uint32_t mbx_flags; #define MBX_IN_PROGRESS BIT_0 Index: linux/drivers/serial/8250.c =================================================================== --- linux.orig/drivers/serial/8250.c +++ linux/drivers/serial/8250.c @@ -2252,14 +2252,10 @@ serial8250_console_write(struct console touch_nmi_watchdog(); - local_irq_save(flags); - if (up->port.sysrq) { - /* serial8250_handle_port() already took the lock */ - locked = 0; - } else if (oops_in_progress) { - locked = spin_trylock(&up->port.lock); - } else - spin_lock(&up->port.lock); + if (up->port.sysrq || oops_in_progress) + locked = spin_trylock_irqsave(&up->port.lock, flags); + else + spin_lock_irqsave(&up->port.lock, flags); /* * First save the IER then disable the interrupts @@ -2281,8 +2277,7 @@ serial8250_console_write(struct console serial_out(up, UART_IER, ier); if (locked) - spin_unlock(&up->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&up->port.lock, flags); } static int serial8250_console_setup(struct console *co, char *options) Index: linux/drivers/usb/core/devio.c =================================================================== --- linux.orig/drivers/usb/core/devio.c +++ linux/drivers/usb/core/devio.c @@ -304,10 +304,11 @@ static void async_completed(struct urb * struct async *as = (struct async *)urb->context; struct dev_state *ps = as->ps; struct siginfo sinfo; + unsigned long flags; - spin_lock(&ps->lock); - list_move_tail(&as->asynclist, &ps->async_completed); - spin_unlock(&ps->lock); + spin_lock_irqsave(&ps->lock, flags); + list_move_tail(&as->asynclist, &ps->async_completed); + spin_unlock_irqrestore(&ps->lock, flags); if (as->signr) { sinfo.si_signo = as->signr; sinfo.si_errno = as->urb->status; Index: linux/drivers/usb/core/hcd.c =================================================================== --- linux.orig/drivers/usb/core/hcd.c +++ linux/drivers/usb/core/hcd.c @@ -515,13 +515,11 @@ error: } /* any errors get returned through the urb completion */ - local_irq_save (flags); - spin_lock (&urb->lock); + spin_lock_irqsave(&urb->lock, flags); if (urb->status == -EINPROGRESS) urb->status = status; - spin_unlock (&urb->lock); + spin_unlock_irqrestore(&urb->lock, flags); usb_hcd_giveback_urb (hcd, urb, NULL); - local_irq_restore (flags); return 0; } @@ -549,8 +547,7 @@ void usb_hcd_poll_rh_status(struct usb_h if (length > 0) { /* try to complete the status urb */ - local_irq_save (flags); - spin_lock(&hcd_root_hub_lock); + spin_lock_irqsave(&hcd_root_hub_lock, flags); urb = hcd->status_urb; if (urb) { spin_lock(&urb->lock); @@ -566,14 +563,13 @@ void usb_hcd_poll_rh_status(struct usb_h spin_unlock(&urb->lock); } else length = 0; - spin_unlock(&hcd_root_hub_lock); + spin_unlock_irqrestore(&hcd_root_hub_lock, flags); /* local irqs are always blocked in completions */ if (length > 0) usb_hcd_giveback_urb (hcd, urb, NULL); else hcd->poll_pending = 1; - local_irq_restore (flags); } /* The USB 2.0 spec says 256 ms. This is close enough and won't @@ -656,17 +652,15 @@ static int usb_rh_urb_dequeue (struct us } else { /* Status URB */ if (!hcd->uses_new_polling) del_timer_sync (&hcd->rh_timer); - local_irq_disable (); - spin_lock (&hcd_root_hub_lock); + spin_lock_irq(&hcd_root_hub_lock); if (urb == hcd->status_urb) { hcd->status_urb = NULL; urb->hcpriv = NULL; } else urb = NULL; /* wasn't fully queued */ - spin_unlock (&hcd_root_hub_lock); + spin_unlock_irq(&hcd_root_hub_lock); if (urb) usb_hcd_giveback_urb (hcd, urb, NULL); - local_irq_enable (); } return 0; @@ -1371,15 +1365,13 @@ hcd_endpoint_disable (struct usb_device WARN_ON (!HC_IS_RUNNING (hcd->state) && hcd->state != HC_STATE_HALT && udev->state != USB_STATE_NOTATTACHED); - local_irq_disable (); - /* FIXME move most of this into message.c as part of its * endpoint disable logic */ /* ep is already gone from udev->ep_{in,out}[]; no more submits */ rescan: - spin_lock (&hcd_data_lock); + spin_lock_irq(&hcd_data_lock); list_for_each_entry (urb, &ep->urb_list, urb_list) { int tmp; @@ -1392,13 +1384,13 @@ rescan: if (urb->status != -EINPROGRESS) continue; usb_get_urb (urb); - spin_unlock (&hcd_data_lock); + spin_unlock_irq(&hcd_data_lock); - spin_lock (&urb->lock); + spin_lock_irq(&urb->lock); tmp = urb->status; if (tmp == -EINPROGRESS) urb->status = -ESHUTDOWN; - spin_unlock (&urb->lock); + spin_unlock_irq(&urb->lock); /* kick hcd unless it's already returning this */ if (tmp == -EINPROGRESS) { @@ -1421,8 +1413,7 @@ rescan: /* list contents may have changed */ goto rescan; } - spin_unlock (&hcd_data_lock); - local_irq_enable (); + spin_unlock_irq(&hcd_data_lock); /* synchronize with the hardware, so old configuration state * clears out immediately (and will be freed). Index: linux/drivers/usb/core/message.c =================================================================== --- linux.orig/drivers/usb/core/message.c +++ linux/drivers/usb/core/message.c @@ -264,8 +264,9 @@ static void sg_clean (struct usb_sg_requ static void sg_complete (struct urb *urb, struct pt_regs *regs) { struct usb_sg_request *io = (struct usb_sg_request *) urb->context; + unsigned long flags; - spin_lock (&io->lock); + spin_lock_irqsave (&io->lock, flags); /* In 2.5 we require hcds' endpoint queues not to progress after fault * reports, until the completion callback (this!) returns. That lets @@ -299,7 +300,7 @@ static void sg_complete (struct urb *urb * unlink pending urbs so they won't rx/tx bad data. * careful: unlink can sometimes be synchronous... */ - spin_unlock (&io->lock); + spin_unlock_irqrestore (&io->lock, flags); for (i = 0, found = 0; i < io->entries; i++) { if (!io->urbs [i] || !io->urbs [i]->dev) continue; @@ -314,7 +315,7 @@ static void sg_complete (struct urb *urb } else if (urb == io->urbs [i]) found = 1; } - spin_lock (&io->lock); + spin_lock_irqsave (&io->lock, flags); } urb->dev = NULL; @@ -324,7 +325,7 @@ static void sg_complete (struct urb *urb if (!io->count) complete (&io->complete); - spin_unlock (&io->lock); + spin_unlock_irqrestore (&io->lock, flags); } @@ -586,7 +587,7 @@ void usb_sg_cancel (struct usb_sg_reques dev_warn (&io->dev->dev, "%s, unlink --> %d\n", __FUNCTION__, retval); } - spin_lock (&io->lock); + spin_lock_irqsave (&io->lock, flags); } spin_unlock_irqrestore (&io->lock, flags); } Index: linux/drivers/usb/net/usbnet.c =================================================================== --- linux.orig/drivers/usb/net/usbnet.c +++ linux/drivers/usb/net/usbnet.c @@ -818,6 +818,8 @@ static void tx_complete (struct urb *urb urb->dev = NULL; entry->state = tx_done; + spin_lock_rt(&dev->txq.lock); + spin_unlock_rt(&dev->txq.lock); defer_bh(dev, skb, &dev->txq); } Index: linux/drivers/usb/storage/usb.h =================================================================== --- linux.orig/drivers/usb/storage/usb.h +++ linux/drivers/usb/storage/usb.h @@ -147,7 +147,7 @@ struct us_data { dma_addr_t iobuf_dma; /* mutual exclusion and synchronization structures */ - struct semaphore sema; /* to sleep thread on */ + struct compat_semaphore sema; /* to sleep thread on */ struct completion notify; /* thread begin/end */ wait_queue_head_t delay_wait; /* wait during scan, reset */ Index: linux/drivers/video/console/fbcon.c =================================================================== --- linux.orig/drivers/video/console/fbcon.c +++ linux/drivers/video/console/fbcon.c @@ -1247,7 +1247,6 @@ static void fbcon_clear(struct vc_data * { struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; struct fbcon_ops *ops = info->fbcon_par; - struct display *p = &fb_display[vc->vc_num]; u_int y_break; @@ -1276,10 +1275,11 @@ static void fbcon_putcs(struct vc_data * struct display *p = &fb_display[vc->vc_num]; struct fbcon_ops *ops = info->fbcon_par; - if (!fbcon_is_inactive(vc, info)) + if (!fbcon_is_inactive(vc, info)) { ops->putcs(vc, info, s, count, real_y(p, ypos), xpos, get_color(vc, info, scr_readw(s), 1), get_color(vc, info, scr_readw(s), 0)); + } } static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos) @@ -3079,6 +3079,7 @@ static const struct consw fb_con = { .con_screen_pos = fbcon_screen_pos, .con_getxy = fbcon_getxy, .con_resize = fbcon_resize, + .con_preemptible = 1, }; static struct notifier_block fbcon_event_notifier = { Index: linux/drivers/video/console/vgacon.c =================================================================== --- linux.orig/drivers/video/console/vgacon.c +++ linux/drivers/video/console/vgacon.c @@ -52,7 +52,7 @@ #include