Index: linux/Documentation/kernel-parameters.txt =================================================================== --- linux.orig/Documentation/kernel-parameters.txt +++ linux/Documentation/kernel-parameters.txt @@ -1637,6 +1637,12 @@ running once the system is up. time Show timing data prefixed to each printk message line + timeout_granularity= + [KNL] + Timeout granularity: process timer wheel timers every + timeout_granularity jiffies. Defaults to 1 (process + timers HZ times per second - most finegrained). + clocksource= [GENERIC_TIME] Override the default clocksource Override the default clocksource and use the clocksource with the name specified. Index: linux/Documentation/rt-mutex-design.txt =================================================================== --- linux.orig/Documentation/rt-mutex-design.txt +++ linux/Documentation/rt-mutex-design.txt @@ -333,11 +333,11 @@ cmpxchg is basically the following funct unsigned long _cmpxchg(unsigned long *A, unsigned long *B, unsigned long *C) { - unsigned long T = *A; - if (*A == *B) { - *A = *C; - } - return T; + unsigned long T = *A; + if (*A == *B) { + *A = *C; + } + return T; } #define cmpxchg(a,b,c) _cmpxchg(&a,&b,&c) @@ -582,7 +582,7 @@ contention). try_to_take_rt_mutex is used every time the task tries to grab a mutex in the slow path. The first thing that is done here is an atomic setting of the "Has Waiters" flag of the mutex's owner field. Yes, this could really -be false, because if the the mutex has no owner, there are no waiters and +be false, because if the mutex has no owner, there are no waiters and the current task also won't have any waiters. But we don't have the lock yet, so we assume we are going to be a waiter. The reason for this is to play nice for those architectures that do have CMPXCHG. By setting this flag @@ -735,7 +735,7 @@ do have CMPXCHG, that check is done in t in the slow path too. If a waiter of a mutex woke up because of a signal or timeout between the time the owner failed the fast path CMPXCHG check and the grabbing of the wait_lock, the mutex may not have any waiters, thus the -owner still needs to make this check. If there are no waiters than the mutex +owner still needs to make this check. If there are no waiters then the mutex owner field is set to NULL, the wait_lock is released and nothing more is needed. Index: linux/Makefile =================================================================== --- linux.orig/Makefile +++ linux/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 18 -EXTRAVERSION = +EXTRAVERSION = -rt6 NAME=Avast! A bilge rat! # *DOCUMENTATION* @@ -485,10 +485,14 @@ endif include $(srctree)/arch/$(ARCH)/Makefile -ifdef CONFIG_FRAME_POINTER -CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) +ifdef CONFIG_MCOUNT +CFLAGS += -pg -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) else -CFLAGS += -fomit-frame-pointer + ifdef CONFIG_FRAME_POINTER + CFLAGS += -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,) + else + CFLAGS += -fomit-frame-pointer + endif endif ifdef CONFIG_UNWIND_INFO Index: linux/arch/arm/Kconfig =================================================================== --- linux.orig/arch/arm/Kconfig +++ linux/arch/arm/Kconfig @@ -17,6 +17,10 @@ config ARM Europe. There is an ARM Linux project with a web page at . +config GENERIC_TIME + bool + default y + config MMU bool default y @@ -51,6 +55,18 @@ config GENERIC_HARDIRQS bool default y +config STACKTRACE_SUPPORT + bool + default y + +config LOCKDEP_SUPPORT + bool + default y + +config TRACE_IRQFLAGS_SUPPORT + bool + default y + config HARDIRQS_SW_RESEND bool default y @@ -344,6 +360,15 @@ source "arch/arm/mach-at91rm9200/Kconfig source "arch/arm/mach-netx/Kconfig" +config IS_TICK_BASED + bool + depends on GENERIC_TIME + default y + help + This is used on platforms that have not added a clocksource to + support GENERIC_TIME. Platforms which have a clocksource + should set this to 'n' in their mach-*/Kconfig. + # Definitions to make life easier config ARCH_ACORN bool @@ -419,6 +444,8 @@ endmenu menu "Kernel Features" +source "kernel/time/Kconfig" + config SMP bool "Symmetric Multi-Processing (EXPERIMENTAL)" depends on EXPERIMENTAL && REALVIEW_MPCORE @@ -463,38 +490,7 @@ config LOCAL_TIMERS accounting to be spread across the timer interval, preventing a "thundering herd" at every timer tick. -config PREEMPT - bool "Preemptible Kernel (EXPERIMENTAL)" - depends on EXPERIMENTAL - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. - - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. - -config NO_IDLE_HZ - bool "Dynamic tick timer" - help - Select this option if you want to disable continuous timer ticks - and have them programmed to occur as required. This option saves - power as the system can remain in idle state for longer. - - By default dynamic tick is disabled during the boot, and can be - manually enabled with: - - echo 1 > /sys/devices/system/timer/timer0/dyn_tick - - Alternatively, if you want dynamic tick automatically enabled - during boot, pass "dyntick=enable" via the kernel command string. - - Please note that dynamic tick may affect the accuracy of - timekeeping on some platforms depending on the implementation. - Currently at least OMAP, PXA2xx and SA11x0 platforms are known - to have accurate timekeeping with dynamic tick. +source kernel/Kconfig.preempt config HZ int Index: linux/arch/arm/boot/compressed/head.S =================================================================== --- linux.orig/arch/arm/boot/compressed/head.S +++ linux/arch/arm/boot/compressed/head.S @@ -231,7 +231,8 @@ not_relocated: mov r0, #0 */ cmp r4, r2 bhs wont_overwrite - add r0, r4, #4096*1024 @ 4MB largest kernel size + sub r3, sp, r5 @ > compressed kernel image + add r0, r4, r3, lsl #2 @ allow for 4x expansion cmp r0, r5 bls wont_overwrite @@ -822,6 +823,19 @@ memdump: mov r12, r0 mov pc, r10 #endif +#ifdef CONFIG_MCOUNT +/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this + * trampoline + */ + .text + .align 0 + .type mcount %function + .global mcount +mcount: + mov pc, lr @ just return +#endif + + reloc_end: .align Index: linux/arch/arm/common/time-acorn.c =================================================================== --- linux.orig/arch/arm/common/time-acorn.c +++ linux/arch/arm/common/time-acorn.c @@ -77,7 +77,7 @@ ioc_timer_interrupt(int irq, void *dev_i static struct irqaction ioc_timer_irq = { .name = "timer", - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NODELAY, .handler = ioc_timer_interrupt }; Index: linux/arch/arm/kernel/dma.c =================================================================== --- linux.orig/arch/arm/kernel/dma.c +++ linux/arch/arm/kernel/dma.c @@ -20,7 +20,7 @@ #include -DEFINE_SPINLOCK(dma_spin_lock); +DEFINE_RAW_SPINLOCK(dma_spin_lock); EXPORT_SYMBOL(dma_spin_lock); static dma_t dma_chan[MAX_DMA_CHANNELS]; Index: linux/arch/arm/kernel/entry-armv.S =================================================================== --- linux.orig/arch/arm/kernel/entry-armv.S +++ linux/arch/arm/kernel/entry-armv.S @@ -191,6 +191,9 @@ __dabt_svc: __irq_svc: svc_entry +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off +#endif #ifdef CONFIG_PREEMPT get_thread_info tsk ldr r8, [tsk, #TI_PREEMPT] @ get preempt count @@ -201,7 +204,7 @@ __irq_svc: irq_handler #ifdef CONFIG_PREEMPT ldr r0, [tsk, #TI_FLAGS] @ get flags - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED blne svc_preempt preempt_return: ldr r0, [tsk, #TI_PREEMPT] @ read preempt value @@ -211,6 +214,10 @@ preempt_return: #endif ldr r0, [sp, #S_PSR] @ irqs are already disabled msr spsr_cxsf, r0 +#ifdef CONFIG_TRACE_IRQFLAGS + tst r0, #PSR_I_BIT + bleq trace_hardirqs_on +#endif ldmia sp, {r0 - pc}^ @ load r0 - pc, cpsr .ltorg @@ -228,7 +235,7 @@ svc_preempt: str r7, [tsk, #TI_PREEMPT] @ expects preempt_count == 0 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED beq preempt_return @ go again b 1b #endif @@ -398,6 +405,9 @@ __dabt_usr: __irq_usr: usr_entry +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off +#endif get_thread_info tsk #ifdef CONFIG_PREEMPT ldr r8, [tsk, #TI_PREEMPT] @ get preempt count @@ -412,6 +422,9 @@ __irq_usr: teq r0, r7 strne r0, [r0, -r0] #endif +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_on +#endif mov why, #0 b ret_to_user Index: linux/arch/arm/kernel/entry-common.S =================================================================== --- linux.orig/arch/arm/kernel/entry-common.S +++ linux/arch/arm/kernel/entry-common.S @@ -3,6 +3,8 @@ * * Copyright (C) 2000 Russell King * + * LATENCY_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. @@ -40,7 +42,7 @@ ret_fast_syscall: fast_work_pending: str r0, [sp, #S_R0+S_OFF]! @ returned r0 work_pending: - tst r1, #_TIF_NEED_RESCHED + tst r1, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED bne work_resched tst r1, #_TIF_NOTIFY_RESUME | _TIF_SIGPENDING beq no_work_pending @@ -50,7 +52,8 @@ work_pending: b ret_slow_syscall @ Check work again work_resched: - bl schedule + bl __schedule + /* * "slow" syscall return path. "why" tells us if this was a real syscall. */ @@ -387,6 +390,112 @@ ENTRY(sys_oabi_call_table) #include "calls.S" #undef ABI #undef OBSOLETE +#endif + +#ifdef CONFIG_FRAME_POINTER + +#ifdef CONFIG_MCOUNT +/* + * At the point where we are in mcount() we maintain the + * frame of the prologue code and keep the call to mcount() + * out of the stack frame list: + + saved pc <---\ caller of instrumented routine + saved lr | + ip/prev_sp | + fp -----^ | + : | + | + -> saved pc | instrumented routine + | saved lr | + | ip/prev_sp | + | fp ---------/ + | : + | + | mcount + | saved pc + | saved lr + | ip/prev sp + -- fp + r3 + r2 + r1 + sp-> r0 + : + */ + + .text + .align 0 + .type mcount %function + .global mcount + +/* gcc -pg generated FUNCTION_PROLOGUE references mcount() + * and has already created the stack frame invocation for + * the routine we have been called to instrument. We create + * a complete frame nevertheless, as we want to use the same + * call to mcount() from c code. + */ +mcount: + + ldr ip, =mcount_enabled @ leave early, if disabled + ldr ip, [ip] + cmp ip, #0 + moveq pc, lr + + mov ip, sp + stmdb sp!, {r0 - r3, fp, ip, lr, pc} @ create stack frame + + ldr r1, [fp, #-4] @ get lr (the return address + @ of the caller of the + @ instrumented function) + mov r0, lr @ get lr - (the return address + @ of the instrumented function) + + sub fp, ip, #4 @ point fp at this frame + + bl __trace +1: + ldmdb fp, {r0 - r3, fp, sp, pc} @ pop entry frame and return + +#endif + +/* ARM replacement for unsupported gcc __builtin_return_address(n) + * where 0 < n. n == 0 is supported here as well. + * + * Walk up the stack frame until the desired frame is found or a NULL + * fp is encountered, return NULL in the latter case. + * + * Note: it is possible under code optimization for the stack invocation + * of an ancestor function (level N) to be removed before calling a + * descendant function (level N+1). No easy means is available to deduce + * this scenario with the result being [for example] caller_addr(0) when + * called from level N+1 returning level N-1 rather than the expected + * level N. This optimization issue appears isolated to the case of + * a call to a level N+1 routine made at the tail end of a level N + * routine -- the level N frame is deleted and a simple branch is made + * to the level N+1 routine. + */ + + .text + .align 0 + .type arm_return_addr %function + .global arm_return_addr + +arm_return_addr: + mov ip, r0 + mov r0, fp +3: + cmp r0, #0 + beq 1f @ frame list hit end, bail + cmp ip, #0 + beq 2f @ reached desired frame + ldr r0, [r0, #-12] @ else continue, get next fp + sub ip, ip, #1 + b 3b +2: + ldr r0, [r0, #-4] @ get target return address +1: + mov pc, lr #endif Index: linux/arch/arm/kernel/fiq.c =================================================================== --- linux.orig/arch/arm/kernel/fiq.c +++ linux/arch/arm/kernel/fiq.c @@ -89,7 +89,7 @@ void set_fiq_handler(void *start, unsign * disable irqs for the duration. Note - these functions are almost * entirely coded in assembly. */ -void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( @@ -107,7 +107,7 @@ void __attribute__((naked)) set_fiq_regs : "r" (®s->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE)); } -void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) +void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs) { register unsigned long tmp; asm volatile ( Index: linux/arch/arm/kernel/head.S =================================================================== --- linux.orig/arch/arm/kernel/head.S +++ linux/arch/arm/kernel/head.S @@ -234,18 +234,19 @@ __create_page_tables: /* * Now setup the pagetables for our kernel direct - * mapped region. We round TEXTADDR down to the - * nearest megabyte boundary. It is assumed that - * the kernel fits within 4 contigous 1MB sections. + * mapped region. */ add r0, r4, #(TEXTADDR & 0xff000000) >> 18 @ start of kernel str r3, [r0, #(TEXTADDR & 0x00f00000) >> 18]! - add r3, r3, #1 << 20 - str r3, [r0, #4]! @ KERNEL + 1MB - add r3, r3, #1 << 20 - str r3, [r0, #4]! @ KERNEL + 2MB - add r3, r3, #1 << 20 - str r3, [r0, #4] @ KERNEL + 3MB + + ldr r6, =(_end - PAGE_OFFSET) + sub r6, r6, #1 @ r6 = number of sections + mov r6, r6, lsr #20 @ needed for kernel minus 1 + +1: add r3, r3, #1 << 20 + str r3, [r0, #4]! + subs r6, r6, #1 + bgt 1b /* * Then map first 1MB of ram in case it contains our boot params. Index: linux/arch/arm/kernel/irq.c =================================================================== --- linux.orig/arch/arm/kernel/irq.c +++ linux/arch/arm/kernel/irq.c @@ -101,7 +101,7 @@ unlock: /* Handle bad interrupts */ static struct irq_desc bad_irq_desc = { .handle_irq = handle_bad_irq, - .lock = SPIN_LOCK_UNLOCKED + .lock = RAW_SPIN_LOCK_UNLOCKED(bad_irq_desc.lock) }; /* @@ -109,10 +109,12 @@ static struct irq_desc bad_irq_desc = { * come via this function. Instead, they should provide their * own 'handler' */ -asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs) +asmlinkage notrace void asm_do_IRQ(unsigned int irq, struct pt_regs *regs) { struct irqdesc *desc = irq_desc + irq; + trace_special(instruction_pointer(regs), irq, 0); + /* * Some hardware gives randomly wrong interrupts. Rather * than crashing, do something sensible. Index: linux/arch/arm/kernel/process.c =================================================================== --- linux.orig/arch/arm/kernel/process.c +++ linux/arch/arm/kernel/process.c @@ -123,7 +123,7 @@ static void default_idle(void) cpu_relax(); else { local_irq_disable(); - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { timer_dyn_reprogram(); arch_idle(); } @@ -154,12 +154,20 @@ void cpu_idle(void) if (!idle) idle = default_idle; leds_event(led_idle_start); - while (!need_resched()) - idle(); + + if (!need_resched() && !need_resched_delayed() && + !hrtimer_stop_sched_tick()) { + while (!need_resched() && !need_resched_delayed()) + idle(); + } + hrtimer_restart_sched_tick(); + leds_event(led_idle_end); - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } Index: linux/arch/arm/kernel/semaphore.c =================================================================== --- linux.orig/arch/arm/kernel/semaphore.c +++ linux/arch/arm/kernel/semaphore.c @@ -49,14 +49,16 @@ * we cannot lose wakeup events. */ -void __up(struct semaphore *sem) +fastcall void __attribute_used__ __compat_up(struct compat_semaphore *sem) { wake_up(&sem->wait); } +EXPORT_SYMBOL(__compat_up); + static DEFINE_SPINLOCK(semaphore_lock); -void __sched __down(struct semaphore * sem) +fastcall void __attribute_used__ __sched __compat_down(struct compat_semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +91,9 @@ void __sched __down(struct semaphore * s wake_up(&sem->wait); } -int __sched __down_interruptible(struct semaphore * sem) +EXPORT_SYMBOL(__compat_down); + +fastcall int __attribute_used__ __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -140,6 +144,8 @@ int __sched __down_interruptible(struct return retval; } +EXPORT_SYMBOL(__compat_down_interruptible); + /* * Trylock failed - make sure we correct for * having decremented the count. @@ -148,7 +154,7 @@ int __sched __down_interruptible(struct * single "cmpxchg" without failure cases, * but then it wouldn't work on a 386. */ -int __down_trylock(struct semaphore * sem) +fastcall int __attribute_used__ __compat_down_trylock(struct compat_semaphore * sem) { int sleepers; unsigned long flags; @@ -168,6 +174,15 @@ int __down_trylock(struct semaphore * se return 1; } +EXPORT_SYMBOL(__compat_down_trylock); + +fastcall int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} + +EXPORT_SYMBOL(compat_sem_is_locked); + /* * The semaphore operations have a special calling sequence that * allow us to do a simpler in-line version of them. These routines @@ -185,7 +200,7 @@ asm(" .section .sched.text,\"ax\",%progb __down_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down \n\ + bl __compat_down \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ .align 5 \n\ @@ -193,7 +208,7 @@ __down_failed: \n\ __down_interruptible_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down_interruptible \n\ + bl __compat_down_interruptible \n\ mov ip, r0 \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ @@ -202,7 +217,7 @@ __down_interruptible_failed: \n\ __down_trylock_failed: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __down_trylock \n\ + bl __compat_down_trylock \n\ mov ip, r0 \n\ ldmfd sp!, {r0 - r4, pc} \n\ \n\ @@ -211,7 +226,7 @@ __down_trylock_failed: \n\ __up_wakeup: \n\ stmfd sp!, {r0 - r4, lr} \n\ mov r0, ip \n\ - bl __up \n\ + bl __compat_up \n\ ldmfd sp!, {r0 - r4, pc} \n\ "); Index: linux/arch/arm/kernel/signal.c =================================================================== --- linux.orig/arch/arm/kernel/signal.c +++ linux/arch/arm/kernel/signal.c @@ -630,6 +630,14 @@ static int do_signal(sigset_t *oldset, s siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif + /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux/arch/arm/kernel/smp.c =================================================================== --- linux.orig/arch/arm/kernel/smp.c +++ linux/arch/arm/kernel/smp.c @@ -515,7 +515,7 @@ static void ipi_call_function(unsigned i cpu_clear(cpu, data->unfinished); } -static DEFINE_SPINLOCK(stop_lock); +static DEFINE_RAW_SPINLOCK(stop_lock); /* * ipi_cpu_stop - handle IPI from smp_send_stop() Index: linux/arch/arm/kernel/time.c =================================================================== --- linux.orig/arch/arm/kernel/time.c +++ linux/arch/arm/kernel/time.c @@ -69,10 +69,12 @@ EXPORT_SYMBOL(profile_pc); */ int (*set_rtc)(void); +#ifdef CONFIG_IS_TICK_BASED static unsigned long dummy_gettimeoffset(void) { return 0; } +#endif /* * Scheduler clock - returns current time in nanosec units. @@ -84,34 +86,10 @@ unsigned long long __attribute__((weak)) return (unsigned long long)jiffies * (1000000000 / HZ); } -static unsigned long next_rtc_update; - -/* - * If we have an externally synchronized linux clock, then update - * CMOS clock accordingly every ~11 minutes. set_rtc() has to be - * called as close as possible to 500 ms before the new second - * starts. - */ -static inline void do_set_rtc(void) +void sync_persistent_clock(struct timespec ts) { - if (!ntp_synced() || set_rtc == NULL) - return; - - if (next_rtc_update && - time_before((unsigned long)xtime.tv_sec, next_rtc_update)) - return; - - if (xtime.tv_nsec < 500000000 - ((unsigned) tick_nsec >> 1) && - xtime.tv_nsec >= 500000000 + ((unsigned) tick_nsec >> 1)) - return; - - if (set_rtc()) - /* - * rtc update failed. Try again in 60s - */ - next_rtc_update = xtime.tv_sec + 60; - else - next_rtc_update = xtime.tv_sec + 660; + if (set_rtc) + set_rtc(); } #ifdef CONFIG_LEDS @@ -230,68 +208,6 @@ static inline void do_leds(void) #define do_leds() #endif -void do_gettimeofday(struct timeval *tv) -{ - unsigned long flags; - unsigned long seq; - unsigned long usec, sec, lost; - - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - usec = system_timer->offset(); - - lost = jiffies - wall_jiffies; - if (lost) - usec += lost * USECS_PER_JIFFY; - - sec = xtime.tv_sec; - usec += xtime.tv_nsec / 1000; - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - - /* usec may have gone up a lot: be safe */ - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * done, and then undo it! - */ - nsec -= system_timer->offset() * NSEC_PER_USEC; - nsec -= (jiffies - wall_jiffies) * TICK_NSEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - /** * save_time_delta - Save the offset between system time and RTC time * @delta: pointer to timespec to store delta @@ -332,7 +248,6 @@ void timer_tick(struct pt_regs *regs) { profile_tick(CPU_PROFILING, regs); do_leds(); - do_set_rtc(); do_timer(regs); #ifndef CONFIG_SMP update_process_times(user_mode(regs)); @@ -500,8 +415,10 @@ device_initcall(timer_init_sysfs); void __init time_init(void) { +#ifdef CONFIG_IS_TICK_BASED if (system_timer->offset == NULL) system_timer->offset = dummy_gettimeoffset; +#endif system_timer->init(); #ifdef CONFIG_NO_IDLE_HZ Index: linux/arch/arm/kernel/traps.c =================================================================== --- linux.orig/arch/arm/kernel/traps.c +++ linux/arch/arm/kernel/traps.c @@ -176,6 +176,7 @@ void dump_stack(void) { #ifdef CONFIG_DEBUG_ERRORS __backtrace(); + print_traces(current); #endif } @@ -191,7 +192,7 @@ void show_stack(struct task_struct *tsk, if (tsk != current) fp = thread_saved_fp(tsk); else - asm("mov%? %0, fp" : "=r" (fp)); + asm("mov %0, fp" : "=r" (fp) : : "cc"); c_backtrace(fp, 0x10); barrier(); @@ -216,7 +217,7 @@ static void __die(const char *str, int e } } -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); /* * This function is protected against re-entrancy. @@ -252,7 +253,7 @@ void notify_die(const char *str, struct } static LIST_HEAD(undef_hook); -static DEFINE_SPINLOCK(undef_lock); +static DEFINE_RAW_SPINLOCK(undef_lock); void register_undef_hook(struct undef_hook *hook) { Index: linux/arch/arm/lib/Makefile =================================================================== --- linux.orig/arch/arm/lib/Makefile +++ linux/arch/arm/lib/Makefile @@ -41,6 +41,7 @@ lib-$(CONFIG_ARCH_RPC) += ecard.o io-ac lib-$(CONFIG_ARCH_CLPS7500) += io-acorn.o lib-$(CONFIG_ARCH_L7200) += io-acorn.o lib-$(CONFIG_ARCH_SHARK) += io-shark.o +lib-$(CONFIG_STACKTRACE) += stacktrace.o $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S $(obj)/csumpartialcopyuser.o: $(obj)/csumpartialcopygeneric.S Index: linux/arch/arm/lib/stacktrace.c =================================================================== --- /dev/null +++ linux/arch/arm/lib/stacktrace.c @@ -0,0 +1,77 @@ +#include +#include + +struct stackframe { + unsigned long fp; + unsigned long sp; + unsigned long lr; + unsigned long pc; +}; + +int walk_stackframe(unsigned long fp, unsigned long low, unsigned long high, + int (*fn)(struct stackframe *, void *), void *data) +{ + struct stackframe *frame; + + do { + /* + * Check current frame pointer is within bounds + */ + if ((fp - 12) < low || fp + 4 >= high) + break; + + frame = (struct stackframe *)(fp - 12); + + if (fn(frame, data)) + break; + + /* + * Update the low bound - the next frame must always + * be at a higher address than the current frame. + */ + low = fp + 4; + fp = frame->fp; + } while (fp); + + return 0; +} + +struct stack_trace_data { + struct stack_trace *trace; + unsigned int skip; +}; + +static int save_trace(struct stackframe *frame, void *d) +{ + struct stack_trace_data *data = d; + struct stack_trace *trace = data->trace; + + if (data->skip) { + data->skip--; + return 0; + } + + trace->entries[trace->nr_entries++] = frame->lr; + + return trace->nr_entries >= trace->max_entries; +} + +void save_stack_trace(struct stack_trace *trace, struct task_struct *task, + int all_contexts, unsigned int skip) +{ + struct stack_trace_data data; + unsigned long fp, base; + + data.trace = trace; + data.skip = skip; + + if (task) { + base = (unsigned long)task_stack_page(task); + fp = 0; + } else { + base = (unsigned long)task_stack_page(current); + asm("mov %0, fp" : "=r" (fp)); + } + + walk_stackframe(fp, base, base + THREAD_SIZE, save_trace, &data); +} Index: linux/arch/arm/mach-footbridge/netwinder-hw.c =================================================================== --- linux.orig/arch/arm/mach-footbridge/netwinder-hw.c +++ linux/arch/arm/mach-footbridge/netwinder-hw.c @@ -67,7 +67,7 @@ static inline void wb977_ww(int reg, int /* * This is a lock for accessing ports GP1_IO_BASE and GP2_IO_BASE */ -DEFINE_SPINLOCK(gpio_lock); +DEFINE_RAW_SPINLOCK(gpio_lock); static unsigned int current_gpio_op; static unsigned int current_gpio_io; Index: linux/arch/arm/mach-footbridge/netwinder-leds.c =================================================================== --- linux.orig/arch/arm/mach-footbridge/netwinder-leds.c +++ linux/arch/arm/mach-footbridge/netwinder-leds.c @@ -32,7 +32,7 @@ static char led_state; static char hw_led_state; static DEFINE_SPINLOCK(leds_lock); -extern spinlock_t gpio_lock; +extern raw_spinlock_t gpio_lock; static void netwinder_leds_event(led_event_t evt) { Index: linux/arch/arm/mach-integrator/core.c =================================================================== --- linux.orig/arch/arm/mach-integrator/core.c +++ linux/arch/arm/mach-integrator/core.c @@ -164,7 +164,7 @@ static struct amba_pl010_data integrator #define CM_CTRL IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET -static DEFINE_SPINLOCK(cm_lock); +static DEFINE_RAW_SPINLOCK(cm_lock); /** * cm_control - update the CM_CTRL register. Index: linux/arch/arm/mach-integrator/pci_v3.c =================================================================== --- linux.orig/arch/arm/mach-integrator/pci_v3.c +++ linux/arch/arm/mach-integrator/pci_v3.c @@ -162,7 +162,7 @@ * 7:2 register number * */ -static DEFINE_SPINLOCK(v3_lock); +static DEFINE_RAW_SPINLOCK(v3_lock); #define PCI_BUS_NONMEM_START 0x00000000 #define PCI_BUS_NONMEM_SIZE SZ_256M Index: linux/arch/arm/mach-integrator/platsmp.c =================================================================== --- linux.orig/arch/arm/mach-integrator/platsmp.c +++ linux/arch/arm/mach-integrator/platsmp.c @@ -31,7 +31,7 @@ extern void integrator_secondary_startup volatile int __cpuinitdata pen_release = -1; unsigned long __cpuinitdata phys_pen_release = 0; -static DEFINE_SPINLOCK(boot_lock); +static DEFINE_RAW_SPINLOCK(boot_lock); void __cpuinit platform_secondary_init(unsigned int cpu) { Index: linux/arch/arm/mach-ixp4xx/Kconfig =================================================================== --- linux.orig/arch/arm/mach-ixp4xx/Kconfig +++ linux/arch/arm/mach-ixp4xx/Kconfig @@ -1,5 +1,9 @@ if ARCH_IXP4XX +config IS_TICK_BASED + bool + default n + config ARCH_SUPPORTS_BIG_ENDIAN bool default y Index: linux/arch/arm/mach-ixp4xx/common-pci.c =================================================================== --- linux.orig/arch/arm/mach-ixp4xx/common-pci.c +++ linux/arch/arm/mach-ixp4xx/common-pci.c @@ -53,7 +53,7 @@ unsigned long ixp4xx_pci_reg_base = 0; * these transactions are atomic or we will end up * with corrupt data on the bus or in a driver. */ -static DEFINE_SPINLOCK(ixp4xx_pci_lock); +static DEFINE_RAW_SPINLOCK(ixp4xx_pci_lock); /* * Read from PCI config space Index: linux/arch/arm/mach-ixp4xx/common.c =================================================================== --- linux.orig/arch/arm/mach-ixp4xx/common.c +++ linux/arch/arm/mach-ixp4xx/common.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include #include @@ -38,6 +40,11 @@ #include #include +#ifdef CONFIG_HIGH_RES_TIMERS +static int __init ixp4xx_clockevent_init(void); +static struct clock_event clockevent_ixp4xx; +#endif + /************************************************************************* * IXP4xx chipset I/O mapping *************************************************************************/ @@ -253,25 +260,17 @@ void __init ixp4xx_init_irq(void) static unsigned volatile last_jiffy_time; -#define CLOCK_TICKS_PER_USEC ((CLOCK_TICK_RATE + USEC_PER_SEC/2) / USEC_PER_SEC) - -/* IRQs are disabled before entering here from do_gettimeofday() */ -static unsigned long ixp4xx_gettimeoffset(void) -{ - u32 elapsed; - - elapsed = *IXP4XX_OSTS - last_jiffy_time; - - return elapsed / CLOCK_TICKS_PER_USEC; -} - static irqreturn_t ixp4xx_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { - write_seqlock(&xtime_lock); - /* Clear Pending Interrupt by writing '1' to it */ *IXP4XX_OSST = IXP4XX_OSST_TIMER_1_PEND; +#ifdef CONFIG_HIGH_RES_TIMERS + if (clockevent_ixp4xx.event_handler) + clockevent_ixp4xx.event_handler(regs); +#else + write_seqlock(&xtime_lock); + /* * Catch up with the real idea of time */ @@ -281,6 +280,7 @@ static irqreturn_t ixp4xx_timer_interrup } write_sequnlock(&xtime_lock); +#endif return IRQ_HANDLED; } @@ -299,17 +299,18 @@ static void __init ixp4xx_timer_init(voi /* Setup the Timer counter value */ *IXP4XX_OSRT1 = (LATCH & ~IXP4XX_OST_RELOAD_MASK) | IXP4XX_OST_ENABLE; - /* Reset time-stamp counter */ - *IXP4XX_OSTS = 0; last_jiffy_time = 0; /* Connect the interrupt handler and enable the interrupt */ setup_irq(IRQ_IXP4XX_TIMER1, &ixp4xx_timer_irq); + +#ifdef CONFIG_HIGH_RES_TIMERS + ixp4xx_clockevent_init(); +#endif } struct sys_timer ixp4xx_timer = { .init = ixp4xx_timer_init, - .offset = ixp4xx_gettimeoffset, }; static struct resource ixp46x_i2c_resources[] = { @@ -365,3 +366,70 @@ void __init ixp4xx_sys_init(void) ixp4xx_exp_bus_size >> 20); } +cycle_t ixp4xx_get_cycles(void) +{ + return *IXP4XX_OSTS; +} + +static struct clocksource clocksource_ixp4xx = { + .name = "OSTS", + .rating = 200, + .read = ixp4xx_get_cycles, + .mask = 0xFFFFFFFF, + .shift = 20, + .is_continuous = 1, +}; + +static int __init ixp4xx_clocksource_init(void) +{ + /* Reset time-stamp counter */ + *IXP4XX_OSTS = 0; + + clocksource_ixp4xx.mult = + clocksource_khz2mult(66660, clocksource_ixp4xx.shift); + clocksource_register(&clocksource_ixp4xx); + + return 0; +} +device_initcall(ixp4xx_clocksource_init); + +#ifdef CONFIG_HIGH_RES_TIMERS +static u32 clockevent_mode = 0; + +static void ixp4xx_set_next_event(unsigned long evt, + struct clock_event *unused) +{ + u32 oneshot = (clockevent_mode == CLOCK_EVT_ONESHOT) ? + IXP4XX_OST_ONE_SHOT : 0; + + *IXP4XX_OSRT1 = (evt & ~IXP4XX_OST_RELOAD_MASK) | IXP4XX_OST_ENABLE | + oneshot; +} + +static void ixp4xx_set_mode(int mode, struct clock_event *evt) +{ + clockevent_mode = mode; +} + +static struct clock_event clockevent_ixp4xx = { + .name = "ixp4xx timer1", + .capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_TICK | + CLOCK_CAP_UPDATE | CLOCK_CAP_PROFILE, + .shift = 32, + .set_mode = ixp4xx_set_mode, + .set_next_event = ixp4xx_set_next_event, +}; + +static int __init ixp4xx_clockevent_init(void) +{ + clockevent_ixp4xx.mult = div_sc(FREQ, NSEC_PER_SEC, + clockevent_ixp4xx.shift); + clockevent_ixp4xx.max_delta_ns = + clockevent_delta2ns(0xfffffffe, &clockevent_ixp4xx); + clockevent_ixp4xx.min_delta_ns = + clockevent_delta2ns(0xf, &clockevent_ixp4xx); + register_local_clockevent(&clockevent_ixp4xx); + + return 0; +} +#endif Index: linux/arch/arm/mach-omap1/pm.c =================================================================== --- linux.orig/arch/arm/mach-omap1/pm.c +++ linux/arch/arm/mach-omap1/pm.c @@ -120,7 +120,7 @@ void omap_pm_idle(void) local_irq_disable(); local_fiq_disable(); - if (need_resched()) { + if (need_resched() || need_resched_delayed()) { local_fiq_enable(); local_irq_enable(); return; Index: linux/arch/arm/mach-omap2/pm.c =================================================================== --- linux.orig/arch/arm/mach-omap2/pm.c +++ linux/arch/arm/mach-omap2/pm.c @@ -53,7 +53,7 @@ void omap2_pm_idle(void) { local_irq_disable(); local_fiq_disable(); - if (need_resched()) { + if (need_resched() || need_resched_delayed()) { local_fiq_enable(); local_irq_enable(); return; Index: linux/arch/arm/mach-sa1100/badge4.c =================================================================== --- linux.orig/arch/arm/mach-sa1100/badge4.c +++ linux/arch/arm/mach-sa1100/badge4.c @@ -240,15 +240,22 @@ void badge4_set_5V(unsigned subsystem, i /* detect on->off and off->on transitions */ if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { /* was off, now on */ - printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__); GPSR = BADGE4_GPIO_PCMEN5V; } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { /* was on, now off */ - printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__); GPCR = BADGE4_GPIO_PCMEN5V; } local_irq_restore(flags); + + /* detect on->off and off->on transitions */ + if ((!old_5V_bitmap) && (badge4_5V_bitmap)) { + /* was off, now on */ + printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__); + } else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) { + /* was on, now off */ + printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__); + } } EXPORT_SYMBOL(badge4_set_5V); Index: linux/arch/arm/mach-shark/leds.c =================================================================== --- linux.orig/arch/arm/mach-shark/leds.c +++ linux/arch/arm/mach-shark/leds.c @@ -32,7 +32,7 @@ static char led_state; static short hw_led_state; static short saved_state; -static DEFINE_SPINLOCK(leds_lock); +static DEFINE_RAW_SPINLOCK(leds_lock); short sequoia_read(int addr) { outw(addr,0x24); Index: linux/arch/arm/mach-versatile/Kconfig =================================================================== --- linux.orig/arch/arm/mach-versatile/Kconfig +++ linux/arch/arm/mach-versatile/Kconfig @@ -1,6 +1,10 @@ menu "Versatile platform type" depends on ARCH_VERSATILE +config IS_TICK_BASED + bool + default n + config ARCH_VERSATILE_PB bool "Support Versatile/PB platform" default y Index: linux/arch/arm/mach-versatile/core.c =================================================================== --- linux.orig/arch/arm/mach-versatile/core.c +++ linux/arch/arm/mach-versatile/core.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include #include @@ -808,59 +810,50 @@ void __init versatile_init(void) #define TICKS2USECS(x) ((x) / TICKS_PER_uSEC) #endif -/* - * Returns number of ms since last clock interrupt. Note that interrupts - * will have been disabled by do_gettimeoffset() - */ -static unsigned long versatile_gettimeoffset(void) +#ifdef CONFIG_HIGH_RES_TIMERS +static void timer_set_mode(int mode, struct clock_event *clk) { - unsigned long ticks1, ticks2, status; - - /* - * Get the current number of ticks. Note that there is a race - * condition between us reading the timer and checking for - * an interrupt. We get around this by ensuring that the - * counter has not reloaded between our two reads. - */ - ticks2 = readl(TIMER0_VA_BASE + TIMER_VALUE) & 0xffff; - do { - ticks1 = ticks2; - status = __raw_readl(VA_IC_BASE + VIC_RAW_STATUS); - ticks2 = readl(TIMER0_VA_BASE + TIMER_VALUE) & 0xffff; - } while (ticks2 > ticks1); - - /* - * Number of ticks since last interrupt. - */ - ticks1 = TIMER_RELOAD - ticks2; - - /* - * Interrupt pending? If so, we've reloaded once already. - * - * FIXME: Need to check this is effectively timer 0 that expires - */ - if (status & IRQMASK_TIMERINT0_1) - ticks1 += TIMER_RELOAD; + if (mode == CLOCK_EVT_PERIODIC) { + writel(TIMER_CTRL_PERIODIC | TIMER_CTRL_32BIT | TIMER_CTRL_IE | + TIMER_CTRL_ENABLE, TIMER0_VA_BASE + TIMER_CTRL); + } else { + writel(TIMER_CTRL_ONESHOT | TIMER_CTRL_32BIT | TIMER_CTRL_IE | + TIMER_CTRL_ENABLE, TIMER0_VA_BASE + TIMER_CTRL); + } +} - /* - * Convert the ticks to usecs - */ - return TICKS2USECS(ticks1); +static void timer_set_next_event(unsigned long evt, struct clock_event *unused) +{ + BUG_ON(!evt); + writel(evt, TIMER0_VA_BASE + TIMER_LOAD); } +static struct clock_event timer0_clock = { + .name = "timer0", + .shift = 32, + .capabilities = CLOCK_CAP_TICK | CLOCK_CAP_UPDATE | + CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE, + .set_mode = timer_set_mode, + .set_next_event = timer_set_next_event, +}; +#endif + /* * IRQ handler for the timer */ static irqreturn_t versatile_timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { - write_seqlock(&xtime_lock); - // ...clear the interrupt writel(1, TIMER0_VA_BASE + TIMER_INTCLR); +#ifdef CONFIG_HIGH_RES_TIMERS + if (timer0_clock.event_handler) + timer0_clock.event_handler(regs); +#else + write_seqlock(&xtime_lock); timer_tick(regs); - write_sequnlock(&xtime_lock); +#endif return IRQ_HANDLED; } @@ -893,11 +886,20 @@ static void __init versatile_timer_init( /* * Initialise to a known state (all timers off) */ - writel(0, TIMER0_VA_BASE + TIMER_CTRL); + writel(0, TIMER0_VA_BASE + TIMER_CTRL); writel(0, TIMER1_VA_BASE + TIMER_CTRL); writel(0, TIMER2_VA_BASE + TIMER_CTRL); writel(0, TIMER3_VA_BASE + TIMER_CTRL); +#ifdef CONFIG_HIGH_RES_TIMERS + timer0_clock.mult = div_sc(1000000, NSEC_PER_SEC, timer0_clock.shift); + timer0_clock.max_delta_ns = + clockevent_delta2ns(0xffffffff, &timer0_clock); + timer0_clock.min_delta_ns = + clockevent_delta2ns(0xf, &timer0_clock); + register_global_clockevent(&timer0_clock); +#endif + writel(TIMER_RELOAD, TIMER0_VA_BASE + TIMER_LOAD); writel(TIMER_RELOAD, TIMER0_VA_BASE + TIMER_VALUE); writel(TIMER_DIVISOR | TIMER_CTRL_ENABLE | TIMER_CTRL_PERIODIC | @@ -911,5 +913,36 @@ static void __init versatile_timer_init( struct sys_timer versatile_timer = { .init = versatile_timer_init, - .offset = versatile_gettimeoffset, }; + +cycle_t versatile_get_cycles(void) +{ + return ~readl(TIMER3_VA_BASE + TIMER_VALUE); +} + +static struct clocksource clocksource_versatile = { + .name = "timer3", + .rating = 200, + .read = versatile_get_cycles, + .mask = 0xFFFFFFFF, + .shift = 20, + .is_continuous = 1, +}; + +static int __init versatile_clocksource_init(void) +{ + writel(0, TIMER3_VA_BASE + TIMER_CTRL); + writel(0xffffffff, TIMER3_VA_BASE + TIMER_LOAD); + writel(0xffffffff, TIMER3_VA_BASE + TIMER_VALUE); + writel(TIMER_CTRL_32BIT | TIMER_CTRL_ENABLE | TIMER_CTRL_PERIODIC, + TIMER3_VA_BASE + TIMER_CTRL); + + clocksource_versatile.mult = + clocksource_khz2mult(1000, clocksource_versatile.shift); + clocksource_register(&clocksource_versatile); + + return 0; +} + +device_initcall(versatile_clocksource_init); + Index: linux/arch/arm/mm/consistent.c =================================================================== --- linux.orig/arch/arm/mm/consistent.c +++ linux/arch/arm/mm/consistent.c @@ -40,7 +40,7 @@ * These are the page tables (2MB each) covering uncached, DMA consistent allocations */ static pte_t *consistent_pte[NUM_CONSISTENT_PTES]; -static DEFINE_SPINLOCK(consistent_lock); +static DEFINE_RAW_SPINLOCK(consistent_lock); /* * VM region handling support. Index: linux/arch/arm/mm/copypage-v4mc.c =================================================================== --- linux.orig/arch/arm/mm/copypage-v4mc.c +++ linux/arch/arm/mm/copypage-v4mc.c @@ -29,7 +29,7 @@ #define TOP_PTE(x) pte_offset_kernel(top_pmd, x) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * ARMv4 mini-dcache optimised copy_user_page @@ -43,7 +43,7 @@ static DEFINE_SPINLOCK(minicache_lock); * instruction. If your processor does not supply this, you have to write your * own copy_user_page that does the right thing. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { asm volatile( @@ -82,7 +82,7 @@ void v4_mc_copy_user_page(void *kto, con /* * ARMv4 optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) v4_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux/arch/arm/mm/copypage-v6.c =================================================================== --- linux.orig/arch/arm/mm/copypage-v6.c +++ linux/arch/arm/mm/copypage-v6.c @@ -26,7 +26,7 @@ #define TOP_PTE(x) pte_offset_kernel(top_pmd, x) -static DEFINE_SPINLOCK(v6_lock); +static DEFINE_RAW_SPINLOCK(v6_lock); /* * Copy the user page. No aliasing to deal with so we can just Index: linux/arch/arm/mm/copypage-xscale.c =================================================================== --- linux.orig/arch/arm/mm/copypage-xscale.c +++ linux/arch/arm/mm/copypage-xscale.c @@ -31,7 +31,7 @@ #define TOP_PTE(x) pte_offset_kernel(top_pmd, x) -static DEFINE_SPINLOCK(minicache_lock); +static DEFINE_RAW_SPINLOCK(minicache_lock); /* * XScale mini-dcache optimised copy_user_page @@ -41,7 +41,7 @@ static DEFINE_SPINLOCK(minicache_lock); * Dcache aliasing issue. The writes will be forwarded to the write buffer, * and merged as appropriate. */ -static void __attribute__((naked)) +static void notrace __attribute__((naked)) mc_copy_user_page(void *from, void *to) { /* @@ -104,7 +104,7 @@ void xscale_mc_copy_user_page(void *kto, /* * XScale optimised clear_user_page */ -void __attribute__((naked)) +void notrace __attribute__((naked)) xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr) { asm volatile( Index: linux/arch/arm/mm/fault.c =================================================================== --- linux.orig/arch/arm/mm/fault.c +++ linux/arch/arm/mm/fault.c @@ -215,7 +215,7 @@ out: return fault; } -static int +static notrace int do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk; @@ -315,7 +315,7 @@ no_context: * interrupt or a critical region, and should only copy the information * from the master page table, nothing more. */ -static int +static notrace int do_translation_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { @@ -361,7 +361,7 @@ bad_area: * Some section permission faults need to be handled gracefully. * They can happen due to a __{get,put}_user during an oops. */ -static int +static notrace int do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk = current; @@ -372,7 +372,7 @@ do_sect_fault(unsigned long addr, unsign /* * This abort handler always returns "fault". */ -static int +static notrace int do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { return 1; @@ -427,7 +427,7 @@ static struct fsr_info { { do_bad, SIGBUS, 0, "unknown 31" } }; -void __init +void __init notrace hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *), int sig, const char *name) { @@ -441,7 +441,7 @@ hook_fault_code(int nr, int (*fn)(unsign /* * Dispatch a data abort to the relevant handler. */ -asmlinkage void +asmlinkage notrace void do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6); @@ -460,7 +460,7 @@ do_DataAbort(unsigned long addr, unsigne notify_die("", regs, &info, fsr, 0); } -asmlinkage void +asmlinkage notrace void do_PrefetchAbort(unsigned long addr, struct pt_regs *regs) { do_translation_fault(addr, 0, regs); Index: linux/arch/arm/mm/init.c =================================================================== --- linux.orig/arch/arm/mm/init.c +++ linux/arch/arm/mm/init.c @@ -25,7 +25,7 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern void _stext, _text, _etext, __data_start, _end, __init_begin, __init_end; Index: linux/arch/arm/oprofile/op_model_xscale.c =================================================================== --- linux.orig/arch/arm/oprofile/op_model_xscale.c +++ linux/arch/arm/oprofile/op_model_xscale.c @@ -383,8 +383,9 @@ static int xscale_pmu_start(void) { int ret; u32 pmnc = read_pmnc(); + int irq_flags = IRQF_DISABLED | IRQF_NODELAY; - ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, IRQF_DISABLED, + ret = request_irq(XSCALE_PMU_IRQ, xscale_pmu_interrupt, irq_flags, "XScale PMU", (void *)results); if (ret < 0) { Index: linux/arch/arm/plat-omap/clock.c =================================================================== --- linux.orig/arch/arm/plat-omap/clock.c +++ linux/arch/arm/plat-omap/clock.c @@ -29,7 +29,7 @@ static LIST_HEAD(clocks); static DEFINE_MUTEX(clocks_mutex); -static DEFINE_SPINLOCK(clockfw_lock); +static DEFINE_RAW_SPINLOCK(clockfw_lock); static struct clk_functions *arch_clock; Index: linux/arch/arm/plat-omap/dma.c =================================================================== --- linux.orig/arch/arm/plat-omap/dma.c +++ linux/arch/arm/plat-omap/dma.c @@ -949,7 +949,7 @@ static struct irqaction omap24xx_dma_irq /*----------------------------------------------------------------------------*/ static struct lcd_dma_info { - spinlock_t lock; + raw_spinlock_t lock; int reserved; void (* callback)(u16 status, void *data); void *cb_data; Index: linux/arch/arm/plat-omap/gpio.c =================================================================== --- linux.orig/arch/arm/plat-omap/gpio.c +++ linux/arch/arm/plat-omap/gpio.c @@ -120,7 +120,7 @@ struct gpio_bank { u32 reserved_map; u32 suspend_wakeup; u32 saved_wakeup; - spinlock_t lock; + raw_spinlock_t lock; }; #define METHOD_MPUIO 0 Index: linux/arch/arm/plat-omap/mux.c =================================================================== --- linux.orig/arch/arm/plat-omap/mux.c +++ linux/arch/arm/plat-omap/mux.c @@ -56,7 +56,7 @@ int __init omap_mux_register(struct pin_ */ int __init_or_module omap_cfg_reg(const unsigned long index) { - static DEFINE_SPINLOCK(mux_spin_lock); + static DEFINE_RAW_SPINLOCK(mux_spin_lock); unsigned long flags; struct pin_config *cfg; Index: linux/arch/arm/plat-omap/pm.c =================================================================== --- linux.orig/arch/arm/plat-omap/pm.c +++ linux/arch/arm/plat-omap/pm.c @@ -84,7 +84,7 @@ void omap_pm_idle(void) local_irq_disable(); local_fiq_disable(); - if (need_resched()) { + if (need_resched() || need_resched_delayed()) { local_fiq_enable(); local_irq_enable(); return; Index: linux/arch/h8300/Kconfig =================================================================== --- linux.orig/arch/h8300/Kconfig +++ linux/arch/h8300/Kconfig @@ -41,6 +41,10 @@ config GENERIC_CALIBRATE_DELAY bool default y +config GENERIC_TIME + bool + default y + config TIME_LOW_RES bool default y Index: linux/arch/h8300/kernel/time.c =================================================================== --- linux.orig/arch/h8300/kernel/time.c +++ linux/arch/h8300/kernel/time.c @@ -68,58 +68,6 @@ void time_init(void) platform_timer_setup(timer_interrupt); } -/* - * This version of gettimeofday has near microsecond resolution. - */ -void do_gettimeofday(struct timeval *tv) -{ - unsigned long flags; - unsigned long usec, sec; - - read_lock_irqsave(&xtime_lock, flags); - usec = 0; - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - read_unlock_irqrestore(&xtime_lock, flags); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_lock_irq(&xtime_lock); - /* This is revolting. We need to set the xtime.tv_usec - * correctly. However, the value in this location is - * is value at the last tick. - * Discover what correction gettimeofday - * would have done, and then undo it! - */ - while (tv->tv_nsec < 0) { - tv->tv_nsec += NSEC_PER_SEC; - tv->tv_sec--; - } - - xtime.tv_sec = tv->tv_sec; - xtime.tv_nsec = tv->tv_nsec; - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - unsigned long long sched_clock(void) { return (unsigned long long)jiffies * (1000000000 / HZ); Index: linux/arch/i386/Kconfig =================================================================== --- linux.orig/arch/i386/Kconfig +++ linux/arch/i386/Kconfig @@ -65,6 +65,8 @@ source "init/Kconfig" menu "Processor type and features" +source "kernel/time/Kconfig" + config SMP bool "Symmetric multi-processing support" ---help--- @@ -261,6 +263,19 @@ config SCHED_MC source "kernel/Kconfig.preempt" +config RWSEM_GENERIC_SPINLOCK + bool + depends on M386 || PREEMPT_RT + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + default y if !RWSEM_GENERIC_SPINLOCK + config X86_UP_APIC bool "Local APIC support on uniprocessors" depends on !SMP && !(X86_VISWS || X86_VOYAGER) @@ -708,6 +723,7 @@ config BOOT_IOREMAP config REGPARM bool "Use register arguments" + depends on !MCOUNT default y help Compile the kernel with -mregparm=3. This instructs gcc to use @@ -791,6 +807,10 @@ config HOTPLUG_CPU enable suspend on SMP systems. CPUs can be controlled through /sys/devices/system/cpu. +config GENERIC_TIME_VSYSCALL + depends on EXPERIMENTAL + bool "VSYSCALL gettimeofday() interface" + config COMPAT_VDSO bool "Compat VDSO support" default y Index: linux/arch/i386/Kconfig.cpu =================================================================== --- linux.orig/arch/i386/Kconfig.cpu +++ linux/arch/i386/Kconfig.cpu @@ -235,11 +235,6 @@ config RWSEM_GENERIC_SPINLOCK depends on M386 default y -config RWSEM_XCHGADD_ALGORITHM - bool - depends on !M386 - default y - config GENERIC_CALIBRATE_DELAY bool default y Index: linux/arch/i386/Kconfig.debug =================================================================== --- linux.orig/arch/i386/Kconfig.debug +++ linux/arch/i386/Kconfig.debug @@ -22,6 +22,7 @@ config EARLY_PRINTK config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL + default y help This option will cause messages to be printed if free stack space drops below a certain limit. @@ -29,6 +30,7 @@ config DEBUG_STACKOVERFLOW config DEBUG_STACK_USAGE bool "Stack utilization instrumentation" depends on DEBUG_KERNEL + default y help Enables the display of the minimum amount of free stack which each task has ever had available in the sysrq-T and sysrq-P debug output. @@ -49,6 +51,7 @@ config DEBUG_PAGEALLOC config DEBUG_RODATA bool "Write protect kernel read-only data structures" depends on DEBUG_KERNEL + default y help Mark the kernel read-only data as write-protected in the pagetables, in order to catch accidental (and incorrect) writes to such const @@ -59,6 +62,7 @@ config DEBUG_RODATA config 4KSTACKS bool "Use 4Kb for kernel stacks instead of 8Kb" depends on DEBUG_KERNEL + default y help If you say Y here the kernel will use a 4Kb stacksize for the kernel stack attached to each process/thread. This facilitates Index: linux/arch/i386/boot/compressed/misc.c =================================================================== --- linux.orig/arch/i386/boot/compressed/misc.c +++ linux/arch/i386/boot/compressed/misc.c @@ -15,6 +15,12 @@ #include #include +#ifdef CONFIG_MCOUNT +void notrace mcount(void) +{ +} +#endif + /* * gzip declarations */ @@ -107,7 +113,7 @@ static long free_mem_end_ptr; #define INPLACE_MOVE_ROUTINE 0x1000 #define LOW_BUFFER_START 0x2000 #define LOW_BUFFER_MAX 0x90000 -#define HEAP_SIZE 0x3000 +#define HEAP_SIZE 0x4000 static unsigned int low_buffer_end, low_buffer_size; static int high_loaded =0; static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/; Index: linux/arch/i386/kernel/Makefile =================================================================== --- linux.orig/arch/i386/kernel/Makefile +++ linux/arch/i386/kernel/Makefile @@ -4,7 +4,7 @@ extra-y := head.o init_task.o vmlinux.lds -obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o \ +obj-y := process.o signal.o entry.o traps.o irq.o \ ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o bootflag.o \ quirks.o i8237.o topology.o alternative.o i8253.o tsc.o @@ -12,6 +12,7 @@ obj-y := process.o semaphore.o signal.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ +obj-$(CONFIG_GENERIC_TIME_VSYSCALL) += vsyscall-gtod.o obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca.o obj-$(CONFIG_X86_MSR) += msr.o @@ -20,6 +21,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp.o smpboot.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o +obj-$(CONFIG_MCOUNT) += mcount-wrapper.o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o @@ -30,6 +32,7 @@ obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_X86_SUMMIT_NUMA) += summit.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o obj-y += sysenter.o vsyscall.o obj-$(CONFIG_ACPI_SRAT) += srat.o obj-$(CONFIG_HPET_TIMER) += time_hpet.o Index: linux/arch/i386/kernel/acpi/boot.c =================================================================== --- linux.orig/arch/i386/kernel/acpi/boot.c +++ linux/arch/i386/kernel/acpi/boot.c @@ -53,8 +53,6 @@ static inline int acpi_madt_oem_check(ch #include #endif /* CONFIG_X86_LOCAL_APIC */ -static inline int gsi_irq_sharing(int gsi) { return gsi; } - #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ @@ -459,12 +457,7 @@ void __init acpi_pic_sci_set_trigger(uns int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) { -#ifdef CONFIG_X86_IO_APIC - if (use_pci_vector() && !platform_legacy_irq(gsi)) - *irq = IO_APIC_VECTOR(gsi); - else -#endif - *irq = gsi_irq_sharing(gsi); + *irq = gsi; return 0; } @@ -575,6 +568,7 @@ static int __init acpi_parse_sbf(unsigne } #ifdef CONFIG_HPET_TIMER +#include static int __init acpi_parse_hpet(unsigned long phys, unsigned long size) { @@ -595,21 +589,13 @@ static int __init acpi_parse_hpet(unsign return -1; } #ifdef CONFIG_X86_64 - vxtime.hpet_address = hpet_tbl->addr.addrl | + hpet_address = hpet_tbl->addr.addrl | ((long)hpet_tbl->addr.addrh << 32); - +#else + hpet_address = hpet_tbl->addr.addrl; +#endif printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, vxtime.hpet_address); -#else /* X86 */ - { - extern unsigned long hpet_address; - - hpet_address = hpet_tbl->addr.addrl; - printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", - hpet_tbl->id, hpet_address); - } -#endif /* X86 */ - + hpet_tbl->id, hpet_address); return 0; } #else Index: linux/arch/i386/kernel/apic.c =================================================================== --- linux.orig/arch/i386/kernel/apic.c +++ linux/arch/i386/kernel/apic.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -59,6 +60,23 @@ int enable_local_apic __initdata = 0; /* */ int apic_verbosity; +static unsigned int calibration_result; + +static void lapic_next_event(unsigned long delta, struct clock_event *evt); +static void lapic_timer_setup(int mode, struct clock_event *evt); + +static struct clock_event lapic_clockevent = { + .name = "lapic", + .capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE +#ifdef CONFIG_SMP + | CLOCK_CAP_UPDATE +#endif + , + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, +}; +static DEFINE_PER_CPU(struct clock_event, lapic_events); static void apic_pm_activate(void); @@ -909,6 +927,11 @@ fake_ioapic_page: */ /* + * FIXME: Move this to i8253.h. There is no need to keep the access to + * the PIT scattered all around the place -tglx + */ + +/* * The timer chip is already set up at HZ interrupts per second here, * but we do not accept timer interrupts yet. We only allow the BP * to calibrate. @@ -966,13 +989,15 @@ void (*wait_timer_tick)(void) __devinitd #define APIC_DIVISOR 16 -static void __setup_APIC_LVTT(unsigned int clocks) +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot) { unsigned int lvtt_value, tmp_value, ver; int cpu = smp_processor_id(); ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; if (!APIC_INTEGRATED(ver)) lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); @@ -989,23 +1014,31 @@ static void __setup_APIC_LVTT(unsigned i & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | APIC_TDR_DIV_16); - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); + if (!oneshot) + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); +} + +static void lapic_next_event(unsigned long delta, struct clock_event *evt) +{ + apic_write_around(APIC_TMICT, delta); } -static void __devinit setup_APIC_timer(unsigned int clocks) +static void lapic_timer_setup(int mode, struct clock_event *evt) { unsigned long flags; local_irq_save(flags); + __setup_APIC_LVTT(calibration_result, mode != CLOCK_EVT_PERIODIC); + local_irq_restore(flags); +} - /* - * Wait for IRQ0's slice: - */ - wait_timer_tick(); +static void __devinit setup_APIC_timer(void) +{ + struct clock_event *levt = &__get_cpu_var(lapic_events); - __setup_APIC_LVTT(clocks); + memcpy(levt, &lapic_clockevent, sizeof(*levt)); - local_irq_restore(flags); + register_local_clockevent(levt); } /* @@ -1014,6 +1047,8 @@ static void __devinit setup_APIC_timer(u * to calibrate, since some later bootup code depends on getting * the first irq? Ugh. * + * TODO: Fix this rather than saying "Ugh" -tglx + * * We want to do the calibration only once since we * want to have local timer irqs syncron. CPUs connected * by the same APIC bus have the very same bus frequency. @@ -1036,7 +1071,7 @@ static int __init calibrate_APIC_clock(v * value into the APIC clock, we just want to get the * counter running for calibration. */ - __setup_APIC_LVTT(1000000000); + __setup_APIC_LVTT(1000000000, 0); /* * The timer chip counts down to zero. Let's wait @@ -1073,6 +1108,14 @@ static int __init calibrate_APIC_clock(v result = (tt1-tt2)*APIC_DIVISOR/LOOPS; + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(tt1-tt2, TICK_NSEC * LOOPS, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + printk("lapic max_delta_ns: %ld\n", lapic_clockevent.max_delta_ns); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + if (cpu_has_tsc) apic_printk(APIC_VERBOSE, "..... CPU clock speed is " "%ld.%04ld MHz.\n", @@ -1087,8 +1130,6 @@ static int __init calibrate_APIC_clock(v return result; } -static unsigned int calibration_result; - void __init setup_boot_APIC_clock(void) { unsigned long flags; @@ -1101,14 +1142,14 @@ void __init setup_boot_APIC_clock(void) /* * Now set up the timer for real. */ - setup_APIC_timer(calibration_result); + setup_APIC_timer(); local_irq_restore(flags); } void __devinit setup_secondary_APIC_clock(void) { - setup_APIC_timer(calibration_result); + setup_APIC_timer(); } void disable_APIC_timer(void) @@ -1154,6 +1195,13 @@ void switch_APIC_timer_to_ipi(void *cpum !cpu_isset(cpu, timer_bcast_ipi)) { disable_APIC_timer(); cpu_set(cpu, timer_bcast_ipi); +#ifdef CONFIG_HIGH_RES_TIMERS + printk("Disabling NO_HZ and high resolution timers " + "due to timer broadcasting\n"); + for_each_possible_cpu(cpu) + per_cpu(lapic_events, cpu).capabilities &= + ~CLOCK_CAP_NEXTEVT; +#endif } } EXPORT_SYMBOL(switch_APIC_timer_to_ipi); @@ -1190,6 +1238,8 @@ inline void smp_local_timer_interrupt(st update_process_times(user_mode_vm(regs)); #endif + trace_special(regs->eip, 0, 0); + /* * We take the 'long' return path, and there every subsystem * grabs the apropriate locks (kernel lock/ irq lock). @@ -1211,15 +1261,18 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) +fastcall notrace void smp_apic_timer_interrupt(struct pt_regs *regs) { int cpu = smp_processor_id(); + struct clock_event *evt = &per_cpu(lapic_events, cpu); /* * the NMI deadlock-detector uses this. */ per_cpu(irq_stat, cpu).apic_timer_irqs++; + trace_special(regs->eip, 0, 0); + /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. @@ -1231,7 +1284,15 @@ fastcall void smp_apic_timer_interrupt(s * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(regs); + /* + * If the task is currently running in user mode, don't + * detect soft lockups. If CONFIG_DETECT_SOFTLOCKUP is not + * configured, this should be optimized out. + */ + if (user_mode(regs)) + touch_softlockup_watchdog(); + + evt->event_handler(regs); irq_exit(); } @@ -1240,6 +1301,8 @@ static void up_apic_timer_interrupt_call { int cpu = smp_processor_id(); + trace_special(regs->eip, 1, 0); + /* * the NMI deadlock-detector uses this. */ @@ -1323,6 +1386,7 @@ fastcall void smp_error_interrupt(struct */ printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); + dump_stack(); irq_exit(); } Index: linux/arch/i386/kernel/apm.c =================================================================== --- linux.orig/arch/i386/kernel/apm.c +++ linux/arch/i386/kernel/apm.c @@ -233,7 +233,6 @@ #include "io_ports.h" -extern unsigned long get_cmos_time(void); extern void machine_real_restart(unsigned char *, int); #if defined(CONFIG_APM_DISPLAY_BLANK) && defined(CONFIG_VT) @@ -1152,26 +1151,6 @@ out: spin_unlock(&user_list_lock); } -static void set_time(void) -{ - if (got_clock_diff) { /* Must know time zone in order to set clock */ - xtime.tv_sec = get_cmos_time() + clock_cmos_diff; - xtime.tv_nsec = 0; - } -} - -static void get_time_diff(void) -{ -#ifndef CONFIG_APM_RTC_IS_GMT - /* - * Estimate time zone so that set_time can update the clock - */ - clock_cmos_diff = -get_cmos_time(); - clock_cmos_diff += get_seconds(); - got_clock_diff = 1; -#endif -} - static void reinit_timer(void) { #ifdef INIT_TIMER_AFTER_SUSPEND @@ -1211,19 +1190,6 @@ static int suspend(int vetoable) local_irq_disable(); device_power_down(PMSG_SUSPEND); - /* serialize with the timer interrupt */ - write_seqlock(&xtime_lock); - - /* protect against access to timer chip registers */ - spin_lock(&i8253_lock); - - get_time_diff(); - /* - * Irq spinlock must be dropped around set_system_power_state. - * We'll undo any timer changes due to interrupts below. - */ - spin_unlock(&i8253_lock); - write_sequnlock(&xtime_lock); local_irq_enable(); save_processor_state(); @@ -1232,13 +1198,7 @@ static int suspend(int vetoable) restore_processor_state(); local_irq_disable(); - write_seqlock(&xtime_lock); - spin_lock(&i8253_lock); reinit_timer(); - set_time(); - - spin_unlock(&i8253_lock); - write_sequnlock(&xtime_lock); if (err == APM_NO_ERROR) err = APM_SUCCESS; @@ -1267,11 +1227,6 @@ static void standby(void) local_irq_disable(); device_power_down(PMSG_SUSPEND); - /* serialize with the timer interrupt */ - write_seqlock(&xtime_lock); - /* If needed, notify drivers here */ - get_time_diff(); - write_sequnlock(&xtime_lock); local_irq_enable(); err = set_system_power_state(APM_STATE_STANDBY); @@ -1365,9 +1320,6 @@ static void check_events(void) ignore_bounce = 1; if ((event != APM_NORMAL_RESUME) || (ignore_normal_resume == 0)) { - write_seqlock_irq(&xtime_lock); - set_time(); - write_sequnlock_irq(&xtime_lock); device_resume(); pm_send_all(PM_RESUME, (void *)0); queue_event(event, NULL); @@ -1383,9 +1335,6 @@ static void check_events(void) break; case APM_UPDATE_TIME: - write_seqlock_irq(&xtime_lock); - set_time(); - write_sequnlock_irq(&xtime_lock); break; case APM_CRITICAL_SUSPEND: Index: linux/arch/i386/kernel/cpu/mtrr/generic.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/mtrr/generic.c +++ linux/arch/i386/kernel/cpu/mtrr/generic.c @@ -234,7 +234,7 @@ static unsigned long set_mtrr_state(u32 static unsigned long cr4 = 0; static u32 deftype_lo, deftype_hi; -static DEFINE_SPINLOCK(set_atomicity_lock); +static DEFINE_RAW_SPINLOCK(set_atomicity_lock); /* * Since we are disabling the cache don't allow any interrupts - they Index: linux/arch/i386/kernel/cpu/mtrr/main.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/mtrr/main.c +++ linux/arch/i386/kernel/cpu/mtrr/main.c @@ -135,8 +135,6 @@ struct set_mtrr_data { mtrr_type smp_type; }; -#ifdef CONFIG_SMP - static void ipi_handler(void *info) /* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. [RETURNS] Nothing. @@ -166,8 +164,6 @@ static void ipi_handler(void *info) local_irq_restore(flags); } -#endif - /** * set_mtrr - update mtrrs on all processors * @reg: mtrr in question Index: linux/arch/i386/kernel/cpu/transmeta.c =================================================================== --- linux.orig/arch/i386/kernel/cpu/transmeta.c +++ linux/arch/i386/kernel/cpu/transmeta.c @@ -9,7 +9,8 @@ static void __init init_transmeta(struct { unsigned int cap_mask, uk, max, dummy; unsigned int cms_rev1, cms_rev2; - unsigned int cpu_rev, cpu_freq, cpu_flags, new_cpu_rev; + unsigned int cpu_rev, cpu_freq = 0 /* shut up gcc warning */, + cpu_flags, new_cpu_rev; char cpu_info[65]; get_model_name(c); /* Same as AMD/Cyrix */ Index: linux/arch/i386/kernel/entry.S =================================================================== --- linux.orig/arch/i386/kernel/entry.S +++ linux/arch/i386/kernel/entry.S @@ -248,14 +248,18 @@ ENTRY(resume_userspace) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) cli + cmpl $0, kernel_preemption + jz restore_nocheck cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ? jnz restore_nocheck need_resched: movl TI_flags(%ebp), %ecx # need_resched set ? testb $_TIF_NEED_RESCHED, %cl - jz restore_all + jz restore_nocheck testl $IF_MASK,EFLAGS(%esp) # interrupts off (exception path) ? - jz restore_all + jz restore_nocheck + cli + TRACE_IRQS_OFF call preempt_schedule_irq jmp need_resched #endif @@ -311,6 +315,11 @@ sysenter_past_esp: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_LATENCY_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ @@ -325,6 +334,11 @@ sysenter_past_esp: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work +#ifdef CONFIG_LATENCY_TRACE + pushl %eax + call sys_ret + popl %eax +#endif /* if something modifies registers it must also disable sysexit */ movl EIP(%esp), %edx movl OLDESP(%esp), %ecx @@ -341,6 +355,11 @@ ENTRY(system_call) pushl %eax # save orig_eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL +#ifdef CONFIG_LATENCY_TRACE + pushl %edx; pushl %ecx; pushl %ebx; pushl %eax + call sys_call + popl %eax; popl %ebx; popl %ecx; popl %edx +#endif GET_THREAD_INFO(%ebp) testl $TF_MASK,EFLAGS(%esp) jz no_singlestep @@ -430,19 +449,20 @@ ldt_ss: ALIGN RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jz work_notifysig work_resched: - call schedule - cli # make sure we don't miss an interrupt + cli + TRACE_IRQS_OFF + call __schedule + # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret - TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx andl $_TIF_WORK_MASK, %ecx # is there any work to be done other # than syscall tracing? jz restore_all - testb $_TIF_NEED_RESCHED, %cl + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx jnz work_resched work_notifysig: # deal with pending signals and Index: linux/arch/i386/kernel/head.S =================================================================== --- linux.orig/arch/i386/kernel/head.S +++ linux/arch/i386/kernel/head.S @@ -397,6 +397,7 @@ ignore_int: call printk #endif addl $(5*4),%esp + call dump_stack popl %ds popl %es popl %edx Index: linux/arch/i386/kernel/i386_ksyms.c =================================================================== --- linux.orig/arch/i386/kernel/i386_ksyms.c +++ linux/arch/i386/kernel/i386_ksyms.c @@ -2,10 +2,12 @@ #include #include -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_ASM_SEMAPHORES +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); @@ -20,7 +22,7 @@ EXPORT_SYMBOL(__put_user_8); EXPORT_SYMBOL(strstr); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_ASM_SEMAPHORES) extern void FASTCALL( __write_lock_failed(rwlock_t *rw)); extern void FASTCALL( __read_lock_failed(rwlock_t *rw)); EXPORT_SYMBOL(__write_lock_failed); Index: linux/arch/i386/kernel/i8253.c =================================================================== --- linux.orig/arch/i386/kernel/i8253.c +++ linux/arch/i386/kernel/i8253.c @@ -2,7 +2,7 @@ * i8253.c 8253/PIT functions * */ -#include +#include #include #include #include @@ -16,22 +16,66 @@ #include "io_ports.h" -DEFINE_SPINLOCK(i8253_lock); +DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -void setup_pit_timer(void) +static void init_pit_timer(int mode, struct clock_event *evt) +{ + unsigned long flags; + + spin_lock_irqsave(&i8253_lock, flags); + + switch(mode) { + case CLOCK_EVT_PERIODIC: + /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(0x34, PIT_MODE); + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + break; + + case CLOCK_EVT_ONESHOT: + case CLOCK_EVT_SHUTDOWN: + /* One shot setup */ + outb_p(0x38, PIT_MODE); + udelay(10); + break; + } + spin_unlock_irqrestore(&i8253_lock, flags); +} + +static void pit_next_event(unsigned long delta, struct clock_event *evt) { unsigned long flags; spin_lock_irqsave(&i8253_lock, flags); - outb_p(0x34,PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ - udelay(10); - outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ - udelay(10); - outb(LATCH >> 8 , PIT_CH0); /* MSB */ + outb_p(delta & 0xff , PIT_CH0); /* LSB */ + outb(delta >> 8 , PIT_CH0); /* MSB */ spin_unlock_irqrestore(&i8253_lock, flags); } +struct clock_event pit_clockevent = { + .name = "pit", + .capabilities = CLOCK_CAP_TICK | CLOCK_CAP_PROFILE | CLOCK_CAP_UPDATE +#ifndef CONFIG_SMP + | CLOCK_CAP_NEXTEVT +#endif + , + .set_mode = init_pit_timer, + .set_next_event = pit_next_event, + .shift = 32, +}; + +void setup_pit_timer(void) +{ + pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32); + pit_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFF, &pit_clockevent); + pit_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &pit_clockevent); + register_global_clockevent(&pit_clockevent); +} + /* * Since the PIT overflows every tick, its not very useful * to just read by itself. So use jiffies to emulate a free @@ -46,7 +90,7 @@ static cycle_t pit_read(void) static u32 old_jifs; spin_lock_irqsave(&i8253_lock, flags); - /* + /* * Although our caller may have the read side of xtime_lock, * this is now a seqlock, and we are cheating in this routine * by having side effects on state that we cannot undo if Index: linux/arch/i386/kernel/i8259.c =================================================================== --- linux.orig/arch/i386/kernel/i8259.c +++ linux/arch/i386/kernel/i8259.c @@ -34,39 +34,21 @@ * moves to arch independent land */ -DEFINE_SPINLOCK(i8259A_lock); - -static void end_8259A_irq (unsigned int irq) -{ - if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) && - irq_desc[irq].action) - enable_8259A_irq(irq); -} - -#define shutdown_8259A_irq disable_8259A_irq - static void mask_and_ack_8259A(unsigned int); -unsigned int startup_8259A_irq(unsigned int irq) -{ - enable_8259A_irq(irq); - return 0; /* never anything pending */ -} - -static struct hw_interrupt_type i8259A_irq_type = { - .typename = "XT-PIC", - .startup = startup_8259A_irq, - .shutdown = shutdown_8259A_irq, - .enable = enable_8259A_irq, - .disable = disable_8259A_irq, - .ack = mask_and_ack_8259A, - .end = end_8259A_irq, +static struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .mask = disable_8259A_irq, + .unmask = enable_8259A_irq, + .mask_ack = mask_and_ack_8259A, }; /* * 8259A PIC functions to handle ISA devices: */ +DEFINE_RAW_SPINLOCK(i8259A_lock); + /* * This contains the irq mask for both 8259A irq controllers, */ @@ -131,7 +113,7 @@ void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1< #include #include +#include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include @@ -49,8 +51,8 @@ atomic_t irq_mis_count; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); int timer_over_8254 __initdata = 1; @@ -85,14 +87,6 @@ static struct irq_pin_list { int apic, pin, next; } irq_2_pin[PIN_MAP_SIZE]; -int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; -#ifdef CONFIG_PCI_MSI -#define vector_to_irq(vector) \ - (platform_legacy_irq(vector) ? vector : vector_irq[vector]) -#else -#define vector_to_irq(vector) (vector) -#endif - /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -136,6 +130,105 @@ static void __init replace_pin_at_irq(un } } +//#define IOAPIC_CACHE + +#ifdef IOAPIC_CACHE +# define MAX_IOAPIC_CACHE 512 + +/* + * Cache register values: + */ +static unsigned int io_apic_cache[MAX_IO_APICS][MAX_IOAPIC_CACHE] + ____cacheline_aligned_in_smp; +#endif + +inline unsigned int __raw_io_apic_read(unsigned int apic, unsigned int reg) +{ + *IO_APIC_BASE(apic) = reg; + return *(IO_APIC_BASE(apic)+4); +} + +unsigned int raw_io_apic_read(unsigned int apic, unsigned int reg) +{ + unsigned int val = __raw_io_apic_read(apic, reg); + +#ifdef IOAPIC_CACHE + io_apic_cache[apic][reg] = val; +#endif + return val; +} + +unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ +#ifdef IOAPIC_CACHE + if (unlikely(reg >= MAX_IOAPIC_CACHE)) { + static int once = 1; + + if (once) { + once = 0; + printk("WARNING: ioapic register cache overflow: %d.\n", + reg); + dump_stack(); + } + return __raw_io_apic_read(apic, reg); + } + if (io_apic_cache[apic][reg] && !sis_apic_bug) + return io_apic_cache[apic][reg]; +#endif + return raw_io_apic_read(apic, reg); +} + +void io_apic_write(unsigned int apic, unsigned int reg, unsigned int val) +{ +#ifdef IOAPIC_CACHE + if (unlikely(reg >= MAX_IOAPIC_CACHE)) { + static int once = 1; + + if (once) { + once = 0; + printk("WARNING: ioapic register cache overflow: %d.\n", + reg); + dump_stack(); + } + } else + io_apic_cache[apic][reg] = val; +#endif + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = val; +} + +/* + * Some systems need a POST flush or else level-triggered interrupts + * generate lots of spurious interrupts due to the POST-ed write not + * reaching the IOAPIC before the IRQ is ACK-ed in the local APIC. + */ +#ifdef CONFIG_SMP +# define IOAPIC_POSTFLUSH +#endif + +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index regiser + */ +void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int val) +{ +#ifdef IOAPIC_CACHE + io_apic_cache[apic][reg] = val; +#endif + if (unlikely(sis_apic_bug)) + *IO_APIC_BASE(apic) = reg; + *(IO_APIC_BASE(apic)+4) = val; +#ifndef IOAPIC_POSTFLUSH + if (unlikely(sis_apic_bug)) +#endif + /* + * Force POST flush by reading: + */ + val = *(IO_APIC_BASE(apic)+4); +} + static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable) { struct irq_pin_list *entry = irq_2_pin + irq; @@ -167,18 +260,6 @@ static void __unmask_IO_APIC_irq (unsign __modify_IO_APIC_irq(irq, 0, 0x00010000); } -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq (unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); -} - static void mask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -258,7 +339,7 @@ static void set_ioapic_affinity_irq(unsi break; entry = irq_2_pin + entry->next; } - set_irq_info(irq, cpumask); + set_native_irq_info(irq, cpumask); spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1159,46 +1240,45 @@ static inline int IO_APIC_irq_trigger(in /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; -int assign_irq_vector(int irq) +static int __assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; - unsigned long flags; int vector; - BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); + BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); - spin_lock_irqsave(&vector_lock, flags); - - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { - spin_unlock_irqrestore(&vector_lock, flags); + if (IO_APIC_VECTOR(irq) > 0) return IO_APIC_VECTOR(irq); - } -next: + current_vector += 8; if (current_vector == SYSCALL_VECTOR) - goto next; + current_vector += 8; if (current_vector >= FIRST_SYSTEM_VECTOR) { offset++; - if (!(offset%8)) { - spin_unlock_irqrestore(&vector_lock, flags); + if (!(offset % 8)) return -ENOSPC; - } current_vector = FIRST_DEVICE_VECTOR + offset; } vector = current_vector; - vector_irq[vector] = irq; - if (irq != AUTO_ASSIGN) - IO_APIC_VECTOR(irq) = vector; + IO_APIC_VECTOR(irq) = vector; + return vector; +} + +static int assign_irq_vector(int irq) +{ + unsigned long flags; + int vector; + + spin_lock_irqsave(&vector_lock, flags); + vector = __assign_irq_vector(irq); spin_unlock_irqrestore(&vector_lock, flags); return vector; } - -static struct hw_interrupt_type ioapic_level_type; -static struct hw_interrupt_type ioapic_edge_type; +static struct irq_chip ioapic_chip; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -1206,16 +1286,17 @@ static struct hw_interrupt_type ioapic_e static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { - unsigned idx; - - idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[idx].chip = &ioapic_level_type; - else - irq_desc[idx].chip = &ioapic_edge_type; - set_intr_gate(vector, interrupt[idx]); + trigger == IOAPIC_LEVEL) { +#ifdef CONFIG_PREEMPT_HARDIRQS + set_irq_chip_and_handler(irq, &ioapic_chip, handle_level_irq); +#else + set_irq_chip_and_handler(irq, &ioapic_chip, handle_fasteoi_irq); +#endif + } else { + set_irq_chip_and_handler(irq, &ioapic_chip, handle_edge_irq); + } + set_intr_gate(vector, interrupt[irq]); } static void __init setup_IO_APIC_irqs(void) @@ -1326,7 +1407,8 @@ static void __init setup_ExtINT_IRQ0_pin * The timer IRQ doesn't have to know that behind the * scene we have a 8259A-master in AEOI mode ... */ - irq_desc[0].chip = &ioapic_edge_type; + irq_desc[0].chip = &ioapic_chip; + set_irq_handler(0, handle_edge_irq); /* * Add it to the IO-APIC irq-routing table: @@ -1445,8 +1527,8 @@ void __init print_IO_APIC(void) struct IO_APIC_route_entry entry; spin_lock_irqsave(&ioapic_lock, flags); - *(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2); - *(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2); + *(((int *)&entry)+0) = raw_io_apic_read(apic, 0x10+i*2); + *(((int *)&entry)+1) = raw_io_apic_read(apic, 0x11+i*2); spin_unlock_irqrestore(&ioapic_lock, flags); printk(KERN_DEBUG " %02x %03X %02X ", @@ -1467,17 +1549,12 @@ void __init print_IO_APIC(void) ); } } - if (use_pci_vector()) - printk(KERN_INFO "Using vector-based indexing\n"); printk(KERN_DEBUG "IRQ to pin mappings:\n"); for (i = 0; i < NR_IRQS; i++) { struct irq_pin_list *entry = irq_2_pin + i; if (entry->pin < 0) continue; - if (use_pci_vector() && !platform_legacy_irq(i)) - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); - else - printk(KERN_DEBUG "IRQ%d ", i); + printk(KERN_DEBUG "IRQ%d ", i); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) @@ -1492,7 +1569,7 @@ void __init print_IO_APIC(void) return; } -#if 0 +#if 1 static void print_APIC_bitfield (int base) { @@ -1893,7 +1970,7 @@ static int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ - if (jiffies - t1 > 4) + if (jiffies - t1 > 4 && jiffies - t1 < 16) return 1; return 0; @@ -1913,6 +1990,8 @@ static int __init timer_irq_works(void) */ /* + * Startup quirk: + * * Starting up a edge-triggered IO-APIC interrupt is * nasty - we need to make sure that we get the edge. * If it is already asserted for some reason, we need @@ -1920,8 +1999,10 @@ static int __init timer_irq_works(void) * * This is not complete - we should be able to fake * an edge even if it isn't on the 8259A... + * + * (We do this for level-triggered IRQs too - it cannot hurt.) */ -static unsigned int startup_edge_ioapic_irq(unsigned int irq) +static unsigned int startup_ioapic_irq(unsigned int irq) { int was_pending = 0; unsigned long flags; @@ -1938,47 +2019,18 @@ static unsigned int startup_edge_ioapic_ return was_pending; } -/* - * Once we have recorded IRQ_PENDING already, we can mask the - * interrupt for real. This prevents IRQ storms from unhandled - * devices. - */ -static void ack_edge_ioapic_irq(unsigned int irq) -{ - move_irq(irq); - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) - == (IRQ_PENDING | IRQ_DISABLED)) - mask_IO_APIC_irq(irq); - ack_APIC_irq(); -} - -/* - * Level triggered interrupts can just be masked, - * and shutting down and starting up the interrupt - * is the same as enabling and disabling them -- except - * with a startup need to return a "was pending" value. - * - * Level triggered interrupts are special because we - * do not touch any IO-APIC register while handling - * them. We ack the APIC in the end-IRQ handler, not - * in the start-IRQ-handler. Protection against reentrance - * from the same interrupt is still provided, both by the - * generic IRQ layer and by the fact that an unacked local - * APIC does not accept IRQs. - */ -static unsigned int startup_level_ioapic_irq (unsigned int irq) +static void ack_ioapic_irq(unsigned int irq) { - unmask_IO_APIC_irq(irq); - - return 0; /* don't check for pending */ + move_native_irq(irq); + ack_APIC_irq(); } -static void end_level_ioapic_irq (unsigned int irq) +static void ack_ioapic_quirk_irq(unsigned int irq) { unsigned long v; int i; - move_irq(irq); + move_native_irq(irq); /* * It appears there is an erratum which affects at least version 0x11 * of I/O APIC (that's the 82093AA and cores integrated into various @@ -2007,111 +2059,34 @@ static void end_level_ioapic_irq (unsign if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); + /* mask = 1, trigger = 0 */ + __modify_IO_APIC_irq(irq, 0x00010000, 0x00008000); + /* mask = 0, trigger = 1 */ + __modify_IO_APIC_irq(irq, 0x00008000, 0x00010000); spin_unlock(&ioapic_lock); } } -#ifdef CONFIG_PCI_MSI -static unsigned int startup_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_edge_ioapic_irq(irq); -} - -static void ack_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - move_native_irq(vector); - ack_edge_ioapic_irq(irq); -} - -static unsigned int startup_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_level_ioapic_irq (irq); -} - -static void end_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - move_native_irq(vector); - end_level_ioapic_irq(irq); -} - -static void mask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - mask_IO_APIC_irq(irq); -} - -static void unmask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - unmask_IO_APIC_irq(irq); -} - -#ifdef CONFIG_SMP -static void set_ioapic_affinity_vector (unsigned int vector, - cpumask_t cpu_mask) -{ - int irq = vector_to_irq(vector); - - set_native_irq_info(vector, cpu_mask); - set_ioapic_affinity_irq(irq, cpu_mask); -} -#endif -#endif - -static int ioapic_retrigger(unsigned int irq) +static int ioapic_retrigger_irq(unsigned int irq) { send_IPI_self(IO_APIC_VECTOR(irq)); return 1; } -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ -static struct hw_interrupt_type ioapic_edge_type __read_mostly = { - .typename = "IO-APIC-edge", - .startup = startup_edge_ioapic, - .shutdown = shutdown_edge_ioapic, - .enable = enable_edge_ioapic, - .disable = disable_edge_ioapic, - .ack = ack_edge_ioapic, - .end = end_edge_ioapic, +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_ioapic_irq, + .eoi = ack_ioapic_quirk_irq, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity, + .set_affinity = set_ioapic_affinity_irq, #endif - .retrigger = ioapic_retrigger, + .retrigger = ioapic_retrigger_irq, }; -static struct hw_interrupt_type ioapic_level_type __read_mostly = { - .typename = "IO-APIC-level", - .startup = startup_level_ioapic, - .shutdown = shutdown_level_ioapic, - .enable = enable_level_ioapic, - .disable = disable_level_ioapic, - .ack = mask_and_ack_level_ioapic, - .end = end_level_ioapic, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity, -#endif - .retrigger = ioapic_retrigger, -}; static inline void init_IO_APIC_traps(void) { @@ -2130,11 +2105,6 @@ static inline void init_IO_APIC_traps(vo */ for (irq = 0; irq < NR_IRQS ; irq++) { int tmp = irq; - if (use_pci_vector()) { - if (!platform_legacy_irq(tmp)) - if ((tmp = vector_to_irq(tmp)) == -1) - continue; - } if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { /* * Hmm.. We don't have an entry for this, @@ -2145,20 +2115,21 @@ static inline void init_IO_APIC_traps(vo make_8259A_irq(irq); else /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_type; + irq_desc[irq].chip = &no_irq_chip; } } } -static void enable_lapic_irq (unsigned int irq) -{ - unsigned long v; +/* + * The local APIC irq-chip implementation: + */ - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); +static void ack_apic(unsigned int irq) +{ + ack_APIC_irq(); } -static void disable_lapic_irq (unsigned int irq) +static void mask_lapic_irq (unsigned int irq) { unsigned long v; @@ -2166,21 +2137,19 @@ static void disable_lapic_irq (unsigned apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); } -static void ack_lapic_irq (unsigned int irq) +static void unmask_lapic_irq (unsigned int irq) { - ack_APIC_irq(); -} + unsigned long v; -static void end_lapic_irq (unsigned int i) { /* nothing */ } + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); +} -static struct hw_interrupt_type lapic_irq_type __read_mostly = { - .typename = "local-APIC-edge", - .startup = NULL, /* startup_irq() not used for IRQ0 */ - .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ - .enable = enable_lapic_irq, - .disable = disable_lapic_irq, - .ack = ack_lapic_irq, - .end = end_lapic_irq +static struct irq_chip lapic_chip __read_mostly = { + .name = "local-APIC-edge", + .mask = mask_lapic_irq, + .unmask = unmask_lapic_irq, + .eoi = ack_apic, }; static void setup_nmi (void) @@ -2361,7 +2330,7 @@ static inline void check_timer(void) printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); disable_8259A_irq(0); - irq_desc[0].chip = &lapic_irq_type; + set_irq_chip_and_handler(0, &lapic_chip, handle_fasteoi_irq); apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); @@ -2543,6 +2512,117 @@ static int __init ioapic_init_sysfs(void device_initcall(ioapic_init_sysfs); +#ifdef CONFIG_PCI_MSI +/* + * Dynamic irq allocate and deallocation for MSI + */ +int create_irq(void) +{ + /* Allocate an unused irq */ + int irq, new, vector; + unsigned long flags; + + irq = -ENOSPC; + spin_lock_irqsave(&vector_lock, flags); + for (new = (NR_IRQS - 1); new >= 0; new--) { + if (platform_legacy_irq(new)) + continue; + if (irq_vector[new] != 0) + continue; + vector = __assign_irq_vector(new); + if (likely(vector > 0)) + irq = new; + break; + } + spin_unlock_irqrestore(&vector_lock, flags); + + if (irq >= 0) { + set_intr_gate(vector, interrupt[irq]); + dynamic_irq_init(irq); + } + return irq; +} + +void destroy_irq(unsigned int irq) +{ + unsigned long flags; + + dynamic_irq_cleanup(irq); + + spin_lock_irqsave(&vector_lock, flags); + irq_vector[irq] = 0; + spin_unlock_irqrestore(&vector_lock, flags); +} +#endif /* CONFIG_PCI_MSI */ + +/* + * MSI mesage composition + */ +#ifdef CONFIG_PCI_MSI +static int msi_msg_setup(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +{ + /* For now always this code always uses physical delivery + * mode. + */ + int vector; + unsigned dest; + + vector = assign_irq_vector(irq); + if (vector >= 0) { + dest = cpu_mask_to_apicid(TARGET_CPUS); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(vector); + } + return vector; +} + +static void msi_msg_teardown(unsigned int irq) +{ + return; +} + +static void msi_msg_set_affinity(unsigned int irq, cpumask_t mask, struct msi_msg *msg) +{ + int vector; + unsigned dest; + + vector = assign_irq_vector(irq); + if (vector > 0) { + dest = cpu_mask_to_apicid(mask); + + msg->data &= ~MSI_DATA_VECTOR_MASK; + msg->data |= MSI_DATA_VECTOR(vector); + msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg->address_lo |= MSI_ADDR_DEST_ID(dest); + } +} + +struct msi_ops arch_msi_ops = { + .needs_64bit_address = 0, + .setup = msi_msg_setup, + .teardown = msi_msg_teardown, + .target = msi_msg_set_affinity, +}; + +#endif /* CONFIG_PCI_MSI */ + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ @@ -2697,7 +2777,7 @@ int io_apic_set_pci_routing (int ioapic, spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); + set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); return 0; Index: linux/arch/i386/kernel/irq.c =================================================================== --- linux.orig/arch/i386/kernel/irq.c +++ linux/arch/i386/kernel/irq.c @@ -51,10 +51,11 @@ static union irq_ctx *softirq_ctx[NR_CPU * SMP cross-CPU interrupts have their own specific * handlers). */ -fastcall unsigned int do_IRQ(struct pt_regs *regs) +fastcall notrace unsigned int do_IRQ(struct pt_regs *regs) { /* high bit used in ret_from_ code */ int irq = ~regs->orig_eax; + struct irq_desc *desc = irq_desc + irq; #ifdef CONFIG_4KSTACKS union irq_ctx *curctx, *irqctx; u32 *isp; @@ -67,6 +68,11 @@ fastcall unsigned int do_IRQ(struct pt_r } irq_enter(); +#ifdef CONFIG_LATENCY_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif + trace_special(regs->eip, irq, 0); #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { @@ -75,12 +81,25 @@ fastcall unsigned int do_IRQ(struct pt_r __asm__ __volatile__("andl %%esp,%0" : "=r" (esp) : "0" (THREAD_SIZE - 1)); if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) { - printk("do_IRQ: stack overflow: %ld\n", + printk("BUG: do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); dump_stack(); } } #endif +#ifdef CONFIG_NO_HZ + if (idle_cpu(smp_processor_id())) { + update_jiffies(); + /* + * Force polling-idle loops to break out into + * the sched-timer setting code, to make sure + * that timer interval changes due to __mod_timer() + * in IRQ context get properly propagated: + */ + if (tsk_is_polling(current)) + set_need_resched(); + } +#endif #ifdef CONFIG_4KSTACKS @@ -94,7 +113,7 @@ fastcall unsigned int do_IRQ(struct pt_r * current stack (which is the irq stack already after all) */ if (curctx != irqctx) { - int arg1, arg2, ebx; + int arg1, arg2, arg3, ebx; /* build the stack frame on the IRQ stack */ isp = (u32*) ((char*)irqctx + sizeof(*irqctx)); @@ -110,16 +129,17 @@ fastcall unsigned int do_IRQ(struct pt_r (curctx->tinfo.preempt_count & SOFTIRQ_MASK); asm volatile( - " xchgl %%ebx,%%esp \n" - " call __do_IRQ \n" + " xchgl %%ebx,%%esp \n" + " call *%%edi \n" " movl %%ebx,%%esp \n" - : "=a" (arg1), "=d" (arg2), "=b" (ebx) - : "0" (irq), "1" (regs), "2" (isp) - : "memory", "cc", "ecx" + : "=a" (arg1), "=d" (arg2), "=c" (arg3), "=b" (ebx) + : "0" (irq), "1" (desc), "2" (regs), "3" (isp), + "D" (desc->handle_irq) + : "memory", "cc" ); } else #endif - __do_IRQ(irq, regs); + desc->handle_irq(irq, desc, regs); irq_exit(); @@ -242,8 +262,10 @@ int show_interrupts(struct seq_file *p, } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); - action = irq_desc[i].action; + irq_desc_t *desc = irq_desc + i; + + spin_lock_irqsave(&desc->lock, flags); + action = desc->action; if (!action) goto skip; seq_printf(p, "%3d: ",i); @@ -253,7 +275,22 @@ int show_interrupts(struct seq_file *p, for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - seq_printf(p, " %14s", irq_desc[i].chip->typename); + seq_printf(p, " %-14s", irq_desc[i].chip->name); +#define F(x,c) ((desc->status & x) ? c : '.') + seq_printf(p, " [%c%c%c%c%c%c%c%c%c/", + F(IRQ_INPROGRESS, 'I'), + F(IRQ_DISABLED, 'D'), + F(IRQ_PENDING, 'P'), + F(IRQ_REPLAY, 'R'), + F(IRQ_AUTODETECT, 'A'), + F(IRQ_WAITING, 'W'), + F(IRQ_LEVEL, 'L'), + F(IRQ_MASKED, 'M'), + F(IRQ_NODELAY, 'N')); +#undef F + seq_printf(p, "%3d]", desc->irqs_unhandled); + + seq_printf(p, "-%s", handle_irq_name(irq_desc[i].handle_irq)); seq_printf(p, " %s", action->name); for (action=action->next; action; action = action->next) Index: linux/arch/i386/kernel/kprobes.c =================================================================== --- linux.orig/arch/i386/kernel/kprobes.c +++ linux/arch/i386/kernel/kprobes.c @@ -338,7 +338,7 @@ ss_probe: /* Boost up -- we can execute copied instructions directly */ reset_current_kprobe(); regs->eip = (unsigned long)p->ainsn.insn; - preempt_enable_no_resched(); + preempt_enable(); return 1; } #endif @@ -347,7 +347,7 @@ ss_probe: return 1; no_kprobe: - preempt_enable_no_resched(); + preempt_enable(); return ret; } @@ -566,7 +566,7 @@ static int __kprobes post_kprobe_handler } reset_current_kprobe(); out: - preempt_enable_no_resched(); + preempt_enable(); /* * if somebody else is singlestepping across a probe point, eflags @@ -600,7 +600,7 @@ static int __kprobes kprobe_fault_handle restore_previous_kprobe(kcb); else reset_current_kprobe(); - preempt_enable_no_resched(); + preempt_enable(); break; case KPROBE_HIT_ACTIVE: case KPROBE_HIT_SSDONE: @@ -734,7 +734,7 @@ int __kprobes longjmp_break_handler(stru *regs = kcb->jprobe_saved_regs; memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack, MIN_STACK_SIZE(stack_addr)); - preempt_enable_no_resched(); + preempt_enable(); return 1; } return 0; Index: linux/arch/i386/kernel/mcount-wrapper.S =================================================================== --- /dev/null +++ linux/arch/i386/kernel/mcount-wrapper.S @@ -0,0 +1,27 @@ +/* + * linux/arch/i386/mcount-wrapper.S + * + * Copyright (C) 2004 Ingo Molnar + */ + +.globl mcount +mcount: + + cmpl $0, mcount_enabled + jz out + + push %ebp + mov %esp, %ebp + pushl %eax + pushl %ecx + pushl %edx + + call __mcount + + popl %edx + popl %ecx + popl %eax + popl %ebp +out: + ret + Index: linux/arch/i386/kernel/microcode.c =================================================================== --- linux.orig/arch/i386/kernel/microcode.c +++ linux/arch/i386/kernel/microcode.c @@ -115,7 +115,7 @@ module_param(verbose, int, 0644); #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) /* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); +static DEFINE_RAW_SPINLOCK(microcode_update_lock); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); Index: linux/arch/i386/kernel/mpparse.c =================================================================== --- linux.orig/arch/i386/kernel/mpparse.c +++ linux/arch/i386/kernel/mpparse.c @@ -228,12 +228,17 @@ static void __init MP_bus_info (struct m mpc_oem_bus_info(m, str, translation_table[mpc_record]); + /* + * mpc_busid is char: + */ +#if MAX_MP_BUSSES < 256 if (m->mpc_busid >= MAX_MP_BUSSES) { printk(KERN_WARNING "MP table busid value (%d) for bustype %s " " is too large, max. supported is %d\n", m->mpc_busid, str, MAX_MP_BUSSES - 1); return; } +#endif if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA)-1) == 0) { mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; Index: linux/arch/i386/kernel/nmi.c =================================================================== --- linux.orig/arch/i386/kernel/nmi.c +++ linux/arch/i386/kernel/nmi.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -30,7 +31,7 @@ unsigned int nmi_watchdog = NMI_NONE; extern int unknown_nmi_panic; -static unsigned int nmi_hz = HZ; +static unsigned int nmi_hz = 1000; static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */ static unsigned int nmi_p4_cccr_val; extern void show_registers(struct pt_regs *regs); @@ -99,7 +100,6 @@ int nmi_active; #define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL #define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK -#ifdef CONFIG_SMP /* The performance counters used by NMI_LOCAL_APIC don't trigger when * the CPU is idle. To make sure the NMI watchdog really ticks on all * CPUs during the test make them busy. @@ -107,7 +107,12 @@ int nmi_active; static __init void nmi_cpu_busy(void *data) { volatile int *endflag = data; + /* + * avoid a warning, on PREEMPT_RT this wont run in hardirq context: + */ +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -117,7 +122,6 @@ static __init void nmi_cpu_busy(void *da while (*endflag == 0) barrier(); } -#endif static int __init check_nmi_watchdog(void) { @@ -140,7 +144,7 @@ static int __init check_nmi_watchdog(voi for_each_possible_cpu(cpu) prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count; local_irq_enable(); - mdelay((10*1000)/nmi_hz); // wait 10 ticks + mdelay((100*1000)/nmi_hz); // wait 100 ticks for_each_possible_cpu(cpu) { #ifdef CONFIG_SMP @@ -167,7 +171,7 @@ static int __init check_nmi_watchdog(voi /* now that we know it works we can reduce NMI frequency to something more reasonable; makes a difference in some configs */ if (nmi_watchdog == NMI_LOCAL_APIC) - nmi_hz = 1; + nmi_hz = 10000; kfree(prev_nmi_count); return 0; @@ -579,9 +583,34 @@ EXPORT_SYMBOL(touch_nmi_watchdog); extern void die_nmi(struct pt_regs *, const char *msg); -void nmi_watchdog_tick (struct pt_regs * regs) +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) { + int i; + + if (nmi_watchdog == NMI_NONE) + return; + if (system_state != SYSTEM_RUNNING) { + printk("nmi_show_all_regs(): system state %d, not doing.\n", + system_state); + return; + } + printk("nmi_show_all_regs(): start on CPU#%d.\n", + raw_smp_processor_id()); + dump_stack(); + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + for_each_online_cpu(i) + while (nmi_show_regs[i] == 1) + barrier(); +} + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + +void notrace nmi_watchdog_tick (struct pt_regs * regs) +{ /* * Since current_thread_info()-> is always on the stack, and we * always switch the stack NMI-atomically, it's safe to use @@ -590,7 +619,16 @@ void nmi_watchdog_tick (struct pt_regs * unsigned int sum; int cpu = smp_processor_id(); - sum = per_cpu(irq_stat, cpu).apic_timer_irqs; + sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0); + + profile_tick(CPU_PROFILING, regs); + if (nmi_show_regs[cpu]) { + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + printk("NMI show regs on CPU#%d:\n", cpu); + show_regs(regs); + spin_unlock(&nmi_print_lock); + } if (last_irq_sums[cpu] == sum) { /* @@ -598,11 +636,26 @@ void nmi_watchdog_tick (struct pt_regs * * wait a few IRQs (5 seconds) before doing the oops ... */ alert_counter[cpu]++; - if (alert_counter[cpu] == 5*nmi_hz) - /* - * die_nmi will return ONLY if NOTIFY_STOP happens.. - */ - die_nmi(regs, "BUG: NMI Watchdog detected LOCKUP"); + if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) { + int i; + + bust_spinlocks(1); + spin_lock(&nmi_print_lock); + printk("NMI watchdog detected lockup on CPU#%d (%d/%d)\n", + cpu, alert_counter[cpu], 5*nmi_hz); + show_regs(regs); + spin_unlock(&nmi_print_lock); + + for_each_online_cpu(i) + if (i != cpu) + nmi_show_regs[i] = 1; + for_each_online_cpu(i) + while (nmi_show_regs[i] == 1) + barrier(); + + die_nmi(regs, "NMI Watchdog detected LOCKUP"); + } + } else { last_irq_sums[cpu] = sum; alert_counter[cpu] = 0; Index: linux/arch/i386/kernel/process.c =================================================================== --- linux.orig/arch/i386/kernel/process.c +++ linux/arch/i386/kernel/process.c @@ -103,16 +103,20 @@ void default_idle(void) if (!hlt_counter && boot_cpu_data.hlt_works_ok) { current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { local_irq_disable(); - if (!need_resched()) - safe_halt(); - else + if (!need_resched() && !need_resched_delayed()) { + if (!hrtimer_stop_sched_tick()) + safe_halt(); + else + local_irq_enable(); + hrtimer_restart_sched_tick(); + } else local_irq_enable(); } current_thread_info()->status |= TS_POLLING; } else { - while (!need_resched()) + while (!need_resched() && !need_resched_delayed()) cpu_relax(); } } @@ -125,16 +129,18 @@ EXPORT_SYMBOL(default_idle); * to poll the ->work.need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ -static void poll_idle (void) +static void poll_idle(void) { local_irq_enable(); - asm volatile( - "2:" - "testl %0, %1;" - "rep; nop;" - "je 2b;" - : : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags)); + while (!need_resched() && !need_resched_delayed()) { + hrtimer_stop_sched_tick(); + local_irq_enable(); + while (!need_resched() && !need_resched_delayed() && !rcu_pending(smp_processor_id()) && !local_softirq_pending()) + rep_nop(); + hrtimer_restart_sched_tick(); + local_irq_enable(); + } } #ifdef CONFIG_HOTPLUG_CPU @@ -177,7 +183,9 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + BUG_ON(irqs_disabled()); + + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) @@ -195,9 +203,11 @@ void cpu_idle(void) __get_cpu_var(irq_stat).idle_timestamp = jiffies; idle(); } - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } @@ -240,13 +250,16 @@ static void mwait_idle(void) { local_irq_enable(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { + if (hrtimer_stop_sched_tick()) + break; __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (need_resched()) + if (need_resched() || need_resched_delayed()) break; __mwait(0, 0); } + hrtimer_restart_sched_tick(); } void __devinit select_idle_routine(const struct cpuinfo_x86 *c) @@ -363,15 +376,23 @@ void exit_thread(void) if (unlikely(test_thread_flag(TIF_IO_BITMAP))) { struct task_struct *tsk = current; struct thread_struct *t = &tsk->thread; - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + void *io_bitmap_ptr = t->io_bitmap_ptr; + int cpu; + struct tss_struct *tss; - kfree(t->io_bitmap_ptr); + /* + * On PREEMPT_RT we must not call kfree() with + * preemption disabled, so we first zap the pointer: + */ t->io_bitmap_ptr = NULL; + kfree(io_bitmap_ptr); + clear_thread_flag(TIF_IO_BITMAP); /* * Careful, clear this in the TSS too: */ + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); memset(tss->io_bitmap, 0xff, tss->io_bitmap_max); t->io_bitmap_max = 0; tss->io_bitmap_owner = NULL; Index: linux/arch/i386/kernel/semaphore.c =================================================================== --- linux.orig/arch/i386/kernel/semaphore.c +++ linux/arch/i386/kernel/semaphore.c @@ -12,6 +12,7 @@ * * rw semaphores implemented November 1999 by Benjamin LaHaise */ +#include #include /* @@ -27,15 +28,15 @@ asm( ".section .sched.text\n" ".align 4\n" -".globl __down_failed\n" -"__down_failed:\n\t" +".globl __compat_down_failed\n" +"__compat_down_failed:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down\n\t" + "call __compat_down\n\t" "popl %ecx\n\t" "popl %edx\n\t" #if defined(CONFIG_FRAME_POINTER) @@ -48,15 +49,15 @@ asm( asm( ".section .sched.text\n" ".align 4\n" -".globl __down_failed_interruptible\n" -"__down_failed_interruptible:\n\t" +".globl __compat_down_failed_interruptible\n" +"__compat_down_failed_interruptible:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down_interruptible\n\t" + "call __compat_down_interruptible\n\t" "popl %ecx\n\t" "popl %edx\n\t" #if defined(CONFIG_FRAME_POINTER) @@ -69,15 +70,15 @@ asm( asm( ".section .sched.text\n" ".align 4\n" -".globl __down_failed_trylock\n" -"__down_failed_trylock:\n\t" +".globl __compat_down_failed_trylock\n" +"__compat_down_failed_trylock:\n\t" #if defined(CONFIG_FRAME_POINTER) "pushl %ebp\n\t" "movl %esp,%ebp\n\t" #endif "pushl %edx\n\t" "pushl %ecx\n\t" - "call __down_trylock\n\t" + "call __compat_down_trylock\n\t" "popl %ecx\n\t" "popl %edx\n\t" #if defined(CONFIG_FRAME_POINTER) @@ -90,45 +91,13 @@ asm( asm( ".section .sched.text\n" ".align 4\n" -".globl __up_wakeup\n" -"__up_wakeup:\n\t" +".globl __compat_up_wakeup\n" +"__compat_up_wakeup:\n\t" "pushl %edx\n\t" "pushl %ecx\n\t" - "call __up\n\t" + "call __compat_up\n\t" "popl %ecx\n\t" "popl %edx\n\t" "ret" ); -/* - * rw spinlock fallbacks - */ -#if defined(CONFIG_SMP) -asm( -".section .sched.text\n" -".align 4\n" -".globl __write_lock_failed\n" -"__write_lock_failed:\n\t" - LOCK_PREFIX "addl $" RW_LOCK_BIAS_STR ",(%eax)\n" -"1: rep; nop\n\t" - "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jne 1b\n\t" - LOCK_PREFIX "subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" - "jnz __write_lock_failed\n\t" - "ret" -); - -asm( -".section .sched.text\n" -".align 4\n" -".globl __read_lock_failed\n" -"__read_lock_failed:\n\t" - LOCK_PREFIX "incl (%eax)\n" -"1: rep; nop\n\t" - "cmpl $1,(%eax)\n\t" - "js 1b\n\t" - LOCK_PREFIX "decl (%eax)\n\t" - "js __read_lock_failed\n\t" - "ret" -); -#endif Index: linux/arch/i386/kernel/setup.c =================================================================== --- linux.orig/arch/i386/kernel/setup.c +++ linux/arch/i386/kernel/setup.c @@ -61,7 +61,7 @@ #include #include #include - +#include /* Forward Declaration. */ void __init find_max_pfn(void); @@ -1580,6 +1580,7 @@ void __init setup_arch(char **cmdline_p) #endif #endif tsc_init(); + vsyscall_init(); } static __init int add_pcspkr(void) Index: linux/arch/i386/kernel/signal.c =================================================================== --- linux.orig/arch/i386/kernel/signal.c +++ linux/arch/i386/kernel/signal.c @@ -532,6 +532,13 @@ handle_signal(unsigned long sig, siginfo } } +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * If TF is set due to a debugger (PT_DTRACE), clear the TF flag so * that register information in the sigcontext is correct. @@ -572,6 +579,13 @@ static void fastcall do_signal(struct pt struct k_sigaction ka; sigset_t *oldset; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux/arch/i386/kernel/smp.c =================================================================== --- linux.orig/arch/i386/kernel/smp.c +++ linux/arch/i386/kernel/smp.c @@ -255,7 +255,7 @@ void send_IPI_mask_sequence(cpumask_t ma static cpumask_t flush_cpumask; static struct mm_struct * flush_mm; static unsigned long flush_va; -static DEFINE_SPINLOCK(tlbstate_lock); +static DEFINE_RAW_SPINLOCK(tlbstate_lock); #define FLUSH_ALL 0xffffffff /* @@ -400,7 +400,7 @@ static void flush_tlb_others(cpumask_t c while (!cpus_empty(flush_cpumask)) /* nothing. lockup detection does not belong here */ - mb(); + cpu_relax(); flush_mm = NULL; flush_va = 0; @@ -491,10 +491,20 @@ void smp_send_reschedule(int cpu) } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); @@ -599,13 +609,14 @@ void smp_send_stop(void) } /* - * Reschedule call back. Nothing to do, - * all the work is done automatically when - * we return from the interrupt. + * Reschedule call back. Trigger a reschedule pass so that + * RT-overload balancing can pass tasks around. */ -fastcall void smp_reschedule_interrupt(struct pt_regs *regs) +fastcall notrace void smp_reschedule_interrupt(struct pt_regs *regs) { + trace_special(regs->eip, 0, 0); ack_APIC_irq(); + set_tsk_need_resched(current); } fastcall void smp_call_function_interrupt(struct pt_regs *regs) Index: linux/arch/i386/kernel/time.c =================================================================== --- linux.orig/arch/i386/kernel/time.c +++ linux/arch/i386/kernel/time.c @@ -131,7 +131,7 @@ static int set_rtc_mmss(unsigned long no int timer_ack; #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER) -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -150,15 +150,6 @@ EXPORT_SYMBOL(profile_pc); */ irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - #ifdef CONFIG_X86_IO_APIC if (timer_ack) { /* @@ -177,7 +168,6 @@ irqreturn_t timer_interrupt(int irq, voi do_timer_interrupt_hook(regs); - if (MCA_bus) { /* The PS/2 uses level-triggered interrupts. You can't turn them off, nor would you want to (any attempt to @@ -192,8 +182,6 @@ irqreturn_t timer_interrupt(int irq, voi outb_p( irq|0x80, 0x61 ); /* reset the IRQ */ } - write_sequnlock(&xtime_lock); - #ifdef CONFIG_X86_LOCAL_APIC if (using_apic_timer) smp_send_timer_broadcast_ipi(regs); @@ -203,7 +191,7 @@ irqreturn_t timer_interrupt(int irq, voi } /* not static: needed by APM */ -unsigned long get_cmos_time(void) +unsigned long read_persistent_clock(void) { unsigned long retval; unsigned long flags; @@ -219,7 +207,7 @@ unsigned long get_cmos_time(void) return retval; } -EXPORT_SYMBOL(get_cmos_time); +EXPORT_SYMBOL(read_persistent_clock); static void sync_cmos_clock(unsigned long dummy); @@ -270,75 +258,11 @@ void notify_arch_cmos_timer(void) mod_timer(&sync_cmos_timer, jiffies + 1); } -static long clock_cmos_diff, sleep_start; - -static int timer_suspend(struct sys_device *dev, pm_message_t state) -{ - /* - * Estimate time zone so that set_time can update the clock - */ - clock_cmos_diff = -get_cmos_time(); - clock_cmos_diff += get_seconds(); - sleep_start = get_cmos_time(); - return 0; -} - -static int timer_resume(struct sys_device *dev) -{ - unsigned long flags; - unsigned long sec; - unsigned long sleep_length; - -#ifdef CONFIG_HPET_TIMER - if (is_hpet_enabled()) - hpet_reenable(); -#endif - setup_pit_timer(); - sec = get_cmos_time() + clock_cmos_diff; - sleep_length = (get_cmos_time() - sleep_start) * HZ; - write_seqlock_irqsave(&xtime_lock, flags); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - jiffies_64 += sleep_length; - wall_jiffies += sleep_length; - write_sequnlock_irqrestore(&xtime_lock, flags); - touch_softlockup_watchdog(); - return 0; -} - -static struct sysdev_class timer_sysclass = { - .resume = timer_resume, - .suspend = timer_suspend, - set_kset_name("timer"), -}; - - -/* XXX this driverfs stuff should probably go elsewhere later -john */ -static struct sys_device device_timer = { - .id = 0, - .cls = &timer_sysclass, -}; - -static int time_init_device(void) -{ - int error = sysdev_class_register(&timer_sysclass); - if (!error) - error = sysdev_register(&device_timer); - return error; -} - -device_initcall(time_init_device); - #ifdef CONFIG_HPET_TIMER extern void (*late_time_init)(void); /* Duplicate of time_init() below, with hpet_enable part added */ static void __init hpet_time_init(void) { - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - if ((hpet_enable() >= 0) && hpet_use_timer) { printk("Using HPET for base-timer\n"); } @@ -359,10 +283,6 @@ void __init time_init(void) return; } #endif - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); time_init_hook(); } Index: linux/arch/i386/kernel/traps.c =================================================================== --- linux.orig/arch/i386/kernel/traps.c +++ linux/arch/i386/kernel/traps.c @@ -226,6 +226,7 @@ static void show_trace_log_lvl(struct ta break; printk("%s =======================\n", log_lvl); } + print_traces(task); } void show_trace(struct task_struct *task, struct pt_regs *regs, unsigned long * stack) @@ -276,6 +277,12 @@ void dump_stack(void) EXPORT_SYMBOL(dump_stack); +#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_LATENCY_TRACE) +extern unsigned long worst_stack_left; +#else +# define worst_stack_left -1L +#endif + void show_registers(struct pt_regs *regs) { int i; @@ -302,8 +309,8 @@ void show_registers(struct pt_regs *regs regs->eax, regs->ebx, regs->ecx, regs->edx); printk(KERN_EMERG "esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", regs->esi, regs->edi, regs->ebp, esp); - printk(KERN_EMERG "ds: %04x es: %04x ss: %04x\n", - regs->xds & 0xffff, regs->xes & 0xffff, ss); + printk(KERN_EMERG "ds: %04x es: %04x ss: %04x preempt: %08x\n", + regs->xds & 0xffff, regs->xes & 0xffff, ss, preempt_count()); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, current->pid, current_thread_info(), current, current->thread_info); @@ -375,11 +382,11 @@ static void handle_BUG(struct pt_regs *r void die(const char * str, struct pt_regs * regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED(die.lock), .lock_owner = -1, .lock_owner_depth = 0 }; @@ -486,6 +493,11 @@ static void __kprobes do_trap(int trapnr if (!user_mode(regs)) goto kernel_trap; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + trap_signal: { if (info) force_sig_info(signr, info, tsk); @@ -505,6 +517,7 @@ static void __kprobes do_trap(int trapnr if (ret) goto trap_signal; return; } + print_traces(tsk); } #define DO_ERROR(trapnr, signr, str, name) \ @@ -703,10 +716,11 @@ void die_nmi (struct pt_regs *regs, cons crash_kexec(regs); } + nmi_exit(); do_exit(SIGSEGV); } -static void default_do_nmi(struct pt_regs * regs) +static void notrace default_do_nmi(struct pt_regs * regs) { unsigned char reason = 0; @@ -715,9 +729,6 @@ static void default_do_nmi(struct pt_reg reason = get_nmi_reason(); if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; #ifdef CONFIG_X86_LOCAL_APIC /* * Ok, so this is none of the documented NMI sources, @@ -725,9 +736,13 @@ static void default_do_nmi(struct pt_reg */ if (nmi_watchdog) { nmi_watchdog_tick(regs); +// trace_special(6, 1, 0); return; } #endif + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; unknown_nmi_error(reason, regs); return; } @@ -744,18 +759,19 @@ static void default_do_nmi(struct pt_reg reassert_nmi(); } -static int dummy_nmi_callback(struct pt_regs * regs, int cpu) +static notrace int dummy_nmi_callback(struct pt_regs * regs, int cpu) { return 0; } static nmi_callback_t nmi_callback = dummy_nmi_callback; -fastcall void do_nmi(struct pt_regs * regs, long error_code) +fastcall notrace void do_nmi(struct pt_regs * regs, long error_code) { int cpu; nmi_enter(); + nmi_trace((unsigned long)do_nmi, regs->eip, regs->eflags); cpu = smp_processor_id(); Index: linux/arch/i386/kernel/tsc.c =================================================================== --- linux.orig/arch/i386/kernel/tsc.c +++ linux/arch/i386/kernel/tsc.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -333,6 +334,16 @@ static cycle_t read_tsc(void) return ret; } + +static cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret; + + rdtscll(ret); + + return ret; +} + static struct clocksource clocksource_tsc = { .name = "tsc", .rating = 300, @@ -342,6 +353,7 @@ static struct clocksource clocksource_ts .shift = 22, .update_callback = tsc_update_callback, .is_continuous = 1, + .vread = vread_tsc, }; static int tsc_update_callback(void) Index: linux/arch/i386/kernel/vm86.c =================================================================== --- linux.orig/arch/i386/kernel/vm86.c +++ linux/arch/i386/kernel/vm86.c @@ -109,6 +109,7 @@ struct pt_regs * fastcall save_v86_state local_irq_enable(); if (!current->thread.vm86_info) { + local_irq_disable(); printk("no vm86_info: BAD\n"); do_exit(SIGSEGV); } Index: linux/arch/i386/kernel/vmlinux.lds.S =================================================================== --- linux.orig/arch/i386/kernel/vmlinux.lds.S +++ linux/arch/i386/kernel/vmlinux.lds.S @@ -8,6 +8,8 @@ #include #include #include +#include +#include OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) @@ -71,6 +73,51 @@ SECTIONS .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } _edata = .; /* End of data section */ +/* VSYSCALL_GTOD data */ +#ifdef CONFIG_GENERIC_TIME_VSYSCALL +#undef VSYSCALL_ADDR +#define VSYSCALL_ADDR VSYSCALL_GTOD_START +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) + +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) + +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } + __vsyscall_0 = VSYSCALL_VIRT_ADDR; + + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } + .vsyscall_data : AT(VLOAD(.vsyscall_data)) { *(.vsyscall_data) } + + . = ALIGN(32); + .vsyscall_gtod_data : AT (VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) } + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + + . = ALIGN(32); + .vsyscall_gtod_lock : AT (VLOAD(.vsyscall_gtod_lock)) { *(.vsyscall_gtod_lock) } + vsyscall_gtod_lock = VVIRT(.vsyscall_gtod_lock); + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } + + . = VSYSCALL_VIRT_ADDR + 4096; + +#undef VSYSCALL_ADDR +#undef VSYSCALL_PHYS_ADDR +#undef VSYSCALL_VIRT_ADDR +#undef VLOAD_OFFSET +#undef VLOAD +#undef VVIRT_OFFSET +#undef VVIRT + +#endif +/* END of VSYSCALL_GTOD data*/ + #ifdef CONFIG_STACK_UNWIND . = ALIGN(4); .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) { Index: linux/arch/i386/kernel/vsyscall-gtod.c =================================================================== --- /dev/null +++ linux/arch/i386/kernel/vsyscall-gtod.c @@ -0,0 +1,179 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct vsyscall_gtod_data_t { + struct timeval wall_time_tv; + struct timezone sys_tz; + struct clocksource clock; +}; + +struct vsyscall_gtod_data_t vsyscall_gtod_data; +struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data; + +seqlock_t vsyscall_gtod_lock = SEQLOCK_UNLOCKED; +seqlock_t __vsyscall_gtod_lock __section_vsyscall_gtod_lock = SEQLOCK_UNLOCKED; + +int errno; +static inline _syscall2(int,gettimeofday,struct timeval *,tv,struct timezone *,tz); + +static int vsyscall_mapped = 0; /* flag variable for remap_vsyscall() */ +extern struct timezone sys_tz; + +static inline void do_vgettimeofday(struct timeval* tv) +{ + cycle_t now, cycle_delta; + s64 nsec_delta; + + if (!__vsyscall_gtod_data.clock.vread) { + gettimeofday(tv, NULL); + return; + } + + /* read the clocksource and calc cycle_delta */ + now = __vsyscall_gtod_data.clock.vread(); + cycle_delta = (now - __vsyscall_gtod_data.clock.cycle_last) + & __vsyscall_gtod_data.clock.mask; + + /* convert cycles to nsecs */ + nsec_delta = cycle_delta * __vsyscall_gtod_data.clock.mult; + nsec_delta = nsec_delta >> __vsyscall_gtod_data.clock.shift; + + /* add nsec offset to wall_time_tv */ + *tv = __vsyscall_gtod_data.wall_time_tv; + do_div(nsec_delta, NSEC_PER_USEC); + while (nsec_delta > NSEC_PER_SEC) { + tv->tv_sec += 1; + nsec_delta -= NSEC_PER_SEC; + } + tv->tv_usec += ((unsigned long)nsec_delta)/1000; +} + +static inline void do_get_tz(struct timezone *tz) +{ + *tz = __vsyscall_gtod_data.sys_tz; +} + +static int __vsyscall(0) asmlinkage vgettimeofday(struct timeval *tv, struct timezone *tz) +{ + unsigned long seq; + do { + seq = read_seqbegin(&__vsyscall_gtod_lock); + + if (tv) + do_vgettimeofday(tv); + if (tz) + do_get_tz(tz); + + } while (read_seqretry(&__vsyscall_gtod_lock, seq)); + + return 0; +} + +static time_t __vsyscall(1) asmlinkage vtime(time_t * t) +{ + struct timeval tv; + vgettimeofday(&tv,NULL); + if (t) + *t = tv.tv_sec; + return tv.tv_sec; +} + +struct clocksource* curr_clock; + +void update_vsyscall(struct timespec *wall_time, + struct clocksource* clock) +{ + unsigned long flags; + + write_seqlock_irqsave(&vsyscall_gtod_lock, flags); + + /* XXX - hackitty hack hack. this is terrible! */ + if (curr_clock != clock) { + curr_clock = clock; + } + + /* save off wall time as timeval */ + vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000; + + /* copy current clocksource */ + vsyscall_gtod_data.clock = *clock; + + /* save off current timezone */ + vsyscall_gtod_data.sys_tz = sys_tz; + + write_sequnlock_irqrestore(&vsyscall_gtod_lock, flags); + +} +extern char __vsyscall_0; + +static void __init map_vsyscall(void) +{ + unsigned long physaddr_page0 = (unsigned long) &__vsyscall_0 - PAGE_OFFSET; + + /* Initially we map the VSYSCALL page w/ PAGE_KERNEL permissions to + * keep the alternate_instruction code from bombing out when it + * changes the seq_lock memory barriers in vgettimeofday() + */ + __set_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE, physaddr_page0, PAGE_KERNEL); +} + +static int __init remap_vsyscall(void) +{ + unsigned long physaddr_page0 = (unsigned long) &__vsyscall_0 - PAGE_OFFSET; + + if (!vsyscall_mapped) + return 0; + + /* Remap the VSYSCALL page w/ PAGE_KERNEL_VSYSCALL permissions + * after the alternate_instruction code has run + */ + clear_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE); + __set_fixmap(FIX_VSYSCALL_GTOD_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); + + return 0; +} + +int __init vsyscall_init(void) +{ + printk("VSYSCALL: consistency checks..."); + if ((unsigned long) &vgettimeofday != VSYSCALL_ADDR(__NR_vgettimeofday)) { + printk("vgettimeofday link addr broken\n"); + printk("VSYSCALL: vsyscall_init failed!\n"); + return -EFAULT; + } + if ((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime)) { + printk("vtime link addr broken\n"); + printk("VSYSCALL: vsyscall_init failed!\n"); + return -EFAULT; + } + if (VSYSCALL_ADDR(0) != __fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE)) { + printk("fixmap first vsyscall 0x%lx should be 0x%x\n", + __fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE), + VSYSCALL_ADDR(0)); + printk("VSYSCALL: vsyscall_init failed!\n"); + return -EFAULT; + } + + + printk("passed...mapping..."); + map_vsyscall(); + printk("done.\n"); + vsyscall_mapped = 1; + printk("VSYSCALL: fixmap virt addr: 0x%lx\n", + __fix_to_virt(FIX_VSYSCALL_GTOD_FIRST_PAGE)); + + return 0; +} +__initcall(remap_vsyscall); Index: linux/arch/i386/lib/bitops.c =================================================================== --- linux.orig/arch/i386/lib/bitops.c +++ linux/arch/i386/lib/bitops.c @@ -1,5 +1,11 @@ -#include + +#include +#include +#include #include +#include +#include +#include /** * find_next_bit - find the first set bit in a memory region @@ -68,3 +74,39 @@ int find_next_zero_bit(const unsigned lo return (offset + set + res); } EXPORT_SYMBOL(find_next_zero_bit); + + +/* + * rw spinlock fallbacks + */ +#ifdef CONFIG_SMP +asm( +".section .sched.text, \"ax\"\n" +".align 4\n" +".globl __write_lock_failed\n" +"__write_lock_failed:\n\t" + "lock; addl $" RW_LOCK_BIAS_STR ",(%eax)\n" +"1: rep; nop\n\t" + "cmpl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" + "jne 1b\n\t" + "lock; subl $" RW_LOCK_BIAS_STR ",(%eax)\n\t" + "jnz __write_lock_failed\n\t" + "ret\n" +".previous\n" +); + +asm( +".section .sched.text, \"ax\"\n" +".align 4\n" +".globl __read_lock_failed\n" +"__read_lock_failed:\n\t" + "lock; incl (%eax)\n" +"1: rep; nop\n\t" + "cmpl $1,(%eax)\n\t" + "js 1b\n\t" + "lock; decl (%eax)\n\t" + "js __read_lock_failed\n\t" + "ret\n" +".previous\n" +); +#endif Index: linux/arch/i386/mach-default/setup.c =================================================================== --- linux.orig/arch/i386/mach-default/setup.c +++ linux/arch/i386/mach-default/setup.c @@ -35,7 +35,7 @@ void __init pre_intr_init_hook(void) /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, IRQF_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; /** * intr_init_hook - post gate setup interrupt initialisation @@ -79,7 +79,7 @@ void __init trap_init_hook(void) { } -static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL}; +static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED | IRQF_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL}; /** * time_init_hook - do any specific initialisations for the system timer. Index: linux/arch/i386/mach-visws/setup.c =================================================================== --- linux.orig/arch/i386/mach-visws/setup.c +++ linux/arch/i386/mach-visws/setup.c @@ -115,7 +115,7 @@ void __init pre_setup_arch_hook() static struct irqaction irq0 = { .handler = timer_interrupt, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NODELAY, .name = "timer", }; Index: linux/arch/i386/mach-visws/visws_apic.c =================================================================== --- linux.orig/arch/i386/mach-visws/visws_apic.c +++ linux/arch/i386/mach-visws/visws_apic.c @@ -259,11 +259,13 @@ out_unlock: static struct irqaction master_action = { .handler = piix4_master_intr, .name = "PIIX4-8259", + .flags = IRQF_NODELAY, }; static struct irqaction cascade_action = { .handler = no_action, .name = "cascade", + .flags = IRQF_NODELAY, }; Index: linux/arch/i386/mach-voyager/setup.c =================================================================== --- linux.orig/arch/i386/mach-voyager/setup.c +++ linux/arch/i386/mach-voyager/setup.c @@ -18,7 +18,7 @@ void __init pre_intr_init_hook(void) /* * IRQ2 is cascade interrupt to second interrupt controller */ -static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL}; +static struct irqaction irq2 = { no_action, IRQF_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL}; void __init intr_init_hook(void) { @@ -40,7 +40,7 @@ void __init trap_init_hook(void) { } -static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL}; +static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED | IRQF_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL}; void __init time_init_hook(void) { Index: linux/arch/i386/mm/fault.c =================================================================== --- linux.orig/arch/i386/mm/fault.c +++ linux/arch/i386/mm/fault.c @@ -73,6 +73,9 @@ void bust_spinlocks(int yes) int loglevel_save = console_loglevel; if (yes) { + stop_trace(); + user_trace_stop(); + zap_rt_locks(); oops_in_progress = 1; return; } @@ -325,8 +328,8 @@ static inline int vmalloc_fault(unsigned * bit 3 == 1 means use of reserved bit detected * bit 4 == 1 means fault was an instruction fetch */ -fastcall void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) +fastcall notrace void __kprobes do_page_fault(struct pt_regs *regs, + unsigned long error_code) { struct task_struct *tsk; struct mm_struct *mm; @@ -337,6 +340,7 @@ fastcall void __kprobes do_page_fault(st /* get the address */ address = read_cr2(); + trace_special(regs->eip, error_code, address); tsk = current; Index: linux/arch/i386/mm/highmem.c =================================================================== --- linux.orig/arch/i386/mm/highmem.c +++ linux/arch/i386/mm/highmem.c @@ -18,6 +18,26 @@ void kunmap(struct page *page) kunmap_high(page); } +void kunmap_virt(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return; + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + kunmap(page); +} + +struct page *kmap_to_page(void *ptr) +{ + struct page *page; + + if ((unsigned long)ptr < PKMAP_ADDR(0)) + return virt_to_page(ptr); + page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]); + return page; +} + /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -26,7 +46,7 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; @@ -48,7 +68,7 @@ void *kmap_atomic(struct page *page, enu return (void*) vaddr; } -void kunmap_atomic(void *kvaddr, enum km_type type) +void __kunmap_atomic(void *kvaddr, enum km_type type) { #ifdef CONFIG_DEBUG_HIGHMEM unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; @@ -78,7 +98,7 @@ void kunmap_atomic(void *kvaddr, enum km /* This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. */ -void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) +void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type) { enum fixed_addresses idx; unsigned long vaddr; @@ -93,7 +113,7 @@ void *kmap_atomic_pfn(unsigned long pfn, return (void*) vaddr; } -struct page *kmap_atomic_to_page(void *ptr) +struct page *__kmap_atomic_to_page(void *ptr) { unsigned long idx, vaddr = (unsigned long)ptr; pte_t *pte; @@ -108,6 +128,7 @@ struct page *kmap_atomic_to_page(void *p EXPORT_SYMBOL(kmap); EXPORT_SYMBOL(kunmap); -EXPORT_SYMBOL(kmap_atomic); -EXPORT_SYMBOL(kunmap_atomic); -EXPORT_SYMBOL(kmap_atomic_to_page); +EXPORT_SYMBOL(kunmap_virt); +EXPORT_SYMBOL(__kmap_atomic); +EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(__kmap_atomic_to_page); Index: linux/arch/i386/mm/init.c =================================================================== --- linux.orig/arch/i386/mm/init.c +++ linux/arch/i386/mm/init.c @@ -45,7 +45,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static int noinline do_test_wp_bit(void); @@ -194,7 +194,7 @@ static inline int page_kills_ppro(unsign extern int is_available_memory(efi_memory_desc_t *); -int page_is_ram(unsigned long pagenr) +int notrace page_is_ram(unsigned long pagenr) { int i; unsigned long addr, end; Index: linux/arch/i386/mm/pgtable.c =================================================================== --- linux.orig/arch/i386/mm/pgtable.c +++ linux/arch/i386/mm/pgtable.c @@ -182,7 +182,7 @@ void pmd_ctor(void *pmd, kmem_cache_t *c * recommendations and having no core impact whatsoever. * -- wli */ -DEFINE_SPINLOCK(pgd_lock); +DEFINE_RAW_SPINLOCK(pgd_lock); struct page *pgd_list; static inline void pgd_list_add(pgd_t *pgd) Index: linux/arch/i386/oprofile/Kconfig =================================================================== --- linux.orig/arch/i386/oprofile/Kconfig +++ linux/arch/i386/oprofile/Kconfig @@ -15,3 +15,6 @@ config OPROFILE If unsure, say N. +config PROFILE_NMI + bool + default y Index: linux/arch/i386/pci/Makefile =================================================================== --- linux.orig/arch/i386/pci/Makefile +++ linux/arch/i386/pci/Makefile @@ -4,8 +4,9 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o obj-$(CONFIG_PCI_MMCONFIG) += mmconfig.o direct.o obj-$(CONFIG_PCI_DIRECT) += direct.o +obj-$(CONFIG_ACPI) += acpi.o + pci-y := fixup.o -pci-$(CONFIG_ACPI) += acpi.o pci-y += legacy.o irq.o pci-$(CONFIG_X86_VISWS) := visws.o fixup.o Index: linux/arch/i386/pci/direct.c =================================================================== --- linux.orig/arch/i386/pci/direct.c +++ linux/arch/i386/pci/direct.c @@ -220,16 +220,23 @@ static int __init pci_check_type1(void) unsigned int tmp; int works = 0; - local_irq_save(flags); + spin_lock_irqsave(&pci_config_lock, flags); outb(0x01, 0xCFB); tmp = inl(0xCF8); outl(0x80000000, 0xCF8); - if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) { - works = 1; + + if (inl(0xCF8) == 0x80000000) { + spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf1)) + works = 1; + + spin_lock_irqsave(&pci_config_lock, flags); } outl(tmp, 0xCF8); - local_irq_restore(flags); + + spin_unlock_irqrestore(&pci_config_lock, flags); return works; } @@ -239,17 +246,19 @@ static int __init pci_check_type2(void) unsigned long flags; int works = 0; - local_irq_save(flags); + spin_lock_irqsave(&pci_config_lock, flags); outb(0x00, 0xCFB); outb(0x00, 0xCF8); outb(0x00, 0xCFA); - if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 && - pci_sanity_check(&pci_direct_conf2)) { - works = 1; - } - local_irq_restore(flags); + if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) { + spin_unlock_irqrestore(&pci_config_lock, flags); + + if (pci_sanity_check(&pci_direct_conf2)) + works = 1; + } else + spin_unlock_irqrestore(&pci_config_lock, flags); return works; } Index: linux/arch/i386/pci/irq.c =================================================================== --- linux.orig/arch/i386/pci/irq.c +++ linux/arch/i386/pci/irq.c @@ -981,10 +981,6 @@ static void __init pcibios_fixup_irqs(vo pci_name(bridge), 'A' + pin, irq); } if (irq >= 0) { - if (use_pci_vector() && - !platform_legacy_irq(irq)) - irq = IO_APIC_VECTOR(irq); - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", pci_name(dev), 'A' + pin, irq); dev->irq = irq; @@ -1169,33 +1165,3 @@ static int pirq_enable_irq(struct pci_de } return 0; } - -int pci_vector_resources(int last, int nr_released) -{ - int count = nr_released; - - int next = last; - int offset = (last % 8); - - while (next < FIRST_SYSTEM_VECTOR) { - next += 8; -#ifdef CONFIG_X86_64 - if (next == IA32_SYSCALL_VECTOR) - continue; -#else - if (next == SYSCALL_VECTOR) - continue; -#endif - count++; - if (next >= FIRST_SYSTEM_VECTOR) { - if (offset%8) { - next = FIRST_DEVICE_VECTOR + offset; - offset++; - continue; - } - count--; - } - } - - return count; -} Index: linux/arch/ia64/Kconfig =================================================================== --- linux.orig/arch/ia64/Kconfig +++ linux/arch/ia64/Kconfig @@ -32,6 +32,7 @@ config SWIOTLB config RWSEM_XCHGADD_ALGORITHM bool + depends on !PREEMPT_RT default y config GENERIC_FIND_NEXT_BIT @@ -42,7 +43,11 @@ config GENERIC_CALIBRATE_DELAY bool default y -config TIME_INTERPOLATION +config GENERIC_TIME + bool + default y + +config GENERIC_TIME_VSYSCALL bool default y @@ -258,6 +263,69 @@ config SMP If you don't know what to do here, say N. + +config GENERIC_TIME + bool + default y + +config HIGH_RES_TIMERS + bool "High-Resolution Timers" + help + + POSIX timers are available by default. This option enables + high-resolution POSIX timers. With this option the resolution + is at least 1 microsecond. High resolution is not free. If + enabled this option will add a small overhead each time a + timer expires that is not on a 1/HZ tick boundary. If no such + timers are used the overhead is nil. + + This option enables two additional POSIX CLOCKS, + CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR. Note that this + option does not change the resolution of CLOCK_REALTIME or + CLOCK_MONOTONIC which remain at 1/HZ resolution. + +config HIGH_RES_RESOLUTION + int "High-Resolution-Timer resolution (nanoseconds)" + depends on HIGH_RES_TIMERS + default 1000 + help + + This sets the resolution of timers accessed with + CLOCK_REALTIME_HR and CLOCK_MONOTONIC_HR. Too + fine a resolution (small a number) will usually not + be observable due to normal system latencies. For an + 800 MHZ processor about 10,000 is the recommended maximum + (smallest number). If you don't need that sort of resolution, + higher numbers may generate less overhead. + +choice + prompt "Clock source" + depends on HIGH_RES_TIMERS + default HIGH_RES_TIMER_ITC + help + This option allows you to choose the hardware source in charge + of generating high precision interruptions on your system. + On IA-64 these are: + + + ITC Interval Time Counter 1/CPU clock + HPET High Precision Event Timer ~ (XXX:have to check the spec) + + The ITC timer is available on all the ia64 computers because + it is integrated directly into the processor. However it may not + give correct results on MP machines with processors running + at different clock rates. In this case you may want to use + the HPET if available on your machine. + + +config HIGH_RES_TIMER_ITC + bool "Interval Time Counter/ITC" + +config HIGH_RES_TIMER_HPET + bool "High Precision Event Timer/HPET" + +endchoice + config NR_CPUS int "Maximum number of CPUs (2-1024)" range 2 1024 @@ -310,17 +378,15 @@ config FORCE_CPEI_RETARGET This option it useful to enable this feature on older BIOS's as well. You can also enable this by using boot command line option force_cpei=1. -config PREEMPT - bool "Preemptible Kernel" - help - This option reduces the latency of the kernel when reacting to - real-time or interactive events by allowing a low priority process to - be preempted even if it is in kernel mode executing a system call. - This allows applications to run more reliably even when the system is - under load. +source "kernel/Kconfig.preempt" - Say Y here if you are building a kernel for a desktop, embedded - or real-time system. Say N if you are unsure. +config RWSEM_GENERIC_SPINLOCK + bool + depends on PREEMPT_RT + default y + +config PREEMPT + def_bool y if (PREEMPT_RT || PREEMPT_SOFTIRQS || PREEMPT_HARDIRQS || PREEMPT_VOLUNTARY || PREEMPT_DESKTOP) source "mm/Kconfig" Index: linux/arch/ia64/configs/bigsur_defconfig =================================================================== --- linux.orig/arch/ia64/configs/bigsur_defconfig +++ linux/arch/ia64/configs/bigsur_defconfig @@ -85,7 +85,7 @@ CONFIG_MMU=y CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y Index: linux/arch/ia64/configs/gensparse_defconfig =================================================================== --- linux.orig/arch/ia64/configs/gensparse_defconfig +++ linux/arch/ia64/configs/gensparse_defconfig @@ -86,7 +86,7 @@ CONFIG_MMU=y CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y Index: linux/arch/ia64/configs/sim_defconfig =================================================================== --- linux.orig/arch/ia64/configs/sim_defconfig +++ linux/arch/ia64/configs/sim_defconfig @@ -86,7 +86,7 @@ CONFIG_MMU=y CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y Index: linux/arch/ia64/configs/sn2_defconfig =================================================================== --- linux.orig/arch/ia64/configs/sn2_defconfig +++ linux/arch/ia64/configs/sn2_defconfig @@ -83,7 +83,7 @@ CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_FIND_NEXT_BIT=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_DMI=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y Index: linux/arch/ia64/configs/tiger_defconfig =================================================================== --- linux.orig/arch/ia64/configs/tiger_defconfig +++ linux/arch/ia64/configs/tiger_defconfig @@ -86,7 +86,7 @@ CONFIG_MMU=y CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y Index: linux/arch/ia64/configs/zx1_defconfig =================================================================== --- linux.orig/arch/ia64/configs/zx1_defconfig +++ linux/arch/ia64/configs/zx1_defconfig @@ -84,7 +84,7 @@ CONFIG_MMU=y CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y Index: linux/arch/ia64/defconfig =================================================================== --- linux.orig/arch/ia64/defconfig +++ linux/arch/ia64/defconfig @@ -86,7 +86,7 @@ CONFIG_MMU=y CONFIG_SWIOTLB=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_EFI=y CONFIG_GENERIC_IOMAP=y CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y Index: linux/arch/ia64/kernel/asm-offsets.c =================================================================== --- linux.orig/arch/ia64/kernel/asm-offsets.c +++ linux/arch/ia64/kernel/asm-offsets.c @@ -7,6 +7,7 @@ #define ASM_OFFSETS_C 1 #include +#include #include #include @@ -254,18 +255,13 @@ void foo(void) offsetof (struct pal_min_state_area_s, pmsa_xip)); BLANK(); +#ifdef CONFIG_TIME_INTERPOLATION /* used by fsys_gettimeofday in arch/ia64/kernel/fsys.S */ - DEFINE(IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET, offsetof (struct time_interpolator, addr)); - DEFINE(IA64_TIME_INTERPOLATOR_SOURCE_OFFSET, offsetof (struct time_interpolator, source)); - DEFINE(IA64_TIME_INTERPOLATOR_SHIFT_OFFSET, offsetof (struct time_interpolator, shift)); - DEFINE(IA64_TIME_INTERPOLATOR_NSEC_OFFSET, offsetof (struct time_interpolator, nsec_per_cyc)); - DEFINE(IA64_TIME_INTERPOLATOR_OFFSET_OFFSET, offsetof (struct time_interpolator, offset)); - DEFINE(IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET, offsetof (struct time_interpolator, last_cycle)); - DEFINE(IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET, offsetof (struct time_interpolator, last_counter)); - DEFINE(IA64_TIME_INTERPOLATOR_JITTER_OFFSET, offsetof (struct time_interpolator, jitter)); - DEFINE(IA64_TIME_INTERPOLATOR_MASK_OFFSET, offsetof (struct time_interpolator, mask)); - DEFINE(IA64_TIME_SOURCE_CPU, TIME_SOURCE_CPU); - DEFINE(IA64_TIME_SOURCE_MMIO64, TIME_SOURCE_MMIO64); - DEFINE(IA64_TIME_SOURCE_MMIO32, TIME_SOURCE_MMIO32); DEFINE(IA64_TIMESPEC_TV_NSEC_OFFSET, offsetof (struct timespec, tv_nsec)); + DEFINE(IA64_CLOCKSOURCE_MASK_OFFSET, offsetof (struct clocksource, mask)); + DEFINE(IA64_CLOCKSOURCE_MULT_OFFSET, offsetof (struct clocksource, mult)); + DEFINE(IA64_CLOCKSOURCE_SHIFT_OFFSET, offsetof (struct clocksource, shift)); + DEFINE(IA64_CLOCKSOURCE_MMIO_PTR_OFFSET, offsetof (struct clocksource, fsys_mmio_ptr)); + DEFINE(IA64_CLOCKSOURCE_CYCLE_LAST_OFFSET, offsetof (struct clocksource, cycle_last)); +#endif } Index: linux/arch/ia64/kernel/cyclone.c =================================================================== --- linux.orig/arch/ia64/kernel/cyclone.c +++ linux/arch/ia64/kernel/cyclone.c @@ -3,6 +3,7 @@ #include #include #include +#include #include /* IBM Summit (EXA) Cyclone counter code*/ @@ -18,13 +19,21 @@ void __init cyclone_setup(void) use_cyclone = 1; } +static void __iomem *cyclone_mc_ptr; -struct time_interpolator cyclone_interpolator = { - .source = TIME_SOURCE_MMIO64, - .shift = 16, - .frequency = CYCLONE_TIMER_FREQ, - .drift = -100, - .mask = (1LL << 40) - 1 +static cycle_t read_cyclone(void) +{ + return (cycle_t)readq((void __iomem *)cyclone_mc_ptr); +} + +static struct clocksource clocksource_cyclone = { + .name = "cyclone", + .rating = 300, + .read = read_cyclone, + .mask = (1LL << 40) - 1, + .mult = 0, /*to be caluclated*/ + .shift = 16, + .is_continuous = 1, }; int __init init_cyclone_clock(void) @@ -101,8 +110,10 @@ int __init init_cyclone_clock(void) } } /* initialize last tick */ - cyclone_interpolator.addr = cyclone_timer; - register_time_interpolator(&cyclone_interpolator); + clocksource_cyclone.fsys_mmio_ptr = cyclone_mc_ptr = cyclone_timer; + clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ, + clocksource_cyclone.shift); + clocksource_register(&clocksource_cyclone); return 0; } Index: linux/arch/ia64/kernel/entry.S =================================================================== --- linux.orig/arch/ia64/kernel/entry.S +++ linux/arch/ia64/kernel/entry.S @@ -1101,23 +1101,24 @@ skip_rbs_switch: st8 [r2]=r8 st8 [r3]=r10 .work_pending: - tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? + tbit.nz p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? +(p6) br.cond.sptk.few .needresched + tbit.z p6,p0=r31,TIF_NEED_RESCHED_DELAYED // current_thread_info()->need_resched_delayed==0? (p6) br.cond.sptk.few .notify -#ifdef CONFIG_PREEMPT -(pKStk) dep r21=-1,r0,PREEMPT_ACTIVE_BIT,1 + +.needresched: + +(pKStk) br.cond.sptk.many .fromkernel ;; -(pKStk) st4 [r20]=r21 ssm psr.i // enable interrupts -#endif br.call.spnt.many rp=schedule -.ret9: cmp.eq p6,p0=r0,r0 // p6 <- 1 - rsm psr.i // disable interrupts - ;; -#ifdef CONFIG_PREEMPT -(pKStk) adds r20=TI_PRE_COUNT+IA64_TASK_SIZE,r13 +.ret9a: rsm psr.i // disable interrupts ;; -(pKStk) st4 [r20]=r0 // preempt_count() <- 0 -#endif + br.cond.sptk.many .endpreemptdep +.fromkernel: + br.call.spnt.many rp=preempt_schedule_irq +.ret9b: rsm psr.i // disable interrupts +.endpreemptdep: (pLvSys)br.cond.sptk.few .work_pending_syscall_end br.cond.sptk.many .work_processed_kernel // re-check Index: linux/arch/ia64/kernel/fsys.S =================================================================== --- linux.orig/arch/ia64/kernel/fsys.S +++ linux/arch/ia64/kernel/fsys.S @@ -24,6 +24,7 @@ #include "entry.h" +#ifdef CONFIG_TIME_INTERPOLATION /* * See Documentation/ia64/fsys.txt for details on fsyscalls. * @@ -145,13 +146,6 @@ ENTRY(fsys_set_tid_address) FSYS_RETURN END(fsys_set_tid_address) -/* - * Ensure that the time interpolator structure is compatible with the asm code - */ -#if IA64_TIME_INTERPOLATOR_SOURCE_OFFSET !=0 || IA64_TIME_INTERPOLATOR_SHIFT_OFFSET != 2 \ - || IA64_TIME_INTERPOLATOR_JITTER_OFFSET != 3 || IA64_TIME_INTERPOLATOR_NSEC_OFFSET != 4 -#error fsys_gettimeofday incompatible with changes to struct time_interpolator -#endif #define CLOCK_REALTIME 0 #define CLOCK_MONOTONIC 1 #define CLOCK_DIVIDE_BY_1000 0x4000 @@ -177,19 +171,18 @@ ENTRY(fsys_gettimeofday) // r11 = preserved: saved ar.pfs // r12 = preserved: memory stack // r13 = preserved: thread pointer - // r14 = address of mask / mask + // r14 = address of mask / mask value // r15 = preserved: system call number // r16 = preserved: current task pointer // r17 = wall to monotonic use - // r18 = time_interpolator->offset - // r19 = address of wall_to_monotonic - // r20 = pointer to struct time_interpolator / pointer to time_interpolator->address - // r21 = shift factor - // r22 = address of time interpolator->last_counter - // r23 = address of time_interpolator->last_cycle - // r24 = adress of time_interpolator->offset - // r25 = last_cycle value - // r26 = last_counter value + // r19 = address of itc_lastcycle + // r20 = struct clocksource / address of first element + // r21 = shift value + // r22 = address of itc_jitter/ wall_to_monotonic + // r23 = address of shift + // r24 = address mult factor / cycle_last value + // r25 = itc_lastcycle value + // r26 = address clocksource cycle_last // r27 = pointer to xtime // r28 = sequence number at the beginning of critcal section // r29 = address of seqlock @@ -199,9 +192,9 @@ ENTRY(fsys_gettimeofday) // p6,p7 short term use // p8 = timesource ar.itc // p9 = timesource mmio64 - // p10 = timesource mmio32 + // p10 = timesource mmio32 - not used // p11 = timesource not to be handled by asm code - // p12 = memory time source ( = p9 | p10) + // p12 = memory time source ( = p9 | p10) - not used // p13 = do cmpxchg with time_interpolator_last_cycle // p14 = Divide by 1000 // p15 = Add monotonic @@ -212,61 +205,55 @@ ENTRY(fsys_gettimeofday) tnat.nz p6,p0 = r31 // branch deferred since it does not fit into bundle structure mov pr = r30,0xc000 // Set predicates according to function add r2 = TI_FLAGS+IA64_TASK_SIZE,r16 - movl r20 = time_interpolator + movl r20 = fsyscall_clock // load fsyscall clocksource address ;; - ld8 r20 = [r20] // get pointer to time_interpolator structure + add r10 = IA64_CLOCKSOURCE_MMIO_PTR_OFFSET,r20 movl r29 = xtime_lock ld4 r2 = [r2] // process work pending flags movl r27 = xtime ;; // only one bundle here - ld8 r21 = [r20] // first quad with control information + add r14 = IA64_CLOCKSOURCE_MASK_OFFSET,r20 + movl r22 = itc_jitter + add r24 = IA64_CLOCKSOURCE_MULT_OFFSET,r20 and r2 = TIF_ALLWORK_MASK,r2 (p6) br.cond.spnt.few .fail_einval // deferred branch ;; - add r10 = IA64_TIME_INTERPOLATOR_ADDRESS_OFFSET,r20 - extr r3 = r21,32,32 // time_interpolator->nsec_per_cyc - extr r8 = r21,0,16 // time_interpolator->source + ld8 r30 = [r10] // clocksource->mmio_ptr + movl r19 = itc_lastcycle + add r23 = IA64_CLOCKSOURCE_SHIFT_OFFSET,r20 cmp.ne p6, p0 = 0, r2 // Fallback if work is scheduled (p6) br.cond.spnt.many fsys_fallback_syscall ;; - cmp.eq p8,p12 = 0,r8 // Check for cpu timer - cmp.eq p9,p0 = 1,r8 // MMIO64 ? - extr r2 = r21,24,8 // time_interpolator->jitter - cmp.eq p10,p0 = 2,r8 // MMIO32 ? - cmp.ltu p11,p0 = 2,r8 // function or other clock -(p11) br.cond.spnt.many fsys_fallback_syscall - ;; - setf.sig f7 = r3 // Setup for scaling of counter -(p15) movl r19 = wall_to_monotonic -(p12) ld8 r30 = [r10] - cmp.ne p13,p0 = r2,r0 // need jitter compensation? - extr r21 = r21,16,8 // shift factor + ld8 r14 = [r14] // clocksource mask value + ld4 r2 = [r22] // itc_jitter value + add r26 = IA64_CLOCKSOURCE_CYCLE_LAST_OFFSET,r20 // clock fsyscall_cycle_last + ld4 r3 = [r24] // clocksource->mult value + cmp.eq p8,p9 = 0,r30 // Check for cpu timer, no mmio_ptr, set p8, clear p9 + ;; + setf.sig f7 = r3 // Setup for mult scaling of counter +(p15) movl r22 = wall_to_monotonic + ld4 r21 = [r23] // shift value +(p8) cmp.ne p13,p0 = r2,r0 // need jitter compensation, set p13 +(p9) cmp.eq p13,p0 = 0,r30 // if mmio_ptr, clear p13 jitter control ;; .time_redo: .pred.rel.mutex p8,p9,p10 ld4.acq r28 = [r29] // xtime_lock.sequence. Must come first for locking purposes (p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!! - add r22 = IA64_TIME_INTERPOLATOR_LAST_COUNTER_OFFSET,r20 (p9) ld8 r2 = [r30] // readq(ti->address). Could also have latency issues.. -(p10) ld4 r2 = [r30] // readw(ti->address) -(p13) add r23 = IA64_TIME_INTERPOLATOR_LAST_CYCLE_OFFSET,r20 +(p13) ld8 r25 = [r19] // get itc_lastcycle value ;; // could be removed by moving the last add upward - ld8 r26 = [r22] // time_interpolator->last_counter -(p13) ld8 r25 = [r23] // time interpolator->last_cycle - add r24 = IA64_TIME_INTERPOLATOR_OFFSET_OFFSET,r20 -(p15) ld8 r17 = [r19],IA64_TIMESPEC_TV_NSEC_OFFSET ld8 r9 = [r27],IA64_TIMESPEC_TV_NSEC_OFFSET - add r14 = IA64_TIME_INTERPOLATOR_MASK_OFFSET, r20 + ld8 r24 = [r26] // get fsyscall_cycle_last value +(p15) ld8 r17 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET ;; - ld8 r18 = [r24] // time_interpolator->offset ld8 r8 = [r27],-IA64_TIMESPEC_TV_NSEC_OFFSET // xtime.tv_nsec -(p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm) +(p13) sub r3 = r25,r2 // Diff needed before comparison (thanks davidm) ;; - ld8 r14 = [r14] // time_interpolator->mask -(p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared - sub r10 = r2,r26 // current_counter - last_counter +(p13) cmp.gt.unc p6,p7 = r3,r0 // check if it is less than last. p6,p7 cleared + sub r10 = r2,r24 // current_counter - last_counter ;; -(p6) sub r10 = r25,r26 // time we got was less than last_cycle +(p6) sub r10 = r25,r24 // time we got was less than last_cycle (p7) mov ar.ccv = r25 // more than last_cycle. Prep for cmpxchg ;; and r10 = r10,r14 // Apply mask @@ -274,22 +261,21 @@ ENTRY(fsys_gettimeofday) setf.sig f8 = r10 nop.i 123 ;; -(p7) cmpxchg8.rel r3 = [r23],r2,ar.ccv +(p7) cmpxchg8.rel r3 = [r19],r2,ar.ccv EX(.fail_efault, probe.w.fault r31, 3) // This takes 5 cycles and we have spare time xmpy.l f8 = f8,f7 // nsec_per_cyc*(counter-last_counter) (p15) add r9 = r9,r17 // Add wall to monotonic.secs to result secs ;; -(p15) ld8 r17 = [r19],-IA64_TIMESPEC_TV_NSEC_OFFSET +(p15) ld8 r17 = [r22],-IA64_TIMESPEC_TV_NSEC_OFFSET (p7) cmp.ne p7,p0 = r25,r3 // if cmpxchg not successful redo // simulate tbit.nz.or p7,p0 = r28,0 and r28 = ~1,r28 // Make sequence even to force retry if odd getf.sig r2 = f8 mf - add r8 = r8,r18 // Add time interpolator offset ;; ld4 r10 = [r29] // xtime_lock.sequence (p15) add r8 = r8, r17 // Add monotonic.nsecs to nsecs - shr.u r2 = r2,r21 + shr.u r2 = r2,r21 // shift by factor ;; // overloaded 3 bundles! // End critical section. add r8 = r8,r2 // Add xtime.nsecs @@ -348,6 +334,26 @@ ENTRY(fsys_clock_gettime) br.many .gettime END(fsys_clock_gettime) + +#else // !CONFIG_TIME_INTERPOLATION + +# define fsys_gettimeofday 0 +# define fsys_clock_gettime 0 + +.fail_einval: + mov r8 = EINVAL + mov r10 = -1 + FSYS_RETURN + +.fail_efault: + mov r8 = EFAULT + mov r10 = -1 + FSYS_RETURN + +#endif + + + /* * long fsys_rt_sigprocmask (int how, sigset_t *set, sigset_t *oset, size_t sigsetsize). */ Index: linux/arch/ia64/kernel/iosapic.c =================================================================== --- linux.orig/arch/ia64/kernel/iosapic.c +++ linux/arch/ia64/kernel/iosapic.c @@ -112,7 +112,7 @@ (PAGE_SIZE / sizeof(struct iosapic_rte_info)) #define RTE_PREALLOCATED (1) -static DEFINE_SPINLOCK(iosapic_lock); +static DEFINE_RAW_SPINLOCK(iosapic_lock); /* * These tables map IA-64 vectors to the IOSAPIC pin that generates this @@ -409,6 +409,34 @@ iosapic_startup_level_irq (unsigned int return 0; } +/* + * In the preemptible case mask the IRQ first then handle it and ack it. + */ +#ifdef CONFIG_PREEMPT_HARDIRQS + +static void +iosapic_ack_level_irq (unsigned int irq) +{ + ia64_vector vec = irq_to_vector(irq); + struct iosapic_rte_info *rte; + + move_irq(irq); + mask_irq(irq); + list_for_each_entry(rte, &iosapic_intr_info[vec].rtes, rte_list) + iosapic_eoi(rte->addr, vec); +} + +static void +iosapic_end_level_irq (unsigned int irq) +{ + if (!(irq_desc[irq].status & IRQ_INPROGRESS)) + unmask_irq(irq); +} + +#else /* !CONFIG_PREEMPT_HARDIRQS */ + +#define iosapic_ack_level_irq nop + static void iosapic_end_level_irq (unsigned int irq) { @@ -420,10 +448,12 @@ iosapic_end_level_irq (unsigned int irq) iosapic_eoi(rte->addr, vec); } + +#endif + #define iosapic_shutdown_level_irq mask_irq #define iosapic_enable_level_irq unmask_irq #define iosapic_disable_level_irq mask_irq -#define iosapic_ack_level_irq nop struct hw_interrupt_type irq_type_iosapic_level = { .typename = "IO-SAPIC-level", Index: linux/arch/ia64/kernel/irq_ia64.c =================================================================== --- linux.orig/arch/ia64/kernel/irq_ia64.c +++ linux/arch/ia64/kernel/irq_ia64.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -105,6 +106,25 @@ reserve_irq_vector (int vector) return test_and_set_bit(pos, ia64_vector_mask); } +/* + * Dynamic irq allocate and deallocation for MSI + */ +int create_irq(void) +{ + int vector = assign_irq_vector(AUTO_ASSIGN); + + if (vector >= 0) + dynamic_irq_init(vector); + + return vector; +} + +void destroy_irq(unsigned int irq) +{ + dynamic_irq_cleanup(irq); + free_irq_vector(irq); +} + #ifdef CONFIG_SMP # define IS_RESCHEDULE(vec) (vec == IA64_IPI_RESCHEDULE) #else Index: linux/arch/ia64/kernel/mca.c =================================================================== --- linux.orig/arch/ia64/kernel/mca.c +++ linux/arch/ia64/kernel/mca.c @@ -152,7 +152,7 @@ ia64_mca_spin(const char *func) typedef struct ia64_state_log_s { - spinlock_t isl_lock; + raw_spinlock_t isl_lock; int isl_index; unsigned long isl_count; ia64_err_rec_t *isl_log[IA64_MAX_LOGS]; /* need space to store header + error log */ Index: linux/arch/ia64/kernel/perfmon.c =================================================================== --- linux.orig/arch/ia64/kernel/perfmon.c +++ linux/arch/ia64/kernel/perfmon.c @@ -277,7 +277,7 @@ typedef struct { */ typedef struct pfm_context { - spinlock_t ctx_lock; /* context protection */ + raw_spinlock_t ctx_lock; /* context protection */ pfm_context_flags_t ctx_flags; /* bitmask of flags (block reason incl.) */ unsigned int ctx_state; /* state: active/inactive (no bitfield) */ @@ -363,7 +363,7 @@ typedef struct pfm_context { * mostly used to synchronize between system wide and per-process */ typedef struct { - spinlock_t pfs_lock; /* lock the structure */ + raw_spinlock_t pfs_lock; /* lock the structure */ unsigned int pfs_task_sessions; /* number of per task sessions */ unsigned int pfs_sys_sessions; /* number of per system wide sessions */ @@ -504,7 +504,7 @@ static pfm_intr_handler_desc_t *pfm_alt static struct proc_dir_entry *perfmon_dir; static pfm_uuid_t pfm_null_uuid = {0,}; -static spinlock_t pfm_buffer_fmt_lock; +static raw_spinlock_t pfm_buffer_fmt_lock; static LIST_HEAD(pfm_buffer_fmt_list); static pmu_config_t *pmu_conf; Index: linux/arch/ia64/kernel/process.c =================================================================== --- linux.orig/arch/ia64/kernel/process.c +++ linux/arch/ia64/kernel/process.c @@ -96,6 +96,9 @@ show_stack (struct task_struct *task, un void dump_stack (void) { + if (irqs_disabled()) { + printk("Uh oh.. entering dump_stack() with irqs disabled.\n"); + } show_stack(NULL, NULL); } @@ -199,7 +202,7 @@ void default_idle (void) { local_irq_enable(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { if (can_do_pal_halt) safe_halt(); else @@ -275,7 +278,7 @@ cpu_idle (void) else current_thread_info()->status |= TS_POLLING; - if (!need_resched()) { + if (!need_resched() && !need_resched_delayed()) { void (*idle)(void); #ifdef CONFIG_SMP min_xtp(); @@ -297,10 +300,11 @@ cpu_idle (void) normal_xtp(); #endif } - preempt_enable_no_resched(); - schedule(); + __preempt_enable_no_resched(); + __schedule(); + preempt_disable(); - check_pgt_cache(); + if (cpu_is_offline(cpu)) play_dead(); } Index: linux/arch/ia64/kernel/sal.c =================================================================== --- linux.orig/arch/ia64/kernel/sal.c +++ linux/arch/ia64/kernel/sal.c @@ -18,7 +18,7 @@ #include #include - __cacheline_aligned DEFINE_SPINLOCK(sal_lock); + __cacheline_aligned DEFINE_RAW_SPINLOCK(sal_lock); unsigned long sal_platform_features; unsigned short sal_revision; Index: linux/arch/ia64/kernel/salinfo.c =================================================================== --- linux.orig/arch/ia64/kernel/salinfo.c +++ linux/arch/ia64/kernel/salinfo.c @@ -141,7 +141,7 @@ enum salinfo_state { struct salinfo_data { cpumask_t cpu_event; /* which cpus have outstanding events */ - struct semaphore mutex; + struct compat_semaphore mutex; u8 *log_buffer; u64 log_size; u8 *oemdata; /* decoded oem data */ @@ -157,8 +157,8 @@ struct salinfo_data { static struct salinfo_data salinfo_data[ARRAY_SIZE(salinfo_log_name)]; -static DEFINE_SPINLOCK(data_lock); -static DEFINE_SPINLOCK(data_saved_lock); +static DEFINE_RAW_SPINLOCK(data_lock); +static DEFINE_RAW_SPINLOCK(data_saved_lock); /** salinfo_platform_oemdata - optional callback to decode oemdata from an error * record. Index: linux/arch/ia64/kernel/semaphore.c =================================================================== --- linux.orig/arch/ia64/kernel/semaphore.c +++ linux/arch/ia64/kernel/semaphore.c @@ -40,12 +40,12 @@ */ void -__up (struct semaphore *sem) +__up (struct compat_semaphore *sem) { wake_up(&sem->wait); } -void __sched __down (struct semaphore *sem) +void __sched __down (struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -82,7 +82,7 @@ void __sched __down (struct semaphore *s tsk->state = TASK_RUNNING; } -int __sched __down_interruptible (struct semaphore * sem) +int __sched __down_interruptible (struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -142,7 +142,7 @@ int __sched __down_interruptible (struct * count. */ int -__down_trylock (struct semaphore *sem) +__down_trylock (struct compat_semaphore *sem) { unsigned long flags; int sleepers; Index: linux/arch/ia64/kernel/signal.c =================================================================== --- linux.orig/arch/ia64/kernel/signal.c +++ linux/arch/ia64/kernel/signal.c @@ -487,6 +487,14 @@ ia64_do_signal (sigset_t *oldset, struct long errno = scr->pt.r8; # define ERR_CODE(c) (IS_IA32_PROCESS(&scr->pt) ? -(c) : (c)) +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif + /* * In the ia64_leave_kernel code path, we want the common case to go fast, which * is why we may in certain cases get here from kernel mode. Just return without Index: linux/arch/ia64/kernel/smp.c =================================================================== --- linux.orig/arch/ia64/kernel/smp.c +++ linux/arch/ia64/kernel/smp.c @@ -222,6 +222,22 @@ smp_send_reschedule (int cpu) platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != smp_processor_id()) + platform_send_ipi(cpu, IA64_IPI_RESCHEDULE, IA64_IPI_DM_INT, 0); + } +} + + void smp_flush_tlb_all (void) { Index: linux/arch/ia64/kernel/smpboot.c =================================================================== --- linux.orig/arch/ia64/kernel/smpboot.c +++ linux/arch/ia64/kernel/smpboot.c @@ -371,6 +371,8 @@ smp_setup_percpu_timer (void) { } +extern void register_itc_clockevent(void); + static void __devinit smp_callin (void) { @@ -430,6 +432,7 @@ smp_callin (void) #ifdef CONFIG_IA32_SUPPORT ia32_gdt_init(); #endif + register_itc_clockevent(); /* * Allow the master to continue. Index: linux/arch/ia64/kernel/time.c =================================================================== --- linux.orig/arch/ia64/kernel/time.c +++ linux/arch/ia64/kernel/time.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,10 @@ extern unsigned long wall_jiffies; +static cycle_t itc_get_cycles(void); +cycle_t itc_lastcycle __attribute__((aligned(L1_CACHE_BYTES))); +int itc_jitter __attribute__((aligned(L1_CACHE_BYTES))); + volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */ #ifdef CONFIG_IA64_DEBUG_IRQ @@ -40,11 +45,16 @@ EXPORT_SYMBOL(last_cli_ip); #endif -static struct time_interpolator itc_interpolator = { - .shift = 16, - .mask = 0xffffffffffffffffLL, - .source = TIME_SOURCE_CPU +static struct clocksource clocksource_itc = { + .name = "itc", + .rating = 350, + .read = itc_get_cycles, + .mask = 0xffffffffffffffffLL, + .mult = 0, /*to be caluclated*/ + .shift = 16, + .is_continuous = 1, }; +static struct clocksource *clocksource_itc_p; static irqreturn_t timer_interrupt (int irq, void *dev_id, struct pt_regs *regs) @@ -57,38 +67,57 @@ timer_interrupt (int irq, void *dev_id, platform_timer_interrupt(irq, dev_id, regs); +#if 0 new_itm = local_cpu_data->itm_next; if (!time_after(ia64_get_itc(), new_itm)) printk(KERN_ERR "Oops: timer tick before it's due (itc=%lx,itm=%lx)\n", ia64_get_itc(), new_itm); - profile_tick(CPU_PROFILING, regs); +#endif - while (1) { - update_process_times(user_mode(regs)); + if (time_after(ia64_get_itc(), local_cpu_data->itm_tick_next)) { - new_itm += local_cpu_data->itm_delta; + unsigned long new_tick_itm; + new_tick_itm = local_cpu_data->itm_tick_next; - if (smp_processor_id() == time_keeper_id) { - /* - * Here we are in the timer irq handler. We have irqs locally - * disabled, but we don't know if the timer_bh is running on - * another CPU. We need to avoid to SMP race by acquiring the - * xtime_lock. - */ - write_seqlock(&xtime_lock); - do_timer(regs); - local_cpu_data->itm_next = new_itm; - write_sequnlock(&xtime_lock); - } else - local_cpu_data->itm_next = new_itm; + profile_tick(CPU_PROFILING, regs); + + while (1) { + update_process_times(user_mode(regs)); + + new_tick_itm += local_cpu_data->itm_tick_delta; + + if (smp_processor_id() == time_keeper_id) { + /* + * Here we are in the timer irq handler. We have irqs locally + * disabled, but we don't know if the timer_bh is running on + * another CPU. We need to avoid to SMP race by acquiring the + * xtime_lock. + */ + write_seqlock(&xtime_lock); + do_timer(regs); + local_cpu_data->itm_tick_next = new_tick_itm; + write_sequnlock(&xtime_lock); + } else + local_cpu_data->itm_tick_next = new_tick_itm; + + if (time_after(new_tick_itm, ia64_get_itc())) + break; + } + } - if (time_after(new_itm, ia64_get_itc())) - break; + if (time_after(ia64_get_itc(), local_cpu_data->itm_timer_next)) { + if (itc_clockevent.event_handler) + itc_clockevent.event_handler(regs); } do { + // FIXME, really, please + new_itm = local_cpu_data->itm_tick_next; + + if (time_after(new_itm, local_cpu_data->itm_timer_next)) + new_itm = local_cpu_data->itm_timer_next; /* * If we're too close to the next clock tick for * comfort, we increase the safety margin by @@ -98,8 +127,8 @@ timer_interrupt (int irq, void *dev_id, * too fast (with the potentially devastating effect * of losing monotony of time). */ - while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_delta/2)) - new_itm += local_cpu_data->itm_delta; + while (!time_after(new_itm, ia64_get_itc() + local_cpu_data->itm_tick_delta/2)) + new_itm += local_cpu_data->itm_tick_delta; ia64_set_itm(new_itm); /* double check, in case we got hit by a (slow) PMI: */ } while (time_after_eq(ia64_get_itc(), new_itm)); @@ -118,7 +147,7 @@ ia64_cpu_local_tick (void) /* arrange for the cycle counter to generate a timer interrupt: */ ia64_set_itv(IA64_TIMER_VECTOR); - delta = local_cpu_data->itm_delta; + delta = local_cpu_data->itm_tick_delta; /* * Stagger the timer tick for each CPU so they don't occur all at (almost) the * same time: @@ -127,8 +156,8 @@ ia64_cpu_local_tick (void) unsigned long hi = 1UL << ia64_fls(cpu); shift = (2*(cpu - hi) + 1) * delta/hi/2; } - local_cpu_data->itm_next = ia64_get_itc() + delta + shift; - ia64_set_itm(local_cpu_data->itm_next); + local_cpu_data->itm_tick_next = ia64_get_itc() + delta + shift; + ia64_set_itm(local_cpu_data->itm_tick_next); } static int nojitter; @@ -186,7 +215,7 @@ ia64_init_itm (void) itc_freq = (platform_base_freq*itc_ratio.num)/itc_ratio.den; - local_cpu_data->itm_delta = (itc_freq + HZ/2) / HZ; + local_cpu_data->itm_tick_delta = (itc_freq + HZ/2) / HZ; printk(KERN_DEBUG "CPU %d: base freq=%lu.%03luMHz, ITC ratio=%u/%u, " "ITC freq=%lu.%03luMHz", smp_processor_id(), platform_base_freq / 1000000, (platform_base_freq / 1000) % 1000, @@ -206,9 +235,8 @@ ia64_init_itm (void) local_cpu_data->nsec_per_cyc = ((NSEC_PER_SEC<itc_freq; - itc_interpolator.drift = itc_drift; #ifdef CONFIG_SMP /* On IA64 in an SMP configuration ITCs are never accurately synchronized. * Jitter compensation requires a cmpxchg which may limit @@ -220,18 +248,57 @@ ia64_init_itm (void) * even going backward) if the ITC offsets between the individual CPUs * are too large. */ - if (!nojitter) itc_interpolator.jitter = 1; + if (!nojitter) itc_jitter = 1; #endif - register_time_interpolator(&itc_interpolator); } +#endif /* Setup the CPU local timer tick */ ia64_cpu_local_tick(); + + if (!clocksource_itc_p) { + /* Sort out mult/shift values: */ + clocksource_itc.mult = clocksource_hz2mult(local_cpu_data->itc_freq, + clocksource_itc.shift); + clocksource_register(&clocksource_itc); + clocksource_itc_p = &clocksource_itc; + } } + +static cycle_t itc_get_cycles() +{ + if (itc_jitter) { + u64 lcycle; + u64 now; + + do { + lcycle = itc_lastcycle; + now = get_cycles(); + if (lcycle && time_after(lcycle, now)) + return lcycle; + + /* When holding the xtime write lock, there's no need + * to add the overhead of the cmpxchg. Readers are + * force to retry until the write lock is released. + */ + if (spin_is_locked(&xtime_lock.lock)) { + itc_lastcycle = now; + return now; + } + /* Keep track of the last timer value returned. The use of cmpxchg here + * will cause contention in an SMP environment. + */ + } while (unlikely(cmpxchg(&itc_lastcycle, lcycle, now) != lcycle)); + return now; + } else + return get_cycles(); +} + + static struct irqaction timer_irqaction = { .handler = timer_interrupt, - .flags = IRQF_DISABLED, + .flags = IRQF_DISABLED | IRQF_NODELAY, .name = "timer" }; @@ -252,6 +319,8 @@ time_init (void) * tv_nsec field must be normalized (i.e., 0 <= nsec < NSEC_PER_SEC). */ set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); + register_itc_clocksource(); + register_itc_clockevent(); } /* @@ -304,3 +373,10 @@ ia64_setup_printk_clock(void) if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) ia64_printk_clock = ia64_itc_printk_clock; } + +struct clocksource fsyscall_clock __attribute__((aligned(L1_CACHE_BYTES))); + +void update_vsyscall(struct timespec *wall, struct clocksource *c) +{ + fsyscall_clock = *c; +} Index: linux/arch/ia64/kernel/traps.c =================================================================== --- linux.orig/arch/ia64/kernel/traps.c +++ linux/arch/ia64/kernel/traps.c @@ -24,7 +24,7 @@ #include #include -extern spinlock_t timerlist_lock; +extern raw_spinlock_t timerlist_lock; fpswa_interface_t *fpswa_interface; EXPORT_SYMBOL(fpswa_interface); @@ -85,11 +85,11 @@ void die (const char *str, struct pt_regs *regs, long err) { static struct { - spinlock_t lock; + raw_spinlock_t lock; u32 lock_owner; int lock_owner_depth; } die = { - .lock = SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED, .lock_owner = -1, .lock_owner_depth = 0 }; @@ -226,7 +226,7 @@ __kprobes ia64_bad_break (unsigned long * access to fph by the time we get here, as the IVT's "Disabled FP-Register" handler takes * care of clearing psr.dfh. */ -static inline void +void disabled_fph_fault (struct pt_regs *regs) { struct ia64_psr *psr = ia64_psr(regs); @@ -245,7 +245,7 @@ disabled_fph_fault (struct pt_regs *regs = (struct task_struct *)ia64_get_kr(IA64_KR_FPU_OWNER); if (ia64_is_local_fpu_owner(current)) { - preempt_enable_no_resched(); + __preempt_enable_no_resched(); return; } @@ -265,7 +265,7 @@ disabled_fph_fault (struct pt_regs *regs */ psr->mfh = 1; } - preempt_enable_no_resched(); + __preempt_enable_no_resched(); } static inline int Index: linux/arch/ia64/kernel/unwind.c =================================================================== --- linux.orig/arch/ia64/kernel/unwind.c +++ linux/arch/ia64/kernel/unwind.c @@ -81,7 +81,7 @@ typedef unsigned long unw_word; typedef unsigned char unw_hash_index_t; static struct { - spinlock_t lock; /* spinlock for unwind data */ + raw_spinlock_t lock; /* spinlock for unwind data */ /* list of unwind tables (one per load-module) */ struct unw_table *tables; @@ -145,7 +145,7 @@ static struct { # endif } unw = { .tables = &unw.kernel_table, - .lock = SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED, .save_order = { UNW_REG_RP, UNW_REG_PFS, UNW_REG_PSP, UNW_REG_PR, UNW_REG_UNAT, UNW_REG_LC, UNW_REG_FPSR, UNW_REG_PRI_UNAT_GR Index: linux/arch/ia64/kernel/unwind_i.h =================================================================== --- linux.orig/arch/ia64/kernel/unwind_i.h +++ linux/arch/ia64/kernel/unwind_i.h @@ -154,7 +154,7 @@ struct unw_script { unsigned long ip; /* ip this script is for */ unsigned long pr_mask; /* mask of predicates script depends on */ unsigned long pr_val; /* predicate values this script is for */ - rwlock_t lock; + raw_rwlock_t lock; unsigned int flags; /* see UNW_FLAG_* in unwind.h */ unsigned short lru_chain; /* used for least-recently-used chain */ unsigned short coll_chain; /* used for hash collisions */ Index: linux/arch/ia64/mm/init.c =================================================================== --- linux.orig/arch/ia64/mm/init.c +++ linux/arch/ia64/mm/init.c @@ -36,7 +36,7 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist); DEFINE_PER_CPU(long, __pgtable_quicklist_size); @@ -92,15 +92,11 @@ check_pgt_cache(void) if (unlikely(pgtable_quicklist_size <= MIN_PGT_PAGES)) return; - preempt_disable(); while (unlikely((pages_to_free = min_pages_to_free()) > 0)) { while (pages_to_free--) { free_page((unsigned long)pgtable_quicklist_alloc()); } - preempt_enable(); - preempt_disable(); } - preempt_enable(); } void Index: linux/arch/ia64/mm/tlb.c =================================================================== --- linux.orig/arch/ia64/mm/tlb.c +++ linux/arch/ia64/mm/tlb.c @@ -32,7 +32,7 @@ static struct { } purge; struct ia64_ctx ia64_ctx = { - .lock = SPIN_LOCK_UNLOCKED, + .lock = RAW_SPIN_LOCK_UNLOCKED, .next = 1, .max_ctx = ~0U }; Index: linux/arch/ia64/pci/pci.c =================================================================== --- linux.orig/arch/ia64/pci/pci.c +++ linux/arch/ia64/pci/pci.c @@ -809,12 +809,3 @@ pcibios_prep_mwi (struct pci_dev *dev) } return rc; } - -int pci_vector_resources(int last, int nr_released) -{ - int count = nr_released; - - count += (IA64_LAST_DEVICE_VECTOR - last); - - return count; -} Index: linux/arch/ia64/sn/kernel/sn2/timer.c =================================================================== --- linux.orig/arch/ia64/sn/kernel/sn2/timer.c +++ linux/arch/ia64/sn/kernel/sn2/timer.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -22,11 +23,21 @@ extern unsigned long sn_rtc_cycles_per_second; -static struct time_interpolator sn2_interpolator = { - .drift = -1, - .shift = 10, - .mask = (1LL << 55) - 1, - .source = TIME_SOURCE_MMIO64 +static void __iomem *sn2_mc_ptr; + +static cycle_t read_sn2(void) +{ + return (cycle_t)readq(sn2_mc_ptr); +} + +static struct clocksource clocksource_sn2 = { + .name = "sn2_rtc", + .rating = 300, + .read = read_sn2, + .mask = (1LL << 55) - 1, + .mult = 0, + .shift = 10, + .is_continuous = 1, }; /* @@ -47,9 +58,10 @@ ia64_sn_udelay (unsigned long usecs) void __init sn_timer_init(void) { - sn2_interpolator.frequency = sn_rtc_cycles_per_second; - sn2_interpolator.addr = RTC_COUNTER_ADDR; - register_time_interpolator(&sn2_interpolator); + clocksource_sn2.fsys_mmio_ptr = sn2_mc_ptr = RTC_COUNTER_ADDR; + clocksource_sn2.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, + clocksource_sn2.shift); + clocksource_register(&clocksource_sn2); ia64_udelay = &ia64_sn_udelay; } Index: linux/arch/mips/Kconfig =================================================================== --- linux.orig/arch/mips/Kconfig +++ linux/arch/mips/Kconfig @@ -417,6 +417,7 @@ config MOMENCO_JAGUAR_ATX config MOMENCO_OCELOT bool "Momentum Ocelot board" select DMA_NONCOHERENT + select NO_SPINLOCK select HW_HAS_PCI select IRQ_CPU select IRQ_CPU_RM7K @@ -837,6 +838,7 @@ source "arch/mips/cobalt/Kconfig" endmenu + config RWSEM_GENERIC_SPINLOCK bool default y @@ -844,6 +846,10 @@ config RWSEM_GENERIC_SPINLOCK config RWSEM_XCHGADD_ALGORITHM bool +config ASM_SEMAPHORES + bool + default y + config GENERIC_FIND_NEXT_BIT bool default y @@ -889,6 +895,9 @@ config DMA_NEED_PCI_MAP_STATE config OWN_DMA bool +config NO_SPINLOCK + bool + config EARLY_PRINTK bool @@ -1843,12 +1852,17 @@ config MIPS_INSANE_LARGE This will result in additional memory usage, so it is not recommended for normal users. -endmenu - -config RWSEM_GENERIC_SPINLOCK +config GENERIC_TIME bool default y +source "kernel/time/Kconfig" + +config CPU_SPEED + int "CPU speed used for clocksource/clockevent calculations" + default 600 +endmenu + source "init/Kconfig" menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)" Index: linux/arch/mips/kernel/Makefile =================================================================== --- linux.orig/arch/mips/kernel/Makefile +++ linux/arch/mips/kernel/Makefile @@ -5,7 +5,7 @@ extra-y := head.o init_task.o vmlinux.lds obj-y += cpu-probe.o branch.o entry.o genex.o irq.o process.o \ - ptrace.o reset.o semaphore.o setup.o signal.o syscall.o \ + ptrace.o reset.o setup.o signal.o syscall.o \ time.o traps.o unaligned.o binfmt_irix-objs := irixelf.o irixinv.o irixioctl.o irixsig.o \ @@ -15,6 +15,8 @@ obj-$(CONFIG_MODULES) += mips_ksyms.o m obj-$(CONFIG_APM) += apm.o +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o + obj-$(CONFIG_CPU_R3000) += r2300_fpu.o r2300_switch.o obj-$(CONFIG_CPU_TX39XX) += r2300_fpu.o r2300_switch.o obj-$(CONFIG_CPU_TX49XX) += r4k_fpu.o r4k_switch.o Index: linux/arch/mips/kernel/asm-offsets.c =================================================================== --- linux.orig/arch/mips/kernel/asm-offsets.c +++ linux/arch/mips/kernel/asm-offsets.c @@ -10,9 +10,11 @@ */ #include #include +#include #include #include #include +#include #include #include Index: linux/arch/mips/kernel/entry.S =================================================================== --- linux.orig/arch/mips/kernel/entry.S +++ linux/arch/mips/kernel/entry.S @@ -25,7 +25,7 @@ .endm #else .macro preempt_stop - local_irq_disable + raw_local_irq_disable .endm #define resume_kernel restore_all #endif @@ -40,7 +40,7 @@ FEXPORT(ret_from_irq) beqz t0, resume_kernel resume_userspace: - local_irq_disable # make sure we dont miss an + raw_local_irq_disable # make sure we dont miss an # interrupt setting need_resched # between sampling and return LONG_L a2, TI_FLAGS($28) # current->work @@ -50,7 +50,9 @@ resume_userspace: #ifdef CONFIG_PREEMPT resume_kernel: - local_irq_disable + raw_local_irq_disable + lw t0, kernel_preemption + beqz t0, restore_all lw t0, TI_PRE_COUNT($28) bnez t0, restore_all need_resched: @@ -60,7 +62,9 @@ need_resched: LONG_L t0, PT_STATUS(sp) # Interrupts off? andi t0, 1 beqz t0, restore_all + raw_local_irq_disable jal preempt_schedule_irq + sw zero, TI_PRE_COUNT($28) b need_resched #endif @@ -68,7 +72,7 @@ FEXPORT(ret_from_fork) jal schedule_tail # a0 = struct task_struct *prev FEXPORT(syscall_exit) - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work @@ -133,19 +137,21 @@ FEXPORT(restore_partial) # restore part .set at work_pending: - andi t0, a2, _TIF_NEED_RESCHED # a2 is preloaded with TI_FLAGS + # a2 is preloaded with TI_FLAGS + andi t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beqz t0, work_notifysig work_resched: + raw_local_irq_enable t0 jal schedule - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) andi t0, a2, _TIF_WORK_MASK # is there any work to be done # other than syscall tracing? beqz t0, restore_all - andi t0, a2, _TIF_NEED_RESCHED + andi t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bnez t0, work_resched work_notifysig: # deal with pending signals and @@ -161,7 +167,7 @@ syscall_exit_work: li t0, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT and t0, a2 # a2 is preloaded with TI_FLAGS beqz t0, work_pending # trace bit set? - local_irq_enable # could let do_syscall_trace() + raw_local_irq_enable # could let do_syscall_trace() # call schedule() instead move a0, sp li a1, 1 Index: linux/arch/mips/kernel/i8259.c =================================================================== --- linux.orig/arch/mips/kernel/i8259.c +++ linux/arch/mips/kernel/i8259.c @@ -31,7 +31,7 @@ void disable_8259A_irq(unsigned int irq) * moves to arch independent land */ -DEFINE_SPINLOCK(i8259A_lock); +DEFINE_RAW_SPINLOCK(i8259A_lock); static void end_8259A_irq (unsigned int irq) { Index: linux/arch/mips/kernel/irq.c =================================================================== --- linux.orig/arch/mips/kernel/irq.c +++ linux/arch/mips/kernel/irq.c @@ -137,7 +137,10 @@ void __init init_IRQ(void) irq_desc[i].action = NULL; irq_desc[i].depth = 1; irq_desc[i].chip = &no_irq_chip; - spin_lock_init(&irq_desc[i].lock); + _raw_spin_lock_init(&irq_desc[i].lock); +#ifdef CONFIG_PREEMPT_HARDIRQS + irq_desc[i].thread = NULL; +#endif #ifdef CONFIG_MIPS_MT_SMTC irq_hwmask[i] = 0; #endif /* CONFIG_MIPS_MT_SMTC */ Index: linux/arch/mips/kernel/module.c =================================================================== --- linux.orig/arch/mips/kernel/module.c +++ linux/arch/mips/kernel/module.c @@ -39,7 +39,7 @@ struct mips_hi16 { static struct mips_hi16 *mips_hi16_list; static LIST_HEAD(dbe_list); -static DEFINE_SPINLOCK(dbe_lock); +static DEFINE_RAW_SPINLOCK(dbe_lock); void *module_alloc(unsigned long size) { Index: linux/arch/mips/kernel/process.c =================================================================== --- linux.orig/arch/mips/kernel/process.c +++ linux/arch/mips/kernel/process.c @@ -54,16 +54,18 @@ ATTRIB_NORET void cpu_idle(void) { /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { #ifdef CONFIG_MIPS_MT_SMTC smtc_idle_loop_hook(); #endif /* CONFIG_MIPS_MT_SMTC */ if (cpu_wait) (*cpu_wait)(); } - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } Index: linux/arch/mips/kernel/scall32-o32.S =================================================================== --- linux.orig/arch/mips/kernel/scall32-o32.S +++ linux/arch/mips/kernel/scall32-o32.S @@ -84,7 +84,7 @@ stack_done: 1: sw v0, PT_R2(sp) # result o32_syscall_exit: - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return lw a2, TI_FLAGS($28) # current->work Index: linux/arch/mips/kernel/scall64-64.S =================================================================== --- linux.orig/arch/mips/kernel/scall64-64.S +++ linux/arch/mips/kernel/scall64-64.S @@ -72,7 +72,7 @@ NESTED(handle_sys64, PT_SIZE, sp) 1: sd v0, PT_R2(sp) # result n64_syscall_exit: - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work Index: linux/arch/mips/kernel/scall64-n32.S =================================================================== --- linux.orig/arch/mips/kernel/scall64-n32.S +++ linux/arch/mips/kernel/scall64-n32.S @@ -69,7 +69,7 @@ NESTED(handle_sysn32, PT_SIZE, sp) sd v0, PT_R0(sp) # set flag for syscall restarting 1: sd v0, PT_R2(sp) # result - local_irq_disable # make sure need_resched and + raw_local_irq_disable # make sure need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) # current->work Index: linux/arch/mips/kernel/scall64-o32.S =================================================================== --- linux.orig/arch/mips/kernel/scall64-o32.S +++ linux/arch/mips/kernel/scall64-o32.S @@ -98,7 +98,7 @@ NESTED(handle_sys, PT_SIZE, sp) 1: sd v0, PT_R2(sp) # result o32_syscall_exit: - local_irq_disable # make need_resched and + raw_local_irq_disable # make need_resched and # signals dont change between # sampling and return LONG_L a2, TI_FLAGS($28) Index: linux/arch/mips/kernel/semaphore.c =================================================================== --- linux.orig/arch/mips/kernel/semaphore.c +++ linux/arch/mips/kernel/semaphore.c @@ -36,7 +36,7 @@ * sem->count and sem->waking atomic. Scalability isn't an issue because * this lock is used on UP only so it's just an empty variable. */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -67,7 +67,7 @@ static inline int __sem_update_count(str : "=&r" (old_count), "=&r" (tmp), "=m" (sem->count) : "r" (incr), "m" (sem->count)); } else { - static DEFINE_SPINLOCK(semaphore_lock); + static DEFINE_RAW_SPINLOCK(semaphore_lock); unsigned long flags; spin_lock_irqsave(&semaphore_lock, flags); @@ -80,7 +80,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -94,7 +94,7 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -EXPORT_SYMBOL(__up); +EXPORT_SYMBOL(__compat_up); /* * Note that when we come in to __down or __down_interruptible, @@ -104,7 +104,7 @@ EXPORT_SYMBOL(__up); * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -133,9 +133,9 @@ void __sched __down(struct semaphore *se wake_up(&sem->wait); } -EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__compat_down); -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -165,4 +165,10 @@ int __sched __down_interruptible(struct return retval; } -EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__compat_down_interruptible); + +int fastcall compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux/arch/mips/kernel/signal.c =================================================================== --- linux.orig/arch/mips/kernel/signal.c +++ linux/arch/mips/kernel/signal.c @@ -416,6 +416,10 @@ void do_signal(struct pt_regs *regs) siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which is why we may in certain * cases get here from kernel mode. Just return without doing anything Index: linux/arch/mips/kernel/signal32.c =================================================================== --- linux.orig/arch/mips/kernel/signal32.c +++ linux/arch/mips/kernel/signal32.c @@ -807,6 +807,10 @@ void do_signal32(struct pt_regs *regs) siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which is why we may in certain * cases get here from kernel mode. Just return without doing anything Index: linux/arch/mips/kernel/smp.c =================================================================== --- linux.orig/arch/mips/kernel/smp.c +++ linux/arch/mips/kernel/smp.c @@ -115,7 +115,22 @@ asmlinkage void start_secondary(void) cpu_idle(); } -DEFINE_SPINLOCK(smp_call_lock); +DEFINE_RAW_SPINLOCK(smp_call_lock); + +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them. + */ +void smp_send_reschedule_allbutself(void) +{ + int cpu = smp_processor_id(); + int i; + + for (i = 0; i < NR_CPUS; i++) + if (cpu_online(i) && i != cpu) + core_send_ipi(i, SMP_RESCHEDULE_YOURSELF); +} struct call_data_struct *call_data; @@ -303,6 +318,8 @@ int setup_profiling_timer(unsigned int m return 0; } +static DEFINE_RAW_SPINLOCK(tlbstate_lock); + static void flush_tlb_all_ipi(void *info) { local_flush_tlb_all(); @@ -360,6 +377,7 @@ static inline void smp_on_each_tlb(void void flush_tlb_mm(struct mm_struct *mm) { preempt_disable(); + spin_lock(&tlbstate_lock); if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) { smp_on_other_tlbs(flush_tlb_mm_ipi, (void *)mm); @@ -369,6 +387,7 @@ void flush_tlb_mm(struct mm_struct *mm) if (smp_processor_id() != i) cpu_context(i, mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_mm(mm); preempt_enable(); @@ -392,6 +411,8 @@ void flush_tlb_range(struct vm_area_stru struct mm_struct *mm = vma->vm_mm; preempt_disable(); + spin_lock(&tlbstate_lock); + if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) { struct flush_tlb_data fd; @@ -405,6 +426,7 @@ void flush_tlb_range(struct vm_area_stru if (smp_processor_id() != i) cpu_context(i, mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_range(vma, start, end); preempt_enable(); } @@ -435,6 +457,8 @@ static void flush_tlb_page_ipi(void *inf void flush_tlb_page(struct vm_area_struct *vma, unsigned long page) { preempt_disable(); + spin_lock(&tlbstate_lock); + if ((atomic_read(&vma->vm_mm->mm_users) != 1) || (current->mm != vma->vm_mm)) { struct flush_tlb_data fd; @@ -447,6 +471,7 @@ void flush_tlb_page(struct vm_area_struc if (smp_processor_id() != i) cpu_context(i, vma->vm_mm) = 0; } + spin_unlock(&tlbstate_lock); local_flush_tlb_page(vma, page); preempt_enable(); } Index: linux/arch/mips/kernel/time.c =================================================================== --- linux.orig/arch/mips/kernel/time.c +++ linux/arch/mips/kernel/time.c @@ -10,7 +10,13 @@ * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the License, or (at your * option) any later version. + * + * This implementation of High Res Timers uses two timers. One is the system + * timer. The second is used for the high res timers. The high res timers + * require the CPU to have count/compare registers. The mips_set_next_event() + * function schedules the next high res timer interrupt. */ +#include #include #include #include @@ -23,6 +29,7 @@ #include #include #include +#include #include #include @@ -49,7 +56,27 @@ */ extern volatile unsigned long wall_jiffies; -DEFINE_SPINLOCK(rtc_lock); +/* any missed timer interrupts */ +int missed_timer_count; + +DEFINE_RAW_SPINLOCK(rtc_lock); + +#ifdef CONFIG_HIGH_RES_TIMERS +static void mips_set_next_event(unsigned long evt); +static void mips_set_mode(int mode, void *priv); + +static struct clock_event lapic_clockevent = { + .name = "mips clockevent interface", + .capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE | + CLOCK_HAS_IRQHANDLER +#ifdef CONFIG_SMP + | CLOCK_CAP_UPDATE +#endif + , + .shift = 32, + .set_next_event = mips_set_next_event, +}; +#endif /* * By default we provide the null RTC ops @@ -68,25 +95,36 @@ unsigned long (*rtc_mips_get_time)(void) int (*rtc_mips_set_time)(unsigned long) = null_rtc_set_time; int (*rtc_mips_set_mmss)(unsigned long); - /* usecs per counter cycle, shifted to left by 32 bits */ static unsigned int sll32_usecs_per_cycle; /* how many counter cycles in a jiffy */ static unsigned long cycles_per_jiffy __read_mostly; +static unsigned long hrt_cycles_per_jiffy __read_mostly; + + /* Cycle counter value at the previous timer interrupt.. */ static unsigned int timerhi, timerlo; /* expirelo is the count value for next CPU timer interrupt */ static unsigned int expirelo; - /* * Null timer ack for systems not needing one (e.g. i8254). */ static void null_timer_ack(void) { /* nothing */ } +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * Set the next event + */ +static void mips_set_next_event(unsigned long evt) +{ + write_c0_compare(read_c0_count() + evt); +} +#endif + /* * Null high precision timer functions for systems lacking one. */ @@ -100,7 +138,6 @@ static void null_hpt_init(unsigned int c /* nothing */ } - /* * Timer ack for an R4k-compatible timer of a known frequency. */ @@ -110,14 +147,15 @@ static void c0_timer_ack(void) #ifndef CONFIG_SOC_PNX8550 /* pnx8550 resets to zero */ /* Ack this timer interrupt and set the next one. */ - expirelo += cycles_per_jiffy; + expirelo += hrt_cycles_per_jiffy; #endif write_c0_compare(expirelo); /* Check to see if we have missed any timer interrupts. */ - while (((count = read_c0_count()) - expirelo) < 0x7fffffff) { - /* missed_timer_count++; */ - expirelo = count + cycles_per_jiffy; + count = read_c0_count(); + if ((count - expirelo) < 0x7fffffff) { + /* missed_timer_count++; */ + expirelo = count + hrt_cycles_per_jiffy; write_c0_compare(expirelo); } } @@ -146,92 +184,39 @@ static void c0_hpt_timer_init(unsigned i write_c0_count(count); } -int (*mips_timer_state)(void); -void (*mips_timer_ack)(void); -unsigned int (*mips_hpt_read)(void); -void (*mips_hpt_init)(unsigned int); - - -/* - * This version of gettimeofday has microsecond resolution and better than - * microsecond precision on fast machines with cycle counter. - */ -void do_gettimeofday(struct timeval *tv) +static cycle_t read_mips_hpt(void) { - unsigned long seq; - unsigned long lost; - unsigned long usec, sec; - unsigned long max_ntp_tick; - - do { - seq = read_seqbegin(&xtime_lock); - - usec = do_gettimeoffset(); - - lost = jiffies - wall_jiffies; - - /* - * If time_adjust is negative then NTP is slowing the clock - * so make sure not to go into next possible interval. - * Better to lose some accuracy than have time go backwards.. - */ - if (unlikely(time_adjust < 0)) { - max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj; - usec = min(usec, max_ntp_tick); - - if (lost) - usec += lost * max_ntp_tick; - } else if (unlikely(lost)) - usec += lost * (USEC_PER_SEC / HZ); - - sec = xtime.tv_sec; - usec += (xtime.tv_nsec / 1000); - - } while (read_seqretry(&xtime_lock, seq)); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; + cycle_t ret; + ret = (cycle_t)mips_hpt_read(); + return ret; } -EXPORT_SYMBOL(do_gettimeofday); +static struct clocksource clocksource_tsc = { + .name = "MIPS", + .rating = 250, + .read = read_mips_hpt, + .mask = 0xffffffff, + .mult = 0, + .shift = 24, + .is_continuous = 1, +}; -int do_settimeofday(struct timespec *tv) +static int __init init_tsc_clocksource(void) { - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); + u64 temp; - /* - * This is revolting. We need to set "xtime" correctly. However, - * the value in this location is the value at the most recent update - * of wall time. Discover what correction gettimeofday() would have - * made, and then undo it! - */ - nsec -= do_gettimeoffset() * NSEC_PER_USEC; - nsec -= (jiffies - wall_jiffies) * tick_nsec; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + temp = (u64) 1000000000 << clocksource_tsc.shift; + do_div(temp, mips_hpt_frequency); + clocksource_tsc.mult = (unsigned)temp; - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; + return clocksource_register(&clocksource_tsc); } +module_init(init_tsc_clocksource); -EXPORT_SYMBOL(do_settimeofday); +int (*mips_timer_state)(void); +void (*mips_timer_ack)(void); +unsigned int (*mips_hpt_read)(void); +void (*mips_hpt_init)(unsigned int); /* * Gettimeoffset routines. These routines returns the time duration @@ -250,11 +235,9 @@ static unsigned long null_gettimeoffset( return 0; } - /* The function pointer to one of the gettimeoffset funcs. */ unsigned long (*do_gettimeoffset)(void) = null_gettimeoffset; - static unsigned long fixed_rate_gettimeoffset(void) { u32 count; @@ -396,6 +379,29 @@ static unsigned long calibrate_div64_get /* last time when xtime and rtc are sync'ed up */ static long last_rtc_update; +unsigned long read_persistent_clock(void) +{ + unsigned long sec; + sec = rtc_mips_get_time(); + return sec; +} + +void sync_persistent_clock(struct timespec ts) +{ + if (ntp_synced() && + xtime.tv_sec > last_rtc_update + 660 && + (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 && + (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) { + if (rtc_mips_set_mmss(xtime.tv_sec) == 0) { + last_rtc_update = xtime.tv_sec; + } + else { + /* do it again in 60 s */ + last_rtc_update = xtime.tv_sec - 600; + } + } +} + /* * local_timer_interrupt() does profiling and process accounting * on a per-CPU basis. @@ -410,6 +416,7 @@ void local_timer_interrupt(int irq, void { if (current->pid) profile_tick(CPU_PROFILING, regs); + update_process_times(user_mode(regs)); } @@ -438,7 +445,7 @@ irqreturn_t timer_interrupt(int irq, voi /* * If we have an externally synchronized Linux clock, then update - * CMOS clock accordingly every ~11 minutes. rtc_mips_set_time() has to be + * CMOS clock accordingly every ~11 minutes. rtc_set_time() has to be * called as close as possible to 500 ms before the new second starts. */ if (ntp_synced() && @@ -518,6 +525,15 @@ int (*perf_irq)(struct pt_regs *regs) = EXPORT_SYMBOL(null_perf_irq); EXPORT_SYMBOL(perf_irq); +#ifdef CONFIG_HIGH_RES_TIMERS +void event_timer_handler(struct pt_regs *regs) +{ + c0_timer_ack(); + if (lapic_clockevent.event_handler) + lapic_clockevent.event_handler(regs,NULL); +} +#endif + asmlinkage void ll_timer_interrupt(int irq, struct pt_regs *regs) { int r2 = cpu_has_mips_r2; @@ -531,6 +547,15 @@ asmlinkage void ll_timer_interrupt(int i * performance counter interrupt was pending, so we have to run the * performance counter interrupt handler anyway. */ +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * Run the event handler + */ + if (!r2 || (read_c0_cause() & (1 << 26))) + if (lapic_clockevent.event_handler) + lapic_clockevent.event_handler(regs,NULL); +#endif + if (!r2 || (read_c0_cause() & (1 << 26))) if (perf_irq(regs)) goto out; @@ -563,7 +588,7 @@ asmlinkage void ll_local_timer_interrupt * b) (optional) calibrate and set the mips_hpt_frequency * (only needed if you intended to use fixed_rate_gettimeoffset * or use cpu counter as timer interrupt source) - * 2) setup xtime based on rtc_mips_get_time(). + * 2) setup xtime based on rtc_get_time(). * 3) choose a appropriate gettimeoffset routine. * 4) calculate a couple of cached variables for later usage * 5) plat_timer_setup() - @@ -578,7 +603,7 @@ unsigned int mips_hpt_frequency; static struct irqaction timer_irqaction = { .handler = timer_interrupt, - .flags = IRQF_DISABLED, + .flags = IRQF_NODELAY | IRQF_DISABLED, .name = "timer", }; @@ -627,6 +652,9 @@ static unsigned int __init calibrate_hpt void __init time_init(void) { +#ifdef CONFIG_HIGH_RES_TIMERS + u64 temp; +#endif if (board_time_init) board_time_init(); @@ -688,6 +716,12 @@ void __init time_init(void) /* Calculate cache parameters. */ cycles_per_jiffy = (mips_hpt_frequency + HZ / 2) / HZ; +#ifdef CONFIG_HIGH_RES_TIMERS + hrt_cycles_per_jiffy = ( (CONFIG_CPU_SPEED * 1000000) + HZ / 2) / HZ; +#else + hrt_cycles_per_jiffy = cycles_per_jiffy; +#endif + /* sll32_usecs_per_cycle = 10^6 * 2^32 / mips_counter_freq */ do_div64_32(sll32_usecs_per_cycle, 1000000, mips_hpt_frequency / 2, @@ -776,3 +810,128 @@ unsigned long long sched_clock(void) { return (unsigned long long)jiffies*(1000000000/HZ); } + + +#ifdef CONFIG_SMP +/* + * We have to synchronize the master CPU with all the slave CPUs + */ +static atomic_t cpus_started; +static atomic_t cpus_ready; +static atomic_t cpus_count; +/* + * Master processor inits + */ +static void sync_cpus_init(int v) +{ + atomic_set(&cpus_count, 0); + mb(); + atomic_set(&cpus_started, v); + mb(); + atomic_set(&cpus_ready, v); + mb(); +} + +/* + * Called by the master processor + */ +static void sync_cpus_master(int v) +{ + atomic_set(&cpus_count, 0); + mb(); + atomic_set(&cpus_started, v); + mb(); + /* Wait here till all other CPUs are now ready */ + while (atomic_read(&cpus_count) != (num_online_cpus() -1) ) + mb(); + atomic_set(&cpus_ready, v); + mb(); +} +/* + * Called by the slave processors + */ +static void sync_cpus_slave(int v) +{ + /* Check if the master has been through this */ + while (atomic_read(&cpus_started) != v) + mb(); + atomic_inc(&cpus_count); + mb(); + while (atomic_read(&cpus_ready) != v) + mb(); +} +/* + * Called by the slave CPUs when done syncing the count register + * with the master processor + */ +static void sync_cpus_slave_exit(int v) +{ + while (atomic_read(&cpus_started) != v) + mb(); + atomic_inc(&cpus_count); + mb(); +} + +#define LOOPS 100 +static u32 c0_count[NR_CPUS]; /* Count register per CPU */ +static u32 c[NR_CPUS][LOOPS + 1]; /* Count register per CPU per loop for syncing */ + +/* + * Slave processors execute this via IPI + */ +static void sync_c0_count_slave(void *info) +{ + int cpus = 1, loop, prev_count = 0, cpu = smp_processor_id(); + unsigned long flags; + u32 diff_count; /* CPU count registers are 32-bit */ + local_irq_save(flags); + + for(loop = 0; loop <= LOOPS; loop++) { + /* Sync with the Master processor */ + sync_cpus_slave(cpus++); + c[cpu][loop] = c0_count[cpu] = read_c0_count(); + mb(); + sync_cpus_slave(cpus++); + diff_count = c0_count[0] - c0_count[cpu]; + diff_count += prev_count; + diff_count += read_c0_count(); + write_c0_count(diff_count); + prev_count = (prev_count >> 1) + + ((int)(c0_count[0] - c0_count[cpu]) >> 1); + } + + /* Slave processor is done syncing count register with Master */ + sync_cpus_slave_exit(cpus++); + printk("SMP: Slave processor %d done syncing count \n", cpu); + local_irq_restore(flags); +} + +/* + * Master kicks off the syncing process + */ +void sync_c0_count_master(void) +{ + int cpus = 0, loop, cpu = smp_processor_id(); + unsigned long flags; + + printk("SMP: Starting to sync the c0 count register ... \n"); + sync_cpus_init(cpus++); + + /* Kick off the slave processors to also start the syncing process */ + smp_call_function(sync_c0_count_slave, NULL, 0, 0); + local_irq_save(flags); + + for (loop = 0; loop <= LOOPS; loop++) { + /* Wait for all the CPUs here */ + sync_cpus_master(cpus++); + c[cpu][loop] = c0_count[cpu] = read_c0_count(); + mb(); + /* Do syncing once more */ + sync_cpus_master(cpus++); + } + sync_cpus_master(cpus++); + local_irq_restore(flags); + + printk("SMP: Syncing process completed accross CPUs ... \n"); +} +#endif /* CONFIG_SMP */ Index: linux/arch/mips/kernel/traps.c =================================================================== --- linux.orig/arch/mips/kernel/traps.c +++ linux/arch/mips/kernel/traps.c @@ -274,7 +274,7 @@ void show_registers(struct pt_regs *regs printk("\n"); } -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); NORET_TYPE void ATTRIB_NORET die(const char * str, struct pt_regs * regs) { Index: linux/arch/mips/mm/init.c =================================================================== --- linux.orig/arch/mips/mm/init.c +++ linux/arch/mips/mm/init.c @@ -36,7 +36,7 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; Index: linux/arch/mips/sibyte/cfe/smp.c =================================================================== --- linux.orig/arch/mips/sibyte/cfe/smp.c +++ linux/arch/mips/sibyte/cfe/smp.c @@ -107,4 +107,8 @@ void prom_smp_finish(void) */ void prom_cpus_done(void) { +#ifdef CONFIG_HIGH_RES_TIMERS + extern void sync_c0_count_master(void); + sync_c0_count_master(); +#endif } Index: linux/arch/mips/sibyte/sb1250/irq.c =================================================================== --- linux.orig/arch/mips/sibyte/sb1250/irq.c +++ linux/arch/mips/sibyte/sb1250/irq.c @@ -85,7 +85,7 @@ static struct irq_chip sb1250_irq_type = /* Store the CPU id (not the logical number) */ int sb1250_irq_owner[SB1250_NR_IRQS]; -DEFINE_SPINLOCK(sb1250_imr_lock); +DEFINE_RAW_SPINLOCK(sb1250_imr_lock); void sb1250_mask_irq(int cpu, int irq) { @@ -262,7 +262,7 @@ static irqreturn_t sb1250_dummy_handler static struct irqaction sb1250_dummy_action = { .handler = sb1250_dummy_handler, - .flags = 0, + .flags = IRQF_NODELAY, .mask = CPU_MASK_NONE, .name = "sb1250-private", .next = NULL, @@ -372,6 +372,10 @@ void __init arch_init_irq(void) #ifdef CONFIG_KGDB imask |= STATUSF_IP6; #endif + +#ifdef CONFIG_HIGH_RES_TIMERS + imask |= STATUSF_IP7; +#endif /* Enable necessary IPs, disable the rest */ change_c0_status(ST0_IM, imask); @@ -465,6 +469,10 @@ asmlinkage void plat_irq_dispatch(struct else #endif +#ifdef CONFIG_HIGH_RES_TIMERS + if (pending & CAUSEF_IP7) + event_timer_handler(regs); +#endif if (pending & CAUSEF_IP4) sb1250_timer_interrupt(regs); Index: linux/arch/mips/sibyte/sb1250/smp.c =================================================================== --- linux.orig/arch/mips/sibyte/sb1250/smp.c +++ linux/arch/mips/sibyte/sb1250/smp.c @@ -59,7 +59,7 @@ void sb1250_smp_finish(void) { extern void sb1250_time_init(void); sb1250_time_init(); - local_irq_enable(); + raw_local_irq_enable(); } /* Index: linux/arch/mips/sibyte/swarm/setup.c =================================================================== --- linux.orig/arch/mips/sibyte/swarm/setup.c +++ linux/arch/mips/sibyte/swarm/setup.c @@ -131,6 +131,12 @@ void __init plat_mem_setup(void) rtc_mips_set_time = m41t81_set_time; } +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * set the mips_hpt_frequency here + */ + mips_hpt_frequency = CONFIG_CPU_SPEED * 1000000; +#endif printk("This kernel optimized for " #ifdef CONFIG_SIMULATION "simulation" Index: linux/arch/powerpc/Kconfig =================================================================== --- linux.orig/arch/powerpc/Kconfig +++ linux/arch/powerpc/Kconfig @@ -26,18 +26,15 @@ config MMU bool default y -config GENERIC_HARDIRQS +config GENERIC_TIME bool default y -config IRQ_PER_CPU +config GENERIC_HARDIRQS bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM +config IRQ_PER_CPU bool default y @@ -596,6 +593,18 @@ config HIGHMEM source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "fs/Kconfig.binfmt" # We optimistically allocate largepages from the VM, so make the limit Index: linux/arch/powerpc/Kconfig.debug =================================================================== --- linux.orig/arch/powerpc/Kconfig.debug +++ linux/arch/powerpc/Kconfig.debug @@ -2,6 +2,10 @@ menu "Kernel hacking" source "lib/Kconfig.debug" +config TRACE_IRQFLAGS_SUPPORT + bool + default y + config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL && PPC64 Index: linux/arch/powerpc/boot/Makefile =================================================================== --- linux.orig/arch/powerpc/boot/Makefile +++ linux/arch/powerpc/boot/Makefile @@ -29,6 +29,14 @@ OBJCOPYFLAGS := contents,alloc,load,r OBJCOPY_COFF_ARGS := -O aixcoff-rs6000 --set-start 0x500000 OBJCOPY_MIB_ARGS := -O aixcoff-rs6000 -R .stab -R .stabstr -R .comment +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +CFLAGS := $(subst ${pg_flag},${space},${CFLAGS}) +endif + zlib := inffast.c inflate.c inftrees.c zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h zliblinuxheader := zlib.h zconf.h zutil.h @@ -44,7 +52,7 @@ obj-boot := $(addsuffix .o, $(basename $ BOOTCFLAGS += -I$(obj) -I$(srctree)/$(obj) quiet_cmd_copy_zlib = COPY $@ - cmd_copy_zlib = sed "s@__attribute_used__@@;s@]\+\).*@\"\1\"@" $< > $@ + cmd_copy_zlib = sed "s@__attribute_used__@@;s@.include.@@;s@.include.@@;s@.*spin.*lock.*@@;s@.*SPINLOCK.*@@;s@]\+\).*@\"\1\"@" $< > $@ quiet_cmd_copy_zlibheader = COPY $@ cmd_copy_zlibheader = sed "s@]\+\).*@\"\1\"@" $< > $@ Index: linux/arch/powerpc/kernel/Makefile =================================================================== --- linux.orig/arch/powerpc/kernel/Makefile +++ linux/arch/powerpc/kernel/Makefile @@ -10,10 +10,11 @@ CFLAGS_prom_init.o += -fPIC CFLAGS_btext.o += -fPIC endif -obj-y := semaphore.o cputable.o ptrace.o syscalls.o \ +obj-y := cputable.o ptrace.o syscalls.o \ irq.o align.o signal_32.o pmc.o vdso.o \ init_task.o process.o systbl.o idle.o obj-y += vdso32/ +obj-$(CONFIG_ASM_SEMAPHORES) += semaphore.o obj-$(CONFIG_PPC64) += setup_64.o binfmt_elf32.o sys_ppc32.o \ signal_64.o ptrace32.o \ paca.o cpu_setup_power4.o \ Index: linux/arch/powerpc/kernel/entry_32.S =================================================================== --- linux.orig/arch/powerpc/kernel/entry_32.S +++ linux/arch/powerpc/kernel/entry_32.S @@ -638,7 +638,7 @@ user_exc_return: /* r10 contains MSR_KE /* Check current_thread_info()->flags */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED) + andi. r0,r9,(_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne do_work restore_user: @@ -856,7 +856,7 @@ load_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -870,7 +870,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,(31-THREAD_SHIFT) lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne- do_resched andi. r0,r9,_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK beq restore_user @@ -978,3 +978,85 @@ machine_check_in_rtas: /* XXX load up BATs and panic */ #endif /* CONFIG_PPC_RTAS */ + +#ifdef CONFIG_MCOUNT +/* + * mcount() is not the same as _mcount(). The callers of mcount() have a + * normal context. The callers of _mcount() do not have a stack frame and + * have not saved the "caller saves" registers. + */ +_GLOBAL(mcount) + stwu r1,-16(r1) + mflr r3 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + stw r3,20(r1) + cmpwi r5,0 + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,16(r1) + lwz r4,4(r4) + bl __trace +1: + lwz r0,20(r1) + mtlr r0 + addi r1,r1,16 + blr + +/* + * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the + * C compiler to add a call to _mcount() at the start of each function + * preamble, before the stack frame is created. An example of this preamble + * code is: + * + * mflr r0 + * lis r12,-16354 + * stw r0,4(r1) + * addi r0,r12,-19652 + * bl 0xc00034c8 <_mcount> + * mflr r0 + * stwu r1,-16(r1) + */ +_GLOBAL(_mcount) +#define M_STK_SIZE 48 + /* Would not expect to need to save cr, but glibc version of */ + /* _mcount() does, so cautiously saving it here too. */ + stwu r1,-M_STK_SIZE(r1) + stw r3, 12(r1) + stw r4, 16(r1) + stw r5, 20(r1) + stw r6, 24(r1) + mflr r3 /* will use as first arg to __trace() */ + mfcr r4 + lis r5,mcount_enabled@ha + lwz r5,mcount_enabled@l(r5) + cmpwi r5,0 + stw r3, 44(r1) /* lr */ + stw r4, 8(r1) /* cr */ + stw r7, 28(r1) + stw r8, 32(r1) + stw r9, 36(r1) + stw r10,40(r1) + beq 1f + /* r3 contains lr (eip), put parent lr (parent_eip) in r4 */ + lwz r4,M_STK_SIZE+4(r1) + bl __trace +1: + lwz r8, 8(r1) /* cr */ + lwz r9, 44(r1) /* lr */ + lwz r3, 12(r1) + lwz r4, 16(r1) + lwz r5, 20(r1) + mtcrf 0xff,r8 + mtctr r9 + lwz r0, 52(r1) + lwz r6, 24(r1) + lwz r7, 28(r1) + lwz r8, 32(r1) + lwz r9, 36(r1) + lwz r10,40(r1) + addi r1,r1,M_STK_SIZE + mtlr r0 + bctr + +#endif /* CONFIG_MCOUNT */ Index: linux/arch/powerpc/kernel/idle.c =================================================================== --- linux.orig/arch/powerpc/kernel/idle.c +++ linux/arch/powerpc/kernel/idle.c @@ -82,7 +82,7 @@ void cpu_idle(void) ppc64_runlatch_on(); if (cpu_should_die()) cpu_die(); - preempt_enable_no_resched(); + __preempt_enable_no_resched(); schedule(); preempt_disable(); } Index: linux/arch/powerpc/kernel/irq.c =================================================================== --- linux.orig/arch/powerpc/kernel/irq.c +++ linux/arch/powerpc/kernel/irq.c @@ -91,8 +91,6 @@ extern atomic_t ipi_sent; #endif #ifdef CONFIG_PPC64 -EXPORT_SYMBOL(irq_desc); - int distribute_irqs = 1; #endif /* CONFIG_PPC64 */ Index: linux/arch/powerpc/kernel/ppc_ksyms.c =================================================================== --- linux.orig/arch/powerpc/kernel/ppc_ksyms.c +++ linux/arch/powerpc/kernel/ppc_ksyms.c @@ -16,7 +16,6 @@ #include #include -#include #include #include #include @@ -189,7 +188,6 @@ EXPORT_SYMBOL(screen_info); #ifdef CONFIG_PPC32 EXPORT_SYMBOL(timer_interrupt); -EXPORT_SYMBOL(irq_desc); EXPORT_SYMBOL(tb_ticks_per_jiffy); EXPORT_SYMBOL(console_drivers); EXPORT_SYMBOL(cacheable_memcpy); Index: linux/arch/powerpc/kernel/semaphore.c =================================================================== --- linux.orig/arch/powerpc/kernel/semaphore.c +++ linux/arch/powerpc/kernel/semaphore.c @@ -31,7 +31,7 @@ * sem->count = tmp; * return old_count; */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -50,7 +50,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -63,7 +63,7 @@ void __up(struct semaphore *sem) __sem_update_count(sem, 1); wake_up(&sem->wait); } -EXPORT_SYMBOL(__up); +EXPORT_SYMBOL(__compat_up); /* * Note that when we come in to __down or __down_interruptible, @@ -73,7 +73,7 @@ EXPORT_SYMBOL(__up); * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -101,9 +101,9 @@ void __sched __down(struct semaphore *se */ wake_up(&sem->wait); } -EXPORT_SYMBOL(__down); +EXPORT_SYMBOL(__compat_down); -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore *sem) { int retval = 0; struct task_struct *tsk = current; @@ -132,4 +132,10 @@ int __sched __down_interruptible(struct wake_up(&sem->wait); return retval; } -EXPORT_SYMBOL(__down_interruptible); +EXPORT_SYMBOL(__compat_down_interruptible); + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} +EXPORT_SYMBOL(compat_sem_is_locked); Index: linux/arch/powerpc/kernel/smp.c =================================================================== --- linux.orig/arch/powerpc/kernel/smp.c +++ linux/arch/powerpc/kernel/smp.c @@ -148,6 +148,16 @@ void smp_send_reschedule(int cpu) smp_ops->message_pass(cpu, PPC_MSG_RESCHEDULE); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + smp_ops->message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE); +} + #ifdef CONFIG_DEBUGGER void smp_send_debugger_break(int cpu) { @@ -184,7 +194,7 @@ void smp_send_stop(void) * static memory requirements. It also looks cleaner. * Stolen from the i386 version. */ -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock); +static __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(call_lock); static struct call_data_struct { void (*func) (void *info); Index: linux/arch/powerpc/kernel/time.c =================================================================== --- linux.orig/arch/powerpc/kernel/time.c +++ linux/arch/powerpc/kernel/time.c @@ -73,6 +73,9 @@ #endif #include +unsigned long cpu_khz; /* Detected as we calibrate the TSC */ +EXPORT_SYMBOL(cpu_khz); + /* keep track of when we need to update the rtc */ time_t last_rtc_update; #ifdef CONFIG_PPC_ISERIES @@ -115,8 +118,6 @@ EXPORT_SYMBOL_GPL(rtc_lock); u64 tb_to_ns_scale; unsigned tb_to_ns_shift; -struct gettimeofday_struct do_gtod; - extern unsigned long wall_jiffies; extern struct timezone sys_tz; @@ -407,162 +408,8 @@ static __inline__ void timer_check_rtc(v } } -/* - * This version of gettimeofday has microsecond resolution. - */ -static inline void __do_gettimeofday(struct timeval *tv) -{ - unsigned long sec, usec; - u64 tb_ticks, xsec; - struct gettimeofday_vars *temp_varp; - u64 temp_tb_to_xs, temp_stamp_xsec; - - /* - * These calculations are faster (gets rid of divides) - * if done in units of 1/2^20 rather than microseconds. - * The conversion to microseconds at the end is done - * without a divide (and in fact, without a multiply) - */ - temp_varp = do_gtod.varp; - - /* Sampling the time base must be done after loading - * do_gtod.varp in order to avoid racing with update_gtod. - */ - data_barrier(temp_varp); - tb_ticks = get_tb() - temp_varp->tb_orig_stamp; - temp_tb_to_xs = temp_varp->tb_to_xs; - temp_stamp_xsec = temp_varp->stamp_xsec; - xsec = temp_stamp_xsec + mulhdu(tb_ticks, temp_tb_to_xs); - sec = xsec / XSEC_PER_SEC; - usec = (unsigned long)xsec & (XSEC_PER_SEC - 1); - usec = SCALE_XSEC(usec, 1000000); - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -void do_gettimeofday(struct timeval *tv) -{ - if (__USE_RTC()) { - /* do this the old way */ - unsigned long flags, seq; - unsigned int sec, nsec, usec; - - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - sec = xtime.tv_sec; - nsec = xtime.tv_nsec + tb_ticks_since(tb_last_jiffy); - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - usec = nsec / 1000; - while (usec >= 1000000) { - usec -= 1000000; - ++sec; - } - tv->tv_sec = sec; - tv->tv_usec = usec; - return; - } - __do_gettimeofday(tv); -} - -EXPORT_SYMBOL(do_gettimeofday); - -/* - * There are two copies of tb_to_xs and stamp_xsec so that no - * lock is needed to access and use these values in - * do_gettimeofday. We alternate the copies and as long as a - * reasonable time elapses between changes, there will never - * be inconsistent values. ntpd has a minimum of one minute - * between updates. - */ -static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec, - u64 new_tb_to_xs) -{ - unsigned temp_idx; - struct gettimeofday_vars *temp_varp; - - temp_idx = (do_gtod.var_idx == 0); - temp_varp = &do_gtod.vars[temp_idx]; - - temp_varp->tb_to_xs = new_tb_to_xs; - temp_varp->tb_orig_stamp = new_tb_stamp; - temp_varp->stamp_xsec = new_stamp_xsec; - smp_mb(); - do_gtod.varp = temp_varp; - do_gtod.var_idx = temp_idx; - - /* - * tb_update_count is used to allow the userspace gettimeofday code - * to assure itself that it sees a consistent view of the tb_to_xs and - * stamp_xsec variables. It reads the tb_update_count, then reads - * tb_to_xs and stamp_xsec and then reads tb_update_count again. If - * the two values of tb_update_count match and are even then the - * tb_to_xs and stamp_xsec values are consistent. If not, then it - * loops back and reads them again until this criteria is met. - * We expect the caller to have done the first increment of - * vdso_data->tb_update_count already. - */ - vdso_data->tb_orig_stamp = new_tb_stamp; - vdso_data->stamp_xsec = new_stamp_xsec; - vdso_data->tb_to_xs = new_tb_to_xs; - vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec; - vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec; - smp_wmb(); - ++(vdso_data->tb_update_count); -} - -/* - * When the timebase - tb_orig_stamp gets too big, we do a manipulation - * between tb_orig_stamp and stamp_xsec. The goal here is to keep the - * difference tb - tb_orig_stamp small enough to always fit inside a - * 32 bits number. This is a requirement of our fast 32 bits userland - * implementation in the vdso. If we "miss" a call to this function - * (interrupt latency, CPU locked in a spinlock, ...) and we end up - * with a too big difference, then the vdso will fallback to calling - * the syscall - */ -static __inline__ void timer_recalc_offset(u64 cur_tb) -{ - unsigned long offset; - u64 new_stamp_xsec; - u64 tlen, t2x; - u64 tb, xsec_old, xsec_new; - struct gettimeofday_vars *varp; - - if (__USE_RTC()) - return; - tlen = current_tick_length(); - offset = cur_tb - do_gtod.varp->tb_orig_stamp; - if (tlen == last_tick_len && offset < 0x80000000u) - return; - if (tlen != last_tick_len) { - t2x = mulhdu(tlen << TICKLEN_SHIFT, ticklen_to_xs); - last_tick_len = tlen; - } else - t2x = do_gtod.varp->tb_to_xs; - new_stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC; - do_div(new_stamp_xsec, 1000000000); - new_stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC; - - ++vdso_data->tb_update_count; - smp_mb(); - - /* - * Make sure time doesn't go backwards for userspace gettimeofday. - */ - tb = get_tb(); - varp = do_gtod.varp; - xsec_old = mulhdu(tb - varp->tb_orig_stamp, varp->tb_to_xs) - + varp->stamp_xsec; - xsec_new = mulhdu(tb - cur_tb, t2x) + new_stamp_xsec; - if (xsec_new < xsec_old) - new_stamp_xsec += xsec_old - xsec_new; - - update_gtod(cur_tb, new_stamp_xsec, t2x); -} - #ifdef CONFIG_SMP -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); @@ -610,11 +457,7 @@ static void iSeries_tb_recal(void) tb_ticks_per_sec = new_tb_ticks_per_sec; calc_cputime_factors(); div128_by_32( XSEC_PER_SEC, 0, tb_ticks_per_sec, &divres ); - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; tb_to_xs = divres.result_low; - do_gtod.varp->tb_to_xs = tb_to_xs; - vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - vdso_data->tb_to_xs = tb_to_xs; } else { printk( "Titan recalibrate: FAILED (difference > 4 percent)\n" @@ -781,81 +624,6 @@ unsigned long long sched_clock(void) return mulhdu(get_tb(), tb_to_ns_scale) << tb_to_ns_shift; } -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, new_sec = tv->tv_sec; - long wtm_nsec, new_nsec = tv->tv_nsec; - unsigned long flags; - u64 new_xsec; - unsigned long tb_delta; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irqsave(&xtime_lock, flags); - - /* - * Updating the RTC is not the job of this code. If the time is - * stepped under NTP, the RTC will be updated after STA_UNSYNC - * is cleared. Tools like clock/hwclock either copy the RTC - * to the system time, in which case there is no point in writing - * to the RTC again, or write to the RTC but then they don't call - * settimeofday to perform this operation. - */ -#ifdef CONFIG_PPC_ISERIES - if (first_settimeofday) { - iSeries_tb_recal(); - first_settimeofday = 0; - } -#endif - - /* Make userspace gettimeofday spin until we're done. */ - ++vdso_data->tb_update_count; - smp_mb(); - - /* - * Subtract off the number of nanoseconds since the - * beginning of the last tick. - * Note that since we don't increment jiffies_64 anywhere other - * than in do_timer (since we don't have a lost tick problem), - * wall_jiffies will always be the same as jiffies, - * and therefore the (jiffies - wall_jiffies) computation - * has been removed. - */ - tb_delta = tb_ticks_since(tb_last_jiffy); - tb_delta = mulhdu(tb_delta, do_gtod.varp->tb_to_xs); /* in xsec */ - new_nsec -= SCALE_XSEC(tb_delta, 1000000000); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - new_sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - new_nsec); - - set_normalized_timespec(&xtime, new_sec, new_nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - /* In case of a large backwards jump in time with NTP, we want the - * clock to be updated as soon as the PLL is again in lock. - */ - last_rtc_update = new_sec - 658; - - ntp_clear(); - - new_xsec = xtime.tv_nsec; - if (new_xsec != 0) { - new_xsec *= XSEC_PER_SEC; - do_div(new_xsec, NSEC_PER_SEC); - } - new_xsec += (u64)xtime.tv_sec * XSEC_PER_SEC; - update_gtod(tb_last_jiffy, new_xsec, do_gtod.varp->tb_to_xs); - - vdso_data->tz_minuteswest = sys_tz.tz_minuteswest; - vdso_data->tz_dsttime = sys_tz.tz_dsttime; - - write_sequnlock_irqrestore(&xtime_lock, flags); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); static int __init get_freq(char *name, int cells, unsigned long *val) { @@ -1024,20 +792,6 @@ void __init time_init(void) xtime.tv_sec = tm; xtime.tv_nsec = 0; - do_gtod.varp = &do_gtod.vars[0]; - do_gtod.var_idx = 0; - do_gtod.varp->tb_orig_stamp = tb_last_jiffy; - __get_cpu_var(last_jiffy) = tb_last_jiffy; - do_gtod.varp->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; - do_gtod.tb_ticks_per_sec = tb_ticks_per_sec; - do_gtod.varp->tb_to_xs = tb_to_xs; - do_gtod.tb_to_us = tb_to_us; - - vdso_data->tb_orig_stamp = tb_last_jiffy; - vdso_data->tb_update_count = 0; - vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; - vdso_data->tb_to_xs = tb_to_xs; time_freq = 0; @@ -1050,7 +804,6 @@ void __init time_init(void) set_dec(tb_ticks_per_jiffy); } - #define FEBRUARY 2 #define STARTOFTIME 1970 #define SECDAY 86400L @@ -1195,3 +948,36 @@ void div128_by_32(u64 dividend_high, u64 dr->result_low = ((u64)y << 32) + z; } + + +/* powerpc clocksource code */ + +#include +static cycle_t timebase_read(void) +{ + return (cycle_t)get_tb(); +} + +struct clocksource clocksource_timebase = { + .name = "timebase", + .rating = 200, + .read = timebase_read, + .mask = (cycle_t)-1, + .mult = 0, + .shift = 22, +}; + + +/* XXX - this should be calculated or properly externed! */ +static int __init init_timebase_clocksource(void) +{ + if (__USE_RTC()) + return -ENODEV; + + clocksource_timebase.mult = clocksource_hz2mult(tb_ticks_per_sec, + clocksource_timebase.shift); + return clocksource_register(&clocksource_timebase); +} + +module_init(init_timebase_clocksource); + Index: linux/arch/powerpc/kernel/traps.c =================================================================== --- linux.orig/arch/powerpc/kernel/traps.c +++ linux/arch/powerpc/kernel/traps.c @@ -93,7 +93,7 @@ EXPORT_SYMBOL(unregister_die_notifier); * Trap & Exception support */ -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); int die(const char *str, struct pt_regs *regs, long err) { @@ -164,6 +164,11 @@ void _exception(int signr, struct pt_reg return; } +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif + memset(&info, 0, sizeof(info)); info.si_signo = signr; info.si_code = code; Index: linux/arch/powerpc/lib/locks.c =================================================================== --- linux.orig/arch/powerpc/lib/locks.c +++ linux/arch/powerpc/lib/locks.c @@ -24,7 +24,7 @@ #include #include -void __spin_yield(raw_spinlock_t *lock) +void __spin_yield(__raw_spinlock_t *lock) { unsigned int lock_value, holder_cpu, yield_count; @@ -79,7 +79,7 @@ void __rw_yield(raw_rwlock_t *rw) } #endif -void __raw_spin_unlock_wait(raw_spinlock_t *lock) +void __raw_spin_unlock_wait(__raw_spinlock_t *lock) { while (lock->slock) { HMT_low(); Index: linux/arch/powerpc/mm/fault.c =================================================================== --- linux.orig/arch/powerpc/mm/fault.c +++ linux/arch/powerpc/mm/fault.c @@ -149,8 +149,8 @@ static void do_dabr(struct pt_regs *regs * The return value is 0 if the fault was handled, or the signal * number if this is a kernel fault that can't be handled here. */ -int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) +int __kprobes notrace do_page_fault(struct pt_regs *regs, + unsigned long address, unsigned long error_code) { struct vm_area_struct * vma; struct mm_struct *mm = current->mm; Index: linux/arch/powerpc/mm/init_32.c =================================================================== --- linux.orig/arch/powerpc/mm/init_32.c +++ linux/arch/powerpc/mm/init_32.c @@ -56,7 +56,7 @@ #endif #define MAX_LOW_MEM CONFIG_LOWMEM_SIZE -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long total_memory; unsigned long total_lowmem; Index: linux/arch/powerpc/mm/tlb_64.c =================================================================== --- linux.orig/arch/powerpc/mm/tlb_64.c +++ linux/arch/powerpc/mm/tlb_64.c @@ -37,7 +37,7 @@ DEFINE_PER_CPU(struct ppc64_tlb_batch, p /* This is declared as we are using the more or less generic * include/asm-powerpc/tlb.h file -- tgall */ -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur); unsigned long pte_freelist_forced_free; Index: linux/arch/powerpc/platforms/cell/smp.c =================================================================== --- linux.orig/arch/powerpc/platforms/cell/smp.c +++ linux/arch/powerpc/platforms/cell/smp.c @@ -133,7 +133,7 @@ static void __devinit smp_iic_setup_cpu( iic_setup_cpu(); } -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned long timebase = 0; static void __devinit cell_give_timebase(void) Index: linux/arch/powerpc/platforms/chrp/smp.c =================================================================== --- linux.orig/arch/powerpc/platforms/chrp/smp.c +++ linux/arch/powerpc/platforms/chrp/smp.c @@ -45,7 +45,7 @@ static void __devinit smp_chrp_setup_cpu mpic_setup_this_cpu(); } -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned int timebase_upper = 0, timebase_lower = 0; void __devinit smp_chrp_give_timebase(void) Index: linux/arch/powerpc/platforms/chrp/time.c =================================================================== --- linux.orig/arch/powerpc/platforms/chrp/time.c +++ linux/arch/powerpc/platforms/chrp/time.c @@ -27,7 +27,7 @@ #include #include -extern spinlock_t rtc_lock; +extern raw_spinlock_t rtc_lock; static int nvram_as1 = NVRAM_AS1; static int nvram_as0 = NVRAM_AS0; Index: linux/arch/powerpc/platforms/iseries/setup.c =================================================================== --- linux.orig/arch/powerpc/platforms/iseries/setup.c +++ linux/arch/powerpc/platforms/iseries/setup.c @@ -594,12 +594,14 @@ static void yield_shared_processor(void) static void iseries_shared_idle(void) { while (1) { - while (!need_resched() && !hvlpevent_is_pending()) { + while (!need_resched() && !need_resched_delayed() + && !hvlpevent_is_pending()) { local_irq_disable(); ppc64_runlatch_off(); /* Recheck with irqs off */ - if (!need_resched() && !hvlpevent_is_pending()) + if (!need_resched() && !need_resched_delayed() + && !hvlpevent_is_pending()) yield_shared_processor(); HMT_medium(); Index: linux/arch/powerpc/platforms/powermac/feature.c =================================================================== --- linux.orig/arch/powerpc/platforms/powermac/feature.c +++ linux/arch/powerpc/platforms/powermac/feature.c @@ -59,7 +59,7 @@ extern struct device_node *k2_skiplist[2 * We use a single global lock to protect accesses. Each driver has * to take care of its own locking */ -DEFINE_SPINLOCK(feature_lock); +DEFINE_RAW_SPINLOCK(feature_lock); #define LOCK(flags) spin_lock_irqsave(&feature_lock, flags); #define UNLOCK(flags) spin_unlock_irqrestore(&feature_lock, flags); Index: linux/arch/powerpc/platforms/powermac/nvram.c =================================================================== --- linux.orig/arch/powerpc/platforms/powermac/nvram.c +++ linux/arch/powerpc/platforms/powermac/nvram.c @@ -80,7 +80,7 @@ static int is_core_99; static int core99_bank = 0; static int nvram_partitions[3]; // XXX Turn that into a sem -static DEFINE_SPINLOCK(nv_lock); +static DEFINE_RAW_SPINLOCK(nv_lock); static int (*core99_write_bank)(int bank, u8* datas); static int (*core99_erase_bank)(int bank); Index: linux/arch/powerpc/platforms/powermac/pic.c =================================================================== --- linux.orig/arch/powerpc/platforms/powermac/pic.c +++ linux/arch/powerpc/platforms/powermac/pic.c @@ -63,7 +63,7 @@ static int max_irqs; static int max_real_irqs; static u32 level_mask[4]; -static DEFINE_SPINLOCK(pmac_pic_lock); +static DEFINE_RAW_SPINLOCK(pmac_pic_lock); #define NR_MASK_WORDS ((NR_IRQS + 31) / 32) static unsigned long ppc_lost_interrupts[NR_MASK_WORDS]; Index: linux/arch/powerpc/platforms/pseries/setup.c =================================================================== --- linux.orig/arch/powerpc/platforms/pseries/setup.c +++ linux/arch/powerpc/platforms/pseries/setup.c @@ -483,7 +483,8 @@ static void pseries_dedicated_idle_sleep set_thread_flag(TIF_POLLING_NRFLAG); while (get_tb() < start_snooze) { - if (need_resched() || cpu_is_offline(cpu)) + if (need_resched() || need_resched_delayed() || + cpu_is_offline(cpu)) goto out; ppc64_runlatch_off(); HMT_low(); @@ -494,7 +495,8 @@ static void pseries_dedicated_idle_sleep clear_thread_flag(TIF_POLLING_NRFLAG); smp_mb(); local_irq_disable(); - if (need_resched() || cpu_is_offline(cpu)) + if (need_resched() || need_resched_delayed() || + cpu_is_offline(cpu)) goto out; } Index: linux/arch/powerpc/platforms/pseries/smp.c =================================================================== --- linux.orig/arch/powerpc/platforms/pseries/smp.c +++ linux/arch/powerpc/platforms/pseries/smp.c @@ -344,7 +344,7 @@ static void __devinit smp_xics_setup_cpu } #endif /* CONFIG_XICS */ -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned long timebase = 0; static void __devinit pSeries_give_timebase(void) Index: linux/arch/ppc/8260_io/enet.c =================================================================== --- linux.orig/arch/ppc/8260_io/enet.c +++ linux/arch/ppc/8260_io/enet.c @@ -116,7 +116,7 @@ struct scc_enet_private { scc_t *sccp; struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; }; static int scc_enet_open(struct net_device *dev); Index: linux/arch/ppc/8260_io/fcc_enet.c =================================================================== --- linux.orig/arch/ppc/8260_io/fcc_enet.c +++ linux/arch/ppc/8260_io/fcc_enet.c @@ -376,7 +376,7 @@ struct fcc_enet_private { volatile fcc_enet_t *ep; struct net_device_stats stats; uint tx_free; - spinlock_t lock; + raw_spinlock_t lock; #ifdef CONFIG_USE_MDIO uint phy_id; Index: linux/arch/ppc/8xx_io/commproc.c =================================================================== --- linux.orig/arch/ppc/8xx_io/commproc.c +++ linux/arch/ppc/8xx_io/commproc.c @@ -356,7 +356,7 @@ cpm_setbrg(uint brg, uint rate) /* * dpalloc / dpfree bits. */ -static spinlock_t cpm_dpmem_lock; +static raw_spinlock_t cpm_dpmem_lock; /* * 16 blocks should be enough to satisfy all requests * until the memory subsystem goes up... Index: linux/arch/ppc/8xx_io/enet.c =================================================================== --- linux.orig/arch/ppc/8xx_io/enet.c +++ linux/arch/ppc/8xx_io/enet.c @@ -143,7 +143,7 @@ struct scc_enet_private { unsigned char *rx_vaddr[RX_RING_SIZE]; struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; }; static int scc_enet_open(struct net_device *dev); Index: linux/arch/ppc/8xx_io/fec.c =================================================================== --- linux.orig/arch/ppc/8xx_io/fec.c +++ linux/arch/ppc/8xx_io/fec.c @@ -164,7 +164,7 @@ struct fec_enet_private { struct net_device_stats stats; uint tx_full; - spinlock_t lock; + raw_spinlock_t lock; #ifdef CONFIG_USE_MDIO uint phy_id; Index: linux/arch/ppc/Kconfig =================================================================== --- linux.orig/arch/ppc/Kconfig +++ linux/arch/ppc/Kconfig @@ -12,13 +12,6 @@ config GENERIC_HARDIRQS bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config GENERIC_HWEIGHT bool default y @@ -955,6 +948,18 @@ config HIGHMEM source kernel/Kconfig.hz source kernel/Kconfig.preempt + +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config ASM_SEMAPHORES + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + bool + source "mm/Kconfig" source "fs/Kconfig.binfmt" Index: linux/arch/ppc/boot/Makefile =================================================================== --- linux.orig/arch/ppc/boot/Makefile +++ linux/arch/ppc/boot/Makefile @@ -14,6 +14,15 @@ # CFLAGS += -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include + +ifdef CONFIG_MCOUNT +# do not trace the boot loader +nullstring := +space := $(nullstring) # end of the line +pg_flag = $(nullstring) -pg # end of the line +CFLAGS := $(subst ${pg_flag},${space},${CFLAGS}) +endif + HOSTCFLAGS += -Iarch/$(ARCH)/boot/include BOOT_TARGETS = zImage zImage.initrd znetboot znetboot.initrd Index: linux/arch/ppc/kernel/dma-mapping.c =================================================================== --- linux.orig/arch/ppc/kernel/dma-mapping.c +++ linux/arch/ppc/kernel/dma-mapping.c @@ -70,7 +70,7 @@ int map_page(unsigned long va, phys_addr * This is the page table (2MB) covering uncached, DMA consistent allocations */ static pte_t *consistent_pte; -static DEFINE_SPINLOCK(consistent_lock); +static DEFINE_RAW_SPINLOCK(consistent_lock); /* * VM region handling support. Index: linux/arch/ppc/kernel/entry.S =================================================================== --- linux.orig/arch/ppc/kernel/entry.S +++ linux/arch/ppc/kernel/entry.S @@ -856,7 +856,7 @@ load_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -870,7 +870,7 @@ recheck: MTMSRD(r10) /* disable interrupts */ rlwinm r9,r1,0,0,18 lwz r9,TI_FLAGS(r9) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED) bne- do_resched andi. r0,r9,_TIF_SIGPENDING beq restore_user Index: linux/arch/ppc/kernel/semaphore.c =================================================================== --- linux.orig/arch/ppc/kernel/semaphore.c +++ linux/arch/ppc/kernel/semaphore.c @@ -29,7 +29,7 @@ * sem->count = tmp; * return old_count; */ -static inline int __sem_update_count(struct semaphore *sem, int incr) +static inline int __sem_update_count(struct compat_semaphore *sem, int incr) { int old_count, tmp; @@ -48,7 +48,7 @@ static inline int __sem_update_count(str return old_count; } -void __up(struct semaphore *sem) +void __compat_up(struct compat_semaphore *sem) { /* * Note that we incremented count in up() before we came here, @@ -70,7 +70,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __sched __down(struct semaphore *sem) +void __sched __compat_down(struct compat_semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -100,7 +100,7 @@ void __sched __down(struct semaphore *se wake_up(&sem->wait); } -int __sched __down_interruptible(struct semaphore * sem) +int __sched __compat_down_interruptible(struct compat_semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -129,3 +129,8 @@ int __sched __down_interruptible(struct wake_up(&sem->wait); return retval; } + +int compat_sem_is_locked(struct compat_semaphore *sem) +{ + return (int) atomic_read(&sem->count) < 0; +} Index: linux/arch/ppc/kernel/smp.c =================================================================== --- linux.orig/arch/ppc/kernel/smp.c +++ linux/arch/ppc/kernel/smp.c @@ -137,6 +137,16 @@ void smp_send_reschedule(int cpu) smp_message_pass(cpu, PPC_MSG_RESCHEDULE); } +/* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + smp_message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE, 0, 0); +} + #ifdef CONFIG_XMON void smp_send_xmon_break(int cpu) { @@ -161,7 +171,7 @@ void smp_send_stop(void) * static memory requirements. It also looks cleaner. * Stolen from the i386 version. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); static struct call_data_struct { void (*func) (void *info); Index: linux/arch/ppc/kernel/time.c =================================================================== --- linux.orig/arch/ppc/kernel/time.c +++ linux/arch/ppc/kernel/time.c @@ -65,6 +65,9 @@ #include +unsigned long cpu_khz; /* Detected as we calibrate the TSC */ +EXPORT_SYMBOL(cpu_khz); + unsigned long disarm_decr[NR_CPUS]; extern struct timezone sys_tz; @@ -103,7 +106,7 @@ static inline int tb_delta(unsigned *jif } #ifdef CONFIG_SMP -unsigned long profile_pc(struct pt_regs *regs) +unsigned long notrace profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); Index: linux/arch/ppc/kernel/traps.c =================================================================== --- linux.orig/arch/ppc/kernel/traps.c +++ linux/arch/ppc/kernel/traps.c @@ -71,7 +71,7 @@ void (*debugger_fault_handler)(struct pt * Trap & Exception support */ -DEFINE_SPINLOCK(die_lock); +DEFINE_RAW_SPINLOCK(die_lock); int die(const char * str, struct pt_regs * fp, long err) { @@ -106,6 +106,10 @@ void _exception(int signr, struct pt_reg debugger(regs); die("Exception in kernel mode", regs, signr); } +#ifdef CONFIG_PREEMPT_RT + local_irq_enable(); + preempt_check_resched(); +#endif info.si_signo = signr; info.si_errno = 0; info.si_code = code; Index: linux/arch/ppc/lib/locks.c =================================================================== --- linux.orig/arch/ppc/lib/locks.c +++ linux/arch/ppc/lib/locks.c @@ -42,7 +42,7 @@ static inline unsigned long __spin_trylo return ret; } -void _raw_spin_lock(spinlock_t *lock) +void __raw_spin_lock(raw_spinlock_t *lock) { int cpu = smp_processor_id(); unsigned int stuck = INIT_STUCK; @@ -62,9 +62,9 @@ void _raw_spin_lock(spinlock_t *lock) lock->owner_pc = (unsigned long)__builtin_return_address(0); lock->owner_cpu = cpu; } -EXPORT_SYMBOL(_raw_spin_lock); +EXPORT_SYMBOL(__raw_spin_lock); -int _raw_spin_trylock(spinlock_t *lock) +int __raw_spin_trylock(raw_spinlock_t *lock) { if (__spin_trylock(&lock->lock)) return 0; @@ -72,9 +72,9 @@ int _raw_spin_trylock(spinlock_t *lock) lock->owner_pc = (unsigned long)__builtin_return_address(0); return 1; } -EXPORT_SYMBOL(_raw_spin_trylock); +EXPORT_SYMBOL(__raw_spin_trylock); -void _raw_spin_unlock(spinlock_t *lp) +void __raw_spin_unlock(raw_spinlock_t *lp) { if ( !lp->lock ) printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n", @@ -88,13 +88,13 @@ void _raw_spin_unlock(spinlock_t *lp) wmb(); lp->lock = 0; } -EXPORT_SYMBOL(_raw_spin_unlock); +EXPORT_SYMBOL(__raw_spin_unlock); /* * For rwlocks, zero is unlocked, -1 is write-locked, * positive is read-locked. */ -static __inline__ int __read_trylock(rwlock_t *rw) +static __inline__ int __read_trylock(raw_rwlock_t *rw) { signed int tmp; @@ -114,13 +114,13 @@ static __inline__ int __read_trylock(rwl return tmp; } -int _raw_read_trylock(rwlock_t *rw) +int __raw_read_trylock(raw_rwlock_t *rw) { return __read_trylock(rw) > 0; } -EXPORT_SYMBOL(_raw_read_trylock); +EXPORT_SYMBOL(__raw_read_trylock); -void _raw_read_lock(rwlock_t *rw) +void __raw_read_lock(rwlock_t *rw) { unsigned int stuck; @@ -135,9 +135,9 @@ void _raw_read_lock(rwlock_t *rw) } } } -EXPORT_SYMBOL(_raw_read_lock); +EXPORT_SYMBOL(__raw_read_lock); -void _raw_read_unlock(rwlock_t *rw) +void __raw_read_unlock(raw_rwlock_t *rw) { if ( rw->lock == 0 ) printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n", @@ -146,9 +146,9 @@ void _raw_read_unlock(rwlock_t *rw) wmb(); atomic_dec((atomic_t *) &(rw)->lock); } -EXPORT_SYMBOL(_raw_read_unlock); +EXPORT_SYMBOL(__raw_read_unlock); -void _raw_write_lock(rwlock_t *rw) +void __raw_write_lock(raw_rwlock_t *rw) { unsigned int stuck; @@ -164,18 +164,18 @@ void _raw_write_lock(rwlock_t *rw) } wmb(); } -EXPORT_SYMBOL(_raw_write_lock); +EXPORT_SYMBOL(__raw_write_lock); -int _raw_write_trylock(rwlock_t *rw) +int __raw_write_trylock(raw_rwlock_t *rw) { if (cmpxchg(&rw->lock, 0, -1) != 0) return 0; wmb(); return 1; } -EXPORT_SYMBOL(_raw_write_trylock); +EXPORT_SYMBOL(__raw_write_trylock); -void _raw_write_unlock(rwlock_t *rw) +void __raw_write_unlock(raw_rwlock_t *rw) { if (rw->lock >= 0) printk("_write_lock(): %s/%d (nip %08lX) lock %d\n", @@ -184,6 +184,6 @@ void _raw_write_unlock(rwlock_t *rw) wmb(); rw->lock = 0; } -EXPORT_SYMBOL(_raw_write_unlock); +EXPORT_SYMBOL(__raw_write_unlock); #endif Index: linux/arch/ppc/mm/fault.c =================================================================== --- linux.orig/arch/ppc/mm/fault.c +++ linux/arch/ppc/mm/fault.c @@ -89,7 +89,7 @@ static int store_updates_sp(struct pt_re * the error_code parameter is ESR for a data fault, 0 for an instruction * fault. */ -int do_page_fault(struct pt_regs *regs, unsigned long address, +int notrace do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long error_code) { struct vm_area_struct * vma; Index: linux/arch/ppc/mm/init.c =================================================================== --- linux.orig/arch/ppc/mm/init.c +++ linux/arch/ppc/mm/init.c @@ -55,7 +55,7 @@ #endif #define MAX_LOW_MEM CONFIG_LOWMEM_SIZE -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); unsigned long total_memory; unsigned long total_lowmem; Index: linux/arch/ppc/platforms/apus_setup.c =================================================================== --- linux.orig/arch/ppc/platforms/apus_setup.c +++ linux/arch/ppc/platforms/apus_setup.c @@ -275,6 +275,7 @@ void apus_calibrate_decr(void) freq/1000000, freq%1000000); tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; __bus_speed = bus_speed; __speed_test_failed = speed_test_failed; Index: linux/arch/ppc/platforms/ev64260.c =================================================================== --- linux.orig/arch/ppc/platforms/ev64260.c +++ linux/arch/ppc/platforms/ev64260.c @@ -550,6 +550,7 @@ ev64260_calibrate_decr(void) tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; return; } Index: linux/arch/ppc/platforms/gemini_setup.c =================================================================== --- linux.orig/arch/ppc/platforms/gemini_setup.c +++ linux/arch/ppc/platforms/gemini_setup.c @@ -459,6 +459,7 @@ void __init gemini_calibrate_decr(void) divisor = 4; tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } unsigned long __init gemini_find_end_of_memory(void) Index: linux/arch/ppc/platforms/hdpu.c =================================================================== --- linux.orig/arch/ppc/platforms/hdpu.c +++ linux/arch/ppc/platforms/hdpu.c @@ -55,7 +55,7 @@ static void parse_bootinfo(unsigned long static void hdpu_set_l1pe(void); static void hdpu_cpustate_set(unsigned char new_state); #ifdef CONFIG_SMP -static DEFINE_SPINLOCK(timebase_lock); +static DEFINE_RAW_SPINLOCK(timebase_lock); static unsigned int timebase_upper = 0, timebase_lower = 0; extern int smp_tb_synchronized; Index: linux/arch/ppc/platforms/powerpmc250.c =================================================================== --- linux.orig/arch/ppc/platforms/powerpmc250.c +++ linux/arch/ppc/platforms/powerpmc250.c @@ -163,6 +163,7 @@ powerpmc250_calibrate_decr(void) tb_ticks_per_jiffy = freq / (HZ * divisor); tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static void Index: linux/arch/ppc/platforms/prep_setup.c =================================================================== --- linux.orig/arch/ppc/platforms/prep_setup.c +++ linux/arch/ppc/platforms/prep_setup.c @@ -940,6 +940,7 @@ prep_calibrate_decr(void) (freq/divisor)/1000000, (freq/divisor)%1000000); tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; tb_ticks_per_jiffy = freq / HZ / divisor; } } Index: linux/arch/ppc/platforms/prpmc750.c =================================================================== --- linux.orig/arch/ppc/platforms/prpmc750.c +++ linux/arch/ppc/platforms/prpmc750.c @@ -268,6 +268,7 @@ static void __init prpmc750_calibrate_de tb_ticks_per_jiffy = freq / (HZ * divisor); tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static void prpmc750_restart(char *cmd) Index: linux/arch/ppc/platforms/prpmc800.c =================================================================== --- linux.orig/arch/ppc/platforms/prpmc800.c +++ linux/arch/ppc/platforms/prpmc800.c @@ -327,6 +327,7 @@ static void __init prpmc800_calibrate_de tb_ticks_per_second = 100000000 / 4; tb_ticks_per_jiffy = tb_ticks_per_second / HZ; tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000); + cpu_khz = tb_ticks_per_second / 1000; return; } @@ -367,6 +368,7 @@ static void __init prpmc800_calibrate_de tb_ticks_per_second = (tbl_end - tbl_start) * 2; tb_ticks_per_jiffy = tb_ticks_per_second / HZ; tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000); + cpu_khz = tb_ticks_per_second / 1000; } static void prpmc800_restart(char *cmd) Index: linux/arch/ppc/platforms/sbc82xx.c =================================================================== --- linux.orig/arch/ppc/platforms/sbc82xx.c +++ linux/arch/ppc/platforms/sbc82xx.c @@ -65,7 +65,7 @@ static void sbc82xx_time_init(void) static volatile char *sbc82xx_i8259_map; static char sbc82xx_i8259_mask = 0xff; -static DEFINE_SPINLOCK(sbc82xx_i8259_lock); +static DEFINE_RAW_SPINLOCK(sbc82xx_i8259_lock); static void sbc82xx_i8259_mask_and_ack_irq(unsigned int irq_nr) { Index: linux/arch/ppc/platforms/spruce.c =================================================================== --- linux.orig/arch/ppc/platforms/spruce.c +++ linux/arch/ppc/platforms/spruce.c @@ -147,6 +147,7 @@ spruce_calibrate_decr(void) freq = SPRUCE_BUS_SPEED; tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } static int Index: linux/arch/ppc/syslib/cpm2_common.c =================================================================== --- linux.orig/arch/ppc/syslib/cpm2_common.c +++ linux/arch/ppc/syslib/cpm2_common.c @@ -114,7 +114,7 @@ cpm2_fastbrg(uint brg, uint rate, int di /* * dpalloc / dpfree bits. */ -static spinlock_t cpm_dpmem_lock; +static raw_spinlock_t cpm_dpmem_lock; /* 16 blocks should be enough to satisfy all requests * until the memory subsystem goes up... */ static rh_block_t cpm_boot_dpmem_rh_block[16]; Index: linux/arch/ppc/syslib/ibm44x_common.c =================================================================== --- linux.orig/arch/ppc/syslib/ibm44x_common.c +++ linux/arch/ppc/syslib/ibm44x_common.c @@ -63,6 +63,7 @@ void __init ibm44x_calibrate_decr(unsign { tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; /* Set the time base to zero */ mtspr(SPRN_TBWL, 0); Index: linux/arch/ppc/syslib/m8260_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/m8260_setup.c +++ linux/arch/ppc/syslib/m8260_setup.c @@ -79,6 +79,7 @@ m8260_calibrate_decr(void) divisor = 4; tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; } /* The 8260 has an internal 1-second timer update register that Index: linux/arch/ppc/syslib/m8xx_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/m8xx_setup.c +++ linux/arch/ppc/syslib/m8xx_setup.c @@ -218,6 +218,7 @@ void __init m8xx_calibrate_decr(void) printk("Decrementer Frequency = %d/%d\n", freq, divisor); tb_ticks_per_jiffy = freq / HZ / divisor; tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; /* Perform some more timer/timebase initialization. This used * to be done elsewhere, but other changes caused it to get Index: linux/arch/ppc/syslib/mpc52xx_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/mpc52xx_setup.c +++ linux/arch/ppc/syslib/mpc52xx_setup.c @@ -215,6 +215,7 @@ mpc52xx_calibrate_decr(void) tb_ticks_per_jiffy = xlbfreq / HZ / divisor; tb_to_us = mulhwu_scale_factor(xlbfreq / divisor, 1000000); + cpu_khz = (xlbfreq / divisor) / 1000; } Index: linux/arch/ppc/syslib/ocp.c =================================================================== --- linux.orig/arch/ppc/syslib/ocp.c +++ linux/arch/ppc/syslib/ocp.c @@ -44,11 +44,11 @@ #include #include #include +#include #include #include #include -#include #include //#define DBG(x) printk x Index: linux/arch/ppc/syslib/open_pic.c =================================================================== --- linux.orig/arch/ppc/syslib/open_pic.c +++ linux/arch/ppc/syslib/open_pic.c @@ -526,7 +526,7 @@ void openpic_reset_processor_phys(u_int } #if defined(CONFIG_SMP) || defined(CONFIG_PM) -static DEFINE_SPINLOCK(openpic_setup_lock); +static DEFINE_RAW_SPINLOCK(openpic_setup_lock); #endif #ifdef CONFIG_SMP Index: linux/arch/ppc/syslib/open_pic2.c =================================================================== --- linux.orig/arch/ppc/syslib/open_pic2.c +++ linux/arch/ppc/syslib/open_pic2.c @@ -380,7 +380,7 @@ static void openpic2_set_spurious(u_int vec); } -static DEFINE_SPINLOCK(openpic2_setup_lock); +static DEFINE_RAW_SPINLOCK(openpic2_setup_lock); /* * Initialize a timer interrupt (and disable it) Index: linux/arch/ppc/syslib/ppc4xx_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/ppc4xx_setup.c +++ linux/arch/ppc/syslib/ppc4xx_setup.c @@ -172,6 +172,7 @@ ppc4xx_calibrate_decr(void) freq = bip->bi_tbfreq; tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; /* Set the time base to zero. ** At 200 Mhz, time base will rollover in ~2925 years. Index: linux/arch/ppc/syslib/ppc85xx_setup.c =================================================================== --- linux.orig/arch/ppc/syslib/ppc85xx_setup.c +++ linux/arch/ppc/syslib/ppc85xx_setup.c @@ -57,6 +57,7 @@ mpc85xx_calibrate_decr(void) divisor = 8; tb_ticks_per_jiffy = freq / divisor / HZ; tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000); + cpu_khz = (freq / divisor) / 1000; /* Set the time base to zero */ mtspr(SPRN_TBWL, 0); Index: linux/arch/ppc/syslib/todc_time.c =================================================================== --- linux.orig/arch/ppc/syslib/todc_time.c +++ linux/arch/ppc/syslib/todc_time.c @@ -506,6 +506,7 @@ todc_calibrate_decr(void) tb_ticks_per_jiffy = freq / HZ; tb_to_us = mulhwu_scale_factor(freq, 1000000); + cpu_khz = freq / 1000; return; } Index: linux/arch/sparc64/Kconfig =================================================================== --- linux.orig/arch/sparc64/Kconfig +++ linux/arch/sparc64/Kconfig @@ -26,7 +26,7 @@ config MMU bool default y -config TIME_INTERPOLATION +config GENERIC_TIME bool default y Index: linux/arch/sparc64/defconfig =================================================================== --- linux.orig/arch/sparc64/defconfig +++ linux/arch/sparc64/defconfig @@ -7,7 +7,7 @@ CONFIG_SPARC=y CONFIG_SPARC64=y CONFIG_64BIT=y CONFIG_MMU=y -CONFIG_TIME_INTERPOLATION=y +CONFIG_GENERIC_TIME=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y CONFIG_SPARC64_PAGE_SIZE_8KB=y # CONFIG_SPARC64_PAGE_SIZE_64KB is not set Index: linux/arch/sparc64/kernel/time.c =================================================================== --- linux.orig/arch/sparc64/kernel/time.c +++ linux/arch/sparc64/kernel/time.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -621,7 +622,7 @@ static void __init set_system_time(void) if (!mregs && !dregs) { prom_printf("Something wrong, clock regs not mapped yet.\n"); prom_halt(); - } + } if (mregs) { spin_lock_irq(&mostek_lock); @@ -821,7 +822,7 @@ static int __devinit clock_probe(struct } set_system_time(); - + local_irq_restore(flags); return 0; @@ -976,22 +977,33 @@ static struct notifier_block sparc64_cpu #endif /* CONFIG_CPU_FREQ */ -static struct time_interpolator sparc64_cpu_interpolator = { - .source = TIME_SOURCE_CPU, - .shift = 16, - .mask = 0xffffffffffffffffLL +static cycle_t read_itc(void) +{ + return (cycle_t)get_cycles()); +} + +static struct clocksource clocksource_sparc64_itc = { + .name = "sparc64_itc", + .rating = 300, + .read = read_itc, + .mask = 0xffffffffffffffffLL, + .mult = 0, /*to be caluclated*/ + .shift = 16, + .is_continuous = 1, }; + /* The quotient formula is taken from the IA64 port. */ #define SPARC64_NSEC_PER_CYC_SHIFT 30UL void __init time_init(void) { unsigned long clock = sparc64_init_timers(); - sparc64_cpu_interpolator.frequency = clock; - register_time_interpolator(&sparc64_cpu_interpolator); + clocksource_sparc64_itc.mult = clocksource_hz2mult(clock, + clocksource_sparc64_itc.shift); + clocksource_register(&clocksource_sparc64_itc); - /* Now that the interpolator is registered, it is + /* Now that the clocksource is registered, it is * safe to start the timer ticking. */ sparc64_start_timers(); @@ -1026,11 +1038,11 @@ static int set_rtc_mmss(unsigned long no unsigned long flags; u8 tmp; - /* + /* * Not having a register set can lead to trouble. * Also starfire doesn't have a tod clock. */ - if (!mregs && !dregs) + if (!mregs && !dregs) return -1; if (mregs) { Index: linux/arch/v850/Kconfig =================================================================== --- linux.orig/arch/v850/Kconfig +++ linux/arch/v850/Kconfig @@ -34,6 +34,10 @@ config GENERIC_IRQ_PROBE bool default y +config GENERIC_TIME + bool + default y + config TIME_LOW_RES bool default y Index: linux/arch/v850/kernel/time.c =================================================================== --- linux.orig/arch/v850/kernel/time.c +++ linux/arch/v850/kernel/time.c @@ -99,81 +99,6 @@ static irqreturn_t timer_interrupt (int return IRQ_HANDLED; } -/* - * This version of gettimeofday has near microsecond resolution. - */ -void do_gettimeofday (struct timeval *tv) -{ -#if 0 /* DAVIDM later if possible */ - extern volatile unsigned long lost_ticks; - unsigned long lost; -#endif - unsigned long flags; - unsigned long usec, sec; - unsigned long seq; - - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - -#if 0 - usec = mach_gettimeoffset ? mach_gettimeoffset () : 0; -#else - usec = 0; -#endif -#if 0 /* DAVIDM later if possible */ - lost = lost_ticks; - if (lost) - usec += lost * (1000000/HZ); -#endif - sec = xtime.tv_sec; - usec += xtime.tv_nsec / 1000; - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; -} - -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq (&xtime_lock); - - /* This is revolting. We need to set the xtime.tv_nsec - * correctly. However, the value in this location is - * is value at the last tick. - * Discover what correction gettimeofday - * would have done, and then undo it! - */ -#if 0 - tv->tv_nsec -= mach_gettimeoffset() * 1000; -#endif - - while (tv->tv_nsec < 0) { - tv->tv_nsec += NSEC_PER_SEC; - tv->tv_sec--; - } - - xtime.tv_sec = tv->tv_sec; - xtime.tv_nsec = tv->tv_nsec; - - ntp_clear(); - - write_sequnlock_irq (&xtime_lock); - clock_was_set(); - return 0; -} - -EXPORT_SYMBOL(do_settimeofday); - static int timer_dev_id; static struct irqaction timer_irqaction = { timer_interrupt, Index: linux/arch/x86_64/Kconfig =================================================================== --- linux.orig/arch/x86_64/Kconfig +++ linux/arch/x86_64/Kconfig @@ -24,6 +24,14 @@ config X86 bool default y +config GENERIC_TIME + bool + default y + +config GENERIC_TIME_VSYSCALL + bool + default y + config LOCKDEP_SUPPORT bool default y @@ -46,13 +54,6 @@ config ISA config SBUS bool -config RWSEM_GENERIC_SPINLOCK - bool - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - config GENERIC_HWEIGHT bool default y @@ -289,6 +290,14 @@ config NUMA If the system is EM64T, you should say N unless your system is EM64T NUMA. +config RWSEM_GENERIC_SPINLOCK + bool + default y + +config RWSEM_XCHGADD_ALGORITHM + depends on !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT + bool + config K8_NUMA bool "Old style AMD Opteron NUMA detection" depends on NUMA @@ -659,3 +668,6 @@ source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" + +source "kernel/time/Kconfig" + Index: linux/arch/x86_64/ia32/ia32entry.S =================================================================== --- linux.orig/arch/x86_64/ia32/ia32entry.S +++ linux/arch/x86_64/ia32/ia32entry.S @@ -119,7 +119,9 @@ sysenter_do_call: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -227,7 +229,9 @@ cstar_do_call: cmpl $IA32_NR_syscalls-1,%eax ja ia32_badsys IA32_ARG_FIXUP 1 + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) cli @@ -320,8 +324,10 @@ ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys IA32_ARG_FIXUP + TRACE_SYS_IA32_CALL call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) jmp int_ret_from_sys_call @@ -390,7 +396,7 @@ END(ia32_ptregs_common) .section .rodata,"a" .align 8 -ia32_sys_call_table: +ENTRY(ia32_sys_call_table) .quad sys_restart_syscall .quad sys_exit .quad stub32_fork @@ -713,4 +719,7 @@ ia32_sys_call_table: .quad sys_tee .quad compat_sys_vmsplice .quad compat_sys_move_pages +#ifdef CONFIG_LATENCY_TRACE +.globl ia32_syscall_end +#endif ia32_syscall_end: Index: linux/arch/x86_64/kernel/Makefile =================================================================== --- linux.orig/arch/x86_64/kernel/Makefile +++ linux/arch/x86_64/kernel/Makefile @@ -8,7 +8,7 @@ obj-y := process.o signal.o entry.o trap ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_x86_64.o \ x8664_ksyms.o i387.o syscall.o vsyscall.o \ setup64.o bootflag.o e820.o reboot.o quirks.o i8237.o \ - pci-dma.o pci-nommu.o alternative.o + pci-dma.o pci-nommu.o alternative.o hpet.o tsc.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_X86_MCE) += mce.o Index: linux/arch/x86_64/kernel/apic.c =================================================================== --- linux.orig/arch/x86_64/kernel/apic.c +++ linux/arch/x86_64/kernel/apic.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -36,9 +37,9 @@ #include #include #include +#include int apic_verbosity; -int apic_runs_main_timer; int apic_calibrate_pmtmr __initdata; int disable_apic_timer __initdata; @@ -52,6 +53,25 @@ static cpumask_t timer_interrupt_broadca /* Using APIC to generate smp_local_timer_interrupt? */ int using_apic_timer __read_mostly = 0; + +static unsigned int calibration_result; + +static void lapic_next_event(unsigned long delta, struct clock_event *evt); +static void lapic_timer_setup(int mode, struct clock_event *evt); + +static struct clock_event lapic_clockevent = { + .name = "lapic", + .capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE +#ifdef CONFIG_SMP + | CLOCK_CAP_UPDATE +#endif + , + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, +}; +static DEFINE_PER_CPU(struct clock_event, lapic_events); + static void apic_pm_activate(void); void enable_NMI_through_LVT0 (void * dummy) @@ -527,8 +547,7 @@ static int lapic_suspend(struct sys_devi apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); - local_save_flags(flags); - local_irq_disable(); + local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); return 0; @@ -696,13 +715,16 @@ void __init init_apic_mappings(void) #define APIC_DIVISOR 16 -static void __setup_APIC_LVTT(unsigned int clocks) +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot) { unsigned int lvtt_value, tmp_value, ver; int cpu = smp_processor_id(); ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) lvtt_value |= APIC_LVT_MASKED; @@ -717,48 +739,34 @@ static void __setup_APIC_LVTT(unsigned i & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | APIC_TDR_DIV_16); - apic_write(APIC_TMICT, clocks/APIC_DIVISOR); + if (!oneshot) + apic_write(APIC_TMICT, clocks/APIC_DIVISOR); } -static void setup_APIC_timer(unsigned int clocks) +static void lapic_next_event(unsigned long delta, struct clock_event *evt) +{ + apic_write(APIC_TMICT, delta); +} + +static void lapic_timer_setup(int mode, struct clock_event *evt) { unsigned long flags; local_irq_save(flags); - - /* wait for irq slice */ - if (vxtime.hpet_address && hpet_use_timer) { - int trigger = hpet_readl(HPET_T0_CMP); - while (hpet_readl(HPET_COUNTER) >= trigger) - /* do nothing */ ; - while (hpet_readl(HPET_COUNTER) < trigger) - /* do nothing */ ; - } else { - int c1, c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - do { - c1 = c2; - outb_p(0x00, 0x43); - c2 = inb_p(0x40); - c2 |= inb_p(0x40) << 8; - } while (c2 - c1 < 300); - } - __setup_APIC_LVTT(clocks); - /* Turn off PIT interrupt if we use APIC timer as main timer. - Only works with the PM timer right now - TBD fix it for HPET too. */ - if (vxtime.mode == VXTIME_PMTMR && - smp_processor_id() == boot_cpu_id && - apic_runs_main_timer == 1 && - !cpu_isset(boot_cpu_id, timer_interrupt_broadcast_ipi_mask)) { - stop_timer_interrupt(); - apic_runs_main_timer++; - } + __setup_APIC_LVTT(calibration_result, mode != CLOCK_EVT_PERIODIC); local_irq_restore(flags); } + +static void __devinit setup_APIC_timer(void) +{ + struct clock_event *levt = &__get_cpu_var(lapic_events); + + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + + register_local_clockevent(levt); +} + /* * In this function we calibrate APIC bus clocks to the external * timer. Unfortunately we cannot use jiffies and the timer irq @@ -778,12 +786,13 @@ static int __init calibrate_APIC_clock(v { int apic, apic_start, tsc, tsc_start; int result; + u64 wallclock_nsecs; /* * Put whatever arbitrary (but long enough) timeout * value into the APIC clock, we just want to get the * counter running for calibration. */ - __setup_APIC_LVTT(1000000000); + __setup_APIC_LVTT(1000000000, 0); apic_start = apic_read(APIC_TMCCT); #ifdef CONFIG_X86_PM_TIMER @@ -791,6 +800,8 @@ static int __init calibrate_APIC_clock(v pmtimer_wait(5000); /* 5ms wait */ apic = apic_read(APIC_TMCCT); result = (apic_start - apic) * 1000L / 5; + printk("using pmtimer for lapic calibration\n"); + wallclock_nsecs = 5000000; } else #endif { @@ -804,6 +815,8 @@ static int __init calibrate_APIC_clock(v result = (apic_start - apic) * 1000L * cpu_khz / (tsc - tsc_start); + wallclock_nsecs = ((u64)tsc - (u64)tsc_start) * 1000000 / (u64)cpu_khz; + } printk("result %d\n", result); @@ -811,11 +824,22 @@ static int __init calibrate_APIC_clock(v printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", result / 1000 / 1000, result / 1000 % 1000); + + + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(apic_start - apic, wallclock_nsecs, 32); + + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + printk("lapic max_delta_ns: %ld\n", lapic_clockevent.max_delta_ns); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + return result * APIC_DIVISOR / HZ; } -static unsigned int calibration_result; - void __init setup_boot_APIC_clock (void) { if (disable_apic_timer) { @@ -832,7 +856,7 @@ void __init setup_boot_APIC_clock (void) /* * Now set up the timer for real. */ - setup_APIC_timer(calibration_result); + setup_APIC_timer(); local_irq_enable(); } @@ -840,7 +864,7 @@ void __init setup_boot_APIC_clock (void) void __cpuinit setup_secondary_APIC_clock(void) { local_irq_disable(); /* FIXME: Do we need this? --RR */ - setup_APIC_timer(calibration_result); + setup_APIC_timer(); local_irq_enable(); } @@ -887,6 +911,13 @@ void switch_APIC_timer_to_ipi(void *cpum !cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) { disable_APIC_timer(); cpu_set(cpu, timer_interrupt_broadcast_ipi_mask); +#ifdef CONFIG_HIGH_RES_TIMERS + printk("Disabling NO_HZ and high resolution timers " + "due to timer broadcasting\n"); + for_each_possible_cpu(cpu) + per_cpu(lapic_events, cpu).capabilities &= + ~CLOCK_CAP_NEXTEVT; +#endif } } EXPORT_SYMBOL(switch_APIC_timer_to_ipi); @@ -945,8 +976,6 @@ void smp_local_timer_interrupt(struct pt #ifdef CONFIG_SMP update_process_times(user_mode(regs)); #endif - if (apic_runs_main_timer > 1 && smp_processor_id() == boot_cpu_id) - main_timer_handler(regs); /* * We take the 'long' return path, and there every subsystem * grabs the appropriate locks (kernel lock/ irq lock). @@ -969,6 +998,8 @@ void smp_local_timer_interrupt(struct pt */ void smp_apic_timer_interrupt(struct pt_regs *regs) { + int cpu = smp_processor_id(); + struct clock_event *evt = &per_cpu(lapic_events, cpu); /* * the NMI deadlock-detector uses this. */ @@ -986,7 +1017,7 @@ void smp_apic_timer_interrupt(struct pt_ */ exit_idle(); irq_enter(); - smp_local_timer_interrupt(regs); + evt->event_handler(regs); irq_exit(); } @@ -1161,26 +1192,11 @@ static __init int setup_noapictimer(char return 1; } -static __init int setup_apicmaintimer(char *str) -{ - apic_runs_main_timer = 1; - nohpet = 1; - return 1; -} -__setup("apicmaintimer", setup_apicmaintimer); - -static __init int setup_noapicmaintimer(char *str) -{ - apic_runs_main_timer = -1; - return 1; -} -__setup("noapicmaintimer", setup_noapicmaintimer); - static __init int setup_apicpmtimer(char *s) { apic_calibrate_pmtmr = 1; notsc_setup(NULL); - return setup_apicmaintimer(NULL); + return 1; } __setup("apicpmtimer", setup_apicpmtimer); Index: linux/arch/x86_64/kernel/early_printk.c =================================================================== --- linux.orig/arch/x86_64/kernel/early_printk.c +++ linux/arch/x86_64/kernel/early_printk.c @@ -203,7 +203,7 @@ static int early_console_initialized = 0 void early_printk(const char *fmt, ...) { - char buf[512]; + static char buf[512]; int n; va_list ap; Index: linux/arch/x86_64/kernel/entry.S =================================================================== --- linux.orig/arch/x86_64/kernel/entry.S +++ linux/arch/x86_64/kernel/entry.S @@ -45,6 +45,47 @@ .code64 +#ifdef CONFIG_LATENCY_TRACE + +ENTRY(mcount) + cmpq $0, mcount_enabled + jz out + + push %rbp + mov %rsp,%rbp + + push %r11 + push %r10 + push %r9 + push %r8 + push %rdi + push %rsi + push %rdx + push %rcx + push %rax + + mov 0x0(%rbp),%rax + mov 0x8(%rbp),%rdi + mov 0x8(%rax),%rsi + + call __trace + + pop %rax + pop %rcx + pop %rdx + pop %rsi + pop %rdi + pop %r8 + pop %r9 + pop %r10 + pop %r11 + + pop %rbp +out: + ret + +#endif + #ifndef CONFIG_PREEMPT #define retint_kernel retint_restore_args #endif @@ -221,7 +262,9 @@ ENTRY(system_call) cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx + TRACE_SYS_CALL call *sys_call_table(,%rax,8) # XXX: rip relative + TRACE_SYS_RET movq %rax,RAX-ARGOFFSET(%rsp) /* * Syscall return path ending with SYSRET (fast path) @@ -255,8 +298,8 @@ sysret_check: /* edx: work, edi: workmask */ sysret_careful: CFI_RESTORE_STATE - bt $TIF_NEED_RESCHED,%edx - jnc sysret_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz sysret_signal TRACE_IRQS_ON sti pushq %rdi @@ -279,7 +322,7 @@ sysret_signal: leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ cli @@ -303,7 +346,9 @@ tracesys: cmpq $__NR_syscall_max,%rax ja 1f movq %r10,%rcx /* fixup for C */ + TRACE_SYS_CALL call *sys_call_table(,%rax,8) + TRACE_SYS_RET 1: movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ jmp int_ret_from_sys_call @@ -349,8 +394,8 @@ int_with_check: /* First do a reschedule test. */ /* edx: work, edi: workmask */ int_careful: - bt $TIF_NEED_RESCHED,%edx - jnc int_very_careful + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz int_very_careful TRACE_IRQS_ON sti pushq %rdi @@ -387,7 +432,7 @@ int_signal: movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 call do_notify_resume -1: movl $_TIF_NEED_RESCHED,%edi +1: movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi int_restore_rest: RESTORE_REST cli @@ -585,8 +630,8 @@ bad_iret: /* edi: workmask, edx: work */ retint_careful: CFI_RESTORE_STATE - bt $TIF_NEED_RESCHED,%edx - jnc retint_signal + testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx + jz retint_signal TRACE_IRQS_ON sti pushq %rdi @@ -612,7 +657,7 @@ retint_signal: RESTORE_REST cli TRACE_IRQS_OFF - movl $_TIF_NEED_RESCHED,%edi + movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi GET_THREAD_INFO(%rcx) jmp retint_check Index: linux/arch/x86_64/kernel/head64.c =================================================================== --- linux.orig/arch/x86_64/kernel/head64.c +++ linux/arch/x86_64/kernel/head64.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -74,7 +75,7 @@ static void __init setup_boot_cpu_data(v boot_cpu_data.x86_mask = eax & 0xf; } -void __init x86_64_start_kernel(char * real_mode_data) +void __init notrace x86_64_start_kernel(char * real_mode_data) { char *s; int i; @@ -99,6 +100,7 @@ void __init x86_64_start_kernel(char * r cpu_pda(i) = &boot_cpu_pda[i]; pda_init(0); + copy_bootdata(real_mode_data); #ifdef CONFIG_SMP cpu_set(0, cpu_online_map); @@ -120,5 +122,6 @@ void __init x86_64_start_kernel(char * r panic("Kernel too big for kernel mapping\n"); setup_boot_cpu_data(); + start_kernel(); } Index: linux/arch/x86_64/kernel/hpet.c =================================================================== --- /dev/null +++ linux/arch/x86_64/kernel/hpet.c @@ -0,0 +1,475 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int nohpet __initdata = 0; + +unsigned long hpet_address; +static unsigned long hpet_period; /* fsecs / HPET clock */ +unsigned long hpet_tick; /* HPET clocks / interrupt */ +int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */ + +#define FSEC_PER_TICK (FSEC_PER_SEC / HZ) + +/* + * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing + * it to the HPET timer of known frequency. + */ + +#define TICK_COUNT 100000000 + +unsigned int __init hpet_calibrate_tsc(void) +{ + int tsc_start, hpet_start; + int tsc_now, hpet_now; + unsigned long flags; + + local_irq_save(flags); + local_irq_disable(); + + hpet_start = hpet_readl(HPET_COUNTER); + rdtscl(tsc_start); + + do { + local_irq_disable(); + hpet_now = hpet_readl(HPET_COUNTER); + tsc_now = get_cycles_sync(); + local_irq_restore(flags); + } while ((tsc_now - tsc_start) < TICK_COUNT && + (hpet_now - hpet_start) < TICK_COUNT); + + return (tsc_now - tsc_start) * 1000000000L + / ((hpet_now - hpet_start) * hpet_period / 1000); +} + + + +#ifdef CONFIG_HPET +static __init int late_hpet_init(void) +{ + struct hpet_data hd; + unsigned int ntimer; + + if (!hpet_address) + return 0; + + memset(&hd, 0, sizeof (hd)); + + ntimer = hpet_readl(HPET_ID); + ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; + ntimer++; + + /* + * Register with driver. + * Timer0 and Timer1 is used by platform. + */ + hd.hd_phys_address = hpet_address; + hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); + hd.hd_nirqs = ntimer; + hd.hd_flags = HPET_DATA_PLATFORM; + hpet_reserve_timer(&hd, 0); +#ifdef CONFIG_HPET_EMULATE_RTC + hpet_reserve_timer(&hd, 1); +#endif + hd.hd_irq[0] = HPET_LEGACY_8254; + hd.hd_irq[1] = HPET_LEGACY_RTC; + if (ntimer > 2) { + struct hpet *hpet; + struct hpet_timer *timer; + int i; + + hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); + timer = &hpet->hpet_timers[2]; + for (i = 2; i < ntimer; timer++, i++) + hd.hd_irq[i] = (timer->hpet_config & + Tn_INT_ROUTE_CNF_MASK) >> + Tn_INT_ROUTE_CNF_SHIFT; + + } + + hpet_alloc(&hd); + return 0; +} +fs_initcall(late_hpet_init); +#endif + +static int hpet_timer_stop_set_go(unsigned long tick) +{ + unsigned int cfg; + +/* + * Stop the timers and reset the main counter. + */ + + cfg = hpet_readl(HPET_CFG); + cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); + hpet_writel(cfg, HPET_CFG); + hpet_writel(0, HPET_COUNTER); + hpet_writel(0, HPET_COUNTER + 4); + +/* + * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, + * and period also hpet_tick. + */ + if (hpet_use_timer) { + hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | + HPET_TN_32BIT, HPET_T0_CFG); + hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ + hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ + cfg |= HPET_CFG_LEGACY; + } +/* + * Go! + */ + + cfg |= HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); + + return 0; +} + +int hpet_arch_init(void) +{ + unsigned int id; + + if (!hpet_address) + return -1; + set_fixmap_nocache(FIX_HPET_BASE, hpet_address); + __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); + +/* + * Read the period, compute tick and quotient. + */ + + id = hpet_readl(HPET_ID); + + if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) + return -1; + + hpet_period = hpet_readl(HPET_PERIOD); + if (hpet_period < 100000 || hpet_period > 100000000) + return -1; + + hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; + + hpet_use_timer = (id & HPET_ID_LEGSUP); + + return hpet_timer_stop_set_go(hpet_tick); +} + +int hpet_reenable(void) +{ + return hpet_timer_stop_set_go(hpet_tick); +} + +int hpet_stop(void) +{ + return hpet_timer_stop_set_go(0); +} + +#ifdef CONFIG_HPET_EMULATE_RTC +/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET + * is enabled, we support RTC interrupt functionality in software. + * RTC has 3 kinds of interrupts: + * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock + * is updated + * 2) Alarm Interrupt - generate an interrupt at a specific time of day + * 3) Periodic Interrupt - generate periodic interrupt, with frequencies + * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) + * (1) and (2) above are implemented using polling at a frequency of + * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt + * overhead. (DEFAULT_RTC_INT_FREQ) + * For (3), we use interrupts at 64Hz or user specified periodic + * frequency, whichever is higher. + */ +#include + +#define DEFAULT_RTC_INT_FREQ 64 +#define RTC_NUM_INTS 1 + +static unsigned long UIE_on; +static unsigned long prev_update_sec; + +static unsigned long AIE_on; +static struct rtc_time alarm_time; + +static unsigned long PIE_on; +static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; +static unsigned long PIE_count; + +static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ +static unsigned int hpet_t1_cmp; /* cached comparator register */ + +int is_hpet_enabled(void) +{ + return hpet_address != 0; +} + +/* + * Timer 1 for RTC, we do not use periodic interrupt feature, + * even if HPET supports periodic interrupts on Timer 1. + * The reason being, to set up a periodic interrupt in HPET, we need to + * stop the main counter. And if we do that everytime someone diables/enables + * RTC, we will have adverse effect on main kernel timer running on Timer 0. + * So, for the time being, simulate the periodic interrupt in software. + * + * hpet_rtc_timer_init() is called for the first time and during subsequent + * interuppts reinit happens through hpet_rtc_timer_reinit(). + */ +int hpet_rtc_timer_init(void) +{ + unsigned int cfg, cnt; + unsigned long flags; + + if (!is_hpet_enabled()) + return 0; + /* + * Set the counter 1 and enable the interrupts. + */ + if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) + hpet_rtc_int_freq = PIE_freq; + else + hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; + + local_irq_save(flags); + cnt = hpet_readl(HPET_COUNTER); + cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); + hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; + local_irq_restore(flags); + + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_PERIODIC; + cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; + hpet_writel(cfg, HPET_T1_CFG); + + return 1; +} + +static void hpet_rtc_timer_reinit(void) +{ + unsigned int cfg, cnt; + + if (unlikely(!(PIE_on | AIE_on | UIE_on))) { + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); + return; + } + + if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) + hpet_rtc_int_freq = PIE_freq; + else + hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; + + /* It is more accurate to use the comparator value than current count.*/ + cnt = hpet_t1_cmp; + cnt += hpet_tick*HZ/hpet_rtc_int_freq; + hpet_writel(cnt, HPET_T1_CMP); + hpet_t1_cmp = cnt; +} + +/* + * The functions below are called from rtc driver. + * Return 0 if HPET is not being used. + * Otherwise do the necessary changes and return 1. + */ +int hpet_mask_rtc_irq_bit(unsigned long bit_mask) +{ + if (!is_hpet_enabled()) + return 0; + + if (bit_mask & RTC_UIE) + UIE_on = 0; + if (bit_mask & RTC_PIE) + PIE_on = 0; + if (bit_mask & RTC_AIE) + AIE_on = 0; + + return 1; +} + +int hpet_set_rtc_irq_bit(unsigned long bit_mask) +{ + int timer_init_reqd = 0; + + if (!is_hpet_enabled()) + return 0; + + if (!(PIE_on | AIE_on | UIE_on)) + timer_init_reqd = 1; + + if (bit_mask & RTC_UIE) { + UIE_on = 1; + } + if (bit_mask & RTC_PIE) { + PIE_on = 1; + PIE_count = 0; + } + if (bit_mask & RTC_AIE) { + AIE_on = 1; + } + + if (timer_init_reqd) + hpet_rtc_timer_init(); + + return 1; +} + +int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) +{ + if (!is_hpet_enabled()) + return 0; + + alarm_time.tm_hour = hrs; + alarm_time.tm_min = min; + alarm_time.tm_sec = sec; + + return 1; +} + +int hpet_set_periodic_freq(unsigned long freq) +{ + if (!is_hpet_enabled()) + return 0; + + PIE_freq = freq; + PIE_count = 0; + + return 1; +} + +int hpet_rtc_dropped_irq(void) +{ + if (!is_hpet_enabled()) + return 0; + + return 1; +} + +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) +{ + struct rtc_time curr_time; + unsigned long rtc_int_flag = 0; + int call_rtc_interrupt = 0; + + hpet_rtc_timer_reinit(); + + if (UIE_on | AIE_on) { + rtc_get_rtc_time(&curr_time); + } + if (UIE_on) { + if (curr_time.tm_sec != prev_update_sec) { + /* Set update int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag = RTC_UF; + prev_update_sec = curr_time.tm_sec; + } + } + if (PIE_on) { + PIE_count++; + if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { + /* Set periodic int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag |= RTC_PF; + PIE_count = 0; + } + } + if (AIE_on) { + if ((curr_time.tm_sec == alarm_time.tm_sec) && + (curr_time.tm_min == alarm_time.tm_min) && + (curr_time.tm_hour == alarm_time.tm_hour)) { + /* Set alarm int info, call real rtc int routine */ + call_rtc_interrupt = 1; + rtc_int_flag |= RTC_AF; + } + } + if (call_rtc_interrupt) { + rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); + rtc_interrupt(rtc_int_flag, dev_id, regs); + } + return IRQ_HANDLED; +} +#endif + +static int __init nohpet_setup(char *s) +{ + nohpet = 1; + return 1; +} + +__setup("nohpet", nohpet_setup); + +#define HPET_MASK 0xFFFFFFFF +#define HPET_SHIFT 22 + +/* FSEC = 10^-15 NSEC = 10^-9 */ +#define FSEC_PER_NSEC 1000000 + +static void *hpet_ptr; + +static cycle_t read_hpet(void) +{ + return (cycle_t)readl(hpet_ptr); +} + +static cycle_t __vsyscall_fn vread_hpet(void) +{ + return (cycle_t)readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0); +} + +struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 250, + .read = read_hpet, + .mask = (cycle_t)HPET_MASK, + .mult = 0, /* set below */ + .shift = HPET_SHIFT, + .is_continuous = 1, + .vread = vread_hpet, +}; + +static int __init init_hpet_clocksource(void) +{ + unsigned long hpet_period; + void __iomem *hpet_base; + u64 tmp; + + if (!hpet_address) + return -ENODEV; + + /* calculate the hpet address: */ + hpet_base = + (void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE); + hpet_ptr = hpet_base + HPET_COUNTER; + + /* calculate the frequency: */ + hpet_period = readl(hpet_base + HPET_PERIOD); + + /* + * hpet period is in femto seconds per cycle + * so we need to convert this to ns/cyc units + * aproximated by mult/2^shift + * + * fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift + * fsec/cyc * 1ns/1000000fsec * 2^shift = mult + * fsec/cyc * 2^shift * 1nsec/1000000fsec = mult + * (fsec/cyc << shift)/1000000 = mult + * (hpet_period << shift)/FSEC_PER_NSEC = mult + */ + tmp = (u64)hpet_period << HPET_SHIFT; + do_div(tmp, FSEC_PER_NSEC); + clocksource_hpet.mult = (u32)tmp; + + return clocksource_register(&clocksource_hpet); +} + +module_init(init_hpet_clocksource); Index: linux/arch/x86_64/kernel/i8259.c =================================================================== --- linux.orig/arch/x86_64/kernel/i8259.c +++ linux/arch/x86_64/kernel/i8259.c @@ -43,17 +43,10 @@ BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ BI(x,c) BI(x,d) BI(x,e) BI(x,f) -#define BUILD_15_IRQS(x) \ - BI(x,0) BI(x,1) BI(x,2) BI(x,3) \ - BI(x,4) BI(x,5) BI(x,6) BI(x,7) \ - BI(x,8) BI(x,9) BI(x,a) BI(x,b) \ - BI(x,c) BI(x,d) BI(x,e) - /* * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: * (these are usually mapped to vectors 0x20-0x2f) */ -BUILD_16_IRQS(0x0) #ifdef CONFIG_X86_LOCAL_APIC /* @@ -66,19 +59,14 @@ BUILD_16_IRQS(0x0) * * (these are usually mapped into the 0x30-0xff vector range) */ - BUILD_16_IRQS(0x1) BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) + BUILD_16_IRQS(0x2) BUILD_16_IRQS(0x3) BUILD_16_IRQS(0x4) BUILD_16_IRQS(0x5) BUILD_16_IRQS(0x6) BUILD_16_IRQS(0x7) BUILD_16_IRQS(0x8) BUILD_16_IRQS(0x9) BUILD_16_IRQS(0xa) BUILD_16_IRQS(0xb) -BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) - -#ifdef CONFIG_PCI_MSI - BUILD_15_IRQS(0xe) -#endif +BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) #endif #undef BUILD_16_IRQS -#undef BUILD_15_IRQS #undef BI @@ -91,26 +79,11 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) -#define IRQLIST_15(x) \ - IRQ(x,0), IRQ(x,1), IRQ(x,2), IRQ(x,3), \ - IRQ(x,4), IRQ(x,5), IRQ(x,6), IRQ(x,7), \ - IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ - IRQ(x,c), IRQ(x,d), IRQ(x,e) - void (*interrupt[NR_IRQS])(void) = { - IRQLIST_16(0x0), - -#ifdef CONFIG_X86_IO_APIC - IRQLIST_16(0x1), IRQLIST_16(0x2), IRQLIST_16(0x3), + IRQLIST_16(0x2), IRQLIST_16(0x3), IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), - IRQLIST_16(0xc), IRQLIST_16(0xd) - -#ifdef CONFIG_PCI_MSI - , IRQLIST_15(0xe) -#endif - -#endif + IRQLIST_16(0xc), IRQLIST_16(0xd), IRQLIST_16(0xe), IRQLIST_16(0xf) }; #undef IRQ @@ -126,46 +99,21 @@ void (*interrupt[NR_IRQS])(void) = { * moves to arch independent land */ -DEFINE_SPINLOCK(i8259A_lock); - -static void end_8259A_irq (unsigned int irq) -{ - if (irq > 256) { - char var; - printk("return %p stack %p ti %p\n", __builtin_return_address(0), &var, task_thread_info(current)); - - BUG(); - } - - if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) && - irq_desc[irq].action) - enable_8259A_irq(irq); -} - -#define shutdown_8259A_irq disable_8259A_irq - static void mask_and_ack_8259A(unsigned int); -static unsigned int startup_8259A_irq(unsigned int irq) -{ - enable_8259A_irq(irq); - return 0; /* never anything pending */ -} - -static struct hw_interrupt_type i8259A_irq_type = { - .typename = "XT-PIC", - .startup = startup_8259A_irq, - .shutdown = shutdown_8259A_irq, - .enable = enable_8259A_irq, - .disable = disable_8259A_irq, - .ack = mask_and_ack_8259A, - .end = end_8259A_irq, +static struct irq_chip i8259A_chip = { + .name = "XT-PIC", + .mask = disable_8259A_irq, + .unmask = enable_8259A_irq, + .mask_ack = mask_and_ack_8259A, }; /* * 8259A PIC functions to handle ISA devices: */ +DEFINE_RAW_SPINLOCK(i8259A_lock); + /* * This contains the irq mask for both 8259A irq controllers, */ @@ -234,7 +182,7 @@ void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<= NR_IRQS) - break; if (vector != IA32_SYSCALL_VECTOR) set_intr_gate(vector, interrupt[i]); } @@ -557,7 +525,7 @@ void __init init_IRQ(void) * IRQ0 must be given a fixed assignment and initialized, * because it's used before the IO-APIC is set up. */ - set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); + __get_cpu_var(vector_irq)[FIRST_DEVICE_VECTOR] = 0; /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper Index: linux/arch/x86_64/kernel/io_apic.c =================================================================== --- linux.orig/arch/x86_64/kernel/io_apic.c +++ linux/arch/x86_64/kernel/io_apic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,9 @@ #include #include #include +#include + +static int assign_irq_vector(int irq, cpumask_t mask); #define __apicdebuginit __init @@ -55,8 +59,8 @@ int timer_over_8254 __initdata = 0; /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +static DEFINE_RAW_SPINLOCK(ioapic_lock); +static DEFINE_RAW_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -81,14 +85,6 @@ static struct irq_pin_list { short apic, pin, next; } irq_2_pin[PIN_MAP_SIZE]; -int vector_irq[NR_VECTORS] __read_mostly = { [0 ... NR_VECTORS - 1] = -1}; -#ifdef CONFIG_PCI_MSI -#define vector_to_irq(vector) \ - (platform_legacy_irq(vector) ? vector : vector_irq[vector]) -#else -#define vector_to_irq(vector) (vector) -#endif - #define __DO_ACTION(R, ACTION, FINAL) \ \ { \ @@ -104,6 +100,9 @@ int vector_irq[NR_VECTORS] __read_mostly reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ reg ACTION; \ io_apic_modify(entry->apic, reg); \ + /* Force POST flush by reading: */ \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + \ if (!entry->next) \ break; \ entry = irq_2_pin + entry->next; \ @@ -112,11 +111,35 @@ int vector_irq[NR_VECTORS] __read_mostly } #ifdef CONFIG_SMP +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + int apic, pin; + struct irq_pin_list *entry = irq_2_pin + irq; + + BUG_ON(irq >= NR_IRQS); + for (;;) { + unsigned int reg; + apic = entry->apic; + pin = entry->pin; + if (pin == -1) + break; + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~0x000000ff; + reg |= vector; + io_apic_modify(apic, reg); + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } +} + static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { unsigned long flags; unsigned int dest; cpumask_t tmp; + int vector; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -124,7 +147,13 @@ static void set_ioapic_affinity_irq(unsi cpus_and(mask, tmp, CPU_MASK_ALL); - dest = cpu_mask_to_apicid(mask); + vector = assign_irq_vector(irq, mask); + if (vector < 0) + return; + + cpus_clear(tmp); + cpu_set(vector >> 8, tmp); + dest = cpu_mask_to_apicid(tmp); /* * Only the high 8 bits are valid. @@ -132,14 +161,12 @@ static void set_ioapic_affinity_irq(unsi dest = SET_APIC_LOGICAL_ID(dest); spin_lock_irqsave(&ioapic_lock, flags); - __DO_ACTION(1, = dest, ) - set_irq_info(irq, mask); + __target_IO_APIC_irq(irq, dest, vector & 0xff); + set_native_irq_info(irq, mask); spin_unlock_irqrestore(&ioapic_lock, flags); } #endif -static u8 gsi_2_irq[NR_IRQ_VECTORS] = { [0 ... NR_IRQ_VECTORS-1] = 0xFF }; - /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -170,10 +197,8 @@ static void add_pin_to_irq(unsigned int static void name##_IO_APIC_irq (unsigned int irq) \ __DO_ACTION(R, ACTION, FINAL) -DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) ) - /* mask = 1 */ -DO_ACTION( __unmask, 0, &= 0xfffeffff, ) - /* mask = 0 */ +DO_ACTION( __mask, 0, |= 0x00010000, ) /* mask = 1 */ +DO_ACTION( __unmask, 0, &= 0xfffeffff, ) /* mask = 0 */ static void mask_IO_APIC_irq (unsigned int irq) { @@ -695,64 +720,6 @@ static inline int irq_trigger(int idx) return MPBIOS_trigger(idx); } -static int next_irq = 16; - -/* - * gsi_irq_sharing -- Name overload! "irq" can be either a legacy IRQ - * in the range 0-15, a linux IRQ in the range 0-223, or a GSI number - * from ACPI, which can reach 800 in large boxen. - * - * Compact the sparse GSI space into a sequential IRQ series and reuse - * vectors if possible. - */ -int gsi_irq_sharing(int gsi) -{ - int i, tries, vector; - - BUG_ON(gsi >= NR_IRQ_VECTORS); - - if (platform_legacy_irq(gsi)) - return gsi; - - if (gsi_2_irq[gsi] != 0xFF) - return (int)gsi_2_irq[gsi]; - - tries = NR_IRQS; - try_again: - vector = assign_irq_vector(gsi); - - /* - * Sharing vectors means sharing IRQs, so scan irq_vectors for previous - * use of vector and if found, return that IRQ. However, we never want - * to share legacy IRQs, which usually have a different trigger mode - * than PCI. - */ - for (i = 0; i < NR_IRQS; i++) - if (IO_APIC_VECTOR(i) == vector) - break; - if (platform_legacy_irq(i)) { - if (--tries >= 0) { - IO_APIC_VECTOR(i) = 0; - goto try_again; - } - panic("gsi_irq_sharing: didn't find an IRQ using vector 0x%02X for GSI %d", vector, gsi); - } - if (i < NR_IRQS) { - gsi_2_irq[gsi] = i; - printk(KERN_INFO "GSI %d sharing vector 0x%02X and IRQ %d\n", - gsi, vector, i); - return i; - } - - i = next_irq++; - BUG_ON(i >= NR_IRQS); - gsi_2_irq[gsi] = i; - IO_APIC_VECTOR(i) = vector; - printk(KERN_INFO "GSI %d assigned vector 0x%02X and IRQ %d\n", - gsi, vector, i); - return i; -} - static int pin_2_irq(int idx, int apic, int pin) { int irq, i; @@ -782,7 +749,6 @@ static int pin_2_irq(int idx, int apic, while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; - irq = gsi_irq_sharing(irq); break; } default: @@ -830,46 +796,83 @@ static inline int IO_APIC_irq_trigger(in } /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; +unsigned int irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_EXTERNAL_VECTOR, 0 }; -int assign_irq_vector(int irq) +static int __assign_irq_vector(int irq, cpumask_t mask) { - static int current_vector = FIRST_DEVICE_VECTOR, offset = 0; - unsigned long flags; - int vector; - - BUG_ON(irq != AUTO_ASSIGN && (unsigned)irq >= NR_IRQ_VECTORS); - - spin_lock_irqsave(&vector_lock, flags); - - if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0) { - spin_unlock_irqrestore(&vector_lock, flags); - return IO_APIC_VECTOR(irq); - } + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + static struct { + int vector; + int offset; + } pos[NR_CPUS] = { [ 0 ... NR_CPUS - 1] = {FIRST_DEVICE_VECTOR, 0} }; + int old_vector = -1; + int cpu; + + BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); + + if (IO_APIC_VECTOR(irq) > 0) + old_vector = IO_APIC_VECTOR(irq); + if ((old_vector > 0) && cpu_isset(old_vector >> 8, mask)) { + return old_vector; + } + + for_each_cpu_mask(cpu, mask) { + int vector, offset; + vector = pos[cpu].vector; + offset = pos[cpu].offset; next: - current_vector += 8; - if (current_vector == IA32_SYSCALL_VECTOR) - goto next; - - if (current_vector >= FIRST_SYSTEM_VECTOR) { - /* If we run out of vectors on large boxen, must share them. */ - offset = (offset + 1) % 8; - current_vector = FIRST_DEVICE_VECTOR + offset; + vector += 8; + if (vector >= FIRST_SYSTEM_VECTOR) { + /* If we run out of vectors on large boxen, must share them. */ + offset = (offset + 1) % 8; + vector = FIRST_DEVICE_VECTOR + offset; + } + if (unlikely(pos[cpu].vector == vector)) + continue; + if (vector == IA32_SYSCALL_VECTOR) + goto next; + if (per_cpu(vector_irq, cpu)[vector] != -1) + goto next; + /* Found one! */ + pos[cpu].vector = vector; + pos[cpu].offset = offset; + if (old_vector >= 0) { + int old_cpu = old_vector >> 8; + old_vector &= 0xff; + per_cpu(vector_irq, old_cpu)[old_vector] = -1; + } + per_cpu(vector_irq, cpu)[vector] = irq; + vector |= cpu << 8; + IO_APIC_VECTOR(irq) = vector; + return vector; } + return -ENOSPC; +} - vector = current_vector; - vector_irq[vector] = irq; - if (irq != AUTO_ASSIGN) - IO_APIC_VECTOR(irq) = vector; +static int assign_irq_vector(int irq, cpumask_t mask) +{ + int vector; + unsigned long flags; + spin_lock_irqsave(&vector_lock, flags); + vector = __assign_irq_vector(irq, mask); spin_unlock_irqrestore(&vector_lock, flags); - return vector; } extern void (*interrupt[NR_IRQS])(void); -static struct hw_interrupt_type ioapic_level_type; -static struct hw_interrupt_type ioapic_edge_type; + +static struct irq_chip ioapic_chip; #define IOAPIC_AUTO -1 #define IOAPIC_EDGE 0 @@ -877,16 +880,16 @@ static struct hw_interrupt_type ioapic_e static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { - unsigned idx; - - idx = use_pci_vector() && !platform_legacy_irq(irq) ? vector : irq; - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - irq_desc[idx].chip = &ioapic_level_type; - else - irq_desc[idx].chip = &ioapic_edge_type; - set_intr_gate(vector, interrupt[idx]); + trigger == IOAPIC_LEVEL) { +#ifdef CONFIG_PREEMPT_HARDIRQS + set_irq_chip_and_handler(irq, &ioapic_chip, handle_level_irq); +#else + set_irq_chip_and_handler(irq, &ioapic_chip, handle_fasteoi_irq); +#endif + } else { + set_irq_chip_and_handler(irq, &ioapic_chip, handle_edge_irq); + } } static void __init setup_IO_APIC_irqs(void) @@ -936,8 +939,15 @@ static void __init setup_IO_APIC_irqs(vo continue; if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); - entry.vector = vector; + cpumask_t mask; + vector = assign_irq_vector(irq, TARGET_CPUS); + if (vector < 0) + continue; + + cpus_clear(mask); + cpu_set(vector >> 8, mask); + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); + entry.vector = vector & 0xff; ioapic_register_intr(irq, vector, IOAPIC_AUTO); if (!apic && (irq < 16)) @@ -987,7 +997,7 @@ static void __init setup_ExtINT_IRQ0_pin * The timer IRQ doesn't have to know that behind the * scene we have a 8259A-master in AEOI mode ... */ - irq_desc[0].chip = &ioapic_edge_type; + set_irq_chip_and_handler(0, &ioapic_chip, handle_edge_irq); /* * Add it to the IO-APIC irq-routing table: @@ -1106,17 +1116,12 @@ void __apicdebuginit print_IO_APIC(void) ); } } - if (use_pci_vector()) - printk(KERN_INFO "Using vector-based indexing\n"); printk(KERN_DEBUG "IRQ to pin mappings:\n"); for (i = 0; i < NR_IRQS; i++) { struct irq_pin_list *entry = irq_2_pin + i; if (entry->pin < 0) continue; - if (use_pci_vector() && !platform_legacy_irq(i)) - printk(KERN_DEBUG "IRQ%d ", IO_APIC_VECTOR(i)); - else - printk(KERN_DEBUG "IRQ%d ", i); + printk(KERN_DEBUG "IRQ%d ", i); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) @@ -1502,7 +1507,7 @@ static int __init timer_irq_works(void) * an edge even if it isn't on the 8259A... */ -static unsigned int startup_edge_ioapic_irq(unsigned int irq) +static unsigned int startup_ioapic_irq(unsigned int irq) { int was_pending = 0; unsigned long flags; @@ -1519,107 +1524,16 @@ static unsigned int startup_edge_ioapic_ return was_pending; } -/* - * Once we have recorded IRQ_PENDING already, we can mask the - * interrupt for real. This prevents IRQ storms from unhandled - * devices. - */ -static void ack_edge_ioapic_irq(unsigned int irq) -{ - move_irq(irq); - if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED)) - == (IRQ_PENDING | IRQ_DISABLED)) - mask_IO_APIC_irq(irq); - ack_APIC_irq(); -} - -/* - * Level triggered interrupts can just be masked, - * and shutting down and starting up the interrupt - * is the same as enabling and disabling them -- except - * with a startup need to return a "was pending" value. - * - * Level triggered interrupts are special because we - * do not touch any IO-APIC register while handling - * them. We ack the APIC in the end-IRQ handler, not - * in the start-IRQ-handler. Protection against reentrance - * from the same interrupt is still provided, both by the - * generic IRQ layer and by the fact that an unacked local - * APIC does not accept IRQs. - */ -static unsigned int startup_level_ioapic_irq (unsigned int irq) -{ - unmask_IO_APIC_irq(irq); - - return 0; /* don't check for pending */ -} - -static void end_level_ioapic_irq (unsigned int irq) -{ - move_irq(irq); - ack_APIC_irq(); -} - -#ifdef CONFIG_PCI_MSI -static unsigned int startup_edge_ioapic_vector(unsigned int vector) +static int ioapic_retrigger_irq(unsigned int irq) { - int irq = vector_to_irq(vector); + cpumask_t mask; + unsigned vector; - return startup_edge_ioapic_irq(irq); -} + vector = irq_vector[irq]; + cpus_clear(mask); + cpu_set(vector >> 8, mask); -static void ack_edge_ioapic_vector(unsigned int vector) -{ - int irq = vector_to_irq(vector); - - move_native_irq(vector); - ack_edge_ioapic_irq(irq); -} - -static unsigned int startup_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - return startup_level_ioapic_irq (irq); -} - -static void end_level_ioapic_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - move_native_irq(vector); - end_level_ioapic_irq(irq); -} - -static void mask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - mask_IO_APIC_irq(irq); -} - -static void unmask_IO_APIC_vector (unsigned int vector) -{ - int irq = vector_to_irq(vector); - - unmask_IO_APIC_irq(irq); -} - -#ifdef CONFIG_SMP -static void set_ioapic_affinity_vector (unsigned int vector, - cpumask_t cpu_mask) -{ - int irq = vector_to_irq(vector); - - set_native_irq_info(vector, cpu_mask); - set_ioapic_affinity_irq(irq, cpu_mask); -} -#endif // CONFIG_SMP -#endif // CONFIG_PCI_MSI - -static int ioapic_retrigger(unsigned int irq) -{ - send_IPI_self(IO_APIC_VECTOR(irq)); + send_IPI_mask(mask, vector & 0xff); return 1; } @@ -1633,32 +1547,47 @@ static int ioapic_retrigger(unsigned int * races. */ -static struct hw_interrupt_type ioapic_edge_type __read_mostly = { - .typename = "IO-APIC-edge", - .startup = startup_edge_ioapic, - .shutdown = shutdown_edge_ioapic, - .enable = enable_edge_ioapic, - .disable = disable_edge_ioapic, - .ack = ack_edge_ioapic, - .end = end_edge_ioapic, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity, +static void ack_apic_edge(unsigned int irq) +{ + move_native_irq(irq); + ack_APIC_irq(); +} + +static void ack_apic_level(unsigned int irq) +{ + int do_unmask_irq = 0; + +#if defined(CONFIG_GENERIC_PENDING_IRQ) || defined(CONFIG_IRQBALANCE) + /* If we are moving the irq we need to mask it */ + if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { + do_unmask_irq = 1; + mask_IO_APIC_irq(irq); + } #endif - .retrigger = ioapic_retrigger, -}; -static struct hw_interrupt_type ioapic_level_type __read_mostly = { - .typename = "IO-APIC-level", - .startup = startup_level_ioapic, - .shutdown = shutdown_level_ioapic, - .enable = enable_level_ioapic, - .disable = disable_level_ioapic, - .ack = mask_and_ack_level_ioapic, - .end = end_level_ioapic, + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propogate properly. + */ + ack_APIC_irq(); + + /* Now we can move and renable the irq */ + move_masked_irq(irq); + if (unlikely(do_unmask_irq)) + unmask_IO_APIC_irq(irq); +} + +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity, + .set_affinity = set_ioapic_affinity_irq, #endif - .retrigger = ioapic_retrigger, + .retrigger = ioapic_retrigger_irq, }; static inline void init_IO_APIC_traps(void) @@ -1678,11 +1607,6 @@ static inline void init_IO_APIC_traps(vo */ for (irq = 0; irq < NR_IRQS ; irq++) { int tmp = irq; - if (use_pci_vector()) { - if (!platform_legacy_irq(tmp)) - if ((tmp = vector_to_irq(tmp)) == -1) - continue; - } if (IO_APIC_IRQ(tmp) && !IO_APIC_VECTOR(tmp)) { /* * Hmm.. We don't have an entry for this, @@ -1693,7 +1617,7 @@ static inline void init_IO_APIC_traps(vo make_8259A_irq(irq); else /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_type; + irq_desc[irq].chip = &no_irq_chip; } } } @@ -1812,8 +1736,6 @@ static inline void unlock_ExtINT_logic(v spin_unlock_irqrestore(&ioapic_lock, flags); } -int timer_uses_ioapic_pin_0; - /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -1831,8 +1753,7 @@ static inline void check_timer(void) * get/set the timer IRQ vector: */ disable_8259A_irq(0); - vector = assign_irq_vector(0); - set_intr_gate(vector, interrupt[0]); + vector = assign_irq_vector(0, TARGET_CPUS); /* * Subtle, code in do_timer_interrupt() expects an AEOI @@ -1851,9 +1772,6 @@ static inline void check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - if (pin1 == 0) - timer_uses_ioapic_pin_0 = 1; - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", vector, apic1, pin1, apic2, pin2); @@ -2069,6 +1987,124 @@ static int __init ioapic_init_sysfs(void device_initcall(ioapic_init_sysfs); +/* + * Dynamic irq allocate and deallocation + */ +int create_irq(void) +{ + /* Allocate an unused irq */ + int irq; + int new; + int vector = 0; + unsigned long flags; + + irq = -ENOSPC; + spin_lock_irqsave(&vector_lock, flags); + for (new = (NR_IRQS - 1); new >= 0; new--) { + if (platform_legacy_irq(new)) + continue; + if (irq_vector[new] != 0) + continue; + vector = __assign_irq_vector(new, TARGET_CPUS); + if (likely(vector > 0)) + irq = new; + break; + } + spin_unlock_irqrestore(&vector_lock, flags); + + if (irq >= 0) { + dynamic_irq_init(irq); + } + return irq; +} + +void destroy_irq(unsigned int irq) +{ + unsigned long flags; + + dynamic_irq_cleanup(irq); + + spin_lock_irqsave(&vector_lock, flags); + irq_vector[irq] = 0; + spin_unlock_irqrestore(&vector_lock, flags); +} + +/* + * MSI mesage composition + */ +#ifdef CONFIG_PCI_MSI +static int msi_msg_setup(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +{ + /* For now always this code always uses physical delivery + * mode. + */ + int vector; + unsigned dest; + + vector = assign_irq_vector(irq, TARGET_CPUS); + if (vector >= 0) { + cpumask_t tmp; + + cpus_clear(tmp); + cpu_set(vector >> 8, tmp); + dest = cpu_mask_to_apicid(tmp); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(vector); + } + return vector; +} + +static void msi_msg_teardown(unsigned int irq) +{ + return; +} + +static void msi_msg_set_affinity(unsigned int irq, cpumask_t mask, struct msi_msg *msg) +{ + int vector; + unsigned dest; + + vector = assign_irq_vector(irq, mask); + if (vector > 0) { + cpumask_t tmp; + + cpus_clear(tmp); + cpu_set(vector >> 8, tmp); + dest = cpu_mask_to_apicid(tmp); + + msg->data &= ~MSI_DATA_VECTOR_MASK; + msg->data |= MSI_DATA_VECTOR(vector); + msg->address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg->address_lo |= MSI_ADDR_DEST_ID(dest); + } +} + +struct msi_ops arch_msi_ops = { + .needs_64bit_address = 0, + .setup = msi_msg_setup, + .teardown = msi_msg_teardown, + .target = msi_msg_set_affinity, +}; + +#endif + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ @@ -2107,6 +2143,8 @@ int io_apic_set_pci_routing (int ioapic, { struct IO_APIC_route_entry entry; unsigned long flags; + int vector; + cpumask_t mask; if (!IO_APIC_IRQ(irq)) { apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", @@ -2115,6 +2153,20 @@ int io_apic_set_pci_routing (int ioapic, } /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + + vector = assign_irq_vector(irq, TARGET_CPUS); + if (vector < 0) + return vector; + + cpus_clear(mask); + cpu_set(vector >> 8, mask); + + /* * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. * Note that we mask (disable) IRQs now -- these get enabled when the * corresponding device driver registers for this IRQ. @@ -2124,19 +2176,11 @@ int io_apic_set_pci_routing (int ioapic, entry.delivery_mode = INT_DELIVERY_MODE; entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); entry.trigger = triggering; entry.polarity = polarity; entry.mask = 1; /* Disabled (masked) */ - - irq = gsi_irq_sharing(irq); - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - entry.vector = assign_irq_vector(irq); + entry.vector = vector & 0xff; apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry (%d-%d -> 0x%x -> " "IRQ %d Mode:%i Active:%i)\n", ioapic, @@ -2151,7 +2195,7 @@ int io_apic_set_pci_routing (int ioapic, spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1)); io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0)); - set_native_irq_info(use_pci_vector() ? entry.vector : irq, TARGET_CPUS); + set_native_irq_info(irq, TARGET_CPUS); spin_unlock_irqrestore(&ioapic_lock, flags); return 0; Index: linux/arch/x86_64/kernel/irq.c =================================================================== --- linux.orig/arch/x86_64/kernel/irq.c +++ linux/arch/x86_64/kernel/irq.c @@ -79,7 +79,8 @@ int show_interrupts(struct seq_file *p, for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - seq_printf(p, " %14s", irq_desc[i].chip->typename); + seq_printf(p, " %8s", irq_desc[i].chip->name); + seq_printf(p, "-%s", handle_irq_name(irq_desc[i].handle_irq)); seq_printf(p, " %s", action->name); for (action=action->next; action; action = action->next) @@ -116,7 +117,18 @@ skip: asmlinkage unsigned int do_IRQ(struct pt_regs *regs) { /* high bit used in ret_from_ code */ - unsigned irq = ~regs->orig_rax; + unsigned vector = ~regs->orig_rax; + unsigned irq; + + exit_idle(); + irq_enter(); + irq = __get_cpu_var(vector_irq)[vector]; + +#ifdef CONFIG_LATENCY_TRACE + if (irq == trace_user_trigger_irq) + user_trace_start(); +#endif + trace_special(regs->rip, irq, 0); if (unlikely(irq >= NR_IRQS)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", @@ -124,12 +136,24 @@ asmlinkage unsigned int do_IRQ(struct pt BUG(); } - exit_idle(); - irq_enter(); #ifdef CONFIG_DEBUG_STACKOVERFLOW stack_overflow_check(regs); #endif - __do_IRQ(irq, regs); +#ifdef CONFIG_NO_HZ + if (idle_cpu(smp_processor_id())) { + update_jiffies(); + /* + * Force polling-idle loops to break out into + * the sched-timer setting code, to make sure + * that timer interval changes due to __mod_timer() + * in IRQ context get properly propagated: + */ + if (tsk_is_polling(current)) + set_need_resched(); + } +#endif + + generic_handle_irq(irq, regs); irq_exit(); return 1; Index: linux/arch/x86_64/kernel/mpparse.c =================================================================== --- linux.orig/arch/x86_64/kernel/mpparse.c +++ linux/arch/x86_64/kernel/mpparse.c @@ -909,20 +909,11 @@ void __init mp_config_acpi_legacy_irqs ( return; } -#define MAX_GSI_NUM 4096 - int mp_register_gsi(u32 gsi, int triggering, int polarity) { int ioapic = -1; int ioapic_pin = 0; int idx, bit = 0; - static int pci_irq = 16; - /* - * Mapping between Global System Interrupts, which - * represent all possible interrupts, to the IRQs - * assigned to actual devices. - */ - static int gsi_to_irq[MAX_GSI_NUM]; if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC) return gsi; @@ -955,42 +946,11 @@ int mp_register_gsi(u32 gsi, int trigger if ((1< 15), but - * avoid a problem where the 8254 timer (IRQ0) is setup - * via an override (so it's not on pin 0 of the ioapic), - * and at the same time, the pin 0 interrupt is a PCI - * type. The gsi > 15 test could cause these two pins - * to be shared as IRQ0, and they are not shareable. - * So test for this condition, and if necessary, avoid - * the pin collision. - */ - if (gsi > 15 || (gsi == 0 && !timer_uses_ioapic_pin_0)) - gsi = pci_irq++; - /* - * Don't assign IRQ used by ACPI SCI - */ - if (gsi == acpi_fadt.sci_int) - gsi = pci_irq++; - gsi_to_irq[irq] = gsi; - } else { - printk(KERN_ERR "GSI %u is too high\n", gsi); - return gsi; - } - } - io_apic_set_pci_routing(ioapic, ioapic_pin, gsi, triggering == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); Index: linux/arch/x86_64/kernel/nmi.c =================================================================== --- linux.orig/arch/x86_64/kernel/nmi.c +++ linux/arch/x86_64/kernel/nmi.c @@ -37,7 +37,7 @@ * This is maintained separately from nmi_active because the NMI * watchdog may also be driven from the I/O APIC timer. */ -static DEFINE_SPINLOCK(lapic_nmi_owner_lock); +static DEFINE_RAW_SPINLOCK(lapic_nmi_owner_lock); static unsigned int lapic_nmi_owner; #define LAPIC_NMI_WATCHDOG (1<<0) #define LAPIC_NMI_RESERVED (1<<1) @@ -127,7 +127,9 @@ void __cpuinit nmi_watchdog_default(void static __init void nmi_cpu_busy(void *data) { volatile int *endflag = data; +#ifndef CONFIG_PREEMPT_RT local_irq_enable_in_hardirq(); +#endif /* Intentionally don't use cpu_relax here. This is to make sure that the performance counter really ticks, even if there is a simulator or similar that catches the @@ -526,12 +528,42 @@ void touch_nmi_watchdog (void) touch_softlockup_watchdog(); } +int nmi_show_regs[NR_CPUS]; + +void nmi_show_all_regs(void) +{ + int i; + + if (nmi_watchdog == NMI_NONE) + return; + if (system_state != SYSTEM_RUNNING) { + printk("nmi_show_all_regs(): system state %d, not doing.\n", + system_state); + return; + } + + for_each_online_cpu(i) + nmi_show_regs[i] = 1; + for_each_online_cpu(i) + while (nmi_show_regs[i] == 1) + barrier(); +} + +static DEFINE_RAW_SPINLOCK(nmi_print_lock); + void __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; + int cpu = safe_smp_processor_id(); sum = read_pda(apic_timer_irqs); + if (nmi_show_regs[cpu]) { + nmi_show_regs[cpu] = 0; + spin_lock(&nmi_print_lock); + show_regs(regs); + spin_unlock(&nmi_print_lock); + } if (__get_cpu_var(nmi_touch)) { __get_cpu_var(nmi_touch) = 0; touched = 1; @@ -549,6 +581,11 @@ void __kprobes nmi_watchdog_tick(struct */ local_inc(&__get_cpu_var(alert_counter)); if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) { + int i; + + for (i = 0; i < NR_CPUS; i++) + nmi_show_regs[i] = 1; + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) { local_set(&__get_cpu_var(alert_counter), 0); Index: linux/arch/x86_64/kernel/pmtimer.c =================================================================== --- linux.orig/arch/x86_64/kernel/pmtimer.c +++ linux/arch/x86_64/kernel/pmtimer.c @@ -24,15 +24,6 @@ #include #include -/* The I/O port the PMTMR resides at. - * The location is detected during setup_arch(), - * in arch/i386/kernel/acpi/boot.c */ -u32 pmtmr_ioport __read_mostly; - -/* value of the Power timer at last timer interrupt */ -static u32 offset_delay; -static u32 last_pmtmr_tick; - #define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */ static inline u32 cyc2us(u32 cycles) @@ -48,38 +39,6 @@ static inline u32 cyc2us(u32 cycles) return (cycles >> 10); } -int pmtimer_mark_offset(void) -{ - static int first_run = 1; - unsigned long tsc; - u32 lost; - - u32 tick = inl(pmtmr_ioport); - u32 delta; - - delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK); - - last_pmtmr_tick = tick; - monotonic_base += delta * NSEC_PER_USEC; - - delta += offset_delay; - - lost = delta / (USEC_PER_SEC / HZ); - offset_delay = delta % (USEC_PER_SEC / HZ); - - rdtscll(tsc); - vxtime.last_tsc = tsc - offset_delay * (u64)cpu_khz / 1000; - - /* don't calculate delay for first run, - or if we've got less then a tick */ - if (first_run || (lost < 1)) { - first_run = 0; - offset_delay = 0; - } - - return lost - 1; -} - static unsigned pmtimer_wait_tick(void) { u32 a, b; @@ -101,23 +60,6 @@ void pmtimer_wait(unsigned us) } while (cyc2us(b - a) < us); } -void pmtimer_resume(void) -{ - last_pmtmr_tick = inl(pmtmr_ioport); -} - -unsigned int do_gettimeoffset_pm(void) -{ - u32 now, offset, delta = 0; - - offset = last_pmtmr_tick; - now = inl(pmtmr_ioport); - delta = (now - offset) & ACPI_PM_MASK; - - return offset_delay + cyc2us(delta); -} - - static int __init nopmtimer_setup(char *s) { pmtmr_ioport = 0; Index: linux/arch/x86_64/kernel/process.c =================================================================== --- linux.orig/arch/x86_64/kernel/process.c +++ linux/arch/x86_64/kernel/process.c @@ -113,11 +113,15 @@ static void default_idle(void) current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { local_irq_disable(); - if (!need_resched()) - safe_halt(); - else + if (!need_resched() && !need_resched_delayed()) { + if (!hrtimer_stop_sched_tick()) + safe_halt(); + else + local_irq_enable(); + hrtimer_restart_sched_tick(); + } else local_irq_enable(); } current_thread_info()->status |= TS_POLLING; @@ -131,6 +135,14 @@ static void default_idle(void) static void poll_idle (void) { local_irq_enable(); + while (!need_resched() && !need_resched_delayed()) { + hrtimer_stop_sched_tick(); + local_irq_enable(); + while (!need_resched() && !need_resched_delayed() && !rcu_pending(smp_processor_id()) && !local_softirq_pending()) + rep_nop(); + hrtimer_restart_sched_tick(); + local_irq_enable(); + } asm volatile( "2:" @@ -206,7 +218,9 @@ void cpu_idle (void) current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { - while (!need_resched()) { + BUG_ON(irqs_disabled()); + + while (!need_resched() && !need_resched_delayed()) { void (*idle)(void); if (__get_cpu_var(cpu_idle_state)) @@ -218,14 +232,16 @@ void cpu_idle (void) idle = default_idle; if (cpu_is_offline(smp_processor_id())) play_dead(); + stop_critical_timing(); enter_idle(); idle(); __exit_idle(); } - - preempt_enable_no_resched(); - schedule(); + local_irq_disable(); + __preempt_enable_no_resched(); + __schedule(); preempt_disable(); + local_irq_enable(); } } @@ -240,13 +256,16 @@ static void mwait_idle(void) { local_irq_enable(); - while (!need_resched()) { + while (!need_resched() && !need_resched_delayed()) { + if (hrtimer_stop_sched_tick()) + break; __monitor((void *)¤t_thread_info()->flags, 0, 0); smp_mb(); - if (need_resched()) + if (need_resched() && !need_resched_delayed()) break; __mwait(0, 0); } + hrtimer_restart_sched_tick(); } void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) @@ -346,13 +365,14 @@ void exit_thread(void) struct thread_struct *t = &me->thread; if (me->thread.io_bitmap_ptr) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss; kfree(t->io_bitmap_ptr); t->io_bitmap_ptr = NULL; /* * Careful, clear this in the TSS too: */ + tss = &per_cpu(init_tss, get_cpu()); memset(tss->io_bitmap, 0xff, t->io_bitmap_max); t->io_bitmap_max = 0; put_cpu(); Index: linux/arch/x86_64/kernel/setup64.c =================================================================== --- linux.orig/arch/x86_64/kernel/setup64.c +++ linux/arch/x86_64/kernel/setup64.c @@ -116,7 +116,7 @@ void __init setup_per_cpu_areas(void) } } -void pda_init(int cpu) +void notrace pda_init(int cpu) { struct x8664_pda *pda = cpu_pda(cpu); @@ -185,7 +185,7 @@ void __cpuinit check_efer(void) * 'CPU state barrier', nothing should get across. * A lot of state is already set up in PDA init. */ -void __cpuinit cpu_init (void) +void __cpuinit notrace cpu_init (void) { int cpu = stack_smp_processor_id(); struct tss_struct *t = &per_cpu(init_tss, cpu); Index: linux/arch/x86_64/kernel/signal.c =================================================================== --- linux.orig/arch/x86_64/kernel/signal.c +++ linux/arch/x86_64/kernel/signal.c @@ -431,6 +431,13 @@ int do_signal(struct pt_regs *regs, sigs siginfo_t info; int signr; +#ifdef CONFIG_PREEMPT_RT + /* + * Fully-preemptible kernel does not need interrupts disabled: + */ + local_irq_enable(); + preempt_check_resched(); +#endif /* * We want the common case to go fast, which * is why we may in certain cases get here from Index: linux/arch/x86_64/kernel/smp.c =================================================================== --- linux.orig/arch/x86_64/kernel/smp.c +++ linux/arch/x86_64/kernel/smp.c @@ -57,7 +57,7 @@ union smp_flush_state { struct mm_struct *flush_mm; unsigned long flush_va; #define FLUSH_ALL -1ULL - spinlock_t tlbstate_lock; + raw_spinlock_t tlbstate_lock; }; char pad[SMP_CACHE_BYTES]; } ____cacheline_aligned; @@ -296,10 +296,20 @@ void smp_send_reschedule(int cpu) } /* + * this function sends a 'reschedule' IPI to all other CPUs. + * This is used when RT tasks are starving and other CPUs + * might be able to run them: + */ +void smp_send_reschedule_allbutself(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ -static DEFINE_SPINLOCK(call_lock); +static DEFINE_RAW_SPINLOCK(call_lock); struct call_data_struct { void (*func) (void *info); Index: linux/arch/x86_64/kernel/smpboot.c =================================================================== --- linux.orig/arch/x86_64/kernel/smpboot.c +++ linux/arch/x86_64/kernel/smpboot.c @@ -204,7 +204,7 @@ static void __cpuinit smp_store_cpu_info latency and low latency is the primary objective here. -AK */ #define no_cpu_relax() barrier() -static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock); +static __cpuinitdata __DEFINE_RAW_SPINLOCK(tsc_sync_lock); static volatile __cpuinitdata unsigned long go[SLAVE + 1]; static int notscsync __cpuinitdata; @@ -530,7 +530,7 @@ static inline void set_cpu_sibling_map(i /* * Setup code on secondary processor (after comming out of the trampoline) */ -void __cpuinit start_secondary(void) +void __cpuinit notrace start_secondary(void) { /* * Dont put anything before smp_callin(), SMP Index: linux/arch/x86_64/kernel/time.c =================================================================== --- linux.orig/arch/x86_64/kernel/time.c +++ linux/arch/x86_64/kernel/time.c @@ -39,149 +39,29 @@ #include #include #include +#include +#include #ifdef CONFIG_X86_LOCAL_APIC #include #endif +#include -#ifdef CONFIG_CPU_FREQ -static void cpufreq_delayed_get(void); -#endif extern void i8254_timer_resume(void); extern int using_apic_timer; +extern struct clock_event pit_clockevent; -static char *time_init_gtod(void); DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); -DEFINE_SPINLOCK(i8253_lock); - -int nohpet __initdata = 0; -static int notsc __initdata = 0; +DEFINE_RAW_SPINLOCK(i8253_lock); #define USEC_PER_TICK (USEC_PER_SEC / HZ) #define NSEC_PER_TICK (NSEC_PER_SEC / HZ) -#define FSEC_PER_TICK (FSEC_PER_SEC / HZ) -#define NS_SCALE 10 /* 2^10, carefully chosen */ -#define US_SCALE 32 /* 2^32, arbitralrily chosen */ -unsigned int cpu_khz; /* TSC clocks / usec, not used here */ -EXPORT_SYMBOL(cpu_khz); -static unsigned long hpet_period; /* fsecs / HPET clock */ -unsigned long hpet_tick; /* HPET clocks / interrupt */ -int hpet_use_timer; /* Use counter of hpet for time keeping, otherwise PIT */ -unsigned long vxtime_hz = PIT_TICK_RATE; int report_lost_ticks; /* command line option */ -unsigned long long monotonic_base; - -struct vxtime_data __vxtime __section_vxtime; /* for vsyscalls */ - -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; -unsigned long __wall_jiffies __section_wall_jiffies = INITIAL_JIFFIES; -struct timespec __xtime __section_xtime; -struct timezone __sys_tz __section_sys_tz; - -/* - * do_gettimeoffset() returns microseconds since last timer interrupt was - * triggered by hardware. A memory read of HPET is slower than a register read - * of TSC, but much more reliable. It's also synchronized to the timer - * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a - * timer interrupt has happened already, but vxtime.trigger wasn't updated yet. - * This is not a problem, because jiffies hasn't updated either. They are bound - * together by xtime_lock. - */ - -static inline unsigned int do_gettimeoffset_tsc(void) -{ - unsigned long t; - unsigned long x; - t = get_cycles_sync(); - if (t < vxtime.last_tsc) - t = vxtime.last_tsc; /* hack */ - x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> US_SCALE; - return x; -} - -static inline unsigned int do_gettimeoffset_hpet(void) -{ - /* cap counter read to one tick to avoid inconsistencies */ - unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last; - return (min(counter,hpet_tick) * vxtime.quot) >> US_SCALE; -} - -unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc; - -/* - * This version of gettimeofday() has microsecond resolution and better than - * microsecond precision, as we're using at least a 10 MHz (usually 14.31818 - * MHz) HPET timer. - */ - -void do_gettimeofday(struct timeval *tv) -{ - unsigned long seq, t; - unsigned int sec, usec; - - do { - seq = read_seqbegin(&xtime_lock); - - sec = xtime.tv_sec; - usec = xtime.tv_nsec / NSEC_PER_USEC; - - /* i386 does some correction here to keep the clock - monotonous even when ntpd is fixing drift. - But they didn't work for me, there is a non monotonic - clock anyways with ntp. - I dropped all corrections now until a real solution can - be found. Note when you fix it here you need to do the same - in arch/x86_64/kernel/vsyscall.c and export all needed - variables in vmlinux.lds. -AK */ - - t = (jiffies - wall_jiffies) * USEC_PER_TICK + - do_gettimeoffset(); - usec += t; - - } while (read_seqretry(&xtime_lock, seq)); - - tv->tv_sec = sec + usec / USEC_PER_SEC; - tv->tv_usec = usec % USEC_PER_SEC; -} - -EXPORT_SYMBOL(do_gettimeofday); - -/* - * settimeofday() first undoes the correction that gettimeofday would do - * on the time, and then saves it. This is ugly, but has been like this for - * ages already. - */ - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - - nsec -= do_gettimeoffset() * NSEC_PER_USEC + - (jiffies - wall_jiffies) * NSEC_PER_TICK; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - return 0; -} -EXPORT_SYMBOL(do_settimeofday); +volatile unsigned long jiffies = INITIAL_JIFFIES; unsigned long profile_pc(struct pt_regs *regs) { @@ -277,84 +157,9 @@ static void set_rtc_mmss(unsigned long n } -/* monotonic_clock(): returns # of nanoseconds passed since time_init() - * Note: This function is required to return accurate - * time even in the absence of multiple timer ticks. - */ -unsigned long long monotonic_clock(void) -{ - unsigned long seq; - u32 last_offset, this_offset, offset; - unsigned long long base; - - if (vxtime.mode == VXTIME_HPET) { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last; - base = monotonic_base; - this_offset = hpet_readl(HPET_COUNTER); - } while (read_seqretry(&xtime_lock, seq)); - offset = (this_offset - last_offset); - offset *= NSEC_PER_TICK / hpet_tick; - } else { - do { - seq = read_seqbegin(&xtime_lock); - - last_offset = vxtime.last_tsc; - base = monotonic_base; - } while (read_seqretry(&xtime_lock, seq)); - this_offset = get_cycles_sync(); - /* FIXME: 1000 or 1000000? */ - offset = (this_offset - last_offset)*1000 / cpu_khz; - } - return base + offset; -} -EXPORT_SYMBOL(monotonic_clock); - -static noinline void handle_lost_ticks(int lost, struct pt_regs *regs) -{ - static long lost_count; - static int warned; - if (report_lost_ticks) { - printk(KERN_WARNING "time.c: Lost %d timer tick(s)! ", lost); - print_symbol("rip %s)\n", regs->rip); - } - - if (lost_count == 1000 && !warned) { - printk(KERN_WARNING "warning: many lost ticks.\n" - KERN_WARNING "Your time source seems to be instable or " - "some driver is hogging interupts\n"); - print_symbol("rip %s\n", regs->rip); - if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) { - printk(KERN_WARNING "Falling back to HPET\n"); - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; - } - /* else should fall back to PIT, but code missing. */ - warned = 1; - } else - lost_count++; - -#ifdef CONFIG_CPU_FREQ - /* In some cases the CPU can change frequency without us noticing - Give cpufreq a change to catch up. */ - if ((lost_count+1) % 25 == 0) - cpufreq_delayed_get(); -#endif -} - void main_timer_handler(struct pt_regs *regs) { static unsigned long rtc_update = 0; - unsigned long tsc; - int delay = 0, offset = 0, lost = 0; - /* * Here we are in the timer irq handler. We have irqs locally disabled (so we * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running @@ -362,92 +167,11 @@ void main_timer_handler(struct pt_regs * * variables, because both do_timer() and us change them -arca+vojtech */ - write_seqlock(&xtime_lock); - - if (vxtime.hpet_address) - offset = hpet_readl(HPET_COUNTER); - - if (hpet_use_timer) { - /* if we're using the hpet timer functionality, - * we can more accurately know the counter value - * when the timer interrupt occured. - */ - offset = hpet_readl(HPET_T0_CMP) - hpet_tick; - delay = hpet_readl(HPET_COUNTER) - offset; - } else if (!pmtmr_ioport) { - spin_lock(&i8253_lock); - outb_p(0x00, 0x43); - delay = inb_p(0x40); - delay |= inb(0x40) << 8; - spin_unlock(&i8253_lock); - delay = LATCH - 1 - delay; - } - - tsc = get_cycles_sync(); - - if (vxtime.mode == VXTIME_HPET) { - if (offset - vxtime.last > hpet_tick) { - lost = (offset - vxtime.last) / hpet_tick - 1; - } - - monotonic_base += - (offset - vxtime.last) * NSEC_PER_TICK / hpet_tick; - - vxtime.last = offset; -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - lost = pmtimer_mark_offset(); -#endif - } else { - offset = (((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> US_SCALE) - USEC_PER_TICK; - - if (offset < 0) - offset = 0; - - if (offset > USEC_PER_TICK) { - lost = offset / USEC_PER_TICK; - offset %= USEC_PER_TICK; - } - - /* FIXME: 1000 or 1000000? */ - monotonic_base += (tsc - vxtime.last_tsc) * 1000000 / cpu_khz; - - vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot; - - if ((((tsc - vxtime.last_tsc) * - vxtime.tsc_quot) >> US_SCALE) < offset) - vxtime.last_tsc = tsc - - (((long) offset << US_SCALE) / vxtime.tsc_quot) - 1; - } - - if (lost > 0) { - handle_lost_ticks(lost, regs); - jiffies += lost; - } - /* * Do the timer stuff. */ - do_timer(regs); -#ifndef CONFIG_SMP - update_process_times(user_mode(regs)); -#endif - -/* - * In the SMP case we use the local APIC timer interrupt to do the profiling, - * except when we simulate SMP mode on a uniprocessor system, in that case we - * have to call the local interrupt handler. - */ - -#ifndef CONFIG_X86_LOCAL_APIC - profile_tick(CPU_PROFILING, regs); -#else - if (!using_apic_timer) - smp_local_timer_interrupt(regs); -#endif - + pit_clockevent.event_handler(regs); /* * If we have an externally synchronized Linux clock, then update CMOS clock * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy @@ -462,13 +186,10 @@ void main_timer_handler(struct pt_regs * rtc_update = xtime.tv_sec + 660; } - write_sequnlock(&xtime_lock); } static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) { - if (apic_runs_main_timer > 1) - return IRQ_HANDLED; main_timer_handler(regs); #ifdef CONFIG_X86_LOCAL_APIC if (using_apic_timer) @@ -477,39 +198,6 @@ static irqreturn_t timer_interrupt(int i return IRQ_HANDLED; } -static unsigned int cyc2ns_scale __read_mostly; - -static inline void set_cyc2ns_scale(unsigned long cpu_khz) -{ - cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / cpu_khz; -} - -static inline unsigned long long cycles_2_ns(unsigned long long cyc) -{ - return (cyc * cyc2ns_scale) >> NS_SCALE; -} - -unsigned long long sched_clock(void) -{ - unsigned long a = 0; - -#if 0 - /* Don't do a HPET read here. Using TSC always is much faster - and HPET may not be mapped yet when the scheduler first runs. - Disadvantage is a small drift between CPUs in some configurations, - but that should be tolerable. */ - if (__vxtime.mode == VXTIME_HPET) - return (hpet_readl(HPET_COUNTER) * vxtime.quot) >> US_SCALE; -#endif - - /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, - which means it is not completely exact and may not be monotonous between - CPUs. But the errors should be too small to matter for scheduling - purposes. */ - - rdtscll(a); - return cycles_2_ns(a); -} static unsigned long get_cmos_time(void) { @@ -562,142 +250,6 @@ static unsigned long get_cmos_time(void) return mktime(year, mon, day, hour, min, sec); } -#ifdef CONFIG_CPU_FREQ - -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency - changes. - - RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - not that important because current Opteron setups do not support - scaling on SMP anyroads. - - Should fix up last_tsc too. Currently gettimeofday in the - first tick after the change will be slightly wrong. */ - -#include - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(void *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries - * to verify the CPU frequency the timing core thinks the CPU is running - * at is still correct. - */ -static void cpufreq_delayed_get(void) -{ - static int warned; - if (cpufreq_init && !cpufreq_delayed_issched) { - cpufreq_delayed_issched = 1; - if (!warned) { - warned = 1; - printk(KERN_DEBUG - "Losing some ticks... checking if CPU frequency changed.\n"); - } - schedule_work(&cpufreq_delayed_get_work); - } -} - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -static unsigned long cpu_khz_ref = 0; - -static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, - void *data) -{ - struct cpufreq_freqs *freq = data; - unsigned long *lpj, dummy; - - if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) - return 0; - - lpj = &dummy; - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) -#ifdef CONFIG_SMP - lpj = &cpu_data[freq->cpu].loops_per_jiffy; -#else - lpj = &boot_cpu_data.loops_per_jiffy; -#endif - - if (!ref_freq) { - ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; - cpu_khz_ref = cpu_khz; - } - if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || - (val == CPUFREQ_RESUMECHANGE)) { - *lpj = - cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); - - cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; - } - - set_cyc2ns_scale(cpu_khz_ref); - - return 0; -} - -static struct notifier_block time_cpufreq_notifier_block = { - .notifier_call = time_cpufreq_notifier -}; - -static int __init cpufreq_tsc(void) -{ - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); - if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER)) - cpufreq_init = 1; - return 0; -} - -core_initcall(cpufreq_tsc); - -#endif - -/* - * calibrate_tsc() calibrates the processor TSC in a very simple way, comparing - * it to the HPET timer of known frequency. - */ - -#define TICK_COUNT 100000000 - -static unsigned int __init hpet_calibrate_tsc(void) -{ - int tsc_start, hpet_start; - int tsc_now, hpet_now; - unsigned long flags; - - local_irq_save(flags); - local_irq_disable(); - - hpet_start = hpet_readl(HPET_COUNTER); - rdtscl(tsc_start); - - do { - local_irq_disable(); - hpet_now = hpet_readl(HPET_COUNTER); - tsc_now = get_cycles_sync(); - local_irq_restore(flags); - } while ((tsc_now - tsc_start) < TICK_COUNT && - (hpet_now - hpet_start) < TICK_COUNT); - - return (tsc_now - tsc_start) * 1000000000L - / ((hpet_now - hpet_start) * hpet_period / 1000); -} - /* * pit_calibrate_tsc() uses the speaker output (channel 2) of @@ -728,137 +280,84 @@ static unsigned int __init pit_calibrate return (end - start) / 50; } -#ifdef CONFIG_HPET -static __init int late_hpet_init(void) -{ - struct hpet_data hd; - unsigned int ntimer; - - if (!vxtime.hpet_address) - return 0; - - memset(&hd, 0, sizeof (hd)); - - ntimer = hpet_readl(HPET_ID); - ntimer = (ntimer & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT; - ntimer++; - - /* - * Register with driver. - * Timer0 and Timer1 is used by platform. - */ - hd.hd_phys_address = vxtime.hpet_address; - hd.hd_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE); - hd.hd_nirqs = ntimer; - hd.hd_flags = HPET_DATA_PLATFORM; - hpet_reserve_timer(&hd, 0); -#ifdef CONFIG_HPET_EMULATE_RTC - hpet_reserve_timer(&hd, 1); -#endif - hd.hd_irq[0] = HPET_LEGACY_8254; - hd.hd_irq[1] = HPET_LEGACY_RTC; - if (ntimer > 2) { - struct hpet *hpet; - struct hpet_timer *timer; - int i; - - hpet = (struct hpet *) fix_to_virt(FIX_HPET_BASE); - timer = &hpet->hpet_timers[2]; - for (i = 2; i < ntimer; timer++, i++) - hd.hd_irq[i] = (timer->hpet_config & - Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; +#define PIT_MODE 0x43 +#define PIT_CH0 0x40 - } +static void __init __pit_init(int val, u8 mode) +{ + unsigned long flags; - hpet_alloc(&hd); - return 0; + spin_lock_irqsave(&i8253_lock, flags); + outb_p(mode, PIT_MODE); + outb_p(val & 0xff, PIT_CH0); /* LSB */ + outb_p(val >> 8, PIT_CH0); /* MSB */ + spin_unlock_irqrestore(&i8253_lock, flags); } -fs_initcall(late_hpet_init); -#endif -static int hpet_timer_stop_set_go(unsigned long tick) +static void init_pit_timer(int mode, struct clock_event *evt) { - unsigned int cfg; - -/* - * Stop the timers and reset the main counter. - */ + unsigned long flags; - cfg = hpet_readl(HPET_CFG); - cfg &= ~(HPET_CFG_ENABLE | HPET_CFG_LEGACY); - hpet_writel(cfg, HPET_CFG); - hpet_writel(0, HPET_COUNTER); - hpet_writel(0, HPET_COUNTER + 4); + spin_lock_irqsave(&i8253_lock, flags); -/* - * Set up timer 0, as periodic with first interrupt to happen at hpet_tick, - * and period also hpet_tick. - */ - if (hpet_use_timer) { - hpet_writel(HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | - HPET_TN_32BIT, HPET_T0_CFG); - hpet_writel(hpet_tick, HPET_T0_CMP); /* next interrupt */ - hpet_writel(hpet_tick, HPET_T0_CMP); /* period */ - cfg |= HPET_CFG_LEGACY; + switch(mode) { + case CLOCK_EVT_PERIODIC: + /* binary, mode 2, LSB/MSB, ch 0 */ + outb_p(0x34, PIT_MODE); + udelay(10); + outb_p(LATCH & 0xff , PIT_CH0); /* LSB */ + outb(LATCH >> 8 , PIT_CH0); /* MSB */ + break; + + case CLOCK_EVT_ONESHOT: + /* One shot setup */ + outb_p(0x38, PIT_MODE); + udelay(10); + break; + case CLOCK_EVT_SHUTDOWN: + outb_p(0x30, PIT_MODE); + outb_p(0, PIT_CH0); /* LSB */ + outb_p(0, PIT_CH0); /* MSB */ + disable_irq(0); + break; } -/* - * Go! - */ - - cfg |= HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); - - return 0; + spin_unlock_irqrestore(&i8253_lock, flags); } -static int hpet_init(void) +static void pit_next_event(unsigned long delta, struct clock_event *evt) { - unsigned int id; - - if (!vxtime.hpet_address) - return -1; - set_fixmap_nocache(FIX_HPET_BASE, vxtime.hpet_address); - __set_fixmap(VSYSCALL_HPET, vxtime.hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE); - -/* - * Read the period, compute tick and quotient. - */ - - id = hpet_readl(HPET_ID); - - if (!(id & HPET_ID_VENDOR) || !(id & HPET_ID_NUMBER)) - return -1; - - hpet_period = hpet_readl(HPET_PERIOD); - if (hpet_period < 100000 || hpet_period > 100000000) - return -1; - - hpet_tick = (FSEC_PER_TICK + hpet_period / 2) / hpet_period; - - hpet_use_timer = (id & HPET_ID_LEGSUP); + unsigned long flags; - return hpet_timer_stop_set_go(hpet_tick); + spin_lock_irqsave(&i8253_lock, flags); + outb_p(delta & 0xff , PIT_CH0); /* LSB */ + outb(delta >> 8 , PIT_CH0); /* MSB */ + spin_unlock_irqrestore(&i8253_lock, flags); } -static int hpet_reenable(void) +struct clock_event pit_clockevent = { + .name = "pit", + .capabilities = CLOCK_CAP_TICK | CLOCK_CAP_PROFILE | CLOCK_CAP_UPDATE +#ifndef CONFIG_SMP + | CLOCK_CAP_NEXTEVT +#endif + , + .set_mode = init_pit_timer, + .set_next_event = pit_next_event, + .shift = 32, +}; + +void setup_pit_timer(void) { - return hpet_timer_stop_set_go(hpet_tick); + pit_clockevent.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, 32); + pit_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFF, &pit_clockevent); + pit_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &pit_clockevent); + register_global_clockevent(&pit_clockevent); } -#define PIT_MODE 0x43 -#define PIT_CH0 0x40 -static void __init __pit_init(int val, u8 mode) -{ - unsigned long flags; - spin_lock_irqsave(&i8253_lock, flags); - outb_p(mode, PIT_MODE); - outb_p(val & 0xff, PIT_CH0); /* LSB */ - outb_p(val >> 8, PIT_CH0); /* MSB */ - spin_unlock_irqrestore(&i8253_lock, flags); -} void __init pit_init(void) { @@ -873,9 +372,9 @@ void __init pit_stop_interrupt(void) void __init stop_timer_interrupt(void) { char *name; - if (vxtime.hpet_address) { + if (hpet_address) { name = "HPET"; - hpet_timer_stop_set_go(0); + hpet_stop(); } else { name = "PIT"; pit_stop_interrupt(); @@ -890,119 +389,47 @@ int __init time_setup(char *str) } static struct irqaction irq0 = { - timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL + timer_interrupt, IRQF_DISABLED | IRQF_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL }; void __init time_init(void) { char *timename; - char *gtod; if (nohpet) - vxtime.hpet_address = 0; - + hpet_address = 0; xtime.tv_sec = get_cmos_time(); xtime.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); - if (!hpet_init()) - vxtime_hz = (FSEC_PER_SEC + hpet_period / 2) / hpet_period; - else - vxtime.hpet_address = 0; + if (hpet_arch_init()) + hpet_address = 0; + + setup_pit_timer(); if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ tick_nsec = TICK_NSEC_HPET; cpu_khz = hpet_calibrate_tsc(); timename = "HPET"; -#ifdef CONFIG_X86_PM_TIMER - } else if (pmtmr_ioport && !vxtime.hpet_address) { - vxtime_hz = PM_TIMER_FREQUENCY; - timename = "PM"; - pit_init(); - cpu_khz = pit_calibrate_tsc(); -#endif } else { pit_init(); cpu_khz = pit_calibrate_tsc(); timename = "PIT"; } - vxtime.mode = VXTIME_TSC; - gtod = time_init_gtod(); + if (unsynchronized_tsc()) + mark_tsc_unstable(); - printk(KERN_INFO "time.c: Using %ld.%06ld MHz WALL %s GTOD %s timer.\n", - vxtime_hz / 1000000, vxtime_hz % 1000000, timename, gtod); printk(KERN_INFO "time.c: Detected %d.%03d MHz processor.\n", cpu_khz / 1000, cpu_khz % 1000); - vxtime.quot = (USEC_PER_SEC << US_SCALE) / vxtime_hz; - vxtime.tsc_quot = (USEC_PER_MSEC << US_SCALE) / cpu_khz; - vxtime.last_tsc = get_cycles_sync(); setup_irq(0, &irq0); set_cyc2ns_scale(cpu_khz); } -/* - * Make an educated guess if the TSC is trustworthy and synchronized - * over all CPUs. - */ -__cpuinit int unsynchronized_tsc(void) -{ -#ifdef CONFIG_SMP - if (apic_is_clustered_box()) - return 1; -#endif - /* Most intel systems have synchronized TSCs except for - multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { -#ifdef CONFIG_ACPI - /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100) - return 1; -#endif - return 0; - } - - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; -} - -/* - * Decide what mode gettimeofday should use. - */ -__init static char *time_init_gtod(void) -{ - char *timetype; - - if (unsynchronized_tsc()) - notsc = 1; - if (vxtime.hpet_address && notsc) { - timetype = hpet_use_timer ? "HPET" : "PIT/HPET"; - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); - vxtime.mode = VXTIME_HPET; - do_gettimeoffset = do_gettimeoffset_hpet; -#ifdef CONFIG_X86_PM_TIMER - /* Using PM for gettimeofday is quite slow, but we have no other - choice because the TSC is too unreliable on some systems. */ - } else if (pmtmr_ioport && !vxtime.hpet_address && notsc) { - timetype = "PM"; - do_gettimeoffset = do_gettimeoffset_pm; - vxtime.mode = VXTIME_PMTMR; - sysctl_vsyscall = 0; - printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n"); -#endif - } else { - timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC"; - vxtime.mode = VXTIME_TSC; - } - return timetype; -} __setup("report_lost_ticks", time_setup); @@ -1033,7 +460,7 @@ static int timer_resume(struct sys_devic unsigned long ctime = get_cmos_time(); unsigned long sleep_length = (ctime - sleep_start) * HZ; - if (vxtime.hpet_address) + if (hpet_address) hpet_reenable(); else i8254_timer_resume(); @@ -1042,21 +469,9 @@ static int timer_resume(struct sys_devic write_seqlock_irqsave(&xtime_lock,flags); xtime.tv_sec = sec; xtime.tv_nsec = 0; - if (vxtime.mode == VXTIME_HPET) { - if (hpet_use_timer) - vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick; - else - vxtime.last = hpet_readl(HPET_COUNTER); -#ifdef CONFIG_X86_PM_TIMER - } else if (vxtime.mode == VXTIME_PMTMR) { - pmtimer_resume(); -#endif - } else - vxtime.last_tsc = get_cycles_sync(); - write_sequnlock_irqrestore(&xtime_lock,flags); jiffies += sleep_length; wall_jiffies += sleep_length; - monotonic_base += sleep_length * (NSEC_PER_SEC/HZ); + write_sequnlock_irqrestore(&xtime_lock,flags); touch_softlockup_watchdog(); return 0; } @@ -1083,243 +498,3 @@ static int time_init_device(void) device_initcall(time_init_device); -#ifdef CONFIG_HPET_EMULATE_RTC -/* HPET in LegacyReplacement Mode eats up RTC interrupt line. When, HPET - * is enabled, we support RTC interrupt functionality in software. - * RTC has 3 kinds of interrupts: - * 1) Update Interrupt - generate an interrupt, every sec, when RTC clock - * is updated - * 2) Alarm Interrupt - generate an interrupt at a specific time of day - * 3) Periodic Interrupt - generate periodic interrupt, with frequencies - * 2Hz-8192Hz (2Hz-64Hz for non-root user) (all freqs in powers of 2) - * (1) and (2) above are implemented using polling at a frequency of - * 64 Hz. The exact frequency is a tradeoff between accuracy and interrupt - * overhead. (DEFAULT_RTC_INT_FREQ) - * For (3), we use interrupts at 64Hz or user specified periodic - * frequency, whichever is higher. - */ -#include - -#define DEFAULT_RTC_INT_FREQ 64 -#define RTC_NUM_INTS 1 - -static unsigned long UIE_on; -static unsigned long prev_update_sec; - -static unsigned long AIE_on; -static struct rtc_time alarm_time; - -static unsigned long PIE_on; -static unsigned long PIE_freq = DEFAULT_RTC_INT_FREQ; -static unsigned long PIE_count; - -static unsigned long hpet_rtc_int_freq; /* RTC interrupt frequency */ -static unsigned int hpet_t1_cmp; /* cached comparator register */ - -int is_hpet_enabled(void) -{ - return vxtime.hpet_address != 0; -} - -/* - * Timer 1 for RTC, we do not use periodic interrupt feature, - * even if HPET supports periodic interrupts on Timer 1. - * The reason being, to set up a periodic interrupt in HPET, we need to - * stop the main counter. And if we do that everytime someone diables/enables - * RTC, we will have adverse effect on main kernel timer running on Timer 0. - * So, for the time being, simulate the periodic interrupt in software. - * - * hpet_rtc_timer_init() is called for the first time and during subsequent - * interuppts reinit happens through hpet_rtc_timer_reinit(). - */ -int hpet_rtc_timer_init(void) -{ - unsigned int cfg, cnt; - unsigned long flags; - - if (!is_hpet_enabled()) - return 0; - /* - * Set the counter 1 and enable the interrupts. - */ - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - local_irq_save(flags); - cnt = hpet_readl(HPET_COUNTER); - cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq); - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; - local_irq_restore(flags); - - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_PERIODIC; - cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T1_CFG); - - return 1; -} - -static void hpet_rtc_timer_reinit(void) -{ - unsigned int cfg, cnt; - - if (unlikely(!(PIE_on | AIE_on | UIE_on))) { - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T1_CFG); - return; - } - - if (PIE_on && (PIE_freq > DEFAULT_RTC_INT_FREQ)) - hpet_rtc_int_freq = PIE_freq; - else - hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ; - - /* It is more accurate to use the comparator value than current count.*/ - cnt = hpet_t1_cmp; - cnt += hpet_tick*HZ/hpet_rtc_int_freq; - hpet_writel(cnt, HPET_T1_CMP); - hpet_t1_cmp = cnt; -} - -/* - * The functions below are called from rtc driver. - * Return 0 if HPET is not being used. - * Otherwise do the necessary changes and return 1. - */ -int hpet_mask_rtc_irq_bit(unsigned long bit_mask) -{ - if (!is_hpet_enabled()) - return 0; - - if (bit_mask & RTC_UIE) - UIE_on = 0; - if (bit_mask & RTC_PIE) - PIE_on = 0; - if (bit_mask & RTC_AIE) - AIE_on = 0; - - return 1; -} - -int hpet_set_rtc_irq_bit(unsigned long bit_mask) -{ - int timer_init_reqd = 0; - - if (!is_hpet_enabled()) - return 0; - - if (!(PIE_on | AIE_on | UIE_on)) - timer_init_reqd = 1; - - if (bit_mask & RTC_UIE) { - UIE_on = 1; - } - if (bit_mask & RTC_PIE) { - PIE_on = 1; - PIE_count = 0; - } - if (bit_mask & RTC_AIE) { - AIE_on = 1; - } - - if (timer_init_reqd) - hpet_rtc_timer_init(); - - return 1; -} - -int hpet_set_alarm_time(unsigned char hrs, unsigned char min, unsigned char sec) -{ - if (!is_hpet_enabled()) - return 0; - - alarm_time.tm_hour = hrs; - alarm_time.tm_min = min; - alarm_time.tm_sec = sec; - - return 1; -} - -int hpet_set_periodic_freq(unsigned long freq) -{ - if (!is_hpet_enabled()) - return 0; - - PIE_freq = freq; - PIE_count = 0; - - return 1; -} - -int hpet_rtc_dropped_irq(void) -{ - if (!is_hpet_enabled()) - return 0; - - return 1; -} - -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) -{ - struct rtc_time curr_time; - unsigned long rtc_int_flag = 0; - int call_rtc_interrupt = 0; - - hpet_rtc_timer_reinit(); - - if (UIE_on | AIE_on) { - rtc_get_rtc_time(&curr_time); - } - if (UIE_on) { - if (curr_time.tm_sec != prev_update_sec) { - /* Set update int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag = RTC_UF; - prev_update_sec = curr_time.tm_sec; - } - } - if (PIE_on) { - PIE_count++; - if (PIE_count >= hpet_rtc_int_freq/PIE_freq) { - /* Set periodic int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_PF; - PIE_count = 0; - } - } - if (AIE_on) { - if ((curr_time.tm_sec == alarm_time.tm_sec) && - (curr_time.tm_min == alarm_time.tm_min) && - (curr_time.tm_hour == alarm_time.tm_hour)) { - /* Set alarm int info, call real rtc int routine */ - call_rtc_interrupt = 1; - rtc_int_flag |= RTC_AF; - } - } - if (call_rtc_interrupt) { - rtc_int_flag |= (RTC_IRQF | (RTC_NUM_INTS << 8)); - rtc_interrupt(rtc_int_flag, dev_id, regs); - } - return IRQ_HANDLED; -} -#endif - -static int __init nohpet_setup(char *s) -{ - nohpet = 1; - return 1; -} - -__setup("nohpet", nohpet_setup); - -int __init notsc_setup(char *s) -{ - notsc = 1; - return 1; -} - -__setup("notsc", notsc_setup); Index: linux/arch/x86_64/kernel/traps.c =================================================================== --- linux.orig/arch/x86_64/kernel/traps.c +++ linux/arch/x86_64/kernel/traps.c @@ -368,6 +368,7 @@ void show_trace(struct task_struct *tsk, #undef HANDLE_STACK printk("\n"); + print_traces(tsk); } static void _show_stack(struct task_struct *tsk, struct pt_regs *regs, unsigned long * rsp) @@ -497,7 +498,7 @@ void out_of_line_bug(void) EXPORT_SYMBOL(out_of_line_bug); #endif -static DEFINE_SPINLOCK(die_lock); +static DEFINE_RAW_SPINLOCK(die_lock); static int die_owner = -1; static unsigned int die_nest_count; @@ -773,9 +774,6 @@ asmlinkage __kprobes void default_do_nmi reason = get_nmi_reason(); if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; #ifdef CONFIG_X86_LOCAL_APIC /* * Ok, so this is none of the documented NMI sources, @@ -786,6 +784,9 @@ asmlinkage __kprobes void default_do_nmi return; } #endif + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; unknown_nmi_error(reason, regs); return; } Index: linux/arch/x86_64/kernel/tsc.c =================================================================== --- /dev/null +++ linux/arch/x86_64/kernel/tsc.c @@ -0,0 +1,229 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define NS_SCALE 10 /* 2^10, carefully chosen */ +#define US_SCALE 32 /* 2^32, arbitralrily chosen */ + +static int notsc __initdata = 0; + +unsigned int cpu_khz; /* TSC clocks / usec, not used here */ +EXPORT_SYMBOL(cpu_khz); + +static unsigned int cyc2ns_scale __read_mostly; + +void set_cyc2ns_scale(unsigned long khz) +{ + cyc2ns_scale = (NSEC_PER_MSEC << NS_SCALE) / khz; +} + +static inline unsigned long long cycles_2_ns(unsigned long long cyc) +{ + return (cyc * cyc2ns_scale) >> NS_SCALE; +} + +unsigned long long sched_clock(void) +{ + unsigned long a = 0; + + /* Could do CPU core sync here. Opteron can execute rdtsc speculatively, + which means it is not completely exact and may not be monotonous between + CPUs. But the errors should be too small to matter for scheduling + purposes. */ + + rdtscll(a); + return cycles_2_ns(a); +} + +static int tsc_unstable; + +static inline int check_tsc_unstable(void) +{ + return tsc_unstable; +} + +void mark_tsc_unstable(void) +{ + tsc_unstable = 1; +} +EXPORT_SYMBOL_GPL(mark_tsc_unstable); + +#ifdef CONFIG_CPU_FREQ + +/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency + changes. + + RED-PEN: On SMP we assume all CPUs run with the same frequency. It's + not that important because current Opteron setups do not support + scaling on SMP anyroads. + + Should fix up last_tsc too. Currently gettimeofday in the + first tick after the change will be slightly wrong. */ + +#include + +static unsigned int cpufreq_delayed_issched = 0; +static unsigned int cpufreq_init = 0; +static struct work_struct cpufreq_delayed_get_work; + +static void handle_cpufreq_delayed_get(void *v) +{ + unsigned int cpu; + for_each_online_cpu(cpu) { + cpufreq_get(cpu); + } + cpufreq_delayed_issched = 0; +} + +static unsigned int ref_freq = 0; +static unsigned long loops_per_jiffy_ref = 0; + +static unsigned long cpu_khz_ref = 0; + +static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_freqs *freq = data; + unsigned long *lpj, dummy; + + if (cpu_has(&cpu_data[freq->cpu], X86_FEATURE_CONSTANT_TSC)) + return 0; + + lpj = &dummy; + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) +#ifdef CONFIG_SMP + lpj = &cpu_data[freq->cpu].loops_per_jiffy; +#else + lpj = &boot_cpu_data.loops_per_jiffy; +#endif + + if (!ref_freq) { + ref_freq = freq->old; + loops_per_jiffy_ref = *lpj; + cpu_khz_ref = cpu_khz; + } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || + (val == CPUFREQ_RESUMECHANGE)) { + *lpj = + cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + + cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new); + if (!(freq->flags & CPUFREQ_CONST_LOOPS)) + mark_tsc_unstable(); + } + + set_cyc2ns_scale(cpu_khz_ref); + + return 0; +} + +static struct notifier_block time_cpufreq_notifier_block = { + .notifier_call = time_cpufreq_notifier +}; + +static int __init cpufreq_tsc(void) +{ + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); + if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER)) + cpufreq_init = 1; + return 0; +} + +core_initcall(cpufreq_tsc); + +#endif +/* + * Make an educated guess if the TSC is trustworthy and synchronized + * over all CPUs. + */ +__cpuinit int unsynchronized_tsc(void) +{ +#ifdef CONFIG_SMP + if (apic_is_clustered_box()) + return 1; +#endif + /* Most intel systems have synchronized TSCs except for + multi node systems */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { +#ifdef CONFIG_ACPI + /* But TSC doesn't tick in C3 so don't use it there */ + if (acpi_fadt.length > 0 && acpi_fadt.plvl3_lat < 100) + return 1; +#endif + return 0; + } + + /* Assume multi socket systems are not synchronized */ + return num_present_cpus() > 1; +} + +int __init notsc_setup(char *s) +{ + notsc = 1; + return 1; +} + +__setup("notsc", notsc_setup); + + +/* clock source code: */ + +static int tsc_update_callback(void); + +static cycle_t read_tsc(void) +{ + cycle_t ret = (cycle_t)get_cycles_sync(); + return ret; +} + +static cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret = (cycle_t)get_cycles_sync(); + return ret; +} + +static struct clocksource clocksource_tsc = { + .name = "tsc", + .rating = 300, + .read = read_tsc, + .mask = (cycle_t)-1, + .mult = 0, /* to be set */ + .shift = 22, + .update_callback = tsc_update_callback, + .is_continuous = 1, + .vread = vread_tsc, +}; + +static int tsc_update_callback(void) +{ + int change = 0; + + /* check to see if we should switch to the safe clocksource: */ + if (clocksource_tsc.rating != 50 && check_tsc_unstable()) { + clocksource_tsc.rating = 50; + clocksource_reselect(); + change = 1; + } + return change; +} + +static int __init init_tsc_clocksource(void) +{ + if (!notsc) { + clocksource_tsc.mult = clocksource_khz2mult(cpu_khz, + clocksource_tsc.shift); + return clocksource_register(&clocksource_tsc); + } + return 0; +} + +module_init(init_tsc_clocksource); Index: linux/arch/x86_64/kernel/vmlinux.lds.S =================================================================== --- linux.orig/arch/x86_64/kernel/vmlinux.lds.S +++ linux/arch/x86_64/kernel/vmlinux.lds.S @@ -93,27 +93,11 @@ SECTIONS __vsyscall_0 = VSYSCALL_VIRT_ADDR; . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .xtime_lock : AT(VLOAD(.xtime_lock)) { *(.xtime_lock) } - xtime_lock = VVIRT(.xtime_lock); - - .vxtime : AT(VLOAD(.vxtime)) { *(.vxtime) } - vxtime = VVIRT(.vxtime); - - .wall_jiffies : AT(VLOAD(.wall_jiffies)) { *(.wall_jiffies) } - wall_jiffies = VVIRT(.wall_jiffies); - - .sys_tz : AT(VLOAD(.sys_tz)) { *(.sys_tz) } - sys_tz = VVIRT(.sys_tz); - - .sysctl_vsyscall : AT(VLOAD(.sysctl_vsyscall)) { *(.sysctl_vsyscall) } - sysctl_vsyscall = VVIRT(.sysctl_vsyscall); - - .xtime : AT(VLOAD(.xtime)) { *(.xtime) } - xtime = VVIRT(.xtime); - + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } - jiffies = VVIRT(.jiffies); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) } + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) } Index: linux/arch/x86_64/kernel/vsyscall.c =================================================================== --- linux.orig/arch/x86_64/kernel/vsyscall.c +++ linux/arch/x86_64/kernel/vsyscall.c @@ -26,65 +26,50 @@ #include #include #include +#include #include #include #include +#include #include #include #include -#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) - -int __sysctl_vsyscall __section_sysctl_vsyscall = 1; -seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; +#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace -#include - -static __always_inline void timeval_normalize(struct timeval * tv) -{ - time_t __sec; +struct vsyscall_gtod_data_t { + raw_seqlock_t lock; + int sysctl_enabled; + struct timeval wall_time_tv; + struct timezone sys_tz; + cycle_t offset_base; + struct clocksource clock; +}; - __sec = tv->tv_usec / 1000000; - if (__sec) { - tv->tv_usec %= 1000000; - tv->tv_sec += __sec; - } -} +struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data = { + .lock = __RAW_SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), + .sysctl_enabled = 1, +}; -static __always_inline void do_vgettimeofday(struct timeval * tv) +void update_vsyscall(struct timespec* wall_time, struct clocksource* clock) { - long sequence, t; - unsigned long sec, usec; + unsigned long flags; - do { - sequence = read_seqbegin(&__xtime_lock); - - sec = __xtime.tv_sec; - usec = (__xtime.tv_nsec / 1000) + - (__jiffies - __wall_jiffies) * (1000000 / HZ); - - if (__vxtime.mode != VXTIME_HPET) { - t = get_cycles_sync(); - if (t < __vxtime.last_tsc) - t = __vxtime.last_tsc; - usec += ((t - __vxtime.last_tsc) * - __vxtime.tsc_quot) >> 32; - /* See comment in x86_64 do_gettimeofday. */ - } else { - usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) - - __vxtime.last) * __vxtime.quot) >> 32; - } - } while (read_seqretry(&__xtime_lock, sequence)); + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* copy vsyscall data */ + vsyscall_gtod_data.clock = *clock; + vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec; + vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000; + vsyscall_gtod_data.sys_tz = sys_tz; - tv->tv_sec = sec + usec / 1000000; - tv->tv_usec = usec % 1000000; + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } /* RED-PEN may want to readd seq locking, but then the variable should be write-once. */ static __always_inline void do_get_tz(struct timezone * tz) { - *tz = __sys_tz; + *tz = __vsyscall_gtod_data.sys_tz; } static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) @@ -105,10 +90,44 @@ static __always_inline long time_syscall return secs; } +static __always_inline void do_vgettimeofday(struct timeval * tv) +{ + cycle_t now, base, mask, cycle_delta; + unsigned long seq, mult, shift, nsec_delta; + cycle_t (*vread)(void); + do { + seq = read_seqbegin(&__vsyscall_gtod_data.lock); + + vread = __vsyscall_gtod_data.clock.vread; + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { + gettimeofday(tv,0); + return; + } + now = vread(); + base = __vsyscall_gtod_data.clock.cycle_last; + mask = __vsyscall_gtod_data.clock.mask; + mult = __vsyscall_gtod_data.clock.mult; + shift = __vsyscall_gtod_data.clock.shift; + + *tv = __vsyscall_gtod_data.wall_time_tv; + + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + + /* calculate interval: */ + cycle_delta = (now - base) & mask; + /* convert to nsecs: */ + nsec_delta = (cycle_delta * mult) >> shift; + + /* convert to usecs and add to timespec: */ + tv->tv_usec += nsec_delta / NSEC_PER_USEC; + while (tv->tv_usec > USEC_PER_SEC) { + tv->tv_sec += 1; + tv->tv_usec -= USEC_PER_SEC; + } +} + int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) { - if (!__sysctl_vsyscall) - return gettimeofday(tv,tz); if (tv) do_vgettimeofday(tv); if (tz) @@ -120,11 +139,11 @@ int __vsyscall(0) vgettimeofday(struct t * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { - if (!__sysctl_vsyscall) + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); else if (t) - *t = __xtime.tv_sec; - return __xtime.tv_sec; + *t = __vsyscall_gtod_data.wall_time_tv.tv_sec; + return __vsyscall_gtod_data.wall_time_tv.tv_sec; } long __vsyscall(2) venosys_0(void) @@ -163,7 +182,7 @@ static int vsyscall_sysctl_change(ctl_ta ret = -ENOMEM; goto out; } - if (!sysctl_vsyscall) { + if (!vsyscall_gtod_data.sysctl_enabled) { *map1 = SYSCALL; *map2 = SYSCALL; } else { @@ -186,7 +205,7 @@ static int vsyscall_sysctl_nostrat(ctl_t static ctl_table kernel_table2[] = { { .ctl_name = 99, .procname = "vsyscall64", - .data = &sysctl_vsyscall, .maxlen = sizeof(int), .mode = 0644, + .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, .strategy = vsyscall_sysctl_nostrat, .proc_handler = vsyscall_sysctl_change }, { 0, } Index: linux/arch/x86_64/kernel/x8664_ksyms.c =================================================================== --- linux.orig/arch/x86_64/kernel/x8664_ksyms.c +++ linux/arch/x86_64/kernel/x8664_ksyms.c @@ -12,10 +12,12 @@ EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(__down_failed); -EXPORT_SYMBOL(__down_failed_interruptible); -EXPORT_SYMBOL(__down_failed_trylock); -EXPORT_SYMBOL(__up_wakeup); +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK +EXPORT_SYMBOL(__compat_down_failed); +EXPORT_SYMBOL(__compat_down_failed_interruptible); +EXPORT_SYMBOL(__compat_down_failed_trylock); +EXPORT_SYMBOL(__compat_up_wakeup); +#endif EXPORT_SYMBOL(__get_user_1); EXPORT_SYMBOL(__get_user_2); Index: linux/arch/x86_64/lib/thunk.S =================================================================== --- linux.orig/arch/x86_64/lib/thunk.S +++ linux/arch/x86_64/lib/thunk.S @@ -42,11 +42,13 @@ thunk rwsem_wake_thunk,rwsem_wake thunk rwsem_downgrade_thunk,rwsem_downgrade_wake #endif - - thunk __down_failed,__down - thunk_retrax __down_failed_interruptible,__down_interruptible - thunk_retrax __down_failed_trylock,__down_trylock - thunk __up_wakeup,__up + +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK + thunk __compat_down_failed,__compat_down + thunk_retrax __compat_down_failed_interruptible,__compat_down_interruptible + thunk_retrax __compat_down_failed_trylock,__compat_down_trylock + thunk __compat_up_wakeup,__compat_up +#endif #ifdef CONFIG_TRACE_IRQFLAGS thunk trace_hardirqs_on_thunk,trace_hardirqs_on Index: linux/arch/x86_64/mm/fault.c =================================================================== --- linux.orig/arch/x86_64/mm/fault.c +++ linux/arch/x86_64/mm/fault.c @@ -79,6 +79,7 @@ void bust_spinlocks(int yes) { int loglevel_save = console_loglevel; if (yes) { + stop_trace(); oops_in_progress = 1; } else { #ifdef CONFIG_VT Index: linux/arch/x86_64/mm/init.c =================================================================== --- linux.orig/arch/x86_64/mm/init.c +++ linux/arch/x86_64/mm/init.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(dma_ops); static unsigned long dma_reserve __initdata; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); +DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers); /* * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the Index: linux/block/cfq-iosched.c =================================================================== --- linux.orig/block/cfq-iosched.c +++ linux/block/cfq-iosched.c @@ -1283,7 +1283,7 @@ static void cfq_exit_single_io_context(s q = cfqd->queue; - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); spin_lock(q->queue_lock); Index: linux/block/ll_rw_blk.c =================================================================== --- linux.orig/block/ll_rw_blk.c +++ linux/block/ll_rw_blk.c @@ -1547,7 +1547,7 @@ static int ll_merge_requests_fn(request_ */ void blk_plug_device(request_queue_t *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); /* * don't plug a stopped queue, it must be paired with blk_start_queue() @@ -1570,7 +1570,7 @@ EXPORT_SYMBOL(blk_plug_device); */ int blk_remove_plug(request_queue_t *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) return 0; @@ -1662,7 +1662,7 @@ static void blk_unplug_timeout(unsigned **/ void blk_start_queue(request_queue_t *q) { - WARN_ON(!irqs_disabled()); + WARN_ON_NONRT(!irqs_disabled()); clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags); @@ -3584,13 +3584,15 @@ void exit_io_context(void) struct io_context *ioc; struct cfq_io_context *cic; - local_irq_save(flags); + // FIXME: unsafe upstream too? + + local_irq_save_nort(flags); task_lock(current); ioc = current->io_context; current->io_context = NULL; ioc->task = NULL; task_unlock(current); - local_irq_restore(flags); + local_irq_restore_nort(flags); if (ioc->aic && ioc->aic->exit) ioc->aic->exit(ioc->aic); Index: linux/drivers/acpi/executer/exmutex.c =================================================================== --- linux.orig/drivers/acpi/executer/exmutex.c +++ linux/drivers/acpi/executer/exmutex.c @@ -267,9 +267,9 @@ acpi_ex_release_mutex(union acpi_operand && (obj_desc->mutex.os_mutex != ACPI_GLOBAL_LOCK)) { ACPI_ERROR((AE_INFO, "Thread %X cannot release Mutex [%4.4s] acquired by thread %X", - (u32) walk_state->thread->thread_id, + (u32)(long) walk_state->thread->thread_id, acpi_ut_get_node_name(obj_desc->mutex.node), - (u32) obj_desc->mutex.owner_thread->thread_id)); + (u32)(long) obj_desc->mutex.owner_thread->thread_id)); return_ACPI_STATUS(AE_AML_NOT_OWNER); } Index: linux/drivers/acpi/osl.c =================================================================== --- linux.orig/drivers/acpi/osl.c +++ linux/drivers/acpi/osl.c @@ -676,13 +676,13 @@ void acpi_os_delete_lock(acpi_spinlock h acpi_status acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle) { - struct semaphore *sem = NULL; + struct compat_semaphore *sem = NULL; - sem = acpi_os_allocate(sizeof(struct semaphore)); + sem = acpi_os_allocate(sizeof(struct compat_semaphore)); if (!sem) return AE_NO_MEMORY; - memset(sem, 0, sizeof(struct semaphore)); + memset(sem, 0, sizeof(struct compat_semaphore)); sema_init(sem, initial_units); @@ -705,7 +705,7 @@ EXPORT_SYMBOL(acpi_os_create_semaphore); acpi_status acpi_os_delete_semaphore(acpi_handle handle) { - struct semaphore *sem = (struct semaphore *)handle; + struct compat_semaphore *sem = (struct compat_semaphore *)handle; if (!sem) @@ -733,7 +733,7 @@ EXPORT_SYMBOL(acpi_os_delete_semaphore); acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout) { acpi_status status = AE_OK; - struct semaphore *sem = (struct semaphore *)handle; + struct compat_semaphore *sem = (struct compat_semaphore *)handle; int ret = 0; @@ -820,7 +820,7 @@ EXPORT_SYMBOL(acpi_os_wait_semaphore); */ acpi_status acpi_os_signal_semaphore(acpi_handle handle, u32 units) { - struct semaphore *sem = (struct semaphore *)handle; + struct compat_semaphore *sem = (struct compat_semaphore *)handle; if (!sem || (units < 1)) Index: linux/drivers/acpi/processor_idle.c =================================================================== --- linux.orig/drivers/acpi/processor_idle.c +++ linux/drivers/acpi/processor_idle.c @@ -38,9 +38,11 @@ #include #include #include /* need_resched() */ +#include #include #include +#include #include #include @@ -368,10 +370,12 @@ static void acpi_processor_idle(void) /* Get end time (ticks) */ t2 = inl(acpi_fadt.xpm_tmr_blk.address); +#ifndef CONFIG_IA64 #ifdef CONFIG_GENERIC_TIME /* TSC halts in C2, so notify users */ mark_tsc_unstable(); #endif +#endif /* Re-enable interrupts */ local_irq_enable(); current_thread_info()->status |= TS_POLLING; @@ -412,10 +416,12 @@ static void acpi_processor_idle(void) ACPI_MTX_DO_NOT_LOCK); } +#ifndef CONFIG_IA64 #ifdef CONFIG_GENERIC_TIME /* TSC halts in C3, so notify users */ mark_tsc_unstable(); #endif +#endif /* Re-enable interrupts */ local_irq_enable(); current_thread_info()->status |= TS_POLLING; @@ -453,7 +459,8 @@ static void acpi_processor_idle(void) */ if (cx->promotion.state && ((cx->promotion.state - pr->power.states) <= max_cstate)) { - if (sleep_ticks > cx->promotion.threshold.ticks) { + if (sleep_ticks > cx->promotion.threshold.ticks && + cx->promotion.state->latency <= system_latency_constraint()) { cx->promotion.count++; cx->demotion.count = 0; if (cx->promotion.count >= @@ -494,8 +501,10 @@ static void acpi_processor_idle(void) end: /* * Demote if current state exceeds max_cstate + * or if the latency of the current state is unacceptable */ - if ((pr->power.state - pr->power.states) > max_cstate) { + if ((pr->power.state - pr->power.states) > max_cstate || + pr->power.state->latency > system_latency_constraint()) { if (cx->demotion.state) next_state = cx->demotion.state; } @@ -1009,9 +1018,10 @@ static int acpi_processor_power_seq_show seq_printf(seq, "active state: C%zd\n" "max_cstate: C%d\n" - "bus master activity: %08x\n", + "bus master activity: %08x\n" + "maximum allowed latency: %d usec\n", pr->power.state ? pr->power.state - pr->power.states : 0, - max_cstate, (unsigned)pr->power.bm_activity); + max_cstate, (unsigned)pr->power.bm_activity, system_latency_constraint()); seq_puts(seq, "states:\n"); @@ -1077,6 +1087,29 @@ static const struct file_operations acpi .release = single_release, }; + +static void smp_callback(void *v) +{ + /* we already woke the CPU up, nothing more to do */ +} + +/* + * This function gets called when a part of the kernel has a new latency requirement. + * This means we need to get all processors out of their C-state, and then recalculate + * a new suitable C-state. Just do a cross-cpu IPI; that wakes them all right up. + */ +static int acpi_processor_latency_notify(struct notifier_block *b, + unsigned long l, void *v) +{ + smp_call_function(smp_callback, NULL, 0, 1); + return NOTIFY_OK; +} + +static struct notifier_block acpi_processor_latency_notifier = { + .notifier_call = acpi_processor_latency_notify, +}; + + int acpi_processor_power_init(struct acpi_processor *pr, struct acpi_device *device) { @@ -1093,6 +1126,7 @@ int acpi_processor_power_init(struct acp "ACPI: processor limited to max C-state %d\n", max_cstate); first_run++; + register_latency_notifier(&acpi_processor_latency_notifier); } if (!pr) @@ -1164,6 +1198,7 @@ int acpi_processor_power_exit(struct acp * copies of pm_idle before proceeding. */ cpu_idle_wait(); + unregister_latency_notifier(&acpi_processor_latency_notifier); } return 0; Index: linux/drivers/acpi/tables/tbget.c =================================================================== --- linux.orig/drivers/acpi/tables/tbget.c +++ linux/drivers/acpi/tables/tbget.c @@ -325,7 +325,7 @@ acpi_tb_get_this_table(struct acpi_point if (header->length < sizeof(struct acpi_table_header)) { ACPI_ERROR((AE_INFO, "Table length (%X) is smaller than minimum (%X)", - header->length, sizeof(struct acpi_table_header))); + header->length, (int)sizeof(struct acpi_table_header))); return_ACPI_STATUS(AE_INVALID_TABLE_LENGTH); } Index: linux/drivers/acpi/tables/tbrsdt.c =================================================================== --- linux.orig/drivers/acpi/tables/tbrsdt.c +++ linux/drivers/acpi/tables/tbrsdt.c @@ -189,7 +189,7 @@ acpi_status acpi_tb_validate_rsdt(struct ACPI_ERROR((AE_INFO, "RSDT/XSDT length (%X) is smaller than minimum (%X)", table_ptr->length, - sizeof(struct acpi_table_header))); + (int)sizeof(struct acpi_table_header))); return (AE_INVALID_TABLE_LENGTH); } Index: linux/drivers/acpi/utilities/utmutex.c =================================================================== --- linux.orig/drivers/acpi/utilities/utmutex.c +++ linux/drivers/acpi/utilities/utmutex.c @@ -259,7 +259,7 @@ acpi_status acpi_ut_acquire_mutex(acpi_m } else { ACPI_EXCEPTION((AE_INFO, status, "Thread %X could not acquire Mutex [%X]", - (u32) this_thread_id, mutex_id)); + (u32)(long) this_thread_id, mutex_id)); } return (status); Index: linux/drivers/block/paride/pseudo.h =================================================================== --- linux.orig/drivers/block/paride/pseudo.h +++ linux/drivers/block/paride/pseudo.h @@ -43,7 +43,7 @@ static unsigned long ps_timeout; static int ps_tq_active = 0; static int ps_nice = 0; -static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused))); +static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock); static DECLARE_WORK(ps_tq, ps_tq_int, NULL); Index: linux/drivers/char/Kconfig =================================================================== --- linux.orig/drivers/char/Kconfig +++ linux/drivers/char/Kconfig @@ -741,6 +741,46 @@ config RTC To compile this driver as a module, choose M here: the module will be called rtc. +config RTC_HISTOGRAM + bool "Real Time Clock Histogram Support" + default n + depends on RTC + ---help--- + If you say Y here then the kernel will track the delivery and + wakeup latency of /dev/rtc using tasks and will report a + histogram to the kernel log when the application closes /dev/rtc. + +config BLOCKER + tristate "Priority Inheritance Debugging (Blocker) Device Support" + depends on X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + pi_test suite uses to test and measure kernel locking primitives. + +config LPPTEST + tristate "Parallel Port Based Latency Measurement Device" + depends on !PARPORT && X86 + default y + ---help--- + If you say Y here then a device will be created that the userspace + testlpp utility uses to measure IRQ latencies of a target system + from an independent measurement system. + + NOTE: this code assumes x86 PCs and that the parallel port is + bidirectional and is on IRQ 7. + + to use the device, both the target and the source system needs to + run a kernel with CONFIG_LPPTEST enabled. To measure latencies, + use the scripts/testlpp utility in your kernel source directory, + and run it (as root) on the source system - it will start printing + out the latencies it took to get a response from the target system: + + Latency of response: 12.2 usecs (121265 cycles) + + then generate various workloads on the target system to see how + (worst-case-) latencies are impacted. + config SGI_DS1286 tristate "SGI DS1286 RTC support" depends on SGI_IP22 Index: linux/drivers/char/Makefile =================================================================== --- linux.orig/drivers/char/Makefile +++ linux/drivers/char/Makefile @@ -89,6 +89,9 @@ obj-$(CONFIG_GPIO_VR41XX) += vr41xx_giu. obj-$(CONFIG_TANBAC_TB0219) += tb0219.o obj-$(CONFIG_TELCLOCK) += tlclk.o +obj-$(CONFIG_BLOCKER) += blocker.o +obj-$(CONFIG_LPPTEST) += lpptest.o + obj-$(CONFIG_WATCHDOG) += watchdog/ obj-$(CONFIG_MWAVE) += mwave/ obj-$(CONFIG_AGP) += agp/ Index: linux/drivers/char/blocker.c =================================================================== --- /dev/null +++ linux/drivers/char/blocker.c @@ -0,0 +1,107 @@ +/* + * priority inheritance testing device + */ + +#include +#include + +#define BLOCKER_MINOR 221 + +#define BLOCK_IOCTL 4245 +#define BLOCK_SET_DEPTH 4246 + +#define BLOCKER_MAX_LOCK_DEPTH 10 + +void loop(int loops) +{ + int i; + + for (i = 0; i < loops; i++) + get_cycles(); +} + +static spinlock_t blocker_lock[BLOCKER_MAX_LOCK_DEPTH]; + +static unsigned int lock_depth = 1; + +void do_the_lock_and_loop(unsigned int args) +{ + int i, max; + + if (rt_task(current)) + max = lock_depth; + else if (lock_depth > 1) + max = (current->pid % lock_depth) + 1; + else + max = 1; + + /* Always lock from the top down */ + for (i = max-1; i >= 0; i--) + spin_lock(&blocker_lock[i]); + loop(args); + for (i = 0; i < max; i++) + spin_unlock(&blocker_lock[i]); +} + +static int blocker_open(struct inode *in, struct file *file) +{ + printk(KERN_INFO "blocker_open called\n"); + + return 0; +} + +static long blocker_ioctl(struct file *file, + unsigned int cmd, unsigned long args) +{ + switch(cmd) { + case BLOCK_IOCTL: + do_the_lock_and_loop(args); + return 0; + case BLOCK_SET_DEPTH: + if (args >= BLOCKER_MAX_LOCK_DEPTH) + return -EINVAL; + lock_depth = args; + return 0; + default: + return -EINVAL; + } +} + +static struct file_operations blocker_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .unlocked_ioctl = blocker_ioctl, + .open = blocker_open, +}; + +static struct miscdevice blocker_dev = +{ + BLOCKER_MINOR, + "blocker", + &blocker_fops +}; + +static int __init blocker_init(void) +{ + int i; + + if (misc_register(&blocker_dev)) + return -ENODEV; + + for (i = 0; i < BLOCKER_MAX_LOCK_DEPTH; i++) + spin_lock_init(blocker_lock + i); + + return 0; +} + +void __exit blocker_exit(void) +{ + printk(KERN_INFO "blocker device uninstalled\n"); + misc_deregister(&blocker_dev); +} + +module_init(blocker_init); +module_exit(blocker_exit); + +MODULE_LICENSE("GPL"); + Index: linux/drivers/char/hangcheck-timer.c =================================================================== --- linux.orig/drivers/char/hangcheck-timer.c +++ linux/drivers/char/hangcheck-timer.c @@ -117,7 +117,7 @@ __setup("hcheck_reboot", hangcheck_parse __setup("hcheck_dump_tasks", hangcheck_parse_dump_tasks); #endif /* not MODULE */ -#if defined(CONFIG_X86_64) || defined(CONFIG_S390) +#ifdef CONFIG_S390 # define HAVE_MONOTONIC # define TIMER_FREQ 1000000000ULL #elif defined(CONFIG_IA64) Index: linux/drivers/char/hpet.c =================================================================== --- linux.orig/drivers/char/hpet.c +++ linux/drivers/char/hpet.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -50,8 +51,34 @@ #define HPET_RANGE_SIZE 1024 /* from HPET spec */ +#if BITS_PER_LONG == 64 +#define write_counter(V, MC) writeq(V, MC) +#define read_counter(MC) readq(MC) +#else +#define write_counter(V, MC) writel(V, MC) +#define read_counter(MC) readl(MC) +#endif + static u32 hpet_nhpet, hpet_max_freq = HPET_USER_FREQ; +static void __iomem *hpet_mc_ptr; + +static cycle_t read_hpet(void) +{ + return (cycle_t)read_counter((void __iomem *)hpet_mc_ptr); +} + +static struct clocksource clocksource_hpet = { + .name = "hpet", + .rating = 300, + .read = read_hpet, + .mask = 0xffffffffffffffffLL, + .mult = 0, /*to be caluclated*/ + .shift = 10, + .is_continuous = 1, +}; +static struct clocksource *hpet_clocksource_p; + /* A lock for concurrent access by app and isr hpet activity. */ static DEFINE_SPINLOCK(hpet_lock); /* A lock for concurrent intermodule access to hpet and isr hpet activity. */ @@ -78,7 +105,7 @@ struct hpets { struct hpets *hp_next; struct hpet __iomem *hp_hpet; unsigned long hp_hpet_phys; - struct time_interpolator *hp_interpolator; + struct clocksource *hp_clocksource; unsigned long long hp_tick_freq; unsigned long hp_delta; unsigned int hp_ntimer; @@ -93,13 +120,6 @@ static struct hpets *hpets; #define HPET_PERIODIC 0x0004 #define HPET_SHARED_IRQ 0x0008 -#if BITS_PER_LONG == 64 -#define write_counter(V, MC) writeq(V, MC) -#define read_counter(MC) readq(MC) -#else -#define write_counter(V, MC) writel(V, MC) -#define read_counter(MC) readl(MC) -#endif #ifndef readq static inline unsigned long long readq(void __iomem *addr) @@ -736,27 +756,6 @@ static ctl_table dev_root[] = { static struct ctl_table_header *sysctl_header; -static void hpet_register_interpolator(struct hpets *hpetp) -{ -#ifdef CONFIG_TIME_INTERPOLATION - struct time_interpolator *ti; - - ti = kzalloc(sizeof(*ti), GFP_KERNEL); - if (!ti) - return; - - ti->source = TIME_SOURCE_MMIO64; - ti->shift = 10; - ti->addr = &hpetp->hp_hpet->hpet_mc; - ti->frequency = hpetp->hp_tick_freq; - ti->drift = HPET_DRIFT; - ti->mask = -1; - - hpetp->hp_interpolator = ti; - register_time_interpolator(ti); -#endif -} - /* * Adjustment for when arming the timer with * initial conditions. That is, main counter @@ -908,7 +907,16 @@ int hpet_alloc(struct hpet_data *hdp) } hpetp->hp_delta = hpet_calibrate(hpetp); - hpet_register_interpolator(hpetp); + + if (!hpet_clocksource_p) { +#ifdef CONFIG_IA64 + clocksource_hpet.fsys_mmio_ptr = hpet_mc_ptr = &hpetp->hp_hpet->hpet_mc; +#endif + clocksource_hpet.mult = clocksource_hz2mult(hpetp->hp_tick_freq, + clocksource_hpet.shift); + clocksource_register(&clocksource_hpet); + hpet_clocksource_p = hpetp->hp_clocksource = &clocksource_hpet; + } return 0; } @@ -994,7 +1002,7 @@ static int hpet_acpi_add(struct acpi_dev static int hpet_acpi_remove(struct acpi_device *device, int type) { - /* XXX need to unregister interpolator, dealloc mem, etc */ + /* XXX need to unregister clocksource, dealloc mem, etc */ return -EINVAL; } Index: linux/drivers/char/lpptest.c =================================================================== --- /dev/null +++ linux/drivers/char/lpptest.c @@ -0,0 +1,179 @@ +/* + * /dev/lpptest device: test IRQ handling latencies over parallel port + * + * Copyright (C) 2005 Thomas Gleixner, Ingo Molnar + * + * licensed under the GPL + * + * You need to have CONFIG_PARPORT disabled for this device, it is a + * completely self-contained device that assumes sole ownership of the + * parallel port. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * API wrappers so that the code can be shared with the -rt tree: + */ +#ifndef local_irq_disable +# define local_irq_disable local_irq_disable +# define local_irq_enable local_irq_enable +#endif + +#ifndef IRQ_NODELAY +# define IRQ_NODELAY 0 +# define IRQF_NODELAY 0 +#endif + +/* + * Driver: + */ +#define LPPTEST_CHAR_MAJOR 245 +#define LPPTEST_DEVICE_NAME "lpptest" + +#define LPPTEST_IRQ 7 + +#define LPPTEST_TEST _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long) +#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long) +#define LPPTEST_ENABLE _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long) + +static char dev_id[] = "lpptest"; + +#define INIT_PORT() outb(0x04, 0x37a) +#define ENABLE_IRQ() outb(0x10, 0x37a) +#define DISABLE_IRQ() outb(0, 0x37a) + +static unsigned char out = 0x5a; + +/** + * Interrupt handler. Flip a bit in the reply. + */ +static int lpptest_irq (int irq, void *dev_id, struct pt_regs *regs) +{ + out ^= 0xff; + outb(out, 0x378); + + return IRQ_HANDLED; +} + +static cycles_t test_response(void) +{ + cycles_t now, end; + unsigned char in; + int timeout = 0; + + local_irq_disable(); + in = inb(0x379); + inb(0x378); + outb(0x08, 0x378); + now = get_cycles(); + while(1) { + if (inb(0x379) != in) + break; + if (timeout++ > 1000000) { + outb(0x00, 0x378); + local_irq_enable(); + + return 0; + } + } + end = get_cycles(); + outb(0x00, 0x378); + local_irq_enable(); + + return end - now; +} + +static int lpptest_open(struct inode *inode, struct file *file) +{ + return 0; +} + +static int lpptest_close(struct inode *inode, struct file *file) +{ + return 0; +} + +int lpptest_ioctl(struct inode *inode, struct file *file, unsigned int ioctl_num, unsigned long ioctl_param) +{ + int retval = 0; + + switch (ioctl_num) { + + case LPPTEST_DISABLE: + DISABLE_IRQ(); + break; + + case LPPTEST_ENABLE: + ENABLE_IRQ(); + break; + + case LPPTEST_TEST: { + + cycles_t diff = test_response(); + if (copy_to_user((void *)ioctl_param, (void*) &diff, sizeof(diff))) + goto errcpy; + break; + } + default: retval = -EINVAL; + } + + return retval; + + errcpy: + return -EFAULT; +} + +static struct file_operations lpptest_dev_fops = { + .ioctl = lpptest_ioctl, + .open = lpptest_open, + .release = lpptest_close, +}; + +static int __init lpptest_init (void) +{ + if (register_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME, &lpptest_dev_fops)) + { + printk(KERN_NOTICE "Can't allocate major number %d for lpptest.\n", + LPPTEST_CHAR_MAJOR); + return -EAGAIN; + } + + if (request_irq (LPPTEST_IRQ, lpptest_irq, 0, "lpptest", dev_id)) { + printk (KERN_WARNING "lpptest: irq %d in use. Unload parport module!\n", LPPTEST_IRQ); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); + return -EAGAIN; + } + irq_desc[LPPTEST_IRQ].status |= IRQ_NODELAY; + irq_desc[LPPTEST_IRQ].action->flags |= IRQF_NODELAY | IRQF_DISABLED; + + INIT_PORT(); + ENABLE_IRQ(); + + return 0; +} +module_init (lpptest_init); + +static void __exit lpptest_exit (void) +{ + DISABLE_IRQ(); + + free_irq(LPPTEST_IRQ, dev_id); + unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME); +} +module_exit (lpptest_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("lpp test module"); + Index: linux/drivers/char/random.c =================================================================== --- linux.orig/drivers/char/random.c +++ linux/drivers/char/random.c @@ -580,8 +580,11 @@ static void add_timer_randomness(struct preempt_disable(); /* if over the trickle threshold, use only 1 in 4096 samples */ if (input_pool.entropy_count > trickle_thresh && - (__get_cpu_var(trickle_count)++ & 0xfff)) - goto out; + (__get_cpu_var(trickle_count)++ & 0xfff)) { + preempt_enable(); + return; + } + preempt_enable(); sample.jiffies = jiffies; sample.cycles = get_cycles(); @@ -626,9 +629,6 @@ static void add_timer_randomness(struct if(input_pool.entropy_count >= random_read_wakeup_thresh) wake_up_interruptible(&random_read_wait); - -out: - preempt_enable(); } void add_input_randomness(unsigned int type, unsigned int code, Index: linux/drivers/char/rtc.c =================================================================== --- linux.orig/drivers/char/rtc.c +++ linux/drivers/char/rtc.c @@ -82,10 +82,36 @@ #include #include +#ifdef CONFIG_MIPS +# include +#endif + #if defined(__i386__) #include #endif +#ifdef CONFIG_RTC_HISTOGRAM + +static cycles_t last_interrupt_time; + +#include + +#define CPU_MHZ (cpu_khz / 1000) + +#define HISTSIZE 10000 +static int histogram[HISTSIZE]; + +static int rtc_state; + +enum rtc_states { + S_STARTUP, /* First round - let the application start */ + S_IDLE, /* Waiting for an interrupt */ + S_WAITING_FOR_READ, /* Signal delivered. waiting for rtc_read() */ + S_READ_MISSED, /* Signal delivered, read() deadline missed */ +}; + +#endif + #ifdef __sparc__ #include #include @@ -217,7 +243,146 @@ static inline unsigned char rtc_is_updat return uip; } +#ifndef RTC_IRQ +# undef CONFIG_RTC_HISTOGRAM +#endif + +static inline void rtc_open_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i; + + last_interrupt_time = 0; + rtc_state = S_STARTUP; + rtc_irq_data = 0; + + for (i = 0; i < HISTSIZE; i++) + histogram[i] = 0; +#endif +} + +static inline void rtc_wake_event(void) +{ +#ifndef CONFIG_RTC_HISTOGRAM + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); +#else + if (!(rtc_status & RTC_IS_OPEN)) + return; + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + break; + /* Waiting for an interrupt */ + case S_IDLE: + kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); + last_interrupt_time = get_cycles(); + rtc_state = S_WAITING_FOR_READ; + break; + + /* Signal has been delivered. waiting for rtc_read() */ + case S_WAITING_FOR_READ: + /* + * Well foo. The usermode application didn't + * schedule and read in time. + */ + last_interrupt_time = get_cycles(); + rtc_state = S_READ_MISSED; + printk("Read missed before next interrupt\n"); + break; + /* Signal has been delivered, read() deadline was missed */ + case S_READ_MISSED: + /* + * Not much we can do here. We're waiting for the usermode + * application to read the rtc + */ + last_interrupt_time = get_cycles(); + break; + } +#endif +} + +static inline void rtc_read_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + cycles_t now = get_cycles(); + + switch (rtc_state) { + /* Startup */ + case S_STARTUP: + rtc_state = S_IDLE; + break; + + /* Waiting for an interrupt */ + case S_IDLE: + printk("bug in rtc_read(): called in state S_IDLE!\n"); + break; + case S_WAITING_FOR_READ: /* + * Signal has been delivered. + * waiting for rtc_read() + */ + /* + * Well done + */ + case S_READ_MISSED: /* + * Signal has been delivered, read() + * deadline was missed + */ + /* + * So, you finally got here. + */ + if (!last_interrupt_time) + printk("bug in rtc_read(): last_interrupt_time = 0\n"); + rtc_state = S_IDLE; + { + cycles_t latency = now - last_interrupt_time; + unsigned long delta; /* Microseconds */ + + delta = latency; + delta /= CPU_MHZ; + + if (delta > 1000 * 1000) { + printk("rtc: eek\n"); + } else { + unsigned long slot = delta; + if (slot >= HISTSIZE) + slot = HISTSIZE - 1; + histogram[slot]++; + if (delta > 2000) + printk("wow! That was a " + "%ld millisec bump\n", + delta / 1000); + } + } + rtc_state = S_IDLE; + break; + } +#endif +} + +static inline void rtc_close_event(void) +{ +#ifdef CONFIG_RTC_HISTOGRAM + int i = 0; + unsigned long total = 0; + + for (i = 0; i < HISTSIZE; i++) + total += histogram[i]; + if (!total) + return; + + printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n", + current->comm, current->pid, total); + for (i = 0; i < HISTSIZE; i++) { + if (histogram[i]) + printk("%d %d\n", i, histogram[i]); + } +#endif +} + #ifdef RTC_IRQ + /* * A very tiny interrupt handler. It runs with IRQF_DISABLED set, * but there is possibility of conflicting with the set_rtc_mmss() @@ -261,7 +426,7 @@ irqreturn_t rtc_interrupt(int irq, void if (rtc_callback) rtc_callback->func(rtc_callback->private_data); spin_unlock(&rtc_task_lock); - wake_up_interruptible(&rtc_wait); + wake_up_interruptible(&rtc_wait); kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); @@ -375,6 +540,8 @@ static ssize_t rtc_read(struct file *fil schedule(); } while (1); + rtc_read_event(); + if (count == sizeof(unsigned int)) retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); else @@ -607,6 +774,11 @@ static int rtc_do_ioctl(unsigned int cmd save_freq_select = CMOS_READ(RTC_FREQ_SELECT); CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + /* + * Make CMOS date writes nonpreemptible even on PREEMPT_RT. + * There's a limit to everything! =B-) + */ + preempt_disable(); #ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR); #endif @@ -616,6 +788,7 @@ static int rtc_do_ioctl(unsigned int cmd CMOS_WRITE(hrs, RTC_HOURS); CMOS_WRITE(min, RTC_MINUTES); CMOS_WRITE(sec, RTC_SECONDS); + preempt_enable(); CMOS_WRITE(save_control, RTC_CONTROL); CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); @@ -714,6 +887,7 @@ static int rtc_open(struct inode *inode, if(rtc_status & RTC_IS_OPEN) goto out_busy; + rtc_open_event(); rtc_status |= RTC_IS_OPEN; rtc_irq_data = 0; @@ -769,6 +943,7 @@ no_irq: rtc_irq_data = 0; rtc_status &= ~RTC_IS_OPEN; spin_unlock_irq (&rtc_lock); + rtc_close_event(); return 0; } @@ -1152,6 +1327,7 @@ static void rtc_dropped_irq(unsigned lon printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq); /* Now we have new data */ + rtc_wake_event(); wake_up_interruptible(&rtc_wait); kill_fasync (&rtc_async_queue, SIGIO, POLL_IN); Index: linux/drivers/char/sysrq.c =================================================================== --- linux.orig/drivers/char/sysrq.c +++ linux/drivers/char/sysrq.c @@ -176,6 +176,23 @@ static struct sysrq_key_op sysrq_showreg .enable_mask = SYSRQ_ENABLE_DUMP, }; +#if defined(__i386__) + +static void sysrq_handle_showallregs(int key, struct pt_regs *pt_regs, + struct tty_struct *tty) +{ + nmi_show_all_regs(); +} + +static struct sysrq_key_op sysrq_showallregs_op = { + .handler = sysrq_handle_showallregs, + .help_msg = "showalLcpupc", + .action_msg = "Show Regs On All CPUs", +}; +#else +#define sysrq_showallregs_op (*(struct sysrq_key_op *)0) +#endif + static void sysrq_handle_showstate(int key, struct pt_regs *pt_regs, struct tty_struct *tty) { @@ -301,7 +318,7 @@ static struct sysrq_key_op *sysrq_key_ta &sysrq_kill_op, /* i */ NULL, /* j */ &sysrq_SAK_op, /* k */ - NULL, /* l */ + &sysrq_showallregs_op, /* l */ &sysrq_showmem_op, /* m */ &sysrq_unrt_op, /* n */ /* This will often be registered as 'Off' at init time */ Index: linux/drivers/char/tty_io.c =================================================================== --- linux.orig/drivers/char/tty_io.c +++ linux/drivers/char/tty_io.c @@ -254,6 +254,7 @@ static int check_tty_count(struct tty_st printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) " "!= #fd's(%d) in %s\n", tty->name, tty->count, count, routine); + dump_stack(); return count; } #endif Index: linux/drivers/ide/ide-floppy.c =================================================================== --- linux.orig/drivers/ide/ide-floppy.c +++ linux/drivers/ide/ide-floppy.c @@ -1666,9 +1666,9 @@ static int idefloppy_get_format_progress atapi_status_t status; unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); status.all = HWIF(drive)->INB(IDE_STATUS_REG); - local_irq_restore(flags); + local_irq_restore_nort(flags); progress_indication = !status.b.dsc ? 0 : 0x10000; } Index: linux/drivers/ide/ide-io.c =================================================================== --- linux.orig/drivers/ide/ide-io.c +++ linux/drivers/ide/ide-io.c @@ -1173,7 +1173,7 @@ static void ide_do_request (ide_hwgroup_ ide_get_lock(ide_intr, hwgroup); /* caller must own ide_lock */ - BUG_ON(!irqs_disabled()); + BUG_ON_NONRT(!irqs_disabled()); while (!hwgroup->busy) { hwgroup->busy = 1; @@ -1434,7 +1434,7 @@ void ide_timer_expiry (unsigned long dat #endif /* DISABLE_IRQ_NOSYNC */ /* local CPU only, * as if we were handling an interrupt */ - local_irq_disable(); + local_irq_disable_nort(); if (hwgroup->polling) { startstop = handler(drive); } else if (drive_is_ready(drive)) { Index: linux/drivers/ide/ide-iops.c =================================================================== --- linux.orig/drivers/ide/ide-iops.c +++ linux/drivers/ide/ide-iops.c @@ -244,10 +244,10 @@ static void ata_input_data(ide_drive_t * if (io_32bit) { if (io_32bit & 2) { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(drive, IDE_NSECTOR_REG); hwif->INSL(IDE_DATA_REG, buffer, wcount); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else hwif->INSL(IDE_DATA_REG, buffer, wcount); } else { @@ -266,10 +266,10 @@ static void ata_output_data(ide_drive_t if (io_32bit) { if (io_32bit & 2) { unsigned long flags; - local_irq_save(flags); + local_irq_save_nort(flags); ata_vlb_sync(drive, IDE_NSECTOR_REG); hwif->OUTSL(IDE_DATA_REG, buffer, wcount); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else hwif->OUTSL(IDE_DATA_REG, buffer, wcount); } else { @@ -564,12 +564,12 @@ int ide_wait_stat (ide_startstop_t *star if (!(stat & BUSY_STAT)) break; - local_irq_restore(flags); + local_irq_restore_nort(flags); *startstop = ide_error(drive, "status timeout", stat); return 1; } } - local_irq_restore(flags); + local_irq_restore_nort(flags); } /* * Allow status to settle, then read it again. @@ -731,17 +731,15 @@ int ide_driveid_update (ide_drive_t *dri printk("%s: CHECK for good STATUS\n", drive->name); return 0; } - local_irq_save(flags); - SELECT_MASK(drive, 0); id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC); - if (!id) { - local_irq_restore(flags); + if (!id) return 0; - } + local_irq_save_nort(flags); + SELECT_MASK(drive, 0); ata_input_data(drive, id, SECTOR_WORDS); (void) hwif->INB(IDE_STATUS_REG); /* clear drive IRQ */ - local_irq_enable(); - local_irq_restore(flags); + local_irq_enable_nort(); + local_irq_restore_nort(flags); ide_fix_driveid(id); if (id) { drive->id->dma_ultra = id->dma_ultra; @@ -821,7 +819,7 @@ int ide_config_drive_speed (ide_drive_t if (time_after(jiffies, timeout)) break; } - local_irq_restore(flags); + local_irq_restore_nort(flags); } /* Index: linux/drivers/ide/ide-lib.c =================================================================== --- linux.orig/drivers/ide/ide-lib.c +++ linux/drivers/ide/ide-lib.c @@ -445,15 +445,16 @@ EXPORT_SYMBOL_GPL(ide_set_xfer_rate); static void ide_dump_opcode(ide_drive_t *drive) { + unsigned long flags; struct request *rq; u8 opcode = 0; int found = 0; - spin_lock(&ide_lock); + spin_lock_irqsave(&ide_lock, flags); rq = NULL; if (HWGROUP(drive)) rq = HWGROUP(drive)->rq; - spin_unlock(&ide_lock); + spin_unlock_irqrestore(&ide_lock, flags); if (!rq) return; if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) { @@ -481,10 +482,8 @@ static void ide_dump_opcode(ide_drive_t static u8 ide_dump_ata_status(ide_drive_t *drive, const char *msg, u8 stat) { ide_hwif_t *hwif = HWIF(drive); - unsigned long flags; u8 err = 0; - local_irq_save(flags); printk("%s: %s: status=0x%02x { ", drive->name, msg, stat); if (stat & BUSY_STAT) printk("Busy "); @@ -544,7 +543,7 @@ static u8 ide_dump_ata_status(ide_drive_ printk("\n"); } ide_dump_opcode(drive); - local_irq_restore(flags); + return err; } @@ -559,14 +558,11 @@ static u8 ide_dump_ata_status(ide_drive_ static u8 ide_dump_atapi_status(ide_drive_t *drive, const char *msg, u8 stat) { - unsigned long flags; - atapi_status_t status; atapi_error_t error; status.all = stat; error.all = 0; - local_irq_save(flags); printk("%s: %s: status=0x%02x { ", drive->name, msg, stat); if (status.b.bsy) printk("Busy "); @@ -592,7 +588,7 @@ static u8 ide_dump_atapi_status(ide_driv printk("}\n"); } ide_dump_opcode(drive); - local_irq_restore(flags); + return error.all; } Index: linux/drivers/ide/ide-probe.c =================================================================== --- linux.orig/drivers/ide/ide-probe.c +++ linux/drivers/ide/ide-probe.c @@ -143,7 +143,7 @@ static inline void do_identify (ide_driv hwif->ata_input_data(drive, id, SECTOR_WORDS); drive->id_read = 1; - local_irq_enable(); + local_irq_enable_nort(); ide_fix_driveid(id); #if defined (CONFIG_SCSI_EATA_DMA) || defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA) @@ -325,14 +325,14 @@ static int actual_try_to_identify (ide_d unsigned long flags; /* local CPU only; some systems need this */ - local_irq_save(flags); + local_irq_save_nort(flags); /* drive returned ID */ do_identify(drive, cmd); /* drive responded with ID */ rc = 0; /* clear drive IRQ */ (void) hwif->INB(IDE_STATUS_REG); - local_irq_restore(flags); + local_irq_restore_nort(flags); } else { /* drive refused ID */ rc = 2; @@ -804,7 +804,7 @@ static void probe_hwif(ide_hwif_t *hwif) } while ((stat & BUSY_STAT) && time_after(timeout, jiffies)); } - local_irq_restore(flags); + local_irq_restore_nort(flags); /* * Use cached IRQ number. It might be (and is...) changed by probe * code above Index: linux/drivers/ide/ide-taskfile.c =================================================================== --- linux.orig/drivers/ide/ide-taskfile.c +++ linux/drivers/ide/ide-taskfile.c @@ -274,7 +274,7 @@ static void ide_pio_sector(ide_drive_t * offset %= PAGE_SIZE; #ifdef CONFIG_HIGHMEM - local_irq_save(flags); + local_irq_save_nort(flags); #endif buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset; @@ -294,7 +294,7 @@ static void ide_pio_sector(ide_drive_t * kunmap_atomic(buf, KM_BIO_SRC_IRQ); #ifdef CONFIG_HIGHMEM - local_irq_restore(flags); + local_irq_restore_nort(flags); #endif } @@ -460,7 +460,7 @@ ide_startstop_t pre_task_out_intr (ide_d } if (!drive->unmask) - local_irq_disable(); + local_irq_disable_nort(); ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL); ide_pio_datablock(drive, rq, 1); Index: linux/drivers/ide/pci/alim15x3.c =================================================================== --- linux.orig/drivers/ide/pci/alim15x3.c +++ linux/drivers/ide/pci/alim15x3.c @@ -322,7 +322,7 @@ static void ali15x3_tune_drive (ide_driv if (r_clc >= 16) r_clc = 0; } - local_irq_save(flags); + local_irq_save_nort(flags); /* * PIO mode => ATA FIFO on, ATAPI FIFO off @@ -344,7 +344,7 @@ static void ali15x3_tune_drive (ide_driv pci_write_config_byte(dev, port, s_clc); pci_write_config_byte(dev, port+drive->select.b.unit+2, (a_clc << 4) | r_clc); - local_irq_restore(flags); + local_irq_restore_nort(flags); /* * setup active rec @@ -600,7 +600,7 @@ static unsigned int __devinit init_chips } #endif /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_PROC_FS) */ - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision < 0xC2) { /* @@ -613,7 +613,7 @@ static unsigned int __devinit init_chips * clear bit 7 */ pci_write_config_byte(dev, 0x4b, tmpbyte & 0x7F); - local_irq_restore(flags); + local_irq_restore_nort(flags); return 0; } @@ -638,7 +638,7 @@ static unsigned int __devinit init_chips * 0:0.0 so if we didn't find one we know what is cooking. */ if (north && north->vendor != PCI_VENDOR_ID_AL) { - local_irq_restore(flags); + local_irq_restore_nort(flags); return 0; } @@ -661,7 +661,7 @@ static unsigned int __devinit init_chips pci_write_config_byte(isa_dev, 0x79, tmpbyte | 0x02); } } - local_irq_restore(flags); + local_irq_restore_nort(flags); return 0; } @@ -685,7 +685,7 @@ static unsigned int __devinit ata66_ali1 unsigned long flags; u8 tmpbyte; - local_irq_save(flags); + local_irq_save_nort(flags); if (m5229_revision >= 0xC2) { /* @@ -737,7 +737,7 @@ static unsigned int __devinit ata66_ali1 pci_write_config_byte(dev, 0x53, tmpbyte); - local_irq_restore(flags); + local_irq_restore_nort(flags); return(ata66); } Index: linux/drivers/ide/pci/cs5530.c =================================================================== --- linux.orig/drivers/ide/pci/cs5530.c +++ linux/drivers/ide/pci/cs5530.c @@ -241,8 +241,8 @@ static unsigned int __devinit init_chips return 0; } - spin_lock_irqsave(&ide_lock, flags); - /* all CPUs (there should only be one CPU with this chipset) */ + /* Local CPU. ide_lock is acquired in do_ide_setup_pci_device. */ + local_irq_save(flags); /* * Enable BusMaster and MemoryWriteAndInvalidate for the cs5530: @@ -294,7 +294,7 @@ static unsigned int __devinit init_chips pci_write_config_byte(master_0, 0x42, 0x00); pci_write_config_byte(master_0, 0x43, 0xc1); - spin_unlock_irqrestore(&ide_lock, flags); + local_irq_restore(flags); return 0; } Index: linux/drivers/ide/pci/hpt366.c =================================================================== --- linux.orig/drivers/ide/pci/hpt366.c +++ linux/drivers/ide/pci/hpt366.c @@ -1496,7 +1496,7 @@ static void __devinit init_dma_hpt366(id dma_old = hwif->INB(dmabase+2); - local_irq_save(flags); + local_irq_save_nort(flags); dma_new = dma_old; pci_read_config_byte(hwif->pci_dev, primary, &masterdma); @@ -1507,7 +1507,7 @@ static void __devinit init_dma_hpt366(id if (dma_new != dma_old) hwif->OUTB(dma_new, dmabase+2); - local_irq_restore(flags); + local_irq_restore_nort(flags); ide_setup_dma(hwif, dmabase, 8); } Index: linux/drivers/ieee1394/ieee1394_types.h =================================================================== --- linux.orig/drivers/ieee1394/ieee1394_types.h +++ linux/drivers/ieee1394/ieee1394_types.h @@ -19,7 +19,7 @@ struct hpsb_tlabel_pool { spinlock_t lock; u8 next; u32 allocations; - struct semaphore count; + struct compat_semaphore count; }; #define HPSB_TPOOL_INIT(_tp) \ Index: linux/drivers/ieee1394/nodemgr.c =================================================================== --- linux.orig/drivers/ieee1394/nodemgr.c +++ linux/drivers/ieee1394/nodemgr.c @@ -167,7 +167,7 @@ struct host_info { struct hpsb_host *host; struct list_head list; struct completion exited; - struct semaphore reset_sem; + struct compat_semaphore reset_sem; int pid; char daemon_name[15]; int kill_me; Index: linux/drivers/ieee1394/raw1394-private.h =================================================================== --- linux.orig/drivers/ieee1394/raw1394-private.h +++ linux/drivers/ieee1394/raw1394-private.h @@ -29,7 +29,7 @@ struct file_info { struct list_head req_pending; struct list_head req_complete; - struct semaphore complete_sem; + struct compat_semaphore complete_sem; spinlock_t reqlists_lock; wait_queue_head_t poll_wait_complete; Index: linux/drivers/input/gameport/gameport.c =================================================================== --- linux.orig/drivers/input/gameport/gameport.c +++ linux/drivers/input/gameport/gameport.c @@ -21,6 +21,7 @@ #include #include #include +#include #include /* HZ */ #include @@ -101,12 +102,12 @@ static int gameport_measure_speed(struct tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); GET_TIME(t1); for (t = 0; t < 50; t++) gameport_read(gameport); GET_TIME(t2); GET_TIME(t3); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t; } @@ -125,11 +126,11 @@ static int gameport_measure_speed(struct tx = 1 << 30; for(i = 0; i < 50; i++) { - local_irq_save(flags); + local_irq_save_nort(flags); rdtscl(t1); for (t = 0; t < 50; t++) gameport_read(gameport); rdtscl(t2); - local_irq_restore(flags); + local_irq_restore_nort(flags); udelay(i * 10); if (t2 - t1 < tx) tx = t2 - t1; } Index: linux/drivers/input/serio/i8042.c =================================================================== --- linux.orig/drivers/input/serio/i8042.c +++ linux/drivers/input/serio/i8042.c @@ -1084,7 +1084,7 @@ static int __devinit i8042_probe(struct goto err_controller_cleanup; } - mod_timer(&i8042_timer, jiffies + I8042_POLL_PERIOD); + mod_timer(&i8042_timer, jiffies + 2); //I8042_POLL_PERIOD); return 0; err_unregister_ports: Index: linux/drivers/input/serio/i8042.h =================================================================== --- linux.orig/drivers/input/serio/i8042.h +++ linux/drivers/input/serio/i8042.h @@ -43,7 +43,7 @@ * polling. */ -#define I8042_POLL_PERIOD HZ/20 +#define I8042_POLL_PERIOD (10*HZ) /* * Status register bits. Index: linux/drivers/macintosh/adb.c =================================================================== --- linux.orig/drivers/macintosh/adb.c +++ linux/drivers/macintosh/adb.c @@ -256,6 +256,8 @@ adb_probe_task(void *x) sigprocmask(SIG_BLOCK, &blocked, NULL); flush_signals(current); + down(&adb_probe_mutex); + printk(KERN_INFO "adb: starting probe task...\n"); do_adb_reset_bus(); printk(KERN_INFO "adb: finished probe task...\n"); @@ -282,7 +284,9 @@ adb_reset_bus(void) return 0; } - down(&adb_probe_mutex); + if (adb_got_sleep) + return 0; + schedule_work(&adb_reset_work); return 0; } @@ -347,23 +351,21 @@ adb_notify_sleep(struct pmu_sleep_notifi switch (when) { case PBOOK_SLEEP_REQUEST: + /* Signal to discontiue probing */ adb_got_sleep = 1; - /* We need to get a lock on the probe thread */ - down(&adb_probe_mutex); /* Stop autopoll */ if (adb_controller->autopoll) adb_controller->autopoll(0); ret = blocking_notifier_call_chain(&adb_client_list, ADB_MSG_POWERDOWN, NULL); if (ret & NOTIFY_STOP_MASK) { - up(&adb_probe_mutex); + adb_got_sleep = 0; return PBOOK_SLEEP_REFUSE; } break; case PBOOK_SLEEP_REJECT: if (adb_got_sleep) { adb_got_sleep = 0; - up(&adb_probe_mutex); adb_reset_bus(); } break; @@ -372,7 +374,6 @@ adb_notify_sleep(struct pmu_sleep_notifi break; case PBOOK_WAKE: adb_got_sleep = 0; - up(&adb_probe_mutex); adb_reset_bus(); break; } Index: linux/drivers/media/dvb/dvb-core/dvb_frontend.c =================================================================== --- linux.orig/drivers/media/dvb/dvb-core/dvb_frontend.c +++ linux/drivers/media/dvb/dvb-core/dvb_frontend.c @@ -97,7 +97,7 @@ struct dvb_frontend_private { struct dvb_device *dvbdev; struct dvb_frontend_parameters parameters; struct dvb_fe_events events; - struct semaphore sem; + struct compat_semaphore sem; struct list_head list_head; wait_queue_head_t wait_queue; pid_t thread_pid; Index: linux/drivers/media/dvb/dvb-core/dvb_frontend.h =================================================================== --- linux.orig/drivers/media/dvb/dvb-core/dvb_frontend.h +++ linux/drivers/media/dvb/dvb-core/dvb_frontend.h @@ -138,7 +138,7 @@ struct dvb_fe_events { int eventr; int overflow; wait_queue_head_t wait_queue; - struct semaphore sem; + struct compat_semaphore sem; }; struct dvb_frontend { Index: linux/drivers/net/3c527.c =================================================================== --- linux.orig/drivers/net/3c527.c +++ linux/drivers/net/3c527.c @@ -182,7 +182,7 @@ struct mc32_local u16 rx_ring_tail; /* index to rx de-queue end */ - struct semaphore cmd_mutex; /* Serialises issuing of execute commands */ + struct compat_semaphore cmd_mutex; /* Serialises issuing of execute commands */ struct completion execution_cmd; /* Card has completed an execute command */ struct completion xceiver_cmd; /* Card has completed a tx or rx command */ }; Index: linux/drivers/net/3c59x.c =================================================================== --- linux.orig/drivers/net/3c59x.c +++ linux/drivers/net/3c59x.c @@ -793,9 +793,9 @@ static void poll_vortex(struct net_devic struct vortex_private *vp = netdev_priv(dev); unsigned long flags; local_save_flags(flags); - local_irq_disable(); + local_irq_disable_nort(); (vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev,NULL); - local_irq_restore(flags); + local_irq_restore_nort(flags); } #endif @@ -1724,6 +1724,7 @@ vortex_timer(unsigned long data) int next_tick = 60*HZ; int ok = 0; int media_status, old_window; + unsigned long flags; if (vortex_debug > 2) { printk(KERN_DEBUG "%s: Media selection timer tick happened, %s.\n", @@ -1731,7 +1732,7 @@ vortex_timer(unsigned long data) printk(KERN_DEBUG "dev->watchdog_timeo=%d\n", dev->watchdog_timeo); } - disable_irq_lockdep(dev->irq); + spin_lock_irqsave(&vp->lock, flags); old_window = ioread16(ioaddr + EL3_CMD) >> 13; EL3WINDOW(4); media_status = ioread16(ioaddr + Wn4_Media); @@ -1754,9 +1755,7 @@ vortex_timer(unsigned long data) case XCVR_MII: case XCVR_NWAY: { ok = 1; - spin_lock_bh(&vp->lock); vortex_check_media(dev, 0); - spin_unlock_bh(&vp->lock); } break; default: /* Other media types handled by Tx timeouts. */ @@ -1812,7 +1811,7 @@ leave_media_alone: dev->name, media_tbl[dev->if_port].name); EL3WINDOW(old_window); - enable_irq_lockdep(dev->irq); + spin_unlock_irqrestore(&vp->lock, flags); mod_timer(&vp->timer, RUN_AT(next_tick)); if (vp->deferred) iowrite16(FakeIntr, ioaddr + EL3_CMD); @@ -1845,13 +1844,17 @@ static void vortex_tx_timeout(struct net /* * Block interrupts because vortex_interrupt does a bare spin_lock() */ +#ifndef CONFIG_PREEMPT_RT unsigned long flags; local_irq_save(flags); +#endif if (vp->full_bus_master_tx) boomerang_interrupt(dev->irq, dev, NULL); else vortex_interrupt(dev->irq, dev, NULL); +#ifndef CONFIG_PREEMPT_RT local_irq_restore(flags); +#endif } } Index: linux/drivers/net/e1000/e1000_main.c =================================================================== --- linux.orig/drivers/net/e1000/e1000_main.c +++ linux/drivers/net/e1000/e1000_main.c @@ -2965,10 +2965,8 @@ e1000_xmit_frame(struct sk_buff *skb, st (adapter->hw.mac_type == e1000_82573)) e1000_transfer_dhcp_info(adapter, skb); - local_irq_save(flags); - if (!spin_trylock(&tx_ring->tx_lock)) { + if (!spin_trylock_irqsave(&tx_ring->tx_lock, flags)) { /* Collision - tell upper layer to requeue */ - local_irq_restore(flags); return NETDEV_TX_LOCKED; } Index: linux/drivers/net/hamradio/6pack.c =================================================================== --- linux.orig/drivers/net/hamradio/6pack.c +++ linux/drivers/net/hamradio/6pack.c @@ -123,7 +123,7 @@ struct sixpack { struct timer_list tx_t; struct timer_list resync_t; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; spinlock_t lock; }; Index: linux/drivers/net/hamradio/mkiss.c =================================================================== --- linux.orig/drivers/net/hamradio/mkiss.c +++ linux/drivers/net/hamradio/mkiss.c @@ -84,7 +84,7 @@ struct mkiss { #define CRC_MODE_SMACK_TEST 4 atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; }; /*---------------------------------------------------------------------------*/ Index: linux/drivers/net/ibm_emac/ibm_emac_core.c =================================================================== --- linux.orig/drivers/net/ibm_emac/ibm_emac_core.c +++ linux/drivers/net/ibm_emac/ibm_emac_core.c @@ -1061,6 +1061,8 @@ static inline int emac_xmit_finish(struc ++dev->stats.tx_packets; dev->stats.tx_bytes += len; + spin_unlock(&dev->tx_lock); + return 0; } @@ -1074,6 +1076,7 @@ static int emac_start_xmit(struct sk_buf u16 ctrl = EMAC_TX_CTRL_GFCS | EMAC_TX_CTRL_GP | MAL_TX_CTRL_READY | MAL_TX_CTRL_LAST | emac_tx_csum(dev, skb); + spin_lock(&dev->tx_lock); slot = dev->tx_slot++; if (dev->tx_slot == NUM_TX_BUFF) { dev->tx_slot = 0; @@ -1243,6 +1246,7 @@ static void emac_poll_tx(void *param) DBG2("%d: poll_tx, %d %d" NL, dev->def->index, dev->tx_cnt, dev->ack_slot); + spin_lock(&dev->tx_lock); if (dev->tx_cnt) { u16 ctrl; int slot = dev->ack_slot, n = 0; @@ -1252,6 +1256,7 @@ static void emac_poll_tx(void *param) struct sk_buff *skb = dev->tx_skb[slot]; ++n; + spin_unlock(&dev->tx_lock); if (skb) { dev_kfree_skb(skb); dev->tx_skb[slot] = NULL; @@ -1261,6 +1266,7 @@ static void emac_poll_tx(void *param) if (unlikely(EMAC_IS_BAD_TX(ctrl))) emac_parse_tx_error(dev, ctrl); + spin_lock(&dev->tx_lock); if (--dev->tx_cnt) goto again; } @@ -1273,6 +1279,7 @@ static void emac_poll_tx(void *param) DBG2("%d: tx %d pkts" NL, dev->def->index, n); } } + spin_unlock(&dev->tx_lock); } static inline void emac_recycle_rx_skb(struct ocp_enet_private *dev, int slot, @@ -1966,6 +1973,7 @@ static int __init emac_probe(struct ocp_ dev->ldev = &ocpdev->dev; dev->def = ocpdev->def; SET_MODULE_OWNER(ndev); + spin_lock_init(&dev->tx_lock); /* Find MAL device we are connected to */ maldev = Index: linux/drivers/net/ibm_emac/ibm_emac_core.h =================================================================== --- linux.orig/drivers/net/ibm_emac/ibm_emac_core.h +++ linux/drivers/net/ibm_emac/ibm_emac_core.h @@ -193,6 +193,8 @@ struct ocp_enet_private { struct ibm_emac_error_stats estats; struct net_device_stats nstats; + spinlock_t tx_lock; + struct device* ldev; }; Index: linux/drivers/net/netconsole.c =================================================================== --- linux.orig/drivers/net/netconsole.c +++ linux/drivers/net/netconsole.c @@ -74,16 +74,22 @@ static void write_msg(struct console *co if (!np.dev) return; - local_irq_save(flags); + /* + * A bit hairy. Netconsole uses mutexes (indirectly) and + * thus must have interrupts enabled: + */ + local_irq_save_nort(flags); for(left = len; left; ) { frag = min(left, MAX_PRINT_CHUNK); + WARN_ON_RT(irqs_disabled()); netpoll_send_udp(&np, msg, frag); + WARN_ON_RT(irqs_disabled()); msg += frag; left -= frag; } - local_irq_restore(flags); + local_irq_restore_nort(flags); } static struct console netconsole = { Index: linux/drivers/net/plip.c =================================================================== --- linux.orig/drivers/net/plip.c +++ linux/drivers/net/plip.c @@ -227,7 +227,10 @@ struct net_local { struct hh_cache *hh); spinlock_t lock; atomic_t kill_timer; - struct semaphore killed_timer_sem; + /* + * PREEMPT_RT: this isnt a mutex, it should be struct completion. + */ + struct compat_semaphore killed_timer_sem; }; static inline void enable_parport_interrupts (struct net_device *dev) Index: linux/drivers/net/ppp_async.c =================================================================== --- linux.orig/drivers/net/ppp_async.c +++ linux/drivers/net/ppp_async.c @@ -67,7 +67,7 @@ struct asyncppp { struct tasklet_struct tsk; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; struct ppp_channel chan; /* interface to generic ppp layer */ unsigned char obuf[OBUFSIZE]; }; Index: linux/drivers/net/ppp_synctty.c =================================================================== --- linux.orig/drivers/net/ppp_synctty.c +++ linux/drivers/net/ppp_synctty.c @@ -70,7 +70,7 @@ struct syncppp { struct tasklet_struct tsk; atomic_t refcnt; - struct semaphore dead_sem; + struct compat_semaphore dead_sem; struct ppp_channel chan; /* interface to generic ppp layer */ }; Index: linux/drivers/net/sungem.c =================================================================== --- linux.orig/drivers/net/sungem.c +++ linux/drivers/net/sungem.c @@ -1037,10 +1037,8 @@ static int gem_start_xmit(struct sk_buff (csum_stuff_off << 21)); } - local_irq_save(flags); - if (!spin_trylock(&gp->tx_lock)) { + if (!spin_trylock_irqsave(&gp->tx_lock, flags)) { /* Tell upper layer to requeue */ - local_irq_restore(flags); return NETDEV_TX_LOCKED; } /* We raced with gem_do_stop() */ Index: linux/drivers/net/tulip/tulip_core.c =================================================================== --- linux.orig/drivers/net/tulip/tulip_core.c +++ linux/drivers/net/tulip/tulip_core.c @@ -1804,6 +1804,7 @@ static void __devexit tulip_remove_one ( pci_iounmap(pdev, tp->base_addr); free_netdev (dev); pci_release_regions (pdev); + pci_disable_device (pdev); pci_set_drvdata (pdev, NULL); /* pci_power_off (pdev, -1); */ Index: linux/drivers/net/wireless/ipw2100.c =================================================================== --- linux.orig/drivers/net/wireless/ipw2100.c +++ linux/drivers/net/wireless/ipw2100.c @@ -163,6 +163,7 @@ that only one external action is invoked #include #include #include +#include #include "ipw2100.h" @@ -1697,6 +1698,11 @@ static int ipw2100_up(struct ipw2100_pri return 0; } + /* the ipw2100 hardware really doesn't want power management delays + * longer than 175usec + */ + modify_acceptable_latency("ipw2100", 175); + /* If the interrupt is enabled, turn it off... */ spin_lock_irqsave(&priv->low_lock, flags); ipw2100_disable_interrupts(priv); @@ -1849,6 +1855,8 @@ static void ipw2100_down(struct ipw2100_ ipw2100_disable_interrupts(priv); spin_unlock_irqrestore(&priv->low_lock, flags); + modify_acceptable_latency("ipw2100", INFINITE_LATENCY); + #ifdef ACPI_CSTATE_LIMIT_DEFINED if (priv->config & CFG_C3_DISABLED) { IPW_DEBUG_INFO(": Resetting C3 transitions.\n"); @@ -6533,6 +6541,7 @@ static int __init ipw2100_init(void) ret = pci_module_init(&ipw2100_pci_driver); + set_acceptable_latency("ipw2100", INFINITE_LATENCY); #ifdef CONFIG_IPW2100_DEBUG ipw2100_debug_level = debug; driver_create_file(&ipw2100_pci_driver.driver, @@ -6553,6 +6562,7 @@ static void __exit ipw2100_exit(void) &driver_attr_debug_level); #endif pci_unregister_driver(&ipw2100_pci_driver); + remove_acceptable_latency("ipw2100"); } module_init(ipw2100_init); Index: linux/drivers/oprofile/oprofilefs.c =================================================================== --- linux.orig/drivers/oprofile/oprofilefs.c +++ linux/drivers/oprofile/oprofilefs.c @@ -21,7 +21,7 @@ #define OPROFILEFS_MAGIC 0x6f70726f -DEFINE_SPINLOCK(oprofilefs_lock); +DEFINE_RAW_SPINLOCK(oprofilefs_lock); static struct inode * oprofilefs_get_inode(struct super_block * sb, int mode) { Index: linux/drivers/pci/Makefile =================================================================== --- linux.orig/drivers/pci/Makefile +++ linux/drivers/pci/Makefile @@ -27,7 +27,8 @@ obj-$(CONFIG_PPC64) += setup-bus.o obj-$(CONFIG_MIPS) += setup-bus.o setup-irq.o obj-$(CONFIG_X86_VISWS) += setup-irq.o -msiobj-y := msi.o msi-apic.o +msiobj-y := msi.o +msiobj-$(CONFIG_IA64) += msi-apic.o msiobj-$(CONFIG_IA64_GENERIC) += msi-altix.o msiobj-$(CONFIG_IA64_SGI_SN2) += msi-altix.o obj-$(CONFIG_PCI_MSI) += $(msiobj-y) Index: linux/drivers/pci/hotplug/cpci_hotplug_core.c =================================================================== --- linux.orig/drivers/pci/hotplug/cpci_hotplug_core.c +++ linux/drivers/pci/hotplug/cpci_hotplug_core.c @@ -59,8 +59,8 @@ static int slots; static atomic_t extracting; int cpci_debug; static struct cpci_hp_controller *controller; -static struct semaphore event_semaphore; /* mutex for process loop (up if something to process) */ -static struct semaphore thread_exit; /* guard ensure thread has exited before calling it quits */ +static struct compat_semaphore event_semaphore; /* mutex for process loop (up if something to process) */ +static struct compat_semaphore thread_exit; /* guard ensure thread has exited before calling it quits */ static int thread_finished = 1; static int enable_slot(struct hotplug_slot *slot); Index: linux/drivers/pci/hotplug/cpqphp_ctrl.c =================================================================== --- linux.orig/drivers/pci/hotplug/cpqphp_ctrl.c +++ linux/drivers/pci/hotplug/cpqphp_ctrl.c @@ -44,8 +44,8 @@ static int configure_new_function(struct u8 behind_bridge, struct resource_lists *resources); static void interrupt_event_handler(struct controller *ctrl); -static struct semaphore event_semaphore; /* mutex for process loop (up if something to process) */ -static struct semaphore event_exit; /* guard ensure thread has exited before calling it quits */ +static struct compat_semaphore event_semaphore; /* mutex for process loop (up if something to process) */ +static struct compat_semaphore event_exit; /* guard ensure thread has exited before calling it quits */ static int event_finished; static unsigned long pushbutton_pending; /* = 0 */ Index: linux/drivers/pci/hotplug/ibmphp_hpc.c =================================================================== --- linux.orig/drivers/pci/hotplug/ibmphp_hpc.c +++ linux/drivers/pci/hotplug/ibmphp_hpc.c @@ -106,7 +106,7 @@ static int tid_poll; static struct mutex sem_hpcaccess; // lock access to HPC static struct semaphore semOperations; // lock all operations and // access to data structures -static struct semaphore sem_exit; // make sure polling thread goes away +static struct compat_semaphore sem_exit; // make sure polling thread goes away //---------------------------------------------------------------------------- // local function prototypes //---------------------------------------------------------------------------- Index: linux/drivers/pci/hotplug/pciehp_ctrl.c =================================================================== --- linux.orig/drivers/pci/hotplug/pciehp_ctrl.c +++ linux/drivers/pci/hotplug/pciehp_ctrl.c @@ -37,8 +37,8 @@ static void interrupt_event_handler(struct controller *ctrl); -static struct semaphore event_semaphore; /* mutex for process loop (up if something to process) */ -static struct semaphore event_exit; /* guard ensure thread has exited before calling it quits */ +static struct compat_semaphore event_semaphore; /* mutex for process loop (up if something to process) */ +static struct compat_semaphore event_exit; /* guard ensure thread has exited before calling it quits */ static int event_finished; static unsigned long pushbutton_pending; /* = 0 */ static unsigned long surprise_rm_pending; /* = 0 */ Index: linux/drivers/pci/msi-altix.c =================================================================== --- linux.orig/drivers/pci/msi-altix.c +++ linux/drivers/pci/msi-altix.c @@ -26,7 +26,7 @@ struct sn_msi_info { static struct sn_msi_info *sn_msi_info; static void -sn_msi_teardown(unsigned int vector) +sn_msi_teardown(unsigned int irq) { nasid_t nasid; int widget; @@ -36,7 +36,7 @@ sn_msi_teardown(unsigned int vector) struct pcibus_bussoft *bussoft; struct sn_pcibus_provider *provider; - sn_irq_info = sn_msi_info[vector].sn_irq_info; + sn_irq_info = sn_msi_info[irq].sn_irq_info; if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0) return; @@ -45,9 +45,9 @@ sn_msi_teardown(unsigned int vector) provider = SN_PCIDEV_BUSPROVIDER(pdev); (*provider->dma_unmap)(pdev, - sn_msi_info[vector].pci_addr, + sn_msi_info[irq].pci_addr, PCI_DMA_FROMDEVICE); - sn_msi_info[vector].pci_addr = 0; + sn_msi_info[irq].pci_addr = 0; bussoft = SN_PCIDEV_BUSSOFT(pdev); nasid = NASID_GET(bussoft->bs_base); @@ -56,14 +56,13 @@ sn_msi_teardown(unsigned int vector) SWIN_WIDGETNUM(bussoft->bs_base); sn_intr_free(nasid, widget, sn_irq_info); - sn_msi_info[vector].sn_irq_info = NULL; + sn_msi_info[irq].sn_irq_info = NULL; return; } int -sn_msi_setup(struct pci_dev *pdev, unsigned int vector, - u32 *addr_hi, u32 *addr_lo, u32 *data) +sn_msi_setup(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) { int widget; int status; @@ -93,7 +92,7 @@ sn_msi_setup(struct pci_dev *pdev, unsig if (! sn_irq_info) return -ENOMEM; - status = sn_intr_alloc(nasid, widget, sn_irq_info, vector, -1, -1); + status = sn_intr_alloc(nasid, widget, sn_irq_info, irq, -1, -1); if (status) { kfree(sn_irq_info); return -ENOMEM; @@ -119,28 +118,27 @@ sn_msi_setup(struct pci_dev *pdev, unsig return -ENOMEM; } - sn_msi_info[vector].sn_irq_info = sn_irq_info; - sn_msi_info[vector].pci_addr = bus_addr; + sn_msi_info[irq].sn_irq_info = sn_irq_info; + sn_msi_info[irq].pci_addr = bus_addr; - *addr_hi = (u32)(bus_addr >> 32); - *addr_lo = (u32)(bus_addr & 0x00000000ffffffff); + msg->address_hi = (u32)(bus_addr >> 32); + msg->address_lo = (u32)(bus_addr & 0x00000000ffffffff); /* * In the SN platform, bit 16 is a "send vector" bit which * must be present in order to move the vector through the system. */ - *data = 0x100 + (unsigned int)vector; + msg->data = 0x100 + irq; #ifdef CONFIG_SMP - set_irq_affinity_info((vector & 0xff), sn_irq_info->irq_cpuid, 0); + set_irq_affinity_info(irq, sn_irq_info->irq_cpuid, 0); #endif return 0; } static void -sn_msi_target(unsigned int vector, unsigned int cpu, - u32 *addr_hi, u32 *addr_lo) +sn_msi_target(unsigned int irq, cpumask_t cpu_mask, struct msi_msg *msg) { int slice; nasid_t nasid; @@ -150,8 +148,10 @@ sn_msi_target(unsigned int vector, unsig struct sn_irq_info *sn_irq_info; struct sn_irq_info *new_irq_info; struct sn_pcibus_provider *provider; + unsigned int cpu; - sn_irq_info = sn_msi_info[vector].sn_irq_info; + cpu = first_cpu(cpu_mask); + sn_irq_info = sn_msi_info[irq].sn_irq_info; if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0) return; @@ -163,15 +163,15 @@ sn_msi_target(unsigned int vector, unsig pdev = sn_pdev->pdi_linux_pcidev; provider = SN_PCIDEV_BUSPROVIDER(pdev); - bus_addr = (u64)(*addr_hi) << 32 | (u64)(*addr_lo); + bus_addr = (u64)(msg->address_hi) << 32 | (u64)(msg->address_lo); (*provider->dma_unmap)(pdev, bus_addr, PCI_DMA_FROMDEVICE); - sn_msi_info[vector].pci_addr = 0; + sn_msi_info[irq].pci_addr = 0; nasid = cpuid_to_nasid(cpu); slice = cpuid_to_slice(cpu); new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice); - sn_msi_info[vector].sn_irq_info = new_irq_info; + sn_msi_info[irq].sn_irq_info = new_irq_info; if (new_irq_info == NULL) return; @@ -184,12 +184,13 @@ sn_msi_target(unsigned int vector, unsig sizeof(new_irq_info->irq_xtalkaddr), SN_DMA_MSI|SN_DMA_ADDR_XIO); - sn_msi_info[vector].pci_addr = bus_addr; - *addr_hi = (u32)(bus_addr >> 32); - *addr_lo = (u32)(bus_addr & 0x00000000ffffffff); + sn_msi_info[irq].pci_addr = bus_addr; + msg->address_hi = (u32)(bus_addr >> 32); + msg->address_lo = (u32)(bus_addr & 0x00000000ffffffff); } struct msi_ops sn_msi_ops = { + .needs_64bit_address = 1, .setup = sn_msi_setup, .teardown = sn_msi_teardown, #ifdef CONFIG_SMP @@ -201,7 +202,7 @@ int sn_msi_init(void) { sn_msi_info = - kzalloc(sizeof(struct sn_msi_info) * NR_VECTORS, GFP_KERNEL); + kzalloc(sizeof(struct sn_msi_info) * NR_IRQS, GFP_KERNEL); if (! sn_msi_info) return -ENOMEM; Index: linux/drivers/pci/msi-apic.c =================================================================== --- linux.orig/drivers/pci/msi-apic.c +++ linux/drivers/pci/msi-apic.c @@ -46,37 +46,36 @@ static void -msi_target_apic(unsigned int vector, - unsigned int dest_cpu, - u32 *address_hi, /* in/out */ - u32 *address_lo) /* in/out */ +msi_target_apic(unsigned int irq, cpumask_t cpu_mask, struct msi_msg *msg) { - u32 addr = *address_lo; + u32 addr = msg->address_lo; addr &= MSI_ADDR_DESTID_MASK; - addr |= MSI_ADDR_DESTID_CPU(cpu_physical_id(dest_cpu)); + addr |= MSI_ADDR_DESTID_CPU(cpu_physical_id(first_cpu(cpu_mask))); - *address_lo = addr; + msg->address_lo = addr; } static int msi_setup_apic(struct pci_dev *pdev, /* unused in generic */ - unsigned int vector, - u32 *address_hi, - u32 *address_lo, - u32 *data) + unsigned int irq, + struct msi_msg *msg) { unsigned long dest_phys_id; + unsigned int vector; dest_phys_id = cpu_physical_id(first_cpu(cpu_online_map)); + vector = irq; - *address_hi = 0; - *address_lo = MSI_ADDR_HEADER | - MSI_ADDR_DESTMODE_PHYS | - MSI_ADDR_REDIRECTION_CPU | - MSI_ADDR_DESTID_CPU(dest_phys_id); + msg->address_hi = 0; + msg->address_lo = + MSI_ADDR_HEADER | + MSI_ADDR_DESTMODE_PHYS | + MSI_ADDR_REDIRECTION_CPU | + MSI_ADDR_DESTID_CPU(dest_phys_id); - *data = MSI_DATA_TRIGGER_EDGE | + msg->data = + MSI_DATA_TRIGGER_EDGE | MSI_DATA_LEVEL_ASSERT | MSI_DATA_DELIVERY_FIXED | MSI_DATA_VECTOR(vector); @@ -85,7 +84,7 @@ msi_setup_apic(struct pci_dev *pdev, /* } static void -msi_teardown_apic(unsigned int vector) +msi_teardown_apic(unsigned int irq) { return; /* no-op */ } @@ -95,6 +94,7 @@ msi_teardown_apic(unsigned int vector) */ struct msi_ops msi_apic_ops = { + .needs_64bit_address = 0, .setup = msi_setup_apic, .teardown = msi_teardown_apic, .target = msi_target_apic, Index: linux/drivers/pci/msi.c =================================================================== --- linux.orig/drivers/pci/msi.c +++ linux/drivers/pci/msi.c @@ -6,6 +6,7 @@ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com) */ +#include #include #include #include @@ -22,19 +23,11 @@ #include "pci.h" #include "msi.h" -static DEFINE_SPINLOCK(msi_lock); +static DEFINE_RAW_SPINLOCK(msi_lock); static struct msi_desc* msi_desc[NR_IRQS] = { [0 ... NR_IRQS-1] = NULL }; static kmem_cache_t* msi_cachep; static int pci_msi_enable = 1; -static int last_alloc_vector; -static int nr_released_vectors; -static int nr_reserved_vectors = NR_HP_RESERVED_VECTORS; -static int nr_msix_devices; - -#ifndef CONFIG_X86_IO_APIC -int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1}; -#endif static struct msi_ops *msi_ops; @@ -61,11 +54,11 @@ static int msi_cache_init(void) return 0; } -static void msi_set_mask_bit(unsigned int vector, int flag) +static void msi_set_mask_bit(unsigned int irq, int flag) { struct msi_desc *entry; - entry = (struct msi_desc *)msi_desc[vector]; + entry = msi_desc[irq]; if (!entry || !entry->dev || !entry->mask_base) return; switch (entry->msi_attrib.type) { @@ -93,84 +86,119 @@ static void msi_set_mask_bit(unsigned in } } -#ifdef CONFIG_SMP -static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask) +static void read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { - struct msi_desc *entry; - u32 address_hi, address_lo; - unsigned int irq = vector; - unsigned int dest_cpu = first_cpu(cpu_mask); + switch(entry->msi_attrib.type) { + case PCI_CAP_ID_MSI: + { + struct pci_dev *dev = entry->dev; + int pos = entry->msi_attrib.pos; + u16 data; + + pci_read_config_dword(dev, msi_lower_address_reg(pos), + &msg->address_lo); + if (entry->msi_attrib.is_64) { + pci_read_config_dword(dev, msi_upper_address_reg(pos), + &msg->address_hi); + pci_read_config_word(dev, msi_data_reg(pos, 1), &data); + } else { + msg->address_hi = 0; + pci_read_config_word(dev, msi_data_reg(pos, 1), &data); + } + msg->data = data; + break; + } + case PCI_CAP_ID_MSIX: + { + void __iomem *base; + base = entry->mask_base + + entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; - entry = (struct msi_desc *)msi_desc[vector]; - if (!entry || !entry->dev) - return; + msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); + msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); + msg->data = readl(base + PCI_MSIX_ENTRY_DATA_OFFSET); + break; + } + default: + BUG(); + } +} +static void write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) +{ switch (entry->msi_attrib.type) { case PCI_CAP_ID_MSI: { - int pos = pci_find_capability(entry->dev, PCI_CAP_ID_MSI); + struct pci_dev *dev = entry->dev; + int pos = entry->msi_attrib.pos; - if (!pos) - return; - - pci_read_config_dword(entry->dev, msi_upper_address_reg(pos), - &address_hi); - pci_read_config_dword(entry->dev, msi_lower_address_reg(pos), - &address_lo); - - msi_ops->target(vector, dest_cpu, &address_hi, &address_lo); - - pci_write_config_dword(entry->dev, msi_upper_address_reg(pos), - address_hi); - pci_write_config_dword(entry->dev, msi_lower_address_reg(pos), - address_lo); - set_native_irq_info(irq, cpu_mask); + pci_write_config_dword(dev, msi_lower_address_reg(pos), + msg->address_lo); + if (entry->msi_attrib.is_64) { + pci_write_config_dword(dev, msi_upper_address_reg(pos), + msg->address_hi); + pci_write_config_word(dev, msi_data_reg(pos, 1), + msg->data); + } else { + pci_write_config_word(dev, msi_data_reg(pos, 0), + msg->data); + } break; } case PCI_CAP_ID_MSIX: { - int offset_hi = - entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET; - int offset_lo = - entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET; - - address_hi = readl(entry->mask_base + offset_hi); - address_lo = readl(entry->mask_base + offset_lo); - - msi_ops->target(vector, dest_cpu, &address_hi, &address_lo); - - writel(address_hi, entry->mask_base + offset_hi); - writel(address_lo, entry->mask_base + offset_lo); - set_native_irq_info(irq, cpu_mask); + void __iomem *base; + base = entry->mask_base + + entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE; + + writel(msg->address_lo, + base + PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); + writel(msg->address_hi, + base + PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); + writel(msg->data, base + PCI_MSIX_ENTRY_DATA_OFFSET); break; } default: - break; + BUG(); } } + +#ifdef CONFIG_SMP +static void set_msi_affinity(unsigned int irq, cpumask_t cpu_mask) +{ + struct msi_desc *entry; + struct msi_msg msg; + + entry = msi_desc[irq]; + if (!entry || !entry->dev) + return; + + read_msi_msg(entry, &msg); + msi_ops->target(irq, cpu_mask, &msg); + write_msi_msg(entry, &msg); + set_native_irq_info(irq, cpu_mask); +} #else #define set_msi_affinity NULL #endif /* CONFIG_SMP */ -static void mask_MSI_irq(unsigned int vector) +static void mask_MSI_irq(unsigned int irq) { - msi_set_mask_bit(vector, 1); + msi_set_mask_bit(irq, 1); } -static void unmask_MSI_irq(unsigned int vector) +static void unmask_MSI_irq(unsigned int irq) { - msi_set_mask_bit(vector, 0); + msi_set_mask_bit(irq, 0); } -static unsigned int startup_msi_irq_wo_maskbit(unsigned int vector) +static unsigned int startup_msi_irq_wo_maskbit(unsigned int irq) { struct msi_desc *entry; unsigned long flags; spin_lock_irqsave(&msi_lock, flags); - entry = msi_desc[vector]; + entry = msi_desc[irq]; if (!entry || !entry->dev) { spin_unlock_irqrestore(&msi_lock, flags); return 0; @@ -181,39 +209,39 @@ static unsigned int startup_msi_irq_wo_m return 0; /* never anything pending */ } -static unsigned int startup_msi_irq_w_maskbit(unsigned int vector) +static unsigned int startup_msi_irq_w_maskbit(unsigned int irq) { - startup_msi_irq_wo_maskbit(vector); - unmask_MSI_irq(vector); + startup_msi_irq_wo_maskbit(irq); + unmask_MSI_irq(irq); return 0; /* never anything pending */ } -static void shutdown_msi_irq(unsigned int vector) +static void shutdown_msi_irq(unsigned int irq) { struct msi_desc *entry; unsigned long flags; spin_lock_irqsave(&msi_lock, flags); - entry = msi_desc[vector]; + entry = msi_desc[irq]; if (entry && entry->dev) entry->msi_attrib.state = 0; /* Mark it not active */ spin_unlock_irqrestore(&msi_lock, flags); } -static void end_msi_irq_wo_maskbit(unsigned int vector) +static void end_msi_irq_wo_maskbit(unsigned int irq) { - move_native_irq(vector); + move_native_irq(irq); ack_APIC_irq(); } -static void end_msi_irq_w_maskbit(unsigned int vector) +static void end_msi_irq_w_maskbit(unsigned int irq) { - move_native_irq(vector); - unmask_MSI_irq(vector); + move_native_irq(irq); + unmask_MSI_irq(irq); ack_APIC_irq(); } -static void do_nothing(unsigned int vector) +static void do_nothing(unsigned int irq) { } @@ -264,86 +292,7 @@ static struct hw_interrupt_type msi_irq_ .set_affinity = set_msi_affinity }; -static int msi_free_vector(struct pci_dev* dev, int vector, int reassign); -static int assign_msi_vector(void) -{ - static int new_vector_avail = 1; - int vector; - unsigned long flags; - - /* - * msi_lock is provided to ensure that successful allocation of MSI - * vector is assigned unique among drivers. - */ - spin_lock_irqsave(&msi_lock, flags); - - if (!new_vector_avail) { - int free_vector = 0; - - /* - * vector_irq[] = -1 indicates that this specific vector is: - * - assigned for MSI (since MSI have no associated IRQ) or - * - assigned for legacy if less than 16, or - * - having no corresponding 1:1 vector-to-IOxAPIC IRQ mapping - * vector_irq[] = 0 indicates that this vector, previously - * assigned for MSI, is freed by hotplug removed operations. - * This vector will be reused for any subsequent hotplug added - * operations. - * vector_irq[] > 0 indicates that this vector is assigned for - * IOxAPIC IRQs. This vector and its value provides a 1-to-1 - * vector-to-IOxAPIC IRQ mapping. - */ - for (vector = FIRST_DEVICE_VECTOR; vector < NR_IRQS; vector++) { - if (vector_irq[vector] != 0) - continue; - free_vector = vector; - if (!msi_desc[vector]) - break; - else - continue; - } - if (!free_vector) { - spin_unlock_irqrestore(&msi_lock, flags); - return -EBUSY; - } - vector_irq[free_vector] = -1; - nr_released_vectors--; - spin_unlock_irqrestore(&msi_lock, flags); - if (msi_desc[free_vector] != NULL) { - struct pci_dev *dev; - int tail; - - /* free all linked vectors before re-assign */ - do { - spin_lock_irqsave(&msi_lock, flags); - dev = msi_desc[free_vector]->dev; - tail = msi_desc[free_vector]->link.tail; - spin_unlock_irqrestore(&msi_lock, flags); - msi_free_vector(dev, tail, 1); - } while (free_vector != tail); - } - - return free_vector; - } - vector = assign_irq_vector(AUTO_ASSIGN); - last_alloc_vector = vector; - if (vector == LAST_DEVICE_VECTOR) - new_vector_avail = 0; - - spin_unlock_irqrestore(&msi_lock, flags); - return vector; -} - -static int get_new_vector(void) -{ - int vector = assign_msi_vector(); - - if (vector > 0) - set_intr_gate(vector, interrupt[vector]); - - return vector; -} - +static int msi_free_irq(struct pci_dev* dev, int irq); static int msi_init(void) { static int status = -ENOMEM; @@ -367,13 +316,13 @@ static int msi_init(void) } if (! msi_ops) { + pci_msi_enable = 0; printk(KERN_WARNING "PCI: MSI ops not registered. MSI disabled.\n"); status = -EINVAL; return status; } - last_alloc_vector = assign_irq_vector(AUTO_ASSIGN); status = msi_cache_init(); if (status < 0) { pci_msi_enable = 0; @@ -381,23 +330,9 @@ static int msi_init(void) return status; } - if (last_alloc_vector < 0) { - pci_msi_enable = 0; - printk(KERN_WARNING "PCI: No interrupt vectors available for MSI\n"); - status = -EBUSY; - return status; - } - vector_irq[last_alloc_vector] = 0; - nr_released_vectors++; - return status; } -static int get_msi_vector(struct pci_dev *dev) -{ - return get_new_vector(); -} - static struct msi_desc* alloc_msi_entry(void) { struct msi_desc *entry; @@ -413,29 +348,45 @@ static struct msi_desc* alloc_msi_entry( return entry; } -static void attach_msi_entry(struct msi_desc *entry, int vector) +static void attach_msi_entry(struct msi_desc *entry, int irq) { unsigned long flags; spin_lock_irqsave(&msi_lock, flags); - msi_desc[vector] = entry; + msi_desc[irq] = entry; spin_unlock_irqrestore(&msi_lock, flags); } -static void irq_handler_init(int cap_id, int pos, int mask) +static int create_msi_irq(struct hw_interrupt_type *handler) { - unsigned long flags; + struct msi_desc *entry; + int irq; + + entry = alloc_msi_entry(); + if (!entry) + return -ENOMEM; - spin_lock_irqsave(&irq_desc[pos].lock, flags); - if (cap_id == PCI_CAP_ID_MSIX) - irq_desc[pos].chip = &msix_irq_type; - else { - if (!mask) - irq_desc[pos].chip = &msi_irq_wo_maskbit_type; - else - irq_desc[pos].chip = &msi_irq_w_maskbit_type; + irq = create_irq(); + if (irq < 0) { + kmem_cache_free(msi_cachep, entry); + return -EBUSY; } - spin_unlock_irqrestore(&irq_desc[pos].lock, flags); + + set_irq_chip(irq, handler); + set_irq_data(irq, entry); + + return irq; +} + +static void destroy_msi_irq(unsigned int irq) +{ + struct msi_desc *entry; + + entry = get_irq_data(irq); + set_irq_chip(irq, NULL); + set_irq_data(irq, NULL); + destroy_irq(irq); + kmem_cache_free(msi_cachep, entry); } static void enable_msi_mode(struct pci_dev *dev, int pos, int type) @@ -480,21 +431,21 @@ void disable_msi_mode(struct pci_dev *de } } -static int msi_lookup_vector(struct pci_dev *dev, int type) +static int msi_lookup_irq(struct pci_dev *dev, int type) { - int vector; + int irq; unsigned long flags; spin_lock_irqsave(&msi_lock, flags); - for (vector = FIRST_DEVICE_VECTOR; vector < NR_IRQS; vector++) { - if (!msi_desc[vector] || msi_desc[vector]->dev != dev || - msi_desc[vector]->msi_attrib.type != type || - msi_desc[vector]->msi_attrib.default_vector != dev->irq) + for (irq = 0; irq < NR_IRQS; irq++) { + if (!msi_desc[irq] || msi_desc[irq]->dev != dev || + msi_desc[irq]->msi_attrib.type != type || + msi_desc[irq]->msi_attrib.default_irq != dev->irq) continue; spin_unlock_irqrestore(&msi_lock, flags); - /* This pre-assigned MSI vector for this device - already exits. Override dev->irq with this vector */ - dev->irq = vector; + /* This pre-assigned MSI irq for this device + already exits. Override dev->irq with this irq */ + dev->irq = irq; return 0; } spin_unlock_irqrestore(&msi_lock, flags); @@ -506,11 +457,6 @@ void pci_scan_msi_device(struct pci_dev { if (!dev) return; - - if (pci_find_capability(dev, PCI_CAP_ID_MSIX) > 0) - nr_msix_devices++; - else if (pci_find_capability(dev, PCI_CAP_ID_MSI) > 0) - nr_reserved_vectors++; } #ifdef CONFIG_PM @@ -584,7 +530,7 @@ int pci_save_msix_state(struct pci_dev * { int pos; int temp; - int vector, head, tail = 0; + int irq, head, tail = 0; u16 control; struct pci_cap_saved_state *save_state; @@ -606,33 +552,20 @@ int pci_save_msix_state(struct pci_dev * /* save the table */ temp = dev->irq; - if (msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) { + if (msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) { kfree(save_state); return -EINVAL; } - vector = head = dev->irq; + irq = head = dev->irq; while (head != tail) { - int j; - void __iomem *base; struct msi_desc *entry; - entry = msi_desc[vector]; - base = entry->mask_base; - j = entry->msi_attrib.entry_nr; - - entry->address_lo_save = - readl(base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); - entry->address_hi_save = - readl(base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); - entry->data_save = - readl(base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_DATA_OFFSET); + entry = msi_desc[irq]; + read_msi_msg(entry, &entry->msg_save); - tail = msi_desc[vector]->link.tail; - vector = tail; + tail = msi_desc[irq]->link.tail; + irq = tail; } dev->irq = temp; @@ -645,9 +578,7 @@ void pci_restore_msix_state(struct pci_d { u16 save; int pos; - int vector, head, tail = 0; - void __iomem *base; - int j; + int irq, head, tail = 0; struct msi_desc *entry; int temp; struct pci_cap_saved_state *save_state; @@ -665,26 +596,15 @@ void pci_restore_msix_state(struct pci_d /* route the table */ temp = dev->irq; - if (msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) + if (msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) return; - vector = head = dev->irq; + irq = head = dev->irq; while (head != tail) { - entry = msi_desc[vector]; - base = entry->mask_base; - j = entry->msi_attrib.entry_nr; - - writel(entry->address_lo_save, - base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); - writel(entry->address_hi_save, - base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); - writel(entry->data_save, - base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_DATA_OFFSET); + entry = msi_desc[irq]; + write_msi_msg(entry, &entry->msg_save); - tail = msi_desc[vector]->link.tail; - vector = tail; + tail = msi_desc[irq]->link.tail; + irq = tail; } dev->irq = temp; @@ -696,29 +616,19 @@ void pci_restore_msix_state(struct pci_d static int msi_register_init(struct pci_dev *dev, struct msi_desc *entry) { int status; - u32 address_hi; - u32 address_lo; - u32 data; - int pos, vector = dev->irq; + struct msi_msg msg; + int pos; u16 control; - pos = pci_find_capability(dev, PCI_CAP_ID_MSI); + pos = entry->msi_attrib.pos; pci_read_config_word(dev, msi_control_reg(pos), &control); /* Configure MSI capability structure */ - status = msi_ops->setup(dev, vector, &address_hi, &address_lo, &data); + status = msi_ops->setup(dev, dev->irq, &msg); if (status < 0) return status; - pci_write_config_dword(dev, msi_lower_address_reg(pos), address_lo); - if (is_64bit_address(control)) { - pci_write_config_dword(dev, - msi_upper_address_reg(pos), address_hi); - pci_write_config_word(dev, - msi_data_reg(pos, 1), data); - } else - pci_write_config_word(dev, - msi_data_reg(pos, 0), data); + write_msi_msg(entry, &msg); if (entry->msi_attrib.maskbit) { unsigned int maskbits, temp; /* All MSIs are unmasked by default, Mask them all */ @@ -741,53 +651,54 @@ static int msi_register_init(struct pci_ * @dev: pointer to the pci_dev data structure of MSI device function * * Setup the MSI capability structure of device function with a single - * MSI vector, regardless of device function is capable of handling + * MSI irq, regardless of device function is capable of handling * multiple messages. A return of zero indicates the successful setup - * of an entry zero with the new MSI vector or non-zero for otherwise. + * of an entry zero with the new MSI irq or non-zero for otherwise. **/ static int msi_capability_init(struct pci_dev *dev) { int status; struct msi_desc *entry; - int pos, vector; + int pos, irq; u16 control; + struct hw_interrupt_type *handler; pos = pci_find_capability(dev, PCI_CAP_ID_MSI); pci_read_config_word(dev, msi_control_reg(pos), &control); /* MSI Entry Initialization */ - entry = alloc_msi_entry(); - if (!entry) - return -ENOMEM; - - vector = get_msi_vector(dev); - if (vector < 0) { - kmem_cache_free(msi_cachep, entry); - return -EBUSY; - } - entry->link.head = vector; - entry->link.tail = vector; + handler = &msi_irq_wo_maskbit_type; + if (is_mask_bit_support(control)) + handler = &msi_irq_w_maskbit_type; + + irq = create_msi_irq(handler); + if (irq < 0) + return irq; + + entry = get_irq_data(irq); + entry->link.head = irq; + entry->link.tail = irq; entry->msi_attrib.type = PCI_CAP_ID_MSI; entry->msi_attrib.state = 0; /* Mark it not active */ + entry->msi_attrib.is_64 = is_64bit_address(control); entry->msi_attrib.entry_nr = 0; entry->msi_attrib.maskbit = is_mask_bit_support(control); - entry->msi_attrib.default_vector = dev->irq; /* Save IOAPIC IRQ */ - dev->irq = vector; + entry->msi_attrib.default_irq = dev->irq; /* Save IOAPIC IRQ */ + entry->msi_attrib.pos = pos; + dev->irq = irq; entry->dev = dev; if (is_mask_bit_support(control)) { entry->mask_base = (void __iomem *)(long)msi_mask_bits_reg(pos, is_64bit_address(control)); } - /* Replace with MSI handler */ - irq_handler_init(PCI_CAP_ID_MSI, vector, entry->msi_attrib.maskbit); /* Configure MSI capability structure */ status = msi_register_init(dev, entry); if (status != 0) { - dev->irq = entry->msi_attrib.default_vector; - kmem_cache_free(msi_cachep, entry); + dev->irq = entry->msi_attrib.default_irq; + destroy_msi_irq(irq); return status; } - attach_msi_entry(entry, vector); + attach_msi_entry(entry, irq); /* Set MSI enabled bits */ enable_msi_mode(dev, pos, PCI_CAP_ID_MSI); @@ -801,18 +712,16 @@ static int msi_capability_init(struct pc * @nvec: number of @entries * * Setup the MSI-X capability structure of device function with a - * single MSI-X vector. A return of zero indicates the successful setup of - * requested MSI-X entries with allocated vectors or non-zero for otherwise. + * single MSI-X irq. A return of zero indicates the successful setup of + * requested MSI-X entries with allocated irqs or non-zero for otherwise. **/ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, int nvec) { struct msi_desc *head = NULL, *tail = NULL, *entry = NULL; - u32 address_hi; - u32 address_lo; - u32 data; + struct msi_msg msg; int status; - int vector, pos, i, j, nr_entries, temp = 0; + int irq, pos, i, j, nr_entries, temp = 0; unsigned long phys_addr; u32 table_offset; u16 control; @@ -834,65 +743,58 @@ static int msix_capability_init(struct p /* MSI-X Table Initialization */ for (i = 0; i < nvec; i++) { - entry = alloc_msi_entry(); - if (!entry) - break; - vector = get_msi_vector(dev); - if (vector < 0) { - kmem_cache_free(msi_cachep, entry); + irq = create_msi_irq(&msix_irq_type); + if (irq < 0) break; - } + entry = get_irq_data(irq); j = entries[i].entry; - entries[i].vector = vector; + entries[i].vector = irq; entry->msi_attrib.type = PCI_CAP_ID_MSIX; entry->msi_attrib.state = 0; /* Mark it not active */ + entry->msi_attrib.is_64 = 1; entry->msi_attrib.entry_nr = j; entry->msi_attrib.maskbit = 1; - entry->msi_attrib.default_vector = dev->irq; + entry->msi_attrib.default_irq = dev->irq; + entry->msi_attrib.pos = pos; entry->dev = dev; entry->mask_base = base; if (!head) { - entry->link.head = vector; - entry->link.tail = vector; + entry->link.head = irq; + entry->link.tail = irq; head = entry; } else { entry->link.head = temp; entry->link.tail = tail->link.tail; - tail->link.tail = vector; - head->link.head = vector; + tail->link.tail = irq; + head->link.head = irq; } - temp = vector; + temp = irq; tail = entry; - /* Replace with MSI-X handler */ - irq_handler_init(PCI_CAP_ID_MSIX, vector, 1); /* Configure MSI-X capability structure */ - status = msi_ops->setup(dev, vector, - &address_hi, - &address_lo, - &data); - if (status < 0) + status = msi_ops->setup(dev, irq, &msg); + if (status < 0) { + destroy_msi_irq(irq); break; + } - writel(address_lo, - base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); - writel(address_hi, - base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); - writel(data, - base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_DATA_OFFSET); - attach_msi_entry(entry, vector); + write_msi_msg(entry, &msg); + attach_msi_entry(entry, irq); } if (i != nvec) { + int avail = i - 1; i--; for (; i >= 0; i--) { - vector = (entries + i)->vector; - msi_free_vector(dev, vector, 0); + irq = (entries + i)->vector; + msi_free_irq(dev, irq); (entries + i)->vector = 0; } - return -EBUSY; + /* If we had some success report the number of irqs + * we succeeded in setting up. + */ + if (avail <= 0) + avail = -EBUSY; + return avail; } /* Set MSI-X enabled bits */ enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); @@ -905,10 +807,10 @@ static int msix_capability_init(struct p * @dev: pointer to the pci_dev data structure of MSI device function * * Setup the MSI capability structure of device function with - * a single MSI vector upon its software driver call to request for + * a single MSI irq upon its software driver call to request for * MSI mode enabled on its hardware device function. A return of zero * indicates the successful setup of an entry zero with the new MSI - * vector or non-zero for otherwise. + * irq or non-zero for otherwise. **/ int pci_enable_msi(struct pci_dev* dev) { @@ -936,52 +838,29 @@ int pci_enable_msi(struct pci_dev* dev) if (!pos) return -EINVAL; - if (!msi_lookup_vector(dev, PCI_CAP_ID_MSI)) { - /* Lookup Sucess */ - unsigned long flags; + pci_read_config_word(dev, msi_control_reg(pos), &control); + if (!is_64bit_address(control) && msi_ops->needs_64bit_address) + return -EINVAL; - pci_read_config_word(dev, msi_control_reg(pos), &control); - if (control & PCI_MSI_FLAGS_ENABLE) - return 0; /* Already in MSI mode */ - spin_lock_irqsave(&msi_lock, flags); - if (!vector_irq[dev->irq]) { - msi_desc[dev->irq]->msi_attrib.state = 0; - vector_irq[dev->irq] = -1; - nr_released_vectors--; - spin_unlock_irqrestore(&msi_lock, flags); - status = msi_register_init(dev, msi_desc[dev->irq]); - if (status == 0) - enable_msi_mode(dev, pos, PCI_CAP_ID_MSI); - return status; - } - spin_unlock_irqrestore(&msi_lock, flags); - dev->irq = temp; - } - /* Check whether driver already requested for MSI-X vectors */ + WARN_ON(!msi_lookup_irq(dev, PCI_CAP_ID_MSI)); + + /* Check whether driver already requested for MSI-X irqs */ pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - if (pos > 0 && !msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) { + if (pos > 0 && !msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) { printk(KERN_INFO "PCI: %s: Can't enable MSI. " - "Device already has MSI-X vectors assigned\n", + "Device already has MSI-X irq assigned\n", pci_name(dev)); dev->irq = temp; return -EINVAL; } status = msi_capability_init(dev); - if (!status) { - if (!pos) - nr_reserved_vectors--; /* Only MSI capable */ - else if (nr_msix_devices > 0) - nr_msix_devices--; /* Both MSI and MSI-X capable, - but choose enabling MSI */ - } - return status; } void pci_disable_msi(struct pci_dev* dev) { struct msi_desc *entry; - int pos, default_vector; + int pos, default_irq; u16 control; unsigned long flags; @@ -998,6 +877,8 @@ void pci_disable_msi(struct pci_dev* dev if (!(control & PCI_MSI_FLAGS_ENABLE)) return; + disable_msi_mode(dev, pos, PCI_CAP_ID_MSI); + spin_lock_irqsave(&msi_lock, flags); entry = msi_desc[dev->irq]; if (!entry || !entry->dev || entry->msi_attrib.type != PCI_CAP_ID_MSI) { @@ -1007,32 +888,30 @@ void pci_disable_msi(struct pci_dev* dev if (entry->msi_attrib.state) { spin_unlock_irqrestore(&msi_lock, flags); printk(KERN_WARNING "PCI: %s: pci_disable_msi() called without " - "free_irq() on MSI vector %d\n", + "free_irq() on MSI irq %d\n", pci_name(dev), dev->irq); BUG_ON(entry->msi_attrib.state > 0); } else { - vector_irq[dev->irq] = 0; /* free it */ - nr_released_vectors++; - default_vector = entry->msi_attrib.default_vector; + default_irq = entry->msi_attrib.default_irq; spin_unlock_irqrestore(&msi_lock, flags); - /* Restore dev->irq to its default pin-assertion vector */ - dev->irq = default_vector; - disable_msi_mode(dev, pci_find_capability(dev, PCI_CAP_ID_MSI), - PCI_CAP_ID_MSI); + msi_free_irq(dev, dev->irq); + + /* Restore dev->irq to its default pin-assertion irq */ + dev->irq = default_irq; } } -static int msi_free_vector(struct pci_dev* dev, int vector, int reassign) +static int msi_free_irq(struct pci_dev* dev, int irq) { struct msi_desc *entry; int head, entry_nr, type; void __iomem *base; unsigned long flags; - msi_ops->teardown(vector); + msi_ops->teardown(irq); spin_lock_irqsave(&msi_lock, flags); - entry = msi_desc[vector]; + entry = msi_desc[irq]; if (!entry || entry->dev != dev) { spin_unlock_irqrestore(&msi_lock, flags); return -EINVAL; @@ -1044,101 +923,43 @@ static int msi_free_vector(struct pci_de msi_desc[entry->link.head]->link.tail = entry->link.tail; msi_desc[entry->link.tail]->link.head = entry->link.head; entry->dev = NULL; - if (!reassign) { - vector_irq[vector] = 0; - nr_released_vectors++; - } - msi_desc[vector] = NULL; + msi_desc[irq] = NULL; spin_unlock_irqrestore(&msi_lock, flags); - kmem_cache_free(msi_cachep, entry); + destroy_msi_irq(irq); if (type == PCI_CAP_ID_MSIX) { - if (!reassign) - writel(1, base + - entry_nr * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); + writel(1, base + entry_nr * PCI_MSIX_ENTRY_SIZE + + PCI_MSIX_ENTRY_VECTOR_CTRL_OFFSET); - if (head == vector) + if (head == irq) iounmap(base); } return 0; } -static int reroute_msix_table(int head, struct msix_entry *entries, int *nvec) -{ - int vector = head, tail = 0; - int i, j = 0, nr_entries = 0; - void __iomem *base; - unsigned long flags; - - spin_lock_irqsave(&msi_lock, flags); - while (head != tail) { - nr_entries++; - tail = msi_desc[vector]->link.tail; - if (entries[0].entry == msi_desc[vector]->msi_attrib.entry_nr) - j = vector; - vector = tail; - } - if (*nvec > nr_entries) { - spin_unlock_irqrestore(&msi_lock, flags); - *nvec = nr_entries; - return -EINVAL; - } - vector = ((j > 0) ? j : head); - for (i = 0; i < *nvec; i++) { - j = msi_desc[vector]->msi_attrib.entry_nr; - msi_desc[vector]->msi_attrib.state = 0; /* Mark it not active */ - vector_irq[vector] = -1; /* Mark it busy */ - nr_released_vectors--; - entries[i].vector = vector; - if (j != (entries + i)->entry) { - base = msi_desc[vector]->mask_base; - msi_desc[vector]->msi_attrib.entry_nr = - (entries + i)->entry; - writel( readl(base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET), base + - (entries + i)->entry * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET); - writel( readl(base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET), base + - (entries + i)->entry * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_UPPER_ADDR_OFFSET); - writel( (readl(base + j * PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_DATA_OFFSET) & 0xff00) | vector, - base + (entries+i)->entry*PCI_MSIX_ENTRY_SIZE + - PCI_MSIX_ENTRY_DATA_OFFSET); - } - vector = msi_desc[vector]->link.tail; - } - spin_unlock_irqrestore(&msi_lock, flags); - - return 0; -} - /** * pci_enable_msix - configure device's MSI-X capability structure * @dev: pointer to the pci_dev data structure of MSI-X device function * @entries: pointer to an array of MSI-X entries - * @nvec: number of MSI-X vectors requested for allocation by device driver + * @nvec: number of MSI-X irqs requested for allocation by device driver * * Setup the MSI-X capability structure of device function with the number - * of requested vectors upon its software driver call to request for + * of requested irqs upon its software driver call to request for * MSI-X mode enabled on its hardware device function. A return of zero * indicates the successful configuration of MSI-X capability structure - * with new allocated MSI-X vectors. A return of < 0 indicates a failure. + * with new allocated MSI-X irqs. A return of < 0 indicates a failure. * Or a return of > 0 indicates that driver request is exceeding the number - * of vectors available. Driver should use the returned value to re-send + * of irqs available. Driver should use the returned value to re-send * its request. **/ int pci_enable_msix(struct pci_dev* dev, struct msix_entry *entries, int nvec) { struct pci_bus *bus; - int status, pos, nr_entries, free_vectors; + int status, pos, nr_entries; int i, j, temp; u16 control; - unsigned long flags; if (!pci_msi_enable || !dev || !entries) return -EINVAL; @@ -1159,9 +980,6 @@ int pci_enable_msix(struct pci_dev* dev, return -EINVAL; pci_read_config_word(dev, msi_control_reg(pos), &control); - if (control & PCI_MSIX_FLAGS_ENABLE) - return -EINVAL; /* Already in MSI-X mode */ - nr_entries = multi_msix_capable(control); if (nvec > nr_entries) return -EINVAL; @@ -1176,56 +994,18 @@ int pci_enable_msix(struct pci_dev* dev, } } temp = dev->irq; - if (!msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) { - /* Lookup Sucess */ - nr_entries = nvec; - /* Reroute MSI-X table */ - if (reroute_msix_table(dev->irq, entries, &nr_entries)) { - /* #requested > #previous-assigned */ - dev->irq = temp; - return nr_entries; - } - dev->irq = temp; - enable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); - return 0; - } - /* Check whether driver already requested for MSI vector */ + WARN_ON(!msi_lookup_irq(dev, PCI_CAP_ID_MSIX)); + + /* Check whether driver already requested for MSI irq */ if (pci_find_capability(dev, PCI_CAP_ID_MSI) > 0 && - !msi_lookup_vector(dev, PCI_CAP_ID_MSI)) { + !msi_lookup_irq(dev, PCI_CAP_ID_MSI)) { printk(KERN_INFO "PCI: %s: Can't enable MSI-X. " - "Device already has an MSI vector assigned\n", + "Device already has an MSI irq assigned\n", pci_name(dev)); dev->irq = temp; return -EINVAL; } - - spin_lock_irqsave(&msi_lock, flags); - /* - * msi_lock is provided to ensure that enough vectors resources are - * available before granting. - */ - free_vectors = pci_vector_resources(last_alloc_vector, - nr_released_vectors); - /* Ensure that each MSI/MSI-X device has one vector reserved by - default to avoid any MSI-X driver to take all available - resources */ - free_vectors -= nr_reserved_vectors; - /* Find the average of free vectors among MSI-X devices */ - if (nr_msix_devices > 0) - free_vectors /= nr_msix_devices; - spin_unlock_irqrestore(&msi_lock, flags); - - if (nvec > free_vectors) { - if (free_vectors > 0) - return free_vectors; - else - return -EBUSY; - } - status = msix_capability_init(dev, entries, nvec); - if (!status && nr_msix_devices > 0) - nr_msix_devices--; - return status; } @@ -1247,47 +1027,42 @@ void pci_disable_msix(struct pci_dev* de if (!(control & PCI_MSIX_FLAGS_ENABLE)) return; + disable_msi_mode(dev, pos, PCI_CAP_ID_MSIX); + temp = dev->irq; - if (!msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) { - int state, vector, head, tail = 0, warning = 0; + if (!msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) { + int state, irq, head, tail = 0, warning = 0; unsigned long flags; - vector = head = dev->irq; - spin_lock_irqsave(&msi_lock, flags); + irq = head = dev->irq; + dev->irq = temp; /* Restore pin IRQ */ while (head != tail) { - state = msi_desc[vector]->msi_attrib.state; + spin_lock_irqsave(&msi_lock, flags); + state = msi_desc[irq]->msi_attrib.state; + tail = msi_desc[irq]->link.tail; + spin_unlock_irqrestore(&msi_lock, flags); if (state) warning = 1; - else { - vector_irq[vector] = 0; /* free it */ - nr_released_vectors++; - } - tail = msi_desc[vector]->link.tail; - vector = tail; + else if (irq != head) /* Release MSI-X irq */ + msi_free_irq(dev, irq); + irq = tail; } - spin_unlock_irqrestore(&msi_lock, flags); + msi_free_irq(dev, irq); if (warning) { - dev->irq = temp; printk(KERN_WARNING "PCI: %s: pci_disable_msix() called without " - "free_irq() on all MSI-X vectors\n", + "free_irq() on all MSI-X irqs\n", pci_name(dev)); BUG_ON(warning > 0); - } else { - dev->irq = temp; - disable_msi_mode(dev, - pci_find_capability(dev, PCI_CAP_ID_MSIX), - PCI_CAP_ID_MSIX); - } } } /** - * msi_remove_pci_irq_vectors - reclaim MSI(X) vectors to unused state + * msi_remove_pci_irq_vectors - reclaim MSI(X) irqs to unused state * @dev: pointer to the pci_dev data structure of MSI(X) device function * * Being called during hotplug remove, from which the device function - * is hot-removed. All previous assigned MSI/MSI-X vectors, if + * is hot-removed. All previous assigned MSI/MSI-X irqs, if * allocated for this device function, are reclaimed to unused state, * which may be used later on. **/ @@ -1301,42 +1076,42 @@ void msi_remove_pci_irq_vectors(struct p temp = dev->irq; /* Save IOAPIC IRQ */ pos = pci_find_capability(dev, PCI_CAP_ID_MSI); - if (pos > 0 && !msi_lookup_vector(dev, PCI_CAP_ID_MSI)) { + if (pos > 0 && !msi_lookup_irq(dev, PCI_CAP_ID_MSI)) { spin_lock_irqsave(&msi_lock, flags); state = msi_desc[dev->irq]->msi_attrib.state; spin_unlock_irqrestore(&msi_lock, flags); if (state) { printk(KERN_WARNING "PCI: %s: msi_remove_pci_irq_vectors() " - "called without free_irq() on MSI vector %d\n", + "called without free_irq() on MSI irq %d\n", pci_name(dev), dev->irq); BUG_ON(state > 0); - } else /* Release MSI vector assigned to this device */ - msi_free_vector(dev, dev->irq, 0); + } else /* Release MSI irq assigned to this device */ + msi_free_irq(dev, dev->irq); dev->irq = temp; /* Restore IOAPIC IRQ */ } pos = pci_find_capability(dev, PCI_CAP_ID_MSIX); - if (pos > 0 && !msi_lookup_vector(dev, PCI_CAP_ID_MSIX)) { - int vector, head, tail = 0, warning = 0; + if (pos > 0 && !msi_lookup_irq(dev, PCI_CAP_ID_MSIX)) { + int irq, head, tail = 0, warning = 0; void __iomem *base = NULL; - vector = head = dev->irq; + irq = head = dev->irq; while (head != tail) { spin_lock_irqsave(&msi_lock, flags); - state = msi_desc[vector]->msi_attrib.state; - tail = msi_desc[vector]->link.tail; - base = msi_desc[vector]->mask_base; + state = msi_desc[irq]->msi_attrib.state; + tail = msi_desc[irq]->link.tail; + base = msi_desc[irq]->mask_base; spin_unlock_irqrestore(&msi_lock, flags); if (state) warning = 1; - else if (vector != head) /* Release MSI-X vector */ - msi_free_vector(dev, vector, 0); - vector = tail; + else if (irq != head) /* Release MSI-X irq */ + msi_free_irq(dev, irq); + irq = tail; } - msi_free_vector(dev, vector, 0); + msi_free_irq(dev, irq); if (warning) { iounmap(base); printk(KERN_WARNING "PCI: %s: msi_remove_pci_irq_vectors() " - "called without free_irq() on all MSI-X vectors\n", + "called without free_irq() on all MSI-X irqs\n", pci_name(dev)); BUG_ON(warning > 0); } Index: linux/drivers/pci/msi.h =================================================================== --- linux.orig/drivers/pci/msi.h +++ linux/drivers/pci/msi.h @@ -6,85 +6,9 @@ #ifndef MSI_H #define MSI_H -/* - * MSI operation vector. Used by the msi core code (drivers/pci/msi.c) - * to abstract platform-specific tasks relating to MSI address generation - * and resource management. - */ -struct msi_ops { - /** - * setup - generate an MSI bus address and data for a given vector - * @pdev: PCI device context (in) - * @vector: vector allocated by the msi core (in) - * @addr_hi: upper 32 bits of PCI bus MSI address (out) - * @addr_lo: lower 32 bits of PCI bus MSI address (out) - * @data: MSI data payload (out) - * - * Description: The setup op is used to generate a PCI bus addres and - * data which the msi core will program into the card MSI capability - * registers. The setup routine is responsible for picking an initial - * cpu to target the MSI at. The setup routine is responsible for - * examining pdev to determine the MSI capabilities of the card and - * generating a suitable address/data. The setup routine is - * responsible for allocating and tracking any system resources it - * needs to route the MSI to the cpu it picks, and for associating - * those resources with the passed in vector. - * - * Returns 0 if the MSI address/data was successfully setup. - **/ - - int (*setup) (struct pci_dev *pdev, unsigned int vector, - u32 *addr_hi, u32 *addr_lo, u32 *data); - - /** - * teardown - release resources allocated by setup - * @vector: vector context for resources (in) - * - * Description: The teardown op is used to release any resources - * that were allocated in the setup routine associated with the passed - * in vector. - **/ - - void (*teardown) (unsigned int vector); - - /** - * target - retarget an MSI at a different cpu - * @vector: vector context for resources (in) - * @cpu: new cpu to direct vector at (in) - * @addr_hi: new value of PCI bus upper 32 bits (in/out) - * @addr_lo: new value of PCI bus lower 32 bits (in/out) - * - * Description: The target op is used to redirect an MSI vector - * at a different cpu. addr_hi/addr_lo coming in are the existing - * values that the MSI core has programmed into the card. The - * target code is responsible for freeing any resources (if any) - * associated with the old address, and generating a new PCI bus - * addr_hi/addr_lo that will redirect the vector at the indicated cpu. - **/ - - void (*target) (unsigned int vector, unsigned int cpu, - u32 *addr_hi, u32 *addr_lo); -}; - -extern int msi_register(struct msi_ops *ops); - #include /* - * Assume the maximum number of hot plug slots supported by the system is about - * ten. The worstcase is that each of these slots is hot-added with a device, - * which has two MSI/MSI-X capable functions. To avoid any MSI-X driver, which - * attempts to request all available vectors, NR_HP_RESERVED_VECTORS is defined - * as below to ensure at least one message is assigned to each detected MSI/ - * MSI-X device function. - */ -#define NR_HP_RESERVED_VECTORS 20 - -extern int vector_irq[NR_VECTORS]; -extern void (*interrupt[NR_IRQS])(void); -extern int pci_vector_resources(int last, int nr_released); - -/* * MSI-X Address Register */ #define PCI_MSIX_FLAGS_QSIZE 0x7FF @@ -110,8 +34,8 @@ extern int pci_vector_resources(int last (1 << ((control & PCI_MSI_FLAGS_QMASK) >> 1)) #define multi_msi_enable(control, num) \ control |= (((num >> 1) << 4) & PCI_MSI_FLAGS_QSIZE); -#define is_64bit_address(control) (control & PCI_MSI_FLAGS_64BIT) -#define is_mask_bit_support(control) (control & PCI_MSI_FLAGS_MASKBIT) +#define is_64bit_address(control) (!!(control & PCI_MSI_FLAGS_64BIT)) +#define is_mask_bit_support(control) (!!(control & PCI_MSI_FLAGS_MASKBIT)) #define msi_enable(control, num) multi_msi_enable(control, num); \ control |= PCI_MSI_FLAGS_ENABLE @@ -130,10 +54,10 @@ struct msi_desc { __u8 type : 5; /* {0: unused, 5h:MSI, 11h:MSI-X} */ __u8 maskbit : 1; /* mask-pending bit supported ? */ __u8 state : 1; /* {0: free, 1: busy} */ - __u8 reserved: 1; /* reserved */ - __u8 entry_nr; /* specific enabled entry */ - __u8 default_vector; /* default pre-assigned vector */ - __u8 unused; /* formerly unused destination cpu*/ + __u8 is_64 : 1; /* Address size: 0=32bit 1=64bit */ + __u8 pos; /* Location of the msi capability */ + __u16 entry_nr; /* specific enabled entry */ + unsigned default_irq; /* default pre-assigned irq */ }msi_attrib; struct { @@ -146,10 +70,7 @@ struct msi_desc { #ifdef CONFIG_PM /* PM save area for MSIX address/data */ - - u32 address_hi_save; - u32 address_lo_save; - u32 data_save; + struct msi_msg msg_save; #endif }; Index: linux/drivers/scsi/aacraid/aacraid.h =================================================================== --- linux.orig/drivers/scsi/aacraid/aacraid.h +++ linux/drivers/scsi/aacraid/aacraid.h @@ -744,7 +744,7 @@ struct aac_fib_context { u32 unique; // unique value representing this context ulong jiffies; // used for cleanup - dmb changed to ulong struct list_head next; // used to link context's into a linked list - struct semaphore wait_sem; // this is used to wait for the next fib to arrive. + struct compat_semaphore wait_sem; // this is used to wait for the next fib to arrive. int wait; // Set to true when thread is in WaitForSingleObject unsigned long count; // total number of FIBs on FibList struct list_head fib_list; // this holds fibs and their attachd hw_fibs @@ -814,7 +814,7 @@ struct fib { * This is the event the sendfib routine will wait on if the * caller did not pass one and this is synch io. */ - struct semaphore event_wait; + struct compat_semaphore event_wait; spinlock_t event_lock; u32 done; /* gets set to 1 when fib is complete */ Index: linux/drivers/scsi/qla2xxx/qla_def.h =================================================================== --- linux.orig/drivers/scsi/qla2xxx/qla_def.h +++ linux/drivers/scsi/qla2xxx/qla_def.h @@ -2284,7 +2284,7 @@ typedef struct scsi_qla_host { spinlock_t mbx_reg_lock; /* Mbx Cmd Register Lock */ struct semaphore mbx_cmd_sem; /* Serialialize mbx access */ - struct semaphore mbx_intr_sem; /* Used for completion notification */ + struct compat_semaphore mbx_intr_sem; /* Used for completion notification */ uint32_t mbx_flags; #define MBX_IN_PROGRESS BIT_0 Index: linux/drivers/serial/8250.c =================================================================== --- linux.orig/drivers/serial/8250.c +++ linux/drivers/serial/8250.c @@ -2252,14 +2252,10 @@ serial8250_console_write(struct console touch_nmi_watchdog(); - local_irq_save(flags); - if (up->port.sysrq) { - /* serial8250_handle_port() already took the lock */ - locked = 0; - } else if (oops_in_progress) { - locked = spin_trylock(&up->port.lock); - } else - spin_lock(&up->port.lock); + if (up->port.sysrq || oops_in_progress) + locked = spin_trylock_irqsave(&up->port.lock, flags); + else + spin_lock_irqsave(&up->port.lock, flags); /* * First save the IER then disable the interrupts @@ -2281,8 +2277,7 @@ serial8250_console_write(struct console serial_out(up, UART_IER, ier); if (locked) - spin_unlock(&up->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&up->port.lock, flags); } static int serial8250_console_setup(struct console *co, char *options) Index: linux/drivers/usb/core/devio.c =================================================================== --- linux.orig/drivers/usb/core/devio.c +++ linux/drivers/usb/core/devio.c @@ -304,10 +304,11 @@ static void async_completed(struct urb * struct async *as = (struct async *)urb->context; struct dev_state *ps = as->ps; struct siginfo sinfo; + unsigned long flags; - spin_lock(&ps->lock); - list_move_tail(&as->asynclist, &ps->async_completed); - spin_unlock(&ps->lock); + spin_lock_irqsave(&ps->lock, flags); + list_move_tail(&as->asynclist, &ps->async_completed); + spin_unlock_irqrestore(&ps->lock, flags); if (as->signr) { sinfo.si_signo = as->signr; sinfo.si_errno = as->urb->status; Index: linux/drivers/usb/core/hcd.c =================================================================== --- linux.orig/drivers/usb/core/hcd.c +++ linux/drivers/usb/core/hcd.c @@ -515,13 +515,11 @@ error: } /* any errors get returned through the urb completion */ - local_irq_save (flags); - spin_lock (&urb->lock); + spin_lock_irqsave(&urb->lock, flags); if (urb->status == -EINPROGRESS) urb->status = status; - spin_unlock (&urb->lock); + spin_unlock_irqrestore(&urb->lock, flags); usb_hcd_giveback_urb (hcd, urb, NULL); - local_irq_restore (flags); return 0; } @@ -549,8 +547,7 @@ void usb_hcd_poll_rh_status(struct usb_h if (length > 0) { /* try to complete the status urb */ - local_irq_save (flags); - spin_lock(&hcd_root_hub_lock); + spin_lock_irqsave(&hcd_root_hub_lock, flags); urb = hcd->status_urb; if (urb) { spin_lock(&urb->lock); @@ -566,14 +563,13 @@ void usb_hcd_poll_rh_status(struct usb_h spin_unlock(&urb->lock); } else length = 0; - spin_unlock(&hcd_root_hub_lock); + spin_unlock_irqrestore(&hcd_root_hub_lock, flags); /* local irqs are always blocked in completions */ if (length > 0) usb_hcd_giveback_urb (hcd, urb, NULL); else hcd->poll_pending = 1; - local_irq_restore (flags); } /* The USB 2.0 spec says 256 ms. This is close enough and won't @@ -656,17 +652,15 @@ static int usb_rh_urb_dequeue (struct us } else { /* Status URB */ if (!hcd->uses_new_polling) del_timer_sync (&hcd->rh_timer); - local_irq_disable (); - spin_lock (&hcd_root_hub_lock); + spin_lock_irq(&hcd_root_hub_lock); if (urb == hcd->status_urb) { hcd->status_urb = NULL; urb->hcpriv = NULL; } else urb = NULL; /* wasn't fully queued */ - spin_unlock (&hcd_root_hub_lock); + spin_unlock_irq(&hcd_root_hub_lock); if (urb) usb_hcd_giveback_urb (hcd, urb, NULL); - local_irq_enable (); } return 0; @@ -1371,15 +1365,13 @@ hcd_endpoint_disable (struct usb_device WARN_ON (!HC_IS_RUNNING (hcd->state) && hcd->state != HC_STATE_HALT && udev->state != USB_STATE_NOTATTACHED); - local_irq_disable (); - /* FIXME move most of this into message.c as part of its * endpoint disable logic */ /* ep is already gone from udev->ep_{in,out}[]; no more submits */ rescan: - spin_lock (&hcd_data_lock); + spin_lock_irq(&hcd_data_lock); list_for_each_entry (urb, &ep->urb_list, urb_list) { int tmp; @@ -1392,13 +1384,13 @@ rescan: if (urb->status != -EINPROGRESS) continue; usb_get_urb (urb); - spin_unlock (&hcd_data_lock); + spin_unlock_irq(&hcd_data_lock); - spin_lock (&urb->lock); + spin_lock_irq(&urb->lock); tmp = urb->status; if (tmp == -EINPROGRESS) urb->status = -ESHUTDOWN; - spin_unlock (&urb->lock); + spin_unlock_irq(&urb->lock); /* kick hcd unless it's already returning this */ if (tmp == -EINPROGRESS) { @@ -1421,8 +1413,7 @@ rescan: /* list contents may have changed */ goto rescan; } - spin_unlock (&hcd_data_lock); - local_irq_enable (); + spin_unlock_irq(&hcd_data_lock); /* synchronize with the hardware, so old configuration state * clears out immediately (and will be freed). Index: linux/drivers/usb/core/message.c =================================================================== --- linux.orig/drivers/usb/core/message.c +++ linux/drivers/usb/core/message.c @@ -264,8 +264,9 @@ static void sg_clean (struct usb_sg_requ static void sg_complete (struct urb *urb, struct pt_regs *regs) { struct usb_sg_request *io = (struct usb_sg_request *) urb->context; + unsigned long flags; - spin_lock (&io->lock); + spin_lock_irqsave (&io->lock, flags); /* In 2.5 we require hcds' endpoint queues not to progress after fault * reports, until the completion callback (this!) returns. That lets @@ -299,7 +300,7 @@ static void sg_complete (struct urb *urb * unlink pending urbs so they won't rx/tx bad data. * careful: unlink can sometimes be synchronous... */ - spin_unlock (&io->lock); + spin_unlock_irqrestore (&io->lock, flags); for (i = 0, found = 0; i < io->entries; i++) { if (!io->urbs [i] || !io->urbs [i]->dev) continue; @@ -314,7 +315,7 @@ static void sg_complete (struct urb *urb } else if (urb == io->urbs [i]) found = 1; } - spin_lock (&io->lock); + spin_lock_irqsave (&io->lock, flags); } urb->dev = NULL; @@ -324,7 +325,7 @@ static void sg_complete (struct urb *urb if (!io->count) complete (&io->complete); - spin_unlock (&io->lock); + spin_unlock_irqrestore (&io->lock, flags); } @@ -586,7 +587,7 @@ void usb_sg_cancel (struct usb_sg_reques dev_warn (&io->dev->dev, "%s, unlink --> %d\n", __FUNCTION__, retval); } - spin_lock (&io->lock); + spin_lock_irqsave (&io->lock, flags); } spin_unlock_irqrestore (&io->lock, flags); } Index: linux/drivers/usb/net/usbnet.c =================================================================== --- linux.orig/drivers/usb/net/usbnet.c +++ linux/drivers/usb/net/usbnet.c @@ -818,6 +818,8 @@ static void tx_complete (struct urb *urb urb->dev = NULL; entry->state = tx_done; + spin_lock_rt(&dev->txq.lock); + spin_unlock_rt(&dev->txq.lock); defer_bh(dev, skb, &dev->txq); } Index: linux/drivers/usb/storage/usb.h =================================================================== --- linux.orig/drivers/usb/storage/usb.h +++ linux/drivers/usb/storage/usb.h @@ -147,7 +147,7 @@ struct us_data { dma_addr_t iobuf_dma; /* mutual exclusion and synchronization structures */ - struct semaphore sema; /* to sleep thread on */ + struct compat_semaphore sema; /* to sleep thread on */ struct completion notify; /* thread begin/end */ wait_queue_head_t delay_wait; /* wait during scan, reset */ Index: linux/drivers/video/console/fbcon.c =================================================================== --- linux.orig/drivers/video/console/fbcon.c +++ linux/drivers/video/console/fbcon.c @@ -1247,7 +1247,6 @@ static void fbcon_clear(struct vc_data * { struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]]; struct fbcon_ops *ops = info->fbcon_par; - struct display *p = &fb_display[vc->vc_num]; u_int y_break; @@ -1276,10 +1275,11 @@ static void fbcon_putcs(struct vc_data * struct display *p = &fb_display[vc->vc_num]; struct fbcon_ops *ops = info->fbcon_par; - if (!fbcon_is_inactive(vc, info)) + if (!fbcon_is_inactive(vc, info)) { ops->putcs(vc, info, s, count, real_y(p, ypos), xpos, get_color(vc, info, scr_readw(s), 1), get_color(vc, info, scr_readw(s), 0)); + } } static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos) @@ -3079,6 +3079,7 @@ static const struct consw fb_con = { .con_screen_pos = fbcon_screen_pos, .con_getxy = fbcon_getxy, .con_resize = fbcon_resize, + .con_preemptible = 1, }; static struct notifier_block fbcon_event_notifier = { Index: linux/drivers/video/console/vgacon.c =================================================================== --- linux.orig/drivers/video/console/vgacon.c +++ linux/drivers/video/console/vgacon.c @@ -52,7 +52,7 @@ #include